1. Install the package mlbench and use the follows to import the data
library(mlbench)
library(tidyverse)
data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
df <- df %>% 
  rename(target = diabetes)
set.seed(2020)
library(caret)
splitIndex <- createDataPartition(df$target, p = .80, 
                                  list = FALSE)
df_train <- df[ splitIndex,]
df_test <- df[-splitIndex,]

  1. Use cross-validation with 10 k-folds to find the maxdepth with the greatest accuracy. Plot the accuracy associated with different maxdepths against the maxdepths. The range to search for maxdepth is from 1 to 10.
tuneGrid_cv <-  expand.grid(maxdepth = 1:10)
trControl_cv <-  trainControl(method = "cv", number = 10)
tree_cv <- train(target~., data=df_train, method = "rpart2", 
                 trControl = trControl_cv, tuneGrid = tuneGrid_cv)
plot(tree_cv)

print(tree_cv)
## CART 
## 
## 615 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 554, 554, 554, 554, 553, 554, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  Accuracy   Kappa    
##    1        0.7398202  0.3630173
##    2        0.7367530  0.4149078
##    3        0.7367530  0.4149078
##    4        0.7367530  0.4149078
##    5        0.7335008  0.4126352
##    6        0.7367266  0.4210583
##    7        0.7302485  0.4080404
##    8        0.7302485  0.4080404
##    9        0.7237705  0.3891845
##   10        0.7221576  0.3804269
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 1.

  1. Make the final decision to select the maxdepth for your decision tree. Is your selected maxdepth the same as the maxdepth found in 2.
# I will be selecting the tree with a max depth of 1, which is the same as the model that caret selected because it is the simplest and also has the highest accuracy.

  1. Calculate the accuracy of your decision tree (the decision tree with your selected maxdepth in 3) on the test data.
# Max Depth of 1
pred_cv <- predict(tree_cv, df_test)
cm_cv <- confusionMatrix(data = pred_cv, reference = df_test$target, positive = "pos")
cm_cv$overall[1]
##  Accuracy 
## 0.7254902

  1. Redo 2-4 with an alternative method to cross-validation.
# Approach 2
tuneGrid = expand.grid(maxdepth = 1:10)
trControl = trainControl(method = "LGOCV",
                         number = 10)
tree_approach2 <- train(target~., data=df_train, 
                                method = "rpart2", 
                                trControl = trControl,
                                tuneGrid = tuneGrid)
plot(tree_approach2)

print(tree_approach2)
## CART 
## 
## 615 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Repeated Train/Test Splits Estimated (10 reps, 75%) 
## Summary of sample sizes: 462, 462, 462, 462, 462, 462, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  Accuracy   Kappa    
##    1        0.7379085  0.3708796
##    2        0.7516340  0.4363065
##    3        0.7516340  0.4363065
##    4        0.7562092  0.4513885
##    5        0.7535948  0.4461648
##    6        0.7568627  0.4506245
##    7        0.7575163  0.4513745
##    8        0.7581699  0.4496120
##    9        0.7555556  0.4473423
##   10        0.7555556  0.4466964
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 8.
# I am choosing the model with a max depth of 7 because it is simpler than the model with a max depth of 8, and the accuracy is nearly identical.

# Max Depth of 7
library(rpart)
tree_model <- rpart(target ~ ., data = df_train,
                 control = rpart.control(maxdepth = 7))
pred <- predict(tree_model, df_test)
pred <- predict(tree_model, df_test, type = "class")
cm <- confusionMatrix(data = pred,reference = df_test$target,positive =   "pos")
cm$overall[1]
##  Accuracy 
## 0.7712418
# The best model is a tree with a max depth of 7.