- Install the package
mlbench
and use the follows to import the data
library(mlbench)
library(tidyverse)
data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
df <- df %>%
rename(target = diabetes)
set.seed(2020)
- Partition the data into 80% training and 20% testing.
library(caret)
splitIndex <- createDataPartition(df$target, p = .80,
list = FALSE)
df_train <- df[ splitIndex,]
df_test <- df[-splitIndex,]
- Use cross-validation with 10 k-folds to find the maxdepth with the greatest accuracy. Plot the accuracy associated with different maxdepths against the maxdepths. The range to search for maxdepth is from 1 to 10.
tuneGrid_cv <- expand.grid(maxdepth = 1:10)
trControl_cv <- trainControl(method = "cv", number = 10)
tree_cv <- train(target~., data=df_train, method = "rpart2",
trControl = trControl_cv, tuneGrid = tuneGrid_cv)
plot(tree_cv)
print(tree_cv)
## CART
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 554, 554, 554, 554, 553, 554, ...
## Resampling results across tuning parameters:
##
## maxdepth Accuracy Kappa
## 1 0.7398202 0.3630173
## 2 0.7367530 0.4149078
## 3 0.7367530 0.4149078
## 4 0.7367530 0.4149078
## 5 0.7335008 0.4126352
## 6 0.7367266 0.4210583
## 7 0.7302485 0.4080404
## 8 0.7302485 0.4080404
## 9 0.7237705 0.3891845
## 10 0.7221576 0.3804269
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 1.
- Make the final decision to select the maxdepth for your decision tree. Is your selected maxdepth the same as the maxdepth found in 2.
# I will be selecting the tree with a max depth of 1, which is the same as the model that caret selected because it is the simplest and also has the highest accuracy.
- Calculate the accuracy of your decision tree (the decision tree with your selected maxdepth in 3) on the test data.
# Max Depth of 1
pred_cv <- predict(tree_cv, df_test)
cm_cv <- confusionMatrix(data = pred_cv, reference = df_test$target, positive = "pos")
cm_cv$overall[1]
## Accuracy
## 0.7254902
- Redo 2-4 with an alternative method to cross-validation.
# Approach 2
tuneGrid = expand.grid(maxdepth = 1:10)
trControl = trainControl(method = "LGOCV",
number = 10)
tree_approach2 <- train(target~., data=df_train,
method = "rpart2",
trControl = trControl,
tuneGrid = tuneGrid)
plot(tree_approach2)
print(tree_approach2)
## CART
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Repeated Train/Test Splits Estimated (10 reps, 75%)
## Summary of sample sizes: 462, 462, 462, 462, 462, 462, ...
## Resampling results across tuning parameters:
##
## maxdepth Accuracy Kappa
## 1 0.7379085 0.3708796
## 2 0.7516340 0.4363065
## 3 0.7516340 0.4363065
## 4 0.7562092 0.4513885
## 5 0.7535948 0.4461648
## 6 0.7568627 0.4506245
## 7 0.7575163 0.4513745
## 8 0.7581699 0.4496120
## 9 0.7555556 0.4473423
## 10 0.7555556 0.4466964
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 8.
# I am choosing the model with a max depth of 7 because it is simpler than the model with a max depth of 8, and the accuracy is nearly identical.
# Max Depth of 7
library(rpart)
tree_model <- rpart(target ~ ., data = df_train,
control = rpart.control(maxdepth = 7))
pred <- predict(tree_model, df_test)
pred <- predict(tree_model, df_test, type = "class")
cm <- confusionMatrix(data = pred,reference = df_test$target,positive = "pos")
cm$overall[1]
## Accuracy
## 0.7712418
# The best model is a tree with a max depth of 7.