Assignment 12

Install the package mlbench and use the follows to import the data

library(mlbench)
library(tidyverse)
data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
df <- df %>% 
  rename(target = diabetes)

Set seed to be 2020.

set.seed(2020)

Partition the data into 80% training and 20% testing.

library(caret)
splitIndex <- createDataPartition(df$target, p = .80, 
                                  list = FALSE)
df_train <- df[ splitIndex,]
df_test <- df[-splitIndex,]

Use cross-validation of 30 folds to tune random forest (method=‘rf’). What is the mtry value that produces the greatest accuracy?

tuneGrid <-  expand.grid(mtry = 2:4)
trControl <-  trainControl(method = "cv",
                         number = 30)
forest_cv <- train(target~., data=df_train, 
                                method = "rf", 
                                trControl = trControl,
                                tuneGrid = tuneGrid)
plot(forest_cv)

print(forest_cv)

## Random Forest 
## 
## 615 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (30 fold) 
## Summary of sample sizes: 594, 594, 594, 594, 594, 594, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.7619048  0.4579314
##   3     0.7586508  0.4527499
##   4     0.7540476  0.4391002
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Use cross-validation with of 30 folds to tune random forest (method=‘ranger’). What are the parameters that produce the greatest accuracy?

library(ranger)
tuneGrid1 <-  expand.grid(mtry = 2:4,
                       splitrule = c('gini', 'extratrees'),
                       min.node.size = c(1:10))
trControl1 <-  trainControl(method = "cv",
                            number = 30)
forest_cv1 <- train(target~., data=df_train, 
                                method = "ranger", 
                                trControl = trControl1,
                                tuneGrid = tuneGrid1)
plot(forest_cv1)

print(forest_cv1)

## Random Forest 
## 
## 615 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (30 fold) 
## Summary of sample sizes: 594, 595, 594, 595, 594, 594, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   min.node.size  Accuracy   Kappa    
##   2     gini         1             0.7553319  0.4457612
##   2     gini         2             0.7650938  0.4662415
##   2     gini         3             0.7587374  0.4562096
##   2     gini         4             0.7569192  0.4486464
##   2     gini         5             0.7635859  0.4670094
##   2     gini         6             0.7635786  0.4660176
##   2     gini         7             0.7601732  0.4563121
##   2     gini         8             0.7681169  0.4726715
##   2     gini         9             0.7637374  0.4645133
##   2     gini        10             0.7635786  0.4683995
##   2     extratrees   1             0.7617605  0.4478260
##   2     extratrees   2             0.7637302  0.4497748
##   2     extratrees   3             0.7648557  0.4542585
##   2     extratrees   4             0.7730303  0.4729684
##   2     extratrees   5             0.7585786  0.4388954
##   2     extratrees   6             0.7617532  0.4460765
##   2     extratrees   7             0.7681818  0.4577856
##   2     extratrees   8             0.7616667  0.4414683
##   2     extratrees   9             0.7664358  0.4551059
##   2     extratrees  10             0.7696898  0.4629252
##   3     gini         1             0.7551732  0.4489382
##   3     gini         2             0.7519913  0.4441697
##   3     gini         3             0.7684271  0.4745410
##   3     gini         4             0.7552525  0.4462296
##   3     gini         5             0.7601732  0.4557906
##   3     gini         6             0.7603247  0.4598666
##   3     gini         7             0.7589683  0.4550149
##   3     gini         8             0.7602453  0.4613334
##   3     gini         9             0.7615945  0.4600149
##   3     gini        10             0.7620707  0.4636517
##   3     extratrees   1             0.7619048  0.4527152
##   3     extratrees   2             0.7600866  0.4502276
##   3     extratrees   3             0.7570707  0.4448358
##   3     extratrees   4             0.7651659  0.4633151
##   3     extratrees   5             0.7716739  0.4741049
##   3     extratrees   6             0.7684271  0.4673906
##   3     extratrees   7             0.7763636  0.4853232
##   3     extratrees   8             0.7600938  0.4473356
##   3     extratrees   9             0.7653247  0.4609842
##   3     extratrees  10             0.7666811  0.4619098
##   4     gini         1             0.7582756  0.4564950
##   4     gini         2             0.7648629  0.4683228
##   4     gini         3             0.7666017  0.4727282
##   4     gini         4             0.7554040  0.4497458
##   4     gini         5             0.7585786  0.4577724
##   4     gini         6             0.7488961  0.4282760
##   4     gini         7             0.7554040  0.4493349
##   4     gini         8             0.7602453  0.4572937
##   4     gini         9             0.7603247  0.4615195
##   4     gini        10             0.7569120  0.4549579
##   4     extratrees   1             0.7618326  0.4549654
##   4     extratrees   2             0.7523882  0.4348504
##   4     extratrees   3             0.7652453  0.4615495
##   4     extratrees   4             0.7554906  0.4388246
##   4     extratrees   5             0.7684199  0.4705288
##   4     extratrees   6             0.7716017  0.4804711
##   4     extratrees   7             0.7761977  0.4878911
##   4     extratrees   8             0.7667532  0.4670799
##   4     extratrees   9             0.7653247  0.4621284
##   4     extratrees  10             0.7603247  0.4501064
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 3, splitrule = extratrees
##  and min.node.size = 7.

Go to https://topepo.github.io/caret/available-models.html and pick a classification model. Tune the classification model using cross-validation of 30 folds.

library(caTools)
tuneGrid2 <-  expand.grid(nIter = 2:5)
trControl2 <-  trainControl(method = "cv",
                            number = 30)
boost_model <- train(target~., data=df_train, 
                                method = "LogitBoost", 
                                trControl = trControl2,
                                tuneGrid = tuneGrid2)
plot(boost_model)

print(boost_model)

## Boosted Logistic Regression 
## 
## 615 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (30 fold) 
## Summary of sample sizes: 594, 595, 595, 595, 595, 595, ... 
## Resampling results across tuning parameters:
## 
##   nIter  Accuracy   Kappa    
##   2      0.8175585  0.5514783
##   3      0.7295455  0.3604972
##   4      0.7967926  0.5270912
##   5      0.7444012  0.4156511
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was nIter = 2.

Compare the three models in question 2, 3, and 4 to select the final model. Evaluate the accuracy of the final model on the test data.

The final model selected is the Boosted Logistic Regression model, since it has the highest accuracy in the comparison below.

results <- resamples(list(forest = forest_cv,
                          ranger = forest_cv1,
                          boost = boost_model))
bwplot(results)

pred <- predict(boost_model, df_test)

cm <- confusionMatrix(data = pred, reference = df_test$target, positive = "pos")

cm$overall[1]

##  Accuracy 
## 0.8314607

Overall, it is a pretty accurate model on the testing data.

Assignment 12

Kaitlyn Fales

10/12/2020