mlbench
and use the follows to import the datalibrary(mlbench)
library(tidyverse)
data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
df <- df %>%
rename(target = diabetes)
set.seed(2020)
library(caret)
splitIndex <- createDataPartition(df$target, p = .80,
list = FALSE)
df_train <- df[ splitIndex,]
df_test <- df[-splitIndex,]
mtry
value that produces the greatest accuracy?tuneGrid <- expand.grid(mtry = 2:4)
trControl <- trainControl(method = "cv",
number = 30)
forest_cv <- train(target~., data=df_train,
method = "rf",
trControl = trControl,
tuneGrid = tuneGrid)
plot(forest_cv)
print(forest_cv)
## Random Forest
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (30 fold)
## Summary of sample sizes: 594, 594, 594, 594, 594, 594, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7619048 0.4579314
## 3 0.7586508 0.4527499
## 4 0.7540476 0.4391002
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
library(ranger)
tuneGrid1 <- expand.grid(mtry = 2:4,
splitrule = c('gini', 'extratrees'),
min.node.size = c(1:10))
trControl1 <- trainControl(method = "cv",
number = 30)
forest_cv1 <- train(target~., data=df_train,
method = "ranger",
trControl = trControl1,
tuneGrid = tuneGrid1)
plot(forest_cv1)
print(forest_cv1)
## Random Forest
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (30 fold)
## Summary of sample sizes: 594, 595, 594, 595, 594, 594, ...
## Resampling results across tuning parameters:
##
## mtry splitrule min.node.size Accuracy Kappa
## 2 gini 1 0.7553319 0.4457612
## 2 gini 2 0.7650938 0.4662415
## 2 gini 3 0.7587374 0.4562096
## 2 gini 4 0.7569192 0.4486464
## 2 gini 5 0.7635859 0.4670094
## 2 gini 6 0.7635786 0.4660176
## 2 gini 7 0.7601732 0.4563121
## 2 gini 8 0.7681169 0.4726715
## 2 gini 9 0.7637374 0.4645133
## 2 gini 10 0.7635786 0.4683995
## 2 extratrees 1 0.7617605 0.4478260
## 2 extratrees 2 0.7637302 0.4497748
## 2 extratrees 3 0.7648557 0.4542585
## 2 extratrees 4 0.7730303 0.4729684
## 2 extratrees 5 0.7585786 0.4388954
## 2 extratrees 6 0.7617532 0.4460765
## 2 extratrees 7 0.7681818 0.4577856
## 2 extratrees 8 0.7616667 0.4414683
## 2 extratrees 9 0.7664358 0.4551059
## 2 extratrees 10 0.7696898 0.4629252
## 3 gini 1 0.7551732 0.4489382
## 3 gini 2 0.7519913 0.4441697
## 3 gini 3 0.7684271 0.4745410
## 3 gini 4 0.7552525 0.4462296
## 3 gini 5 0.7601732 0.4557906
## 3 gini 6 0.7603247 0.4598666
## 3 gini 7 0.7589683 0.4550149
## 3 gini 8 0.7602453 0.4613334
## 3 gini 9 0.7615945 0.4600149
## 3 gini 10 0.7620707 0.4636517
## 3 extratrees 1 0.7619048 0.4527152
## 3 extratrees 2 0.7600866 0.4502276
## 3 extratrees 3 0.7570707 0.4448358
## 3 extratrees 4 0.7651659 0.4633151
## 3 extratrees 5 0.7716739 0.4741049
## 3 extratrees 6 0.7684271 0.4673906
## 3 extratrees 7 0.7763636 0.4853232
## 3 extratrees 8 0.7600938 0.4473356
## 3 extratrees 9 0.7653247 0.4609842
## 3 extratrees 10 0.7666811 0.4619098
## 4 gini 1 0.7582756 0.4564950
## 4 gini 2 0.7648629 0.4683228
## 4 gini 3 0.7666017 0.4727282
## 4 gini 4 0.7554040 0.4497458
## 4 gini 5 0.7585786 0.4577724
## 4 gini 6 0.7488961 0.4282760
## 4 gini 7 0.7554040 0.4493349
## 4 gini 8 0.7602453 0.4572937
## 4 gini 9 0.7603247 0.4615195
## 4 gini 10 0.7569120 0.4549579
## 4 extratrees 1 0.7618326 0.4549654
## 4 extratrees 2 0.7523882 0.4348504
## 4 extratrees 3 0.7652453 0.4615495
## 4 extratrees 4 0.7554906 0.4388246
## 4 extratrees 5 0.7684199 0.4705288
## 4 extratrees 6 0.7716017 0.4804711
## 4 extratrees 7 0.7761977 0.4878911
## 4 extratrees 8 0.7667532 0.4670799
## 4 extratrees 9 0.7653247 0.4621284
## 4 extratrees 10 0.7603247 0.4501064
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 3, splitrule = extratrees
## and min.node.size = 7.
library(caTools)
tuneGrid2 <- expand.grid(nIter = 2:5)
trControl2 <- trainControl(method = "cv",
number = 30)
boost_model <- train(target~., data=df_train,
method = "LogitBoost",
trControl = trControl2,
tuneGrid = tuneGrid2)
plot(boost_model)
print(boost_model)
## Boosted Logistic Regression
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (30 fold)
## Summary of sample sizes: 594, 595, 595, 595, 595, 595, ...
## Resampling results across tuning parameters:
##
## nIter Accuracy Kappa
## 2 0.8175585 0.5514783
## 3 0.7295455 0.3604972
## 4 0.7967926 0.5270912
## 5 0.7444012 0.4156511
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was nIter = 2.
The final model selected is the Boosted Logistic Regression model, since it has the highest accuracy in the comparison below.
results <- resamples(list(forest = forest_cv,
ranger = forest_cv1,
boost = boost_model))
bwplot(results)
pred <- predict(boost_model, df_test)
cm <- confusionMatrix(data = pred, reference = df_test$target, positive = "pos")
cm$overall[1]
## Accuracy
## 0.8314607
Overall, it is a pretty accurate model on the testing data.