Submission: Submit the link on Github of the assignment to Blackboard.


  1. Install the package mlbench and use the follows to import the data
library(mlbench)
library(tidyverse)
data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
df <- df %>% 
  rename(target = diabetes)
sum(is.na(df))
## [1] 0
library(caret)
set.seed(2020)
splitIndex <- createDataPartition(df$target, p = .80, 
                                  list = FALSE)
df_train <- df[ splitIndex,]
df_test <- df[-splitIndex,]

  1. Practice Decision Tree. Do the follows:
library(rpart)
tree_model <- rpart(target ~ ., data = df_train,
                 control = rpart.control(maxdepth = 3))
pred <- predict(tree_model, df_test, type = "class")
cm <- confusionMatrix(data = pred,reference = df_test$target,positive =   "pos")
cm$overall[1]
##  Accuracy 
## 0.7254902
library(rattle)
fancyRpartPlot(tree_model)

barplot(tree_model$variable.importance)


  1. Practice Random Forest. Do the follows:
library(randomForest)
forest_model <-  randomForest(target ~ ., data=df_train, ntree = 1000)
pred_forest <- predict(forest_model, df_test, type = "class")
cm_forest <- confusionMatrix(data = pred_forest, reference = df_test$target, positive = "pos")
cm_forest$overall[1]
##  Accuracy 
## 0.7908497
importance(forest_model)
##          MeanDecreaseGini
## pregnant         22.90061
## glucose          74.01213
## pressure         24.65639
## triceps          19.37641
## insulin          20.69518
## mass             44.06675
## pedigree         34.78747
## age              37.35174
varImpPlot(forest_model)


  1. Compare the accuracy of a forest of 1000 trees and a forest of 2000 trees.
# 1000 trees
cm_forest$overall[1]
##  Accuracy 
## 0.7908497
# 2000 trees
forest_model1 <-  randomForest(target ~ ., data=df_train, ntree = 2000)
pred_forest1 <- predict(forest_model1, df_test, type = "class")
cm_forest1 <- confusionMatrix(data = pred_forest1, reference = df_test$target, positive = "pos")
cm_forest1$overall[1]
##  Accuracy 
## 0.7843137
# The accuracy of the random forest with 1000 trees is higher than the model with 2000 trees.

  1. Using Caret, create a tree with maximum depth of 3 and forest of 1000 trees. Compare the accuracy of these two models.
# Tree with Max Depth of 3
model1 <- train(target~., data=df_train, 
                method = "rpart2",
                maxdepth=3)
pred_model1 <- predict(model1, df_test)
cm_model1 <- confusionMatrix(data = pred_model1, reference = df_test$target, positive = "pos")
cm_model1$overall[1]
##  Accuracy 
## 0.7254902
# Random Forest of 1000 trees
model2 <- train(target~., data=df_train, 
                method = "rf",
                ntree=1000)
pred_model2 <- predict(model2, df_test)
cm_model2 <- confusionMatrix(data = pred_model2, reference = df_test$target, positive = "pos")
cm_model2$overall[1]
##  Accuracy 
## 0.7843137
# Random forest is more accurate than the decision tree.

  1. Plot variable importance by the two models in 5.
plot(varImp(model1))

plot(varImp(model2))