Assignment 10

Submission: Submit the link on Github of the assignment to Blackboard.

Install the package mlbench and use the follows to import the data

library(mlbench)
library(tidyverse)
data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
df <- df %>% 
  rename(target = diabetes)
sum(is.na(df))

## [1] 0

Set seed to be 2020.

library(caret)
set.seed(2020)

Partition the data into 80% training and 20% testing.

splitIndex <- createDataPartition(df$target, p = .80, 
                                  list = FALSE)
df_train <- df[ splitIndex,]
df_test <- df[-splitIndex,]

Practice Decision Tree. Do the follows:

Use rpart package, create a decision tree with maximum depth of 3.

library(rpart)
tree_model <- rpart(target ~ ., data = df_train,
                 control = rpart.control(maxdepth = 3))

Calculate the accuracy of the model on the testing data.

pred <- predict(tree_model, df_test, type = "class")
cm <- confusionMatrix(data = pred,reference = df_test$target,positive =   "pos")
cm$overall[1]

##  Accuracy 
## 0.7254902

Plot the tree

library(rattle)
fancyRpartPlot(tree_model)

Plot the variable importance by the tree

barplot(tree_model$variable.importance)

Practice Random Forest. Do the follows:

Use randomForest package, create a random forest of 1000 trees.

library(randomForest)
forest_model <-  randomForest(target ~ ., data=df_train, ntree = 1000)

Calculate the accuracy of the model on the testing data.

pred_forest <- predict(forest_model, df_test, type = "class")
cm_forest <- confusionMatrix(data = pred_forest, reference = df_test$target, positive = "pos")
cm_forest$overall[1]

##  Accuracy 
## 0.7908497

Plot the variable importance by the forest

importance(forest_model)

##          MeanDecreaseGini
## pregnant         22.90061
## glucose          74.01213
## pressure         24.65639
## triceps          19.37641
## insulin          20.69518
## mass             44.06675
## pedigree         34.78747
## age              37.35174

varImpPlot(forest_model)

Compare the accuracy of a forest of 1000 trees and a forest of 2000 trees.

# 1000 trees
cm_forest$overall[1]

##  Accuracy 
## 0.7908497

# 2000 trees
forest_model1 <-  randomForest(target ~ ., data=df_train, ntree = 2000)
pred_forest1 <- predict(forest_model1, df_test, type = "class")
cm_forest1 <- confusionMatrix(data = pred_forest1, reference = df_test$target, positive = "pos")
cm_forest1$overall[1]

##  Accuracy 
## 0.7843137

# The accuracy of the random forest with 1000 trees is higher than the model with 2000 trees.

Using Caret, create a tree with maximum depth of 3 and forest of 1000 trees. Compare the accuracy of these two models.

# Tree with Max Depth of 3
model1 <- train(target~., data=df_train, 
                method = "rpart2",
                maxdepth=3)
pred_model1 <- predict(model1, df_test)
cm_model1 <- confusionMatrix(data = pred_model1, reference = df_test$target, positive = "pos")
cm_model1$overall[1]

##  Accuracy 
## 0.7254902

# Random Forest of 1000 trees
model2 <- train(target~., data=df_train, 
                method = "rf",
                ntree=1000)
pred_model2 <- predict(model2, df_test)
cm_model2 <- confusionMatrix(data = pred_model2, reference = df_test$target, positive = "pos")
cm_model2$overall[1]

##  Accuracy 
## 0.7843137

# Random forest is more accurate than the decision tree.

Plot variable importance by the two models in 5.

plot(varImp(model1))

plot(varImp(model2))

Assignment 10

Kaitlyn Fales

10/5/2020