Submission: Submit the link on Github of the assignment to Blackboard.
mlbench
and use the follows to import the datalibrary(mlbench)
library(tidyverse)
data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
df <- df %>%
rename(target = diabetes)
sum(is.na(df))
## [1] 0
library(caret)
set.seed(2020)
splitIndex <- createDataPartition(df$target, p = .80,
list = FALSE)
df_train <- df[ splitIndex,]
df_test <- df[-splitIndex,]
rpart
package, create a decision tree with maximum depth of 3.library(rpart)
tree_model <- rpart(target ~ ., data = df_train,
control = rpart.control(maxdepth = 3))
pred <- predict(tree_model, df_test, type = "class")
cm <- confusionMatrix(data = pred,reference = df_test$target,positive = "pos")
cm$overall[1]
## Accuracy
## 0.7254902
library(rattle)
fancyRpartPlot(tree_model)
barplot(tree_model$variable.importance)
randomForest
package, create a random forest of 1000 trees.library(randomForest)
forest_model <- randomForest(target ~ ., data=df_train, ntree = 1000)
pred_forest <- predict(forest_model, df_test, type = "class")
cm_forest <- confusionMatrix(data = pred_forest, reference = df_test$target, positive = "pos")
cm_forest$overall[1]
## Accuracy
## 0.7908497
importance(forest_model)
## MeanDecreaseGini
## pregnant 22.90061
## glucose 74.01213
## pressure 24.65639
## triceps 19.37641
## insulin 20.69518
## mass 44.06675
## pedigree 34.78747
## age 37.35174
varImpPlot(forest_model)
# 1000 trees
cm_forest$overall[1]
## Accuracy
## 0.7908497
# 2000 trees
forest_model1 <- randomForest(target ~ ., data=df_train, ntree = 2000)
pred_forest1 <- predict(forest_model1, df_test, type = "class")
cm_forest1 <- confusionMatrix(data = pred_forest1, reference = df_test$target, positive = "pos")
cm_forest1$overall[1]
## Accuracy
## 0.7843137
# The accuracy of the random forest with 1000 trees is higher than the model with 2000 trees.
# Tree with Max Depth of 3
model1 <- train(target~., data=df_train,
method = "rpart2",
maxdepth=3)
pred_model1 <- predict(model1, df_test)
cm_model1 <- confusionMatrix(data = pred_model1, reference = df_test$target, positive = "pos")
cm_model1$overall[1]
## Accuracy
## 0.7254902
# Random Forest of 1000 trees
model2 <- train(target~., data=df_train,
method = "rf",
ntree=1000)
pred_model2 <- predict(model2, df_test)
cm_model2 <- confusionMatrix(data = pred_model2, reference = df_test$target, positive = "pos")
cm_model2$overall[1]
## Accuracy
## 0.7843137
# Random forest is more accurate than the decision tree.
plot(varImp(model1))
plot(varImp(model2))