I. Data Wrangling

  1. Download the data file hdd0318cy.sas7bdat.

  2. Use read_sas in library haven to read the data.

library(haven)
df <- read_sas('Midterm_Data.sas7bdat')
  1. Filter the data to have only patients of the year 2018 (yod==2018)
library(tidyverse)
## -- Attaching packages ----------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
summary(df$yod)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00    6.00   10.00   10.35   14.00   18.00
df_cleaned <- df %>% 
  filter(yod == "18")
  1. Select to work with only following variables:
                      "yod", "payfix","pay_ub92","age",  
                      "sex","raceethn","provider","moa", 
                      "yoa","mod","admtype", "asource" , 
                      "preopday" ,"los", "service" , "icu","ccu",    
                      "dispub92", "payer"  ,"drg","trandb", 
                      "randbg","randbs","orr", "anes","seq",   
                      "lab","dtest", "ther","blood","phar", 
                      "other","patcon","bwght","total","tot" ,  
                      "ecodub92","b_wt","pt_state","diag_adm","ancilar" ,
                      "campus","er_fee","er_chrg","er_mode","obs_chrg",
                      "obs_hour","psycchrg","nicu_day"
df_cleaned <- df_cleaned %>% 
  select(yod,payfix,pay_ub92,age,sex,raceethn,provider,moa,yoa,mod,admtype,
         asource,preopday,los,service,icu,ccu,dispub92,payer,drg,trandb,
         randbg,randbs,orr,anes,seq,lab,dtest,ther,blood,phar,other,patcon,
         bwght,total,tot,ecodub92,b_wt,pt_state,diag_adm,ancilar,campus,
         er_fee,er_chrg,er_mode,obs_chrg,obs_hour,psycchrg,nicu_day)
  1. What are variables that have missing values?
colSums(is.na(df_cleaned))
##      yod   payfix pay_ub92      age      sex raceethn provider      moa 
##        0   131478        0        0        0        0        0        0 
##      yoa      mod  admtype  asource preopday      los  service      icu 
##        0        0        0        0    60486        0        0        0 
##      ccu dispub92    payer      drg   trandb   randbg   randbs      orr 
##        0        0        0        0        0        0        0        0 
##     anes      seq      lab    dtest     ther    blood     phar    other 
##        0        0        0        0        0        0        0        0 
##   patcon    bwght    total      tot ecodub92     b_wt pt_state diag_adm 
##        0        0        0        0        0        0        0        0 
##  ancilar   campus   er_fee  er_chrg  er_mode obs_chrg obs_hour psycchrg 
##        0        0        0        0        0        0    77343        0 
## nicu_day 
##   124090
# payfix,preopday,obs_hour,nicu_day
  1. Remove all variables with missing values
df_cleaned <- df_cleaned %>% 
  select(-payfix,-preopday,-obs_hour,-nicu_day)
colSums(is.na(df_cleaned))
##      yod pay_ub92      age      sex raceethn provider      moa      yoa 
##        0        0        0        0        0        0        0        0 
##      mod  admtype  asource      los  service      icu      ccu dispub92 
##        0        0        0        0        0        0        0        0 
##    payer      drg   trandb   randbg   randbs      orr     anes      seq 
##        0        0        0        0        0        0        0        0 
##      lab    dtest     ther    blood     phar    other   patcon    bwght 
##        0        0        0        0        0        0        0        0 
##    total      tot ecodub92     b_wt pt_state diag_adm  ancilar   campus 
##        0        0        0        0        0        0        0        0 
##   er_fee  er_chrg  er_mode obs_chrg psycchrg 
##        0        0        0        0        0
  1. Refer to the data description in the file HDD2015-18cy6-20-19.docx, which variable recording the month of admission?, which variable recording the month of discharge?
  1. Which month admitted the most number of patients? Which month admitted the most number of male patients?
df_cleaned %>% 
  group_by(moa) %>% 
  summarise(patients = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##      moa patients
##    <dbl>    <int>
##  1     1    11309
##  2     2    10259
##  3     3    11073
##  4     4    11174
##  5     5    11404
##  6     6    10690
##  7     7    10962
##  8     8    11199
##  9     9    10666
## 10    10    11408
## 11    11    10636
## 12    12    10698
# The tenth month, or October, admitted the most number of patients.
df_cleaned %>% 
  filter(sex == 1) %>% 
  group_by(moa) %>% 
  summarise(patients = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##      moa patients
##    <dbl>    <int>
##  1     1     4970
##  2     2     4546
##  3     3     4978
##  4     4     4989
##  5     5     5066
##  6     6     4810
##  7     7     4910
##  8     8     5028
##  9     9     4683
## 10    10     5164
## 11    11     4800
## 12    12     4855
# The same month, October, admitted the most male patients. 
  1. Which month has the most number of teenage female patients?
df_cleaned %>% 
  filter(sex == 2 & age %in% c(13:17)) %>% 
  group_by(moa) %>% 
  summarise(patients = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##      moa patients
##    <dbl>    <int>
##  1     1      139
##  2     2      111
##  3     3      158
##  4     4      123
##  5     5      150
##  6     6       97
##  7     7       97
##  8     8      105
##  9     9      131
## 10    10      126
## 11    11      128
## 12    12      131
# Month 3, or March, admitted the most number of teenage female patients.
  1. Which provider has the most number of female patients in October?
df_cleaned %>% 
  filter(sex == 2 & moa == 10) %>% 
  group_by(provider) %>% 
  summarise(patients = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##    provider patients
##    <chr>       <int>
##  1 7201          294
##  2 7202          212
##  3 7204          872
##  4 7205         1517
##  5 7206          333
##  6 7209          351
##  7 7210          699
##  8 7211          122
##  9 7213          317
## 10 7214         1224
## 11 7215           46
## 12 7216          256
# Provider number 7205, Rhode Island Hospital, had the most number of female patients in October.
  1. Is female patients older than male patients, on average?
df_cleaned %>% 
  group_by(sex) %>% 
  summarise(avg_age = mean(age))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
##   sex   avg_age
##   <chr>   <dbl>
## 1 1        51.5
## 2 2        50.9
## 3 9        43
# Disregarding number 9 since there are missing values there, female patients are not older than male patients on average, they are younger by about a year.
  1. Calculate the average age of patients by months. Which month has the oldest patients on average age?
df_cleaned %>% 
  group_by(moa) %>% 
  summarise(avg_age = mean(age))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##      moa avg_age
##    <dbl>   <dbl>
##  1     1    51.8
##  2     2    51.2
##  3     3    51.1
##  4     4    51.4
##  5     5    50.6
##  6     6    51.5
##  7     7    51.3
##  8     8    50.9
##  9     9    50.6
## 10    10    50.8
## 11    11    51.0
## 12    12    51.5
# Based on the summary below, month 1, or January, has the oldest average age of patients admitted.
  1. What is the name of the provider that has the highest total charge?
df_cleaned %>% 
  group_by(provider) %>% 
  summarise(avg_total_charge = mean(tot))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##    provider avg_total_charge
##    <chr>               <dbl>
##  1 7201               22775.
##  2 7202               35504.
##  3 7204               35277.
##  4 7205               48739.
##  5 7206               31018.
##  6 7209               24539.
##  7 7210               27691.
##  8 7211               24089.
##  9 7213               38200.
## 10 7214               22362.
## 11 7215               69946.
## 12 7216               17782.
# Provider number 7215, or Bradley, has the highest total charge on average. 
  1. What is the name of the provider that has the least total charge for teenage male?
df_cleaned %>% 
  filter(sex == 1 & age %in% c(13:17)) %>% 
  group_by(provider) %>% 
  summarise(avg_total_charge = mean(tot))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 7 x 2
##   provider avg_total_charge
##   <chr>               <dbl>
## 1 7201               14487 
## 2 7202                7797 
## 3 7205               36611.
## 4 7209               45669 
## 5 7210               11255 
## 6 7215               99346.
## 7 7216               27064.
# Provider number 7202, or St. Joseph Health Services of RI has the average least total charges for teenage males.
  1. Calculate the length of stays by races. Which race has the longest length of stays on average?
df_cleaned %>% 
  group_by(raceethn) %>% 
  summarise(avg_los = mean(los))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 2
##   raceethn avg_los
##   <chr>      <dbl>
## 1 ""          4.38
## 2 "1"         4.93
## 3 "2"         5.28
## 4 "3"         4.41
## 5 "4"         5.54
## 6 "6"         5.35
## 7 "7"         4.67
## 8 "9"         6.18
# The longest average length of stay is for Unknown Races (9), but for known races, the longest average length of stay is for American Indian, not Hispanic patients.
  1. On average, how much a 20 year-old male white get charged for staying 1 day?
df_cleaned %>% 
  filter(sex == 1 & raceethn == 1 & age == 20 & los == 1) %>% 
  summarise(avg_total_charge = mean(tot))
## # A tibble: 1 x 1
##   avg_total_charge
##              <dbl>
## 1           15320.

II. Data Visualization

Continue with the data from part I.

  1. Provides at least 10 meaningful plots. Comments on the plots. All plots should have title, caption, appropriate labels on x and y-axis
df_cleaned %>% 
  ggplot(aes(x=age)) + geom_histogram() + labs(x="Age",y="Count",title="Age Distribution of Patients",caption="The strongest concentrations of patients are age 0 (babies), age 30 and age 65.")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

- Plot 2

df_cleaned %>% 
  group_by(sex) %>% 
  ggplot(aes(x=provider)) + geom_bar() + labs(x="Provider",y="Count",title="Number of Patients by Provider",caption="Rhode Island Hospital has the most amount of patients in the year 2018.")

df_cleaned %>% 
  group_by(sex) %>% 
  ggplot(aes(x=factor(moa),fill=sex)) + geom_bar(position = 'fill') + labs(x="Month of Admission",y="Fill",title="Male vs. Female Patient Admits by Month",caption="The ratio of male:female is consistent across the months, but there are always more female admits.")

df_cleaned %>%
  filter(los <= 200 & tot <= 1000000) %>% 
  ggplot(aes(x=los,y=tot)) + geom_point(position = 'jitter') + labs(x="Length of Stay",y="Total cost",title="Total Cost vs. Length of Stay",caption="This graph demonstrates that the longer a person stays the more they will be charged.")

df_cleaned %>%
  filter(los <= 200 & tot <= 1000000 & raceethn %in% c(1:7)) %>% 
  ggplot(aes(x=los,y=tot,color=raceethn)) + geom_point(position = 'jitter') + facet_wrap(~raceethn) + labs(x="Length of Stay",y="Total Cost",title="Total Cost vs. Length of Stay by Race",color="Race",caption="There is much more data for white patients than any other type of patient, so it is a difficult basis for comparison. However, it seems as though Hispanic patients (7) have a steeper slope for cost than other races.")

df_cleaned %>% 
  filter(raceethn %in% c(1:7)) %>% 
  ggplot(aes(x=provider,fill=raceethn)) + geom_bar() + labs(x="Provider",y="Patients",title="Patients by Race for each Provider in 2018",caption="White patients are again dominant for all providers, but 7205 and 7214 have more minority patients than other providers.",fill="Race")

df_cleaned %>% 
  filter(sex %in% c(1:2)) %>% 
  ggplot(aes(x=age,y=sex)) + geom_boxplot() + labs(x="Age",y="Sex",title="Distribution of Age by Sex",caption="On average, male patients are older than females, although the distribution for females is larger.")

df_cleaned %>% 
  filter(sex %in% c(1:2),tot < 50000) %>% 
  ggplot(aes(x=tot,y=sex)) + geom_boxplot() + labs(x="Total Cost",y="Sex",title="Total Cost by Sex",caption="The median cost for male and female patients is almost exactly the same, but the first and third quartiles are a little more outstretched for males than females.")

df_cleaned %>% 
  group_by(provider) %>% 
  summarise(avg_los = mean(los)) %>% 
  ggplot(aes(x=provider,y=avg_los)) + geom_col() + labs(x="Provider",y="Average Length of Stay",title="Average Length of Stay by Provider",caption="Provider 7215, Bradley is the outlier with a much longer average stay than all other providers.")
## `summarise()` ungrouping output (override with `.groups` argument)

df_cleaned %>% 
  filter(raceethn %in% c(1:7)) %>% 
  group_by(provider,raceethn) %>% 
  summarise(avg_tot = mean(tot)) %>% 
  ggplot(aes(x=provider,y=avg_tot,fill=raceethn)) + geom_col(position = 'fill') +
  labs(x="Provider",y="Share of Average Total Cost",title="Average Total Cost by Provider and Race",caption="Between providers, there is a good amount of variation between what share different races have of the total cost in a given year.",fill="Race")
## `summarise()` regrouping output by 'provider' (override with `.groups` argument)

  1. Make an animation
library(gganimate)
df_cleaned %>% 
  group_by(provider,moa) %>% 
  summarise(patients = n()) %>% 
  ggplot(aes(x=moa,y=patients,color=provider)) + geom_line() + geom_point(size=3) +
  transition_reveal(moa) + labs(x="Month of Admission",y="Number of Patients",color="Provider",title="Fluctuation of Patients by Provider over 2018",caption="Three distinct groups emerge: 1) Rhode Island Hospital has the most patients and is the outlier, 2) three providers are in the middle of the pack, 3) the other hospitals are not as large and don't admit nearly as many patients each month.")
## `summarise()` regrouping output by 'provider' (override with `.groups` argument)


III. Predictive Models

Continue with the data from part I. Use the follows as the target and input variables:

Target Variable: Create the target variable taking value of

median(df_cleaned$tot)
## [1] 21854
df_cleaned$target <- case_when(
  df_cleaned$tot <21854 ~ 'low cost',
  TRUE ~ 'high cost'
)

Input Variables:

df_model <- df_cleaned %>% 
  select(target,age,sex,raceethn,provider,moa,mod,admtype,campus,los) %>% 
  filter(admtype != '' & raceethn != '' & sex != 9)

str(df_model)
## tibble [127,523 x 10] (S3: tbl_df/tbl/data.frame)
##  $ target  : chr [1:127523] "low cost" "high cost" "high cost" "low cost" ...
##  $ age     : num [1:127523] 0 76 50 0 0 0 0 0 70 25 ...
##   ..- attr(*, "label")= chr " patient age at admission"
##  $ sex     : chr [1:127523] "1" "2" "2" "1" ...
##   ..- attr(*, "label")= chr " patient sex (1=M,2=F) alpha"
##   ..- attr(*, "format.sas")= chr "$SEX"
##  $ raceethn: chr [1:127523] "1" "1" "1" "7" ...
##   ..- attr(*, "label")= chr " patient race/ethnicity"
##   ..- attr(*, "format.sas")= chr "$RACEETH"
##  $ provider: chr [1:127523] "7214" "7214" "7214" "7214" ...
##   ..- attr(*, "label")= chr " provider"
##   ..- attr(*, "format.sas")= chr "$PROVID"
##  $ moa     : num [1:127523] 1 1 12 1 12 12 1 1 1 12 ...
##   ..- attr(*, "label")= chr " month of admission  mm"
##   ..- attr(*, "format.sas")= chr "MONTHNME"
##  $ mod     : num [1:127523] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr " month of discharge  mm"
##   ..- attr(*, "format.sas")= chr "MONTHNME"
##  $ admtype : chr [1:127523] "4" "3" "2" "4" ...
##   ..- attr(*, "label")= chr " admission type"
##   ..- attr(*, "format.sas")= chr "$ADMTYPE"
##  $ campus  : chr [1:127523] "0" "0" "0" "0" ...
##  $ los     : num [1:127523] 2 1 3 4 35 17 2 3 2 12 ...
##   ..- attr(*, "label")= chr " length of stay,in days for hosp[in hours for ED"
##  - attr(*, "label")= chr "HDD0318CY                       "
df_model$target <- factor(df_model$target)
df_model$sex <- factor(df_model$sex)
df_model$raceethn <- factor(df_model$raceethn)
df_model$provider <- factor(df_model$provider)
df_model$moa <- factor(df_model$moa)
df_model$mod <- factor(df_model$mod)
df_model$admtype <- factor(df_model$admtype)
df_model$campus <- factor(df_model$campus)
str(df_model)
## tibble [127,523 x 10] (S3: tbl_df/tbl/data.frame)
##  $ target  : Factor w/ 2 levels "high cost","low cost": 2 1 1 2 1 1 2 2 1 1 ...
##  $ age     : num [1:127523] 0 76 50 0 0 0 0 0 70 25 ...
##   ..- attr(*, "label")= chr " patient age at admission"
##  $ sex     : Factor w/ 2 levels "1","2": 1 2 2 1 1 2 2 2 2 2 ...
##  $ raceethn: Factor w/ 7 levels "1","2","3","4",..: 1 1 1 6 1 7 1 1 1 1 ...
##  $ provider: Factor w/ 12 levels "7201","7202",..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ moa     : Factor w/ 12 levels "1","2","3","4",..: 1 1 12 1 12 12 1 1 1 12 ...
##  $ mod     : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ admtype : Factor w/ 6 levels "1","2","3","4",..: 4 3 2 4 4 4 4 4 2 2 ...
##  $ campus  : Factor w/ 4 levels "0","1","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ los     : num [1:127523] 2 1 3 4 35 17 2 3 2 12 ...
##   ..- attr(*, "label")= chr " length of stay,in days for hosp[in hours for ED"
##  - attr(*, "label")= chr "HDD0318CY                       "

  1. Set Training : Testing Split = 10 : 90
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(2020)
splitIndex <- createDataPartition(df_model$target, p = .10, 
                                  list = FALSE)
df_train <- df_model[ splitIndex,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
df_test <- df_model[-splitIndex,]
  1. Train a decision tree using rpart. Plot the decision tree. Plot the variable importance ranked by the tree.
library(rpart)
tree_model <- rpart(target ~ ., data = df_train,
                 control = rpart.control(maxdepth = 4))
barplot(tree_model$variable.importance)

  1. Using caret for this question. Set Training Control to be: Use Cross-Validation of 5 folds across all models. Train & tune at least 3 different models (i.e. three different values for method= in the train function of caret). Plot the hyper-parameter tuning plots for each model.
# Random forest using method "rf"
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
tuneGrid <-  expand.grid(mtry = 2:4)
trControl <-  trainControl(method = "cv", number = 5)
forest_cv <- train(target~., data=df_train, 
                                method = "rf", 
                                trControl = trControl,
                                tuneGrid = tuneGrid)
plot(forest_cv)

print(forest_cv)
## Random Forest 
## 
## 12753 samples
##     9 predictor
##     2 classes: 'high cost', 'low cost' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10202, 10203, 10203, 10202, 10202 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.7604471  0.5200016
##   3     0.8028680  0.6055959
##   4     0.8224715  0.6450468
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
# Random forest using method "ranger"
library(ranger)
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
tuneGrid1 <-  expand.grid(mtry = 2:4,
                       splitrule = c('gini', 'extratrees'),
                       min.node.size = c(1:10))
trControl1 <-  trainControl(method = "cv",
                            number = 5)
forest_cv1 <- train(target~., data=df_train, 
                                method = "ranger", 
                                trControl = trControl1,
                                tuneGrid = tuneGrid1)
plot(forest_cv1)

print(forest_cv1)
## Random Forest 
## 
## 12753 samples
##     9 predictor
##     2 classes: 'high cost', 'low cost' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10202, 10203, 10202, 10203, 10202 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   min.node.size  Accuracy   Kappa    
##   2     gini         1             0.7580962  0.5152840
##   2     gini         2             0.7626446  0.5244383
##   2     gini         3             0.7611544  0.5214363
##   2     gini         4             0.7571558  0.5134060
##   2     gini         5             0.7602928  0.5196904
##   2     gini         6             0.7642132  0.5275920
##   2     gini         7             0.7589592  0.5169921
##   2     gini         8             0.7613898  0.5219041
##   2     gini         9             0.7616253  0.5223592
##   2     gini        10             0.7589587  0.5170099
##   2     extratrees   1             0.6858768  0.3703760
##   2     extratrees   2             0.6889349  0.3765306
##   2     extratrees   3             0.6868964  0.3723963
##   2     extratrees   4             0.6865827  0.3718363
##   2     extratrees   5             0.6855631  0.3697412
##   2     extratrees   6             0.6880721  0.3747650
##   2     extratrees   7             0.6863472  0.3713801
##   2     extratrees   8             0.6877586  0.3740763
##   2     extratrees   9             0.6876021  0.3739544
##   2     extratrees  10             0.6879156  0.3744518
##   3     gini         1             0.8014591  0.6027933
##   3     gini         2             0.8001258  0.6001249
##   3     gini         3             0.8013022  0.6024431
##   3     gini         4             0.8020862  0.6040998
##   3     gini         5             0.8027136  0.6053472
##   3     gini         6             0.8024777  0.6048510
##   3     gini         7             0.8034972  0.6069226
##   3     gini         8             0.8031053  0.6061233
##   3     gini         9             0.8028705  0.6056154
##   3     gini        10             0.8018510  0.6035682
##   3     extratrees   1             0.6966193  0.3921308
##   3     extratrees   2             0.6956786  0.3902531
##   3     extratrees   3             0.6952082  0.3892662
##   3     extratrees   4             0.6977171  0.3943212
##   3     extratrees   5             0.6968545  0.3925909
##   3     extratrees   6             0.6971685  0.3932609
##   3     extratrees   7             0.6961492  0.3911957
##   3     extratrees   8             0.6959922  0.3908196
##   3     extratrees   9             0.6956002  0.3900701
##   3     extratrees  10             0.6966196  0.3921375
##   4     gini         1             0.8246685  0.6494573
##   4     gini         2             0.8241980  0.6485001
##   4     gini         3             0.8230218  0.6461497
##   4     gini         4             0.8253745  0.6508872
##   4     gini         5             0.8249822  0.6500655
##   4     gini         6             0.8251391  0.6504060
##   4     gini         7             0.8243548  0.6487971
##   4     gini         8             0.8241197  0.6483321
##   4     gini         9             0.8258451  0.6518079
##   4     gini        10             0.8235708  0.6472646
##   4     extratrees   1             0.7107343  0.4205289
##   4     extratrees   2             0.7116747  0.4224149
##   4     extratrees   3             0.7117535  0.4225640
##   4     extratrees   4             0.7102632  0.4195703
##   4     extratrees   5             0.7097931  0.4186542
##   4     extratrees   6             0.7114394  0.4219408
##   4     extratrees   7             0.7109693  0.4209892
##   4     extratrees   8             0.7108906  0.4208521
##   4     extratrees   9             0.7119886  0.4230364
##   4     extratrees  10             0.7108126  0.4206615
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 4, splitrule = gini
##  and min.node.size = 9.
# Boosted Logistic Regression
library(caTools)
tuneGrid2 <-  expand.grid(nIter = 2:6)
trControl2 <-  trainControl(method = "cv",
                            number = 5)
boost_model <- train(target~., data=df_train, 
                                method = "LogitBoost", 
                                trControl = trControl2,
                                tuneGrid = tuneGrid2)
plot(boost_model)

print(boost_model)
## Boosted Logistic Regression 
## 
## 12753 samples
##     9 predictor
##     2 classes: 'high cost', 'low cost' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10203, 10202, 10202, 10202, 10203 
## Resampling results across tuning parameters:
## 
##   nIter  Accuracy   Kappa    
##   2      0.8604150  0.7175903
##   3      0.7620952  0.5254575
##   4      0.8766155  0.7410386
##   5      0.8055356  0.6117290
##   6      0.8827906  0.7576157
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was nIter = 6.
  1. Plot the comparison of the models in 3.
results <- resamples(list(forest_rf = forest_cv,
                          forest_ranger = forest_cv1,
                          boost = boost_model))
bwplot(results)

  1. What is your final selection for the model? Test the accuracy of your final model on the test data.

Final model selection is boosted logistic regression because it has the highest accuracy.

pred <- predict(boost_model,df_test)
cm <- confusionMatrix(data = factor(pred), reference = df_test$target, positive = "high cost")
cm$overall[1]
##  Accuracy 
## 0.8763689
  1. Create another target variable (binary), decide the input variables and redo 1 to 5.
# Creation of model
median(df_cleaned$los)
## [1] 3
df_cleaned$target1 <- case_when(
  df_cleaned$los <3 ~ 'short',
  TRUE ~ 'long'
)

df_model1 <- df_cleaned %>% 
  select(target1,age,sex,raceethn,provider,moa,mod,admtype,campus,icu) %>%
  filter(admtype != '' & raceethn != '' & sex != 9)

str(df_model1)
## tibble [127,523 x 10] (S3: tbl_df/tbl/data.frame)
##  $ target1 : chr [1:127523] "short" "short" "long" "long" ...
##  $ age     : num [1:127523] 0 76 50 0 0 0 0 0 70 25 ...
##   ..- attr(*, "label")= chr " patient age at admission"
##  $ sex     : chr [1:127523] "1" "2" "2" "1" ...
##   ..- attr(*, "label")= chr " patient sex (1=M,2=F) alpha"
##   ..- attr(*, "format.sas")= chr "$SEX"
##  $ raceethn: chr [1:127523] "1" "1" "1" "7" ...
##   ..- attr(*, "label")= chr " patient race/ethnicity"
##   ..- attr(*, "format.sas")= chr "$RACEETH"
##  $ provider: chr [1:127523] "7214" "7214" "7214" "7214" ...
##   ..- attr(*, "label")= chr " provider"
##   ..- attr(*, "format.sas")= chr "$PROVID"
##  $ moa     : num [1:127523] 1 1 12 1 12 12 1 1 1 12 ...
##   ..- attr(*, "label")= chr " month of admission  mm"
##   ..- attr(*, "format.sas")= chr "MONTHNME"
##  $ mod     : num [1:127523] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr " month of discharge  mm"
##   ..- attr(*, "format.sas")= chr "MONTHNME"
##  $ admtype : chr [1:127523] "4" "3" "2" "4" ...
##   ..- attr(*, "label")= chr " admission type"
##   ..- attr(*, "format.sas")= chr "$ADMTYPE"
##  $ campus  : chr [1:127523] "0" "0" "0" "0" ...
##  $ icu     : chr [1:127523] "000000" "000000" "000000" "000000" ...
##   ..- attr(*, "label")= chr " icu days"
##  - attr(*, "label")= chr "HDD0318CY                       "
df_model1$target1 <- factor(df_model1$target1)
df_model1$sex <- factor(df_model1$sex)
df_model1$raceethn <- factor(df_model1$raceethn)
df_model1$provider <- factor(df_model1$provider)
df_model1$moa <- factor(df_model1$moa)
df_model1$mod <- factor(df_model1$mod)
df_model1$admtype <- factor(df_model1$admtype)
df_model1$campus <- factor(df_model1$campus)
df_model1$icu <- as.numeric(df_model1$icu)
str(df_model1)
## tibble [127,523 x 10] (S3: tbl_df/tbl/data.frame)
##  $ target1 : Factor w/ 2 levels "long","short": 2 2 1 1 1 1 2 1 2 1 ...
##  $ age     : num [1:127523] 0 76 50 0 0 0 0 0 70 25 ...
##   ..- attr(*, "label")= chr " patient age at admission"
##  $ sex     : Factor w/ 2 levels "1","2": 1 2 2 1 1 2 2 2 2 2 ...
##  $ raceethn: Factor w/ 7 levels "1","2","3","4",..: 1 1 1 6 1 7 1 1 1 1 ...
##  $ provider: Factor w/ 12 levels "7201","7202",..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ moa     : Factor w/ 12 levels "1","2","3","4",..: 1 1 12 1 12 12 1 1 1 12 ...
##  $ mod     : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ admtype : Factor w/ 6 levels "1","2","3","4",..: 4 3 2 4 4 4 4 4 2 2 ...
##  $ campus  : Factor w/ 4 levels "0","1","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ icu     : num [1:127523] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "label")= chr "HDD0318CY                       "
set.seed(2020)

splitIndex <- createDataPartition(df_model1$target1, p = .10, 
                                  list = FALSE)
df_train1 <- df_model1[ splitIndex,]
df_test1 <- df_model1[-splitIndex,]

# Decision Tree
tree_model1 <- rpart(target1 ~ ., data = df_train1,
                 control = rpart.control(maxdepth = 4))
barplot(tree_model1$variable.importance)

# Other models
# Random forest using method "rf"
tuneGrid <-  expand.grid(mtry = 2:4)
trControl <-  trainControl(method = "cv", number = 5)
forest_cv2 <- train(target1~., data=df_train1, 
                                method = "rf", 
                                trControl = trControl,
                                tuneGrid = tuneGrid)
plot(forest_cv2)

print(forest_cv2)
## Random Forest 
## 
## 12753 samples
##     9 predictor
##     2 classes: 'long', 'short' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10202, 10203, 10203, 10202, 10202 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa     
##   2     0.6374189  0.09394915
##   3     0.6552178  0.16987011
##   4     0.6631377  0.20267392
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
# Random forest using method "ranger"
tuneGrid1 <-  expand.grid(mtry = 2:4,
                       splitrule = c('gini', 'extratrees'),
                       min.node.size = c(1:10))
trControl1 <-  trainControl(method = "cv",
                            number = 5)
forest_cv3 <- train(target1~., data=df_train1, 
                                method = "ranger", 
                                trControl = trControl1,
                                tuneGrid = tuneGrid1)
plot(forest_cv3)

print(forest_cv3)
## Random Forest 
## 
## 12753 samples
##     9 predictor
##     2 classes: 'long', 'short' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10202, 10202, 10203, 10202, 10203 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   min.node.size  Accuracy   Kappa     
##   2     gini         1             0.6369483  0.09313624
##   2     gini         2             0.6376539  0.09570761
##   2     gini         3             0.6368699  0.09449983
##   2     gini         4             0.6384381  0.09963092
##   2     gini         5             0.6377323  0.09551458
##   2     gini         6             0.6372615  0.09541257
##   2     gini         7             0.6379675  0.09683047
##   2     gini         8             0.6385163  0.09925521
##   2     gini         9             0.6375752  0.09517255
##   2     gini        10             0.6374188  0.09389143
##   2     extratrees   1             0.6310673  0.06390385
##   2     extratrees   2             0.6314594  0.06620800
##   2     extratrees   3             0.6311458  0.06520552
##   2     extratrees   4             0.6309101  0.06394520
##   2     extratrees   5             0.6294203  0.05766925
##   2     extratrees   6             0.6328706  0.07099306
##   2     extratrees   7             0.6310673  0.06056804
##   2     extratrees   8             0.6303613  0.06401206
##   2     extratrees   9             0.6312240  0.06499019
##   2     extratrees  10             0.6301262  0.06191651
##   3     gini         1             0.6571001  0.17641526
##   3     gini         2             0.6571002  0.17617180
##   3     gini         3             0.6575705  0.17762761
##   3     gini         4             0.6583545  0.17917858
##   3     gini         5             0.6566297  0.17468132
##   3     gini         6             0.6574919  0.17801664
##   3     gini         7             0.6567082  0.17417483
##   3     gini         8             0.6571788  0.17796753
##   3     gini         9             0.6575706  0.17751798
##   3     gini        10             0.6567079  0.17634121
##   3     extratrees   1             0.6493374  0.15139854
##   3     extratrees   2             0.6495728  0.15357448
##   3     extratrees   3             0.6504353  0.15488133
##   3     extratrees   4             0.6494160  0.15337083
##   3     extratrees   5             0.6476906  0.14673001
##   3     extratrees   6             0.6500435  0.15449128
##   3     extratrees   7             0.6498080  0.15335439
##   3     extratrees   8             0.6501217  0.15461944
##   3     extratrees   9             0.6482397  0.14812540
##   3     extratrees  10             0.6502002  0.15483896
##   4     gini         1             0.6658036  0.21344044
##   4     gini         2             0.6654116  0.21360228
##   4     gini         3             0.6662743  0.21568741
##   4     gini         4             0.6643139  0.21030472
##   4     gini         5             0.6658819  0.21450764
##   4     gini         6             0.6654902  0.21379426
##   4     gini         7             0.6658038  0.21504798
##   4     gini         8             0.6658039  0.21523027
##   4     gini         9             0.6651763  0.21191781
##   4     gini        10             0.6659606  0.21531078
##   4     extratrees   1             0.6536500  0.17602614
##   4     extratrees   2             0.6539635  0.17830201
##   4     extratrees   3             0.6530230  0.17382592
##   4     extratrees   4             0.6533362  0.17472997
##   4     extratrees   5             0.6530225  0.17426732
##   4     extratrees   6             0.6527089  0.17424796
##   4     extratrees   7             0.6533363  0.17439831
##   4     extratrees   8             0.6544340  0.17822430
##   4     extratrees   9             0.6541991  0.17702780
##   4     extratrees  10             0.6532578  0.17568152
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 4, splitrule = gini
##  and min.node.size = 3.
# Boosted Logistic Regression
tuneGrid2 <-  expand.grid(nIter = 2:6)
trControl2 <-  trainControl(method = "cv",
                            number = 5)
boost_model1 <- train(target1~., data=df_train1, 
                                method = "LogitBoost", 
                                trControl = trControl2,
                                tuneGrid = tuneGrid2)
plot(boost_model1)

print(boost_model1)
## Boosted Logistic Regression 
## 
## 12753 samples
##     9 predictor
##     2 classes: 'long', 'short' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10202, 10202, 10203, 10202, 10203 
## Resampling results across tuning parameters:
## 
##   nIter  Accuracy   Kappa    
##   2      0.7863052  0.5769924
##   3      0.6466719  0.1570180
##   4      0.8019985  0.6047507
##   5      0.6418890  0.1419172
##   6      0.8169745  0.6096988
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was nIter = 6.
#Comparison of Models
results1 <- resamples(list(forest_rf = forest_cv2,
                          forest_ranger = forest_cv3,
                          boost = boost_model1))
bwplot(results1)

The final chosen model is the Boosted Logistic Regression again because it had the highest accuracy.

pred1 <- predict(boost_model1,df_test1)
cm1 <- confusionMatrix(data = pred1, reference = df_test1$target1, positive = "long")
cm1$overall[1]
##  Accuracy 
## 0.8449282