Download the data file hdd0318cy.sas7bdat
.
Use read_sas
in library haven
to read the data.
library(haven)
df <- read_sas('Midterm_Data.sas7bdat')
yod==2018
)library(tidyverse)
## -- Attaching packages ----------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
summary(df$yod)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 6.00 10.00 10.35 14.00 18.00
df_cleaned <- df %>%
filter(yod == "18")
"yod", "payfix","pay_ub92","age",
"sex","raceethn","provider","moa",
"yoa","mod","admtype", "asource" ,
"preopday" ,"los", "service" , "icu","ccu",
"dispub92", "payer" ,"drg","trandb",
"randbg","randbs","orr", "anes","seq",
"lab","dtest", "ther","blood","phar",
"other","patcon","bwght","total","tot" ,
"ecodub92","b_wt","pt_state","diag_adm","ancilar" ,
"campus","er_fee","er_chrg","er_mode","obs_chrg",
"obs_hour","psycchrg","nicu_day"
df_cleaned <- df_cleaned %>%
select(yod,payfix,pay_ub92,age,sex,raceethn,provider,moa,yoa,mod,admtype,
asource,preopday,los,service,icu,ccu,dispub92,payer,drg,trandb,
randbg,randbs,orr,anes,seq,lab,dtest,ther,blood,phar,other,patcon,
bwght,total,tot,ecodub92,b_wt,pt_state,diag_adm,ancilar,campus,
er_fee,er_chrg,er_mode,obs_chrg,obs_hour,psycchrg,nicu_day)
colSums(is.na(df_cleaned))
## yod payfix pay_ub92 age sex raceethn provider moa
## 0 131478 0 0 0 0 0 0
## yoa mod admtype asource preopday los service icu
## 0 0 0 0 60486 0 0 0
## ccu dispub92 payer drg trandb randbg randbs orr
## 0 0 0 0 0 0 0 0
## anes seq lab dtest ther blood phar other
## 0 0 0 0 0 0 0 0
## patcon bwght total tot ecodub92 b_wt pt_state diag_adm
## 0 0 0 0 0 0 0 0
## ancilar campus er_fee er_chrg er_mode obs_chrg obs_hour psycchrg
## 0 0 0 0 0 0 77343 0
## nicu_day
## 124090
# payfix,preopday,obs_hour,nicu_day
df_cleaned <- df_cleaned %>%
select(-payfix,-preopday,-obs_hour,-nicu_day)
colSums(is.na(df_cleaned))
## yod pay_ub92 age sex raceethn provider moa yoa
## 0 0 0 0 0 0 0 0
## mod admtype asource los service icu ccu dispub92
## 0 0 0 0 0 0 0 0
## payer drg trandb randbg randbs orr anes seq
## 0 0 0 0 0 0 0 0
## lab dtest ther blood phar other patcon bwght
## 0 0 0 0 0 0 0 0
## total tot ecodub92 b_wt pt_state diag_adm ancilar campus
## 0 0 0 0 0 0 0 0
## er_fee er_chrg er_mode obs_chrg psycchrg
## 0 0 0 0 0
HDD2015-18cy6-20-19.docx
, which variable recording the month of admission?, which variable recording the month of discharge?df_cleaned %>%
group_by(moa) %>%
summarise(patients = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
## moa patients
## <dbl> <int>
## 1 1 11309
## 2 2 10259
## 3 3 11073
## 4 4 11174
## 5 5 11404
## 6 6 10690
## 7 7 10962
## 8 8 11199
## 9 9 10666
## 10 10 11408
## 11 11 10636
## 12 12 10698
# The tenth month, or October, admitted the most number of patients.
df_cleaned %>%
filter(sex == 1) %>%
group_by(moa) %>%
summarise(patients = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
## moa patients
## <dbl> <int>
## 1 1 4970
## 2 2 4546
## 3 3 4978
## 4 4 4989
## 5 5 5066
## 6 6 4810
## 7 7 4910
## 8 8 5028
## 9 9 4683
## 10 10 5164
## 11 11 4800
## 12 12 4855
# The same month, October, admitted the most male patients.
df_cleaned %>%
filter(sex == 2 & age %in% c(13:17)) %>%
group_by(moa) %>%
summarise(patients = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
## moa patients
## <dbl> <int>
## 1 1 139
## 2 2 111
## 3 3 158
## 4 4 123
## 5 5 150
## 6 6 97
## 7 7 97
## 8 8 105
## 9 9 131
## 10 10 126
## 11 11 128
## 12 12 131
# Month 3, or March, admitted the most number of teenage female patients.
df_cleaned %>%
filter(sex == 2 & moa == 10) %>%
group_by(provider) %>%
summarise(patients = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
## provider patients
## <chr> <int>
## 1 7201 294
## 2 7202 212
## 3 7204 872
## 4 7205 1517
## 5 7206 333
## 6 7209 351
## 7 7210 699
## 8 7211 122
## 9 7213 317
## 10 7214 1224
## 11 7215 46
## 12 7216 256
# Provider number 7205, Rhode Island Hospital, had the most number of female patients in October.
df_cleaned %>%
group_by(sex) %>%
summarise(avg_age = mean(age))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## sex avg_age
## <chr> <dbl>
## 1 1 51.5
## 2 2 50.9
## 3 9 43
# Disregarding number 9 since there are missing values there, female patients are not older than male patients on average, they are younger by about a year.
df_cleaned %>%
group_by(moa) %>%
summarise(avg_age = mean(age))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
## moa avg_age
## <dbl> <dbl>
## 1 1 51.8
## 2 2 51.2
## 3 3 51.1
## 4 4 51.4
## 5 5 50.6
## 6 6 51.5
## 7 7 51.3
## 8 8 50.9
## 9 9 50.6
## 10 10 50.8
## 11 11 51.0
## 12 12 51.5
# Based on the summary below, month 1, or January, has the oldest average age of patients admitted.
df_cleaned %>%
group_by(provider) %>%
summarise(avg_total_charge = mean(tot))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
## provider avg_total_charge
## <chr> <dbl>
## 1 7201 22775.
## 2 7202 35504.
## 3 7204 35277.
## 4 7205 48739.
## 5 7206 31018.
## 6 7209 24539.
## 7 7210 27691.
## 8 7211 24089.
## 9 7213 38200.
## 10 7214 22362.
## 11 7215 69946.
## 12 7216 17782.
# Provider number 7215, or Bradley, has the highest total charge on average.
df_cleaned %>%
filter(sex == 1 & age %in% c(13:17)) %>%
group_by(provider) %>%
summarise(avg_total_charge = mean(tot))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 7 x 2
## provider avg_total_charge
## <chr> <dbl>
## 1 7201 14487
## 2 7202 7797
## 3 7205 36611.
## 4 7209 45669
## 5 7210 11255
## 6 7215 99346.
## 7 7216 27064.
# Provider number 7202, or St. Joseph Health Services of RI has the average least total charges for teenage males.
df_cleaned %>%
group_by(raceethn) %>%
summarise(avg_los = mean(los))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 2
## raceethn avg_los
## <chr> <dbl>
## 1 "" 4.38
## 2 "1" 4.93
## 3 "2" 5.28
## 4 "3" 4.41
## 5 "4" 5.54
## 6 "6" 5.35
## 7 "7" 4.67
## 8 "9" 6.18
# The longest average length of stay is for Unknown Races (9), but for known races, the longest average length of stay is for American Indian, not Hispanic patients.
df_cleaned %>%
filter(sex == 1 & raceethn == 1 & age == 20 & los == 1) %>%
summarise(avg_total_charge = mean(tot))
## # A tibble: 1 x 1
## avg_total_charge
## <dbl>
## 1 15320.
Continue with the data from part I.
df_cleaned %>%
ggplot(aes(x=age)) + geom_histogram() + labs(x="Age",y="Count",title="Age Distribution of Patients",caption="The strongest concentrations of patients are age 0 (babies), age 30 and age 65.")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
- Plot 2
df_cleaned %>%
group_by(sex) %>%
ggplot(aes(x=provider)) + geom_bar() + labs(x="Provider",y="Count",title="Number of Patients by Provider",caption="Rhode Island Hospital has the most amount of patients in the year 2018.")
df_cleaned %>%
group_by(sex) %>%
ggplot(aes(x=factor(moa),fill=sex)) + geom_bar(position = 'fill') + labs(x="Month of Admission",y="Fill",title="Male vs. Female Patient Admits by Month",caption="The ratio of male:female is consistent across the months, but there are always more female admits.")
df_cleaned %>%
filter(los <= 200 & tot <= 1000000) %>%
ggplot(aes(x=los,y=tot)) + geom_point(position = 'jitter') + labs(x="Length of Stay",y="Total cost",title="Total Cost vs. Length of Stay",caption="This graph demonstrates that the longer a person stays the more they will be charged.")
df_cleaned %>%
filter(los <= 200 & tot <= 1000000 & raceethn %in% c(1:7)) %>%
ggplot(aes(x=los,y=tot,color=raceethn)) + geom_point(position = 'jitter') + facet_wrap(~raceethn) + labs(x="Length of Stay",y="Total Cost",title="Total Cost vs. Length of Stay by Race",color="Race",caption="There is much more data for white patients than any other type of patient, so it is a difficult basis for comparison. However, it seems as though Hispanic patients (7) have a steeper slope for cost than other races.")
df_cleaned %>%
filter(raceethn %in% c(1:7)) %>%
ggplot(aes(x=provider,fill=raceethn)) + geom_bar() + labs(x="Provider",y="Patients",title="Patients by Race for each Provider in 2018",caption="White patients are again dominant for all providers, but 7205 and 7214 have more minority patients than other providers.",fill="Race")
df_cleaned %>%
filter(sex %in% c(1:2)) %>%
ggplot(aes(x=age,y=sex)) + geom_boxplot() + labs(x="Age",y="Sex",title="Distribution of Age by Sex",caption="On average, male patients are older than females, although the distribution for females is larger.")
df_cleaned %>%
filter(sex %in% c(1:2),tot < 50000) %>%
ggplot(aes(x=tot,y=sex)) + geom_boxplot() + labs(x="Total Cost",y="Sex",title="Total Cost by Sex",caption="The median cost for male and female patients is almost exactly the same, but the first and third quartiles are a little more outstretched for males than females.")
df_cleaned %>%
group_by(provider) %>%
summarise(avg_los = mean(los)) %>%
ggplot(aes(x=provider,y=avg_los)) + geom_col() + labs(x="Provider",y="Average Length of Stay",title="Average Length of Stay by Provider",caption="Provider 7215, Bradley is the outlier with a much longer average stay than all other providers.")
## `summarise()` ungrouping output (override with `.groups` argument)
df_cleaned %>%
filter(raceethn %in% c(1:7)) %>%
group_by(provider,raceethn) %>%
summarise(avg_tot = mean(tot)) %>%
ggplot(aes(x=provider,y=avg_tot,fill=raceethn)) + geom_col(position = 'fill') +
labs(x="Provider",y="Share of Average Total Cost",title="Average Total Cost by Provider and Race",caption="Between providers, there is a good amount of variation between what share different races have of the total cost in a given year.",fill="Race")
## `summarise()` regrouping output by 'provider' (override with `.groups` argument)
library(gganimate)
df_cleaned %>%
group_by(provider,moa) %>%
summarise(patients = n()) %>%
ggplot(aes(x=moa,y=patients,color=provider)) + geom_line() + geom_point(size=3) +
transition_reveal(moa) + labs(x="Month of Admission",y="Number of Patients",color="Provider",title="Fluctuation of Patients by Provider over 2018",caption="Three distinct groups emerge: 1) Rhode Island Hospital has the most patients and is the outlier, 2) three providers are in the middle of the pack, 3) the other hospitals are not as large and don't admit nearly as many patients each month.")
## `summarise()` regrouping output by 'provider' (override with `.groups` argument)
Continue with the data from part I. Use the follows as the target and input variables:
Target Variable: Create the target variable taking value of
low cost
if the total charge of a patient (tot
) is smaller than the median of the total charge, and
high cost
otherwise.
median(df_cleaned$tot)
## [1] 21854
df_cleaned$target <- case_when(
df_cleaned$tot <21854 ~ 'low cost',
TRUE ~ 'high cost'
)
Input Variables:
df_model <- df_cleaned %>%
select(target,age,sex,raceethn,provider,moa,mod,admtype,campus,los) %>%
filter(admtype != '' & raceethn != '' & sex != 9)
str(df_model)
## tibble [127,523 x 10] (S3: tbl_df/tbl/data.frame)
## $ target : chr [1:127523] "low cost" "high cost" "high cost" "low cost" ...
## $ age : num [1:127523] 0 76 50 0 0 0 0 0 70 25 ...
## ..- attr(*, "label")= chr " patient age at admission"
## $ sex : chr [1:127523] "1" "2" "2" "1" ...
## ..- attr(*, "label")= chr " patient sex (1=M,2=F) alpha"
## ..- attr(*, "format.sas")= chr "$SEX"
## $ raceethn: chr [1:127523] "1" "1" "1" "7" ...
## ..- attr(*, "label")= chr " patient race/ethnicity"
## ..- attr(*, "format.sas")= chr "$RACEETH"
## $ provider: chr [1:127523] "7214" "7214" "7214" "7214" ...
## ..- attr(*, "label")= chr " provider"
## ..- attr(*, "format.sas")= chr "$PROVID"
## $ moa : num [1:127523] 1 1 12 1 12 12 1 1 1 12 ...
## ..- attr(*, "label")= chr " month of admission mm"
## ..- attr(*, "format.sas")= chr "MONTHNME"
## $ mod : num [1:127523] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr " month of discharge mm"
## ..- attr(*, "format.sas")= chr "MONTHNME"
## $ admtype : chr [1:127523] "4" "3" "2" "4" ...
## ..- attr(*, "label")= chr " admission type"
## ..- attr(*, "format.sas")= chr "$ADMTYPE"
## $ campus : chr [1:127523] "0" "0" "0" "0" ...
## $ los : num [1:127523] 2 1 3 4 35 17 2 3 2 12 ...
## ..- attr(*, "label")= chr " length of stay,in days for hosp[in hours for ED"
## - attr(*, "label")= chr "HDD0318CY "
df_model$target <- factor(df_model$target)
df_model$sex <- factor(df_model$sex)
df_model$raceethn <- factor(df_model$raceethn)
df_model$provider <- factor(df_model$provider)
df_model$moa <- factor(df_model$moa)
df_model$mod <- factor(df_model$mod)
df_model$admtype <- factor(df_model$admtype)
df_model$campus <- factor(df_model$campus)
str(df_model)
## tibble [127,523 x 10] (S3: tbl_df/tbl/data.frame)
## $ target : Factor w/ 2 levels "high cost","low cost": 2 1 1 2 1 1 2 2 1 1 ...
## $ age : num [1:127523] 0 76 50 0 0 0 0 0 70 25 ...
## ..- attr(*, "label")= chr " patient age at admission"
## $ sex : Factor w/ 2 levels "1","2": 1 2 2 1 1 2 2 2 2 2 ...
## $ raceethn: Factor w/ 7 levels "1","2","3","4",..: 1 1 1 6 1 7 1 1 1 1 ...
## $ provider: Factor w/ 12 levels "7201","7202",..: 10 10 10 10 10 10 10 10 10 10 ...
## $ moa : Factor w/ 12 levels "1","2","3","4",..: 1 1 12 1 12 12 1 1 1 12 ...
## $ mod : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ admtype : Factor w/ 6 levels "1","2","3","4",..: 4 3 2 4 4 4 4 4 2 2 ...
## $ campus : Factor w/ 4 levels "0","1","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## $ los : num [1:127523] 2 1 3 4 35 17 2 3 2 12 ...
## ..- attr(*, "label")= chr " length of stay,in days for hosp[in hours for ED"
## - attr(*, "label")= chr "HDD0318CY "
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(2020)
splitIndex <- createDataPartition(df_model$target, p = .10,
list = FALSE)
df_train <- df_model[ splitIndex,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
df_test <- df_model[-splitIndex,]
rpart
. Plot the decision tree. Plot the variable importance ranked by the tree.library(rpart)
tree_model <- rpart(target ~ ., data = df_train,
control = rpart.control(maxdepth = 4))
barplot(tree_model$variable.importance)
Training Control
to be: Use Cross-Validation of 5 folds across all models. Train & tune at least 3 different models (i.e. three different values for method=
in the train function of caret). Plot the hyper-parameter tuning plots for each model.# Random forest using method "rf"
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
tuneGrid <- expand.grid(mtry = 2:4)
trControl <- trainControl(method = "cv", number = 5)
forest_cv <- train(target~., data=df_train,
method = "rf",
trControl = trControl,
tuneGrid = tuneGrid)
plot(forest_cv)
print(forest_cv)
## Random Forest
##
## 12753 samples
## 9 predictor
## 2 classes: 'high cost', 'low cost'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 10202, 10203, 10203, 10202, 10202
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7604471 0.5200016
## 3 0.8028680 0.6055959
## 4 0.8224715 0.6450468
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
# Random forest using method "ranger"
library(ranger)
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
tuneGrid1 <- expand.grid(mtry = 2:4,
splitrule = c('gini', 'extratrees'),
min.node.size = c(1:10))
trControl1 <- trainControl(method = "cv",
number = 5)
forest_cv1 <- train(target~., data=df_train,
method = "ranger",
trControl = trControl1,
tuneGrid = tuneGrid1)
plot(forest_cv1)
print(forest_cv1)
## Random Forest
##
## 12753 samples
## 9 predictor
## 2 classes: 'high cost', 'low cost'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 10202, 10203, 10202, 10203, 10202
## Resampling results across tuning parameters:
##
## mtry splitrule min.node.size Accuracy Kappa
## 2 gini 1 0.7580962 0.5152840
## 2 gini 2 0.7626446 0.5244383
## 2 gini 3 0.7611544 0.5214363
## 2 gini 4 0.7571558 0.5134060
## 2 gini 5 0.7602928 0.5196904
## 2 gini 6 0.7642132 0.5275920
## 2 gini 7 0.7589592 0.5169921
## 2 gini 8 0.7613898 0.5219041
## 2 gini 9 0.7616253 0.5223592
## 2 gini 10 0.7589587 0.5170099
## 2 extratrees 1 0.6858768 0.3703760
## 2 extratrees 2 0.6889349 0.3765306
## 2 extratrees 3 0.6868964 0.3723963
## 2 extratrees 4 0.6865827 0.3718363
## 2 extratrees 5 0.6855631 0.3697412
## 2 extratrees 6 0.6880721 0.3747650
## 2 extratrees 7 0.6863472 0.3713801
## 2 extratrees 8 0.6877586 0.3740763
## 2 extratrees 9 0.6876021 0.3739544
## 2 extratrees 10 0.6879156 0.3744518
## 3 gini 1 0.8014591 0.6027933
## 3 gini 2 0.8001258 0.6001249
## 3 gini 3 0.8013022 0.6024431
## 3 gini 4 0.8020862 0.6040998
## 3 gini 5 0.8027136 0.6053472
## 3 gini 6 0.8024777 0.6048510
## 3 gini 7 0.8034972 0.6069226
## 3 gini 8 0.8031053 0.6061233
## 3 gini 9 0.8028705 0.6056154
## 3 gini 10 0.8018510 0.6035682
## 3 extratrees 1 0.6966193 0.3921308
## 3 extratrees 2 0.6956786 0.3902531
## 3 extratrees 3 0.6952082 0.3892662
## 3 extratrees 4 0.6977171 0.3943212
## 3 extratrees 5 0.6968545 0.3925909
## 3 extratrees 6 0.6971685 0.3932609
## 3 extratrees 7 0.6961492 0.3911957
## 3 extratrees 8 0.6959922 0.3908196
## 3 extratrees 9 0.6956002 0.3900701
## 3 extratrees 10 0.6966196 0.3921375
## 4 gini 1 0.8246685 0.6494573
## 4 gini 2 0.8241980 0.6485001
## 4 gini 3 0.8230218 0.6461497
## 4 gini 4 0.8253745 0.6508872
## 4 gini 5 0.8249822 0.6500655
## 4 gini 6 0.8251391 0.6504060
## 4 gini 7 0.8243548 0.6487971
## 4 gini 8 0.8241197 0.6483321
## 4 gini 9 0.8258451 0.6518079
## 4 gini 10 0.8235708 0.6472646
## 4 extratrees 1 0.7107343 0.4205289
## 4 extratrees 2 0.7116747 0.4224149
## 4 extratrees 3 0.7117535 0.4225640
## 4 extratrees 4 0.7102632 0.4195703
## 4 extratrees 5 0.7097931 0.4186542
## 4 extratrees 6 0.7114394 0.4219408
## 4 extratrees 7 0.7109693 0.4209892
## 4 extratrees 8 0.7108906 0.4208521
## 4 extratrees 9 0.7119886 0.4230364
## 4 extratrees 10 0.7108126 0.4206615
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 4, splitrule = gini
## and min.node.size = 9.
# Boosted Logistic Regression
library(caTools)
tuneGrid2 <- expand.grid(nIter = 2:6)
trControl2 <- trainControl(method = "cv",
number = 5)
boost_model <- train(target~., data=df_train,
method = "LogitBoost",
trControl = trControl2,
tuneGrid = tuneGrid2)
plot(boost_model)
print(boost_model)
## Boosted Logistic Regression
##
## 12753 samples
## 9 predictor
## 2 classes: 'high cost', 'low cost'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 10203, 10202, 10202, 10202, 10203
## Resampling results across tuning parameters:
##
## nIter Accuracy Kappa
## 2 0.8604150 0.7175903
## 3 0.7620952 0.5254575
## 4 0.8766155 0.7410386
## 5 0.8055356 0.6117290
## 6 0.8827906 0.7576157
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was nIter = 6.
results <- resamples(list(forest_rf = forest_cv,
forest_ranger = forest_cv1,
boost = boost_model))
bwplot(results)
Final model selection is boosted logistic regression because it has the highest accuracy.
pred <- predict(boost_model,df_test)
cm <- confusionMatrix(data = factor(pred), reference = df_test$target, positive = "high cost")
cm$overall[1]
## Accuracy
## 0.8763689
target
variable (binary), decide the input variables and redo 1 to 5.# Creation of model
median(df_cleaned$los)
## [1] 3
df_cleaned$target1 <- case_when(
df_cleaned$los <3 ~ 'short',
TRUE ~ 'long'
)
df_model1 <- df_cleaned %>%
select(target1,age,sex,raceethn,provider,moa,mod,admtype,campus,icu) %>%
filter(admtype != '' & raceethn != '' & sex != 9)
str(df_model1)
## tibble [127,523 x 10] (S3: tbl_df/tbl/data.frame)
## $ target1 : chr [1:127523] "short" "short" "long" "long" ...
## $ age : num [1:127523] 0 76 50 0 0 0 0 0 70 25 ...
## ..- attr(*, "label")= chr " patient age at admission"
## $ sex : chr [1:127523] "1" "2" "2" "1" ...
## ..- attr(*, "label")= chr " patient sex (1=M,2=F) alpha"
## ..- attr(*, "format.sas")= chr "$SEX"
## $ raceethn: chr [1:127523] "1" "1" "1" "7" ...
## ..- attr(*, "label")= chr " patient race/ethnicity"
## ..- attr(*, "format.sas")= chr "$RACEETH"
## $ provider: chr [1:127523] "7214" "7214" "7214" "7214" ...
## ..- attr(*, "label")= chr " provider"
## ..- attr(*, "format.sas")= chr "$PROVID"
## $ moa : num [1:127523] 1 1 12 1 12 12 1 1 1 12 ...
## ..- attr(*, "label")= chr " month of admission mm"
## ..- attr(*, "format.sas")= chr "MONTHNME"
## $ mod : num [1:127523] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr " month of discharge mm"
## ..- attr(*, "format.sas")= chr "MONTHNME"
## $ admtype : chr [1:127523] "4" "3" "2" "4" ...
## ..- attr(*, "label")= chr " admission type"
## ..- attr(*, "format.sas")= chr "$ADMTYPE"
## $ campus : chr [1:127523] "0" "0" "0" "0" ...
## $ icu : chr [1:127523] "000000" "000000" "000000" "000000" ...
## ..- attr(*, "label")= chr " icu days"
## - attr(*, "label")= chr "HDD0318CY "
df_model1$target1 <- factor(df_model1$target1)
df_model1$sex <- factor(df_model1$sex)
df_model1$raceethn <- factor(df_model1$raceethn)
df_model1$provider <- factor(df_model1$provider)
df_model1$moa <- factor(df_model1$moa)
df_model1$mod <- factor(df_model1$mod)
df_model1$admtype <- factor(df_model1$admtype)
df_model1$campus <- factor(df_model1$campus)
df_model1$icu <- as.numeric(df_model1$icu)
str(df_model1)
## tibble [127,523 x 10] (S3: tbl_df/tbl/data.frame)
## $ target1 : Factor w/ 2 levels "long","short": 2 2 1 1 1 1 2 1 2 1 ...
## $ age : num [1:127523] 0 76 50 0 0 0 0 0 70 25 ...
## ..- attr(*, "label")= chr " patient age at admission"
## $ sex : Factor w/ 2 levels "1","2": 1 2 2 1 1 2 2 2 2 2 ...
## $ raceethn: Factor w/ 7 levels "1","2","3","4",..: 1 1 1 6 1 7 1 1 1 1 ...
## $ provider: Factor w/ 12 levels "7201","7202",..: 10 10 10 10 10 10 10 10 10 10 ...
## $ moa : Factor w/ 12 levels "1","2","3","4",..: 1 1 12 1 12 12 1 1 1 12 ...
## $ mod : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ admtype : Factor w/ 6 levels "1","2","3","4",..: 4 3 2 4 4 4 4 4 2 2 ...
## $ campus : Factor w/ 4 levels "0","1","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## $ icu : num [1:127523] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "label")= chr "HDD0318CY "
set.seed(2020)
splitIndex <- createDataPartition(df_model1$target1, p = .10,
list = FALSE)
df_train1 <- df_model1[ splitIndex,]
df_test1 <- df_model1[-splitIndex,]
# Decision Tree
tree_model1 <- rpart(target1 ~ ., data = df_train1,
control = rpart.control(maxdepth = 4))
barplot(tree_model1$variable.importance)
# Other models
# Random forest using method "rf"
tuneGrid <- expand.grid(mtry = 2:4)
trControl <- trainControl(method = "cv", number = 5)
forest_cv2 <- train(target1~., data=df_train1,
method = "rf",
trControl = trControl,
tuneGrid = tuneGrid)
plot(forest_cv2)
print(forest_cv2)
## Random Forest
##
## 12753 samples
## 9 predictor
## 2 classes: 'long', 'short'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 10202, 10203, 10203, 10202, 10202
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.6374189 0.09394915
## 3 0.6552178 0.16987011
## 4 0.6631377 0.20267392
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
# Random forest using method "ranger"
tuneGrid1 <- expand.grid(mtry = 2:4,
splitrule = c('gini', 'extratrees'),
min.node.size = c(1:10))
trControl1 <- trainControl(method = "cv",
number = 5)
forest_cv3 <- train(target1~., data=df_train1,
method = "ranger",
trControl = trControl1,
tuneGrid = tuneGrid1)
plot(forest_cv3)
print(forest_cv3)
## Random Forest
##
## 12753 samples
## 9 predictor
## 2 classes: 'long', 'short'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 10202, 10202, 10203, 10202, 10203
## Resampling results across tuning parameters:
##
## mtry splitrule min.node.size Accuracy Kappa
## 2 gini 1 0.6369483 0.09313624
## 2 gini 2 0.6376539 0.09570761
## 2 gini 3 0.6368699 0.09449983
## 2 gini 4 0.6384381 0.09963092
## 2 gini 5 0.6377323 0.09551458
## 2 gini 6 0.6372615 0.09541257
## 2 gini 7 0.6379675 0.09683047
## 2 gini 8 0.6385163 0.09925521
## 2 gini 9 0.6375752 0.09517255
## 2 gini 10 0.6374188 0.09389143
## 2 extratrees 1 0.6310673 0.06390385
## 2 extratrees 2 0.6314594 0.06620800
## 2 extratrees 3 0.6311458 0.06520552
## 2 extratrees 4 0.6309101 0.06394520
## 2 extratrees 5 0.6294203 0.05766925
## 2 extratrees 6 0.6328706 0.07099306
## 2 extratrees 7 0.6310673 0.06056804
## 2 extratrees 8 0.6303613 0.06401206
## 2 extratrees 9 0.6312240 0.06499019
## 2 extratrees 10 0.6301262 0.06191651
## 3 gini 1 0.6571001 0.17641526
## 3 gini 2 0.6571002 0.17617180
## 3 gini 3 0.6575705 0.17762761
## 3 gini 4 0.6583545 0.17917858
## 3 gini 5 0.6566297 0.17468132
## 3 gini 6 0.6574919 0.17801664
## 3 gini 7 0.6567082 0.17417483
## 3 gini 8 0.6571788 0.17796753
## 3 gini 9 0.6575706 0.17751798
## 3 gini 10 0.6567079 0.17634121
## 3 extratrees 1 0.6493374 0.15139854
## 3 extratrees 2 0.6495728 0.15357448
## 3 extratrees 3 0.6504353 0.15488133
## 3 extratrees 4 0.6494160 0.15337083
## 3 extratrees 5 0.6476906 0.14673001
## 3 extratrees 6 0.6500435 0.15449128
## 3 extratrees 7 0.6498080 0.15335439
## 3 extratrees 8 0.6501217 0.15461944
## 3 extratrees 9 0.6482397 0.14812540
## 3 extratrees 10 0.6502002 0.15483896
## 4 gini 1 0.6658036 0.21344044
## 4 gini 2 0.6654116 0.21360228
## 4 gini 3 0.6662743 0.21568741
## 4 gini 4 0.6643139 0.21030472
## 4 gini 5 0.6658819 0.21450764
## 4 gini 6 0.6654902 0.21379426
## 4 gini 7 0.6658038 0.21504798
## 4 gini 8 0.6658039 0.21523027
## 4 gini 9 0.6651763 0.21191781
## 4 gini 10 0.6659606 0.21531078
## 4 extratrees 1 0.6536500 0.17602614
## 4 extratrees 2 0.6539635 0.17830201
## 4 extratrees 3 0.6530230 0.17382592
## 4 extratrees 4 0.6533362 0.17472997
## 4 extratrees 5 0.6530225 0.17426732
## 4 extratrees 6 0.6527089 0.17424796
## 4 extratrees 7 0.6533363 0.17439831
## 4 extratrees 8 0.6544340 0.17822430
## 4 extratrees 9 0.6541991 0.17702780
## 4 extratrees 10 0.6532578 0.17568152
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 4, splitrule = gini
## and min.node.size = 3.
# Boosted Logistic Regression
tuneGrid2 <- expand.grid(nIter = 2:6)
trControl2 <- trainControl(method = "cv",
number = 5)
boost_model1 <- train(target1~., data=df_train1,
method = "LogitBoost",
trControl = trControl2,
tuneGrid = tuneGrid2)
plot(boost_model1)
print(boost_model1)
## Boosted Logistic Regression
##
## 12753 samples
## 9 predictor
## 2 classes: 'long', 'short'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 10202, 10202, 10203, 10202, 10203
## Resampling results across tuning parameters:
##
## nIter Accuracy Kappa
## 2 0.7863052 0.5769924
## 3 0.6466719 0.1570180
## 4 0.8019985 0.6047507
## 5 0.6418890 0.1419172
## 6 0.8169745 0.6096988
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was nIter = 6.
#Comparison of Models
results1 <- resamples(list(forest_rf = forest_cv2,
forest_ranger = forest_cv3,
boost = boost_model1))
bwplot(results1)
The final chosen model is the Boosted Logistic Regression again because it had the highest accuracy.
pred1 <- predict(boost_model1,df_test1)
cm1 <- confusionMatrix(data = pred1, reference = df_test1$target1, positive = "long")
cm1$overall[1]
## Accuracy
## 0.8449282