tidyverse packageAn R package can be installed by install.packages function. Install tidyverse if you have not done so.
install.packages('tidyverse')
read_csvUse read_csv function to import the US Covid 19 data at link. Donโt forget to import tidyverse (library(tidyverse)) so that you can use read_csv.
library(tidyverse)
COVID <- read_csv('https://covidtracking.com/data/download/all-states-history.csv')
date column to date.library(lubridate)
df$date <- ymd(df$date)
library(lubridate)
COVID$date <- ymd(COVID$date)
month, weekday and monthday variablesdf$month = month(df$date)
# day of the week
df$weekday = wday(df$date)
# day of the month
df$monthday <- mday(df$date)
COVID$month <- month(COVID$date)
COVID$weekday <- wday(COVID$date)
COVID$monthday <- mday(COVID$date)
case_when.The function case_when is a good option to create a new variable from existing variable. For example, this below codes create a new variable, daily_death, from deathIncrease variable. deathIncrease is the number of daily new death by Covid19. The new variable daily_death takes three values: low (if deathIncrease less than 3), medium (deathIncrease from 3 to 14), and high (deathIncrease more than 14).
df$daily_death <- case_when(
df$deathIncrease <3 ~ 'low',
df$deathIncrease <=14 ~ 'medium',
TRUE ~ 'high'
)
month2 that takes three values: early_month (day of the month from 1-10), mid_month (day of the month from 11-20), and end_month (day of the month > 20).COVID$month2 <- case_when(
COVID$monthday <=10 ~ 'early_month',
COVID$monthday <=20 ~ 'mid_month',
TRUE ~ 'end_month'
)
Use the select function to deselect the column totalTestsViral from the data.
COVID <- select(COVID, -totalTestsViral)
Pipe operator offers another way to write R codes. Many times, it makes the codes more readable. Pipe works very well with all the tidyverse packages. Refer to these slides (slide 15, 16, 17 and 18) to rewrite the below codes using pipe operator
x <- c(1:10)
# square root of x
sqrt(x)
sum(sqrt(x))
log(sum(sqrt(x)))
# log base 2 of 16
log(16, 2)
x <- c(1:10)
x %>% sqrt
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751 2.828427
## [9] 3.000000 3.162278
x %>% sqrt %>% sum
## [1] 22.46828
x %>% sqrt %>% sum %>% log
## [1] 3.112104
16 %>% log(2)
## [1] 4
This combo is used when you want to apply a function/calculation to different groups of the data. For example, to calculate the average number of cases (positiveIncrease) by dataQualityGrade, we use:
df %>%
group_by(dataQualityGrade) %>%
summarise(mean(positiveIncrease))
positiveIncrease) by dataQualityGrademedian <- COVID %>%
group_by(dataQualityGrade) %>%
summarise(median(positiveIncrease))
median
## # A tibble: 7 x 2
## dataQualityGrade `median(positiveIncrease)`
## <chr> <dbl>
## 1 A 360
## 2 A+ 430
## 3 B 213
## 4 C 25
## 5 D 11
## 6 F 0
## 7 <NA> 2
positiveIncrease) by dataQualityGrademax <- COVID %>%
group_by(dataQualityGrade) %>%
summarise(max(positiveIncrease))
max
## # A tibble: 7 x 2
## dataQualityGrade `max(positiveIncrease)`
## <chr> <dbl>
## 1 A 15300
## 2 A+ 7569
## 3 B 14916
## 4 C 1323
## 5 D 1240
## 6 F 141
## 7 <NA> 4812
positiveIncrease) by month2avg_month2 <- COVID %>%
group_by(month2) %>%
summarise(mean(positiveIncrease))
avg_month2
## # A tibble: 3 x 2
## month2 `mean(positiveIncrease)`
## <chr> <dbl>
## 1 early_month 596.
## 2 end_month 622.
## 3 mid_month 567.
positiveIncrease) by weekdayavg_weekday <- COVID %>%
group_by(weekday) %>%
summarise(mean(positiveIncrease))
avg_weekday
## # A tibble: 7 x 2
## weekday `mean(positiveIncrease)`
## <dbl> <dbl>
## 1 1 552.
## 2 2 510.
## 3 3 562.
## 4 4 601.
## 5 5 640.
## 6 6 677.
## 7 7 630.
An example: to calculate the average number of cases (positiveIncrease) by dataQualityGrade in RI, we use:
df %>%
filter(state=='RI')
group_by(dataQualityGrade) %>%
summarise(mean(positiveIncrease))
new_cases_weekday <- COVID %>%
filter(state=='MA') %>%
group_by(weekday) %>%
summarise(sum(positiveIncrease))
new_cases_weekday
## # A tibble: 7 x 2
## weekday `sum(positiveIncrease)`
## <dbl> <dbl>
## 1 1 17212
## 2 2 17324
## 3 3 15092
## 4 4 10468
## 5 5 21340
## 6 6 21182
## 7 7 20525
month2 in RI and MA. Is there any difference between the total number of cases for each categories in month2?new_cases_month2 <- COVID %>%
filter(state %in% c('MA','RI')) %>%
group_by(month2) %>%
summarise(sum(positiveIncrease))
new_cases_month2
## # A tibble: 3 x 2
## month2 `sum(positiveIncrease)`
## <chr> <dbl>
## 1 early_month 45189
## 2 end_month 52557
## 3 mid_month 48071
avg_New_England <- COVID %>%
filter(state %in% c('MA','RI','NH','ME','CT','VE')) %>%
summarise(mean(positiveIncrease))
avg_New_England
## # A tibble: 1 x 1
## `mean(positiveIncrease)`
## <dbl>
## 1 214.
avg_other_states <- COVID %>%
filter(state != c('MA','RI','NH','ME','CT','VE')) %>%
summarise(mean(positiveIncrease))
avg_other_states
## # A tibble: 1 x 1
## `mean(positiveIncrease)`
## <dbl>
## 1 600.
avg_other_states > avg_New_England
## mean(positiveIncrease)
## [1,] TRUE