1. Install tidyverse package

An R package can be installed by install.packages function. Install tidyverse if you have not done so.

install.packages('tidyverse')

2. Read the data using read_csv

Use read_csv function to import the US Covid 19 data at link. Donโ€™t forget to import tidyverse (library(tidyverse)) so that you can use read_csv.

library(tidyverse)
COVID <- read_csv('https://covidtracking.com/data/download/all-states-history.csv')

3. Fix the date and create some new variables

library(lubridate)
df$date <- ymd(df$date)
library(lubridate)
COVID$date <- ymd(COVID$date)
df$month = month(df$date)

# day of the week
df$weekday = wday(df$date)

# day of the month
df$monthday <- mday(df$date)
COVID$month <- month(COVID$date)

COVID$weekday <- wday(COVID$date)

COVID$monthday <- mday(COVID$date)

4. Create new variables with case_when.

The function case_when is a good option to create a new variable from existing variable. For example, this below codes create a new variable, daily_death, from deathIncrease variable. deathIncrease is the number of daily new death by Covid19. The new variable daily_death takes three values: low (if deathIncrease less than 3), medium (deathIncrease from 3 to 14), and high (deathIncrease more than 14).

df$daily_death <- case_when(
  df$deathIncrease <3 ~ 'low',
  df$deathIncrease <=14 ~ 'medium',
  TRUE ~ 'high'
)
COVID$month2 <- case_when(
  COVID$monthday <=10 ~ 'early_month',
  COVID$monthday <=20 ~ 'mid_month',
  TRUE ~ 'end_month'
)

5. Select function

Use the select function to deselect the column totalTestsViral from the data.

COVID <- select(COVID, -totalTestsViral)

6. Pipe Operator ( %>% )

Pipe operator offers another way to write R codes. Many times, it makes the codes more readable. Pipe works very well with all the tidyverse packages. Refer to these slides (slide 15, 16, 17 and 18) to rewrite the below codes using pipe operator

x <- c(1:10)

# square root of x
sqrt(x)

sum(sqrt(x))

log(sum(sqrt(x)))

# log base 2 of 16
log(16, 2)
x <- c(1:10)

x %>% sqrt
##  [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751 2.828427
##  [9] 3.000000 3.162278
x %>% sqrt %>% sum
## [1] 22.46828
x %>% sqrt %>% sum %>% log
## [1] 3.112104
16 %>% log(2)
## [1] 4

7. Combo 1: group_by + summarise

This combo is used when you want to apply a function/calculation to different groups of the data. For example, to calculate the average number of cases (positiveIncrease) by dataQualityGrade, we use:

df %>% 
  group_by(dataQualityGrade) %>% 
  summarise(mean(positiveIncrease))
median <- COVID %>% 
  group_by(dataQualityGrade) %>% 
  summarise(median(positiveIncrease))
median
## # A tibble: 7 x 2
##   dataQualityGrade `median(positiveIncrease)`
##   <chr>                                 <dbl>
## 1 A                                       360
## 2 A+                                      430
## 3 B                                       213
## 4 C                                        25
## 5 D                                        11
## 6 F                                         0
## 7 <NA>                                      2
max <- COVID %>% 
  group_by(dataQualityGrade) %>% 
  summarise(max(positiveIncrease))
max
## # A tibble: 7 x 2
##   dataQualityGrade `max(positiveIncrease)`
##   <chr>                              <dbl>
## 1 A                                  15300
## 2 A+                                  7569
## 3 B                                  14916
## 4 C                                   1323
## 5 D                                   1240
## 6 F                                    141
## 7 <NA>                                4812
avg_month2 <- COVID %>% 
  group_by(month2) %>% 
  summarise(mean(positiveIncrease))
avg_month2
## # A tibble: 3 x 2
##   month2      `mean(positiveIncrease)`
##   <chr>                          <dbl>
## 1 early_month                     596.
## 2 end_month                       622.
## 3 mid_month                       567.
avg_weekday <- COVID %>% 
  group_by(weekday) %>% 
  summarise(mean(positiveIncrease))
avg_weekday
## # A tibble: 7 x 2
##   weekday `mean(positiveIncrease)`
##     <dbl>                    <dbl>
## 1       1                     552.
## 2       2                     510.
## 3       3                     562.
## 4       4                     601.
## 5       5                     640.
## 6       6                     677.
## 7       7                     630.

8. Combo 2: filter + group_by + summarise

An example: to calculate the average number of cases (positiveIncrease) by dataQualityGrade in RI, we use:

df %>% 
  filter(state=='RI')
  group_by(dataQualityGrade) %>% 
  summarise(mean(positiveIncrease))
new_cases_weekday <- COVID %>% 
  filter(state=='MA') %>% 
  group_by(weekday) %>% 
  summarise(sum(positiveIncrease))
new_cases_weekday
## # A tibble: 7 x 2
##   weekday `sum(positiveIncrease)`
##     <dbl>                   <dbl>
## 1       1                   17212
## 2       2                   17324
## 3       3                   15092
## 4       4                   10468
## 5       5                   21340
## 6       6                   21182
## 7       7                   20525
new_cases_month2 <- COVID %>% 
  filter(state %in% c('MA','RI')) %>% 
  group_by(month2) %>% 
  summarise(sum(positiveIncrease))
new_cases_month2
## # A tibble: 3 x 2
##   month2      `sum(positiveIncrease)`
##   <chr>                         <dbl>
## 1 early_month                   45189
## 2 end_month                     52557
## 3 mid_month                     48071
avg_New_England <- COVID %>% 
  filter(state %in% c('MA','RI','NH','ME','CT','VE')) %>% 
  summarise(mean(positiveIncrease))
avg_New_England
## # A tibble: 1 x 1
##   `mean(positiveIncrease)`
##                      <dbl>
## 1                     214.
avg_other_states <- COVID %>% 
  filter(state != c('MA','RI','NH','ME','CT','VE')) %>% 
  summarise(mean(positiveIncrease))
avg_other_states
## # A tibble: 1 x 1
##   `mean(positiveIncrease)`
##                      <dbl>
## 1                     600.
avg_other_states > avg_New_England
##      mean(positiveIncrease)
## [1,]                   TRUE