read.csv
to import the Covid19 data from WHO: linkWHOdf <- read.csv("WHO-COVID-19-global-data.csv", header = TRUE)
str(WHOdf)
## 'data.frame': 39001 obs. of 8 variables:
## $ ï..Date_reported : chr "2/24/2020" "2/25/2020" "2/26/2020" "2/27/2020" ...
## $ Country_code : chr "AF" "AF" "AF" "AF" ...
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ WHO_region : chr "EMRO" "EMRO" "EMRO" "EMRO" ...
## $ New_cases : int 5 0 0 0 0 0 0 0 0 0 ...
## $ Cumulative_cases : int 5 5 5 5 5 5 5 5 5 5 ...
## $ New_deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Cumulative_deaths: int 0 0 0 0 0 0 0 0 0 0 ...
dim(WHOdf)
## [1] 39001 8
sum(is.na(WHOdf))
## [1] 173
colSums(is.na(WHOdf))
## ï..Date_reported Country_code Country WHO_region
## 0 173 0 0
## New_cases Cumulative_cases New_deaths Cumulative_deaths
## 0 0 0 0
class(WHOdf$ï..Date_reported)
## [1] "character"
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
WHOdf$ï..Date_reported <- mdy(WHOdf$ï..Date_reported)
class(WHOdf$ï..Date_reported)
## [1] "Date"
daily_new_cases <- mean(WHOdf$New_cases)
daily_new_cases
## [1] 656.4618
daily_cumulative_cases <- mean(WHOdf$Cumulative_cases)
daily_cumulative_cases
## [1] 40414.05
max_cases <- max(WHOdf$New_cases)
max_cases
## [1] 78761
which.max
function to find the index of the row that contains the maximum number of cases. Then from the row index, find that country and the date. which_max <- which.max(WHOdf$New_cases)
which_max
## [1] 16888
WHOdf[16888,1:3]
## ï..Date_reported Country_code Country
## 16888 2020-08-30 IN India
table(WHOdf$WHO_region)
##
## AFRO AMRO EMRO EURO Other SEARO WPRO
## 8219 9425 4056 11397 211 1950 3743
# There are 7 regions
WHOdf_Europe <- WHOdf[WHOdf$WHO_region == 'EURO',]
average_Europe <- mean(WHOdf_Europe$New_cases)
average_Europe
## [1] 378.2812
by(WHOdf$New_cases, WHOdf$WHO_region, mean)
## WHOdf$WHO_region: AFRO
## [1] 129.1473
## ------------------------------------------------------------
## WHOdf$WHO_region: AMRO
## [1] 1429.151
## ------------------------------------------------------------
## WHOdf$WHO_region: EMRO
## [1] 478.1075
## ------------------------------------------------------------
## WHOdf$WHO_region: EURO
## [1] 378.2812
## ------------------------------------------------------------
## WHOdf$WHO_region: Other
## [1] 3.511848
## ------------------------------------------------------------
## WHOdf$WHO_region: SEARO
## [1] 2214.503
## ------------------------------------------------------------
## WHOdf$WHO_region: WPRO
## [1] 134.1061
WHOdf_US <- WHOdf[(WHOdf$Country=='United States of America')&(Sys.Date()-WHOdf$ï..Date_reported<10),]
mean(WHOdf_US$New_cases)
## [1] 40611
# Changed the number from 5 to 10 to account for the five days in between the system date and the last reported date, so we are actually going back 10 days to get the last five rows of dates
WHOdf$weekday <- wday(WHOdf$ï..Date_reported)
WHOdf_SEARO <- WHOdf[(WHOdf$WHO_region=='SEARO'),]
by(WHOdf_SEARO$New_cases,WHOdf_SEARO$weekday,sum)
## WHOdf_SEARO$weekday: 1
## [1] 635936
## ------------------------------------------------------------
## WHOdf_SEARO$weekday: 2
## [1] 617453
## ------------------------------------------------------------
## WHOdf_SEARO$weekday: 3
## [1] 587327
## ------------------------------------------------------------
## WHOdf_SEARO$weekday: 4
## [1] 633744
## ------------------------------------------------------------
## WHOdf_SEARO$weekday: 5
## [1] 597792
## ------------------------------------------------------------
## WHOdf_SEARO$weekday: 6
## [1] 620520
## ------------------------------------------------------------
## WHOdf_SEARO$weekday: 7
## [1] 625509
# Day 1, or Monday has the most number of cases in the SEARO region