Input: a data frame
Output: a data frame with all the missing of numeric variables replaced by the associated means.
Hint: Similar function
library(tidyverse)
df <- read_csv('titanic.csv')
df <- df %>% select(-Cabin)
# There are no missing values in this data, but this data will be used to test out the graphing functions.
insurance <- read_csv('insurance.csv')
impute_mean <- function(x)
{
if(is.numeric(x))
{
mean <- mean(x,na.rm=TRUE)
library(tidyr)
x <- replace_na(x, mean)
}
return(x)
}
df_impute <- function(x)
{
for(i in 1:length(x))
{
x[[i]] <- impute_mean(x[[i]])
}
return(x)
}
colSums(is.na(df))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Embarked
## 0 0 0 0 2
df1 <- df_impute(df)
colSums(is.na(df1))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 0
## SibSp Parch Ticket Fare Embarked
## 0 0 0 0 2
Input: a data frame
Output: a data frame with all the missing of variables replaced by the associated means (for numeric variables) or modes (for non-numeric variables).
Hint: Combine the function in Problem 1 and the function in this example
impute_all <- function(x)
{
if(sum(is.na(x)>0) && is.numeric(x))
{
mean <- mean(x,na.rm=TRUE)
library(tidyr)
x <- replace_na(x, mean)
}
if(sum(is.na(x)>0) && !is.numeric(x))
{
mode <- names(sort(-table(x)))[1]
x <- replace_na(x, mode)
}
return(x)
}
df_impute_all <- function(x)
{
for(i in 1:length(x))
{
x[[i]] <- impute_all(x[[i]])
}
return(x)
}
colSums(is.na(df))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Embarked
## 0 0 0 0 2
df2 <- df_impute_all(df)
colSums(is.na(df2))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 0
## SibSp Parch Ticket Fare Embarked
## 0 0 0 0 0
Input: a data frame
Output: Bar plots of all non-numeric variables
Hint: Similar function
bar_plot <- function(d)
{
library(ggplot2)
for (i in 1:length(d))
{
if (!is.numeric(d[[i]]))
{
print(ggplot(d, aes(x = d[[i]]))+
geom_bar()+
labs(x = names(d)[i]))
}
}
}
df2$Pclass <- factor(df2$Pclass)
df2$Survived <- factor(df2$Survived)
bar_plot(df2)
bar_plot(insurance)
Input: a data frame
Output: all possible the bar plots of a non-numeric variable filled by a non-numeric variable.
Hint: Similar function
bar_plot2 <- function(d)
{
library(ggplot2)
l <- length(d)
for (i in 1:(l-1))
for (j in (i+1):l)
{
if (!is.numeric(d[[i]])& (!is.numeric(d[[j]])))
{
print(ggplot(d, aes(x = d[[i]], fill = d[[j]]))+
geom_bar()+labs(x = names(d)[i], fill = names(d)[j]))
}
}
}
bar_plot2(df)
bar_plot2(insurance)
Input: a data frame
Output:
all possible the bar plots of a non-numeric variable filled by a non-numeric variable.
all possible the density plots of a numeric variable colored by a non-numeric variable
all possible the scatter plots.
Hint: Combine this function, this function, and the function in Question 4. One way to combine is creating a new function, quick_plot
, and call these three functions within quic_kplot
.
scatter_plot <- function(d)
{
library(ggplot2)
l <- length(d)
for (i in 1:(l-1))
for (j in (i+1):l)
{
if (is.numeric(d[[i]])&is.numeric(d[[j]]))
{
print(ggplot(d, aes(x = d[[i]], y = d[[j]]))+
geom_point()+
labs(x = names(d)[i], y = names(d)[j]))
}
}
}
density_plot2 <- function(d)
{
library(ggplot2)
l <- length(d)
for (i in 1:(l-1))
for (j in (i+1):l)
{
if (is.numeric(d[[i]])& (!is.numeric(d[[j]])))
{
print(ggplot(d, aes(x = d[[i]], color = d[[j]]))+
geom_density()+labs(x = names(d)[i], color = names(d)[j]))
}
}
}
quick_plot <- function(d)
{
scatter_plot(d)
density_plot2(d)
bar_plot2(d)
}
quick_plot(insurance)