Assignment 14

Write the following function. Give example to test your function.

Input: a data frame
Output: a data frame with all the missing of numeric variables replaced by the associated means.

library(tidyverse)
df <- read_csv('titanic.csv')
df <- df %>% select(-Cabin)

# There are no missing values in this data, but this data will be used to test out the graphing functions.
insurance <- read_csv('insurance.csv')

impute_mean <- function(x)
{
  if(is.numeric(x))
  {
    mean <-  mean(x,na.rm=TRUE)
    library(tidyr)
    x <- replace_na(x, mean)
  }
  return(x)
}

df_impute <- function(x)
{
  for(i in 1:length(x))
  {
    x[[i]] <- impute_mean(x[[i]])
  }
  return(x)
}

colSums(is.na(df))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare    Embarked 
##           0           0           0           0           2

df1 <- df_impute(df)
colSums(is.na(df1))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0           0 
##       SibSp       Parch      Ticket        Fare    Embarked 
##           0           0           0           0           2

Write the following function. Give example to test your function.

Input: a data frame
Output: a data frame with all the missing of variables replaced by the associated means (for numeric variables) or modes (for non-numeric variables).

Hint: Combine the function in Problem 1 and the function in this example

impute_all <- function(x)
{
  if(sum(is.na(x)>0) && is.numeric(x))
  {
    mean <-  mean(x,na.rm=TRUE)
    library(tidyr)
    x <- replace_na(x, mean)
  }
  if(sum(is.na(x)>0) && !is.numeric(x))
  {
    mode <- names(sort(-table(x)))[1]
    x <- replace_na(x, mode)
  }
  return(x)
}

df_impute_all <- function(x)
{
  for(i in 1:length(x))
  {
    x[[i]] <- impute_all(x[[i]])
  }
  return(x)
}

colSums(is.na(df))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare    Embarked 
##           0           0           0           0           2

df2 <- df_impute_all(df)
colSums(is.na(df2))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0           0 
##       SibSp       Parch      Ticket        Fare    Embarked 
##           0           0           0           0           0

Write the following function. Give example to test your function.

Input: a data frame
Output: Bar plots of all non-numeric variables

Hint: Similar function

bar_plot <- function(d)
{
  library(ggplot2)
  for (i in 1:length(d))
  {
    if (!is.numeric(d[[i]]))
    {
      print(ggplot(d, aes(x = d[[i]]))+ 
              geom_bar()+
              labs(x = names(d)[i]))
    }
  }
}

df2$Pclass <- factor(df2$Pclass)
df2$Survived <- factor(df2$Survived)
bar_plot(df2)

bar_plot(insurance)

Write the following function. Give example to test your function.

Input: a data frame
Output: all possible the bar plots of a non-numeric variable filled by a non-numeric variable.

Hint: Similar function

bar_plot2 <- function(d)
{
  library(ggplot2)
  l <- length(d)
  for (i in 1:(l-1))
    for (j in (i+1):l)
  {
    if (!is.numeric(d[[i]])& (!is.numeric(d[[j]])))
    {
      print(ggplot(d, aes(x = d[[i]], fill = d[[j]]))+ 
              geom_bar()+labs(x = names(d)[i], fill = names(d)[j]))
    }
  }
}

bar_plot2(df)

bar_plot2(insurance)

Write the following function. Give example to test your function.

Input: a data frame
Output:
- all possible the bar plots of a non-numeric variable filled by a non-numeric variable.
- all possible the density plots of a numeric variable colored by a non-numeric variable
- all possible the scatter plots.

Hint: Combine this function, this function, and the function in Question 4. One way to combine is creating a new function, quick_plot, and call these three functions within quic_kplot.

scatter_plot <- function(d)
{
  library(ggplot2)
  l <- length(d)
  for (i in 1:(l-1))
    for (j in (i+1):l)
  {
    if (is.numeric(d[[i]])&is.numeric(d[[j]]))
    {
      print(ggplot(d, aes(x = d[[i]], y = d[[j]]))+ 
              geom_point()+
              labs(x = names(d)[i], y = names(d)[j]))
    }
  }
}

density_plot2 <- function(d)
{
  library(ggplot2)
  l <- length(d)
  for (i in 1:(l-1))
    for (j in (i+1):l)
  {
    if (is.numeric(d[[i]])& (!is.numeric(d[[j]])))
    {
      print(ggplot(d, aes(x = d[[i]], color = d[[j]]))+ 
              geom_density()+labs(x = names(d)[i], color = names(d)[j]))
    }
  }
}

quick_plot <- function(d)
{
    scatter_plot(d)
    density_plot2(d)
    bar_plot2(d)
}

quick_plot(insurance)

Assignment 14

Kaitlyn Fales

10/28/2020