1 R Programming Exercises

This file contains exercises to practice the concepts covered in the R introduction. Each exercise includes questions in comments and solutions in hidden code chunks.

1.1 Note to Students

Try to solve each exercise before looking at the solutions
Copy the Questions (the commented lines) into a new R script file, and write your solutions in that file
Practice running the code in RStudio
Experiment with modifying the code to understand how it works
Use the help documentation (?function_name) when you’re unsure about a function

1.2 Exercise 1: Basic Data Types and Vectors

# Question 1: Create a numeric vector with values 10, 20, 30, 40, 50
# Question 2: Create a character vector with your name and two hobbies
# Question 3: Create a logical vector with TRUE, FALSE, TRUE, FALSE
# Question 4: Check the class and length of each vector you created

# Write your code here:

Solution

# Solutions:
# 1. Create numeric vector
numbers <- c(10, 20, 30, 40, 50)

# 2. Create character vector
personal_info <- c("Your Name", "Reading", "Swimming")

# 3. Create logical vector
logical_values <- c(TRUE, FALSE, TRUE, FALSE)

# 4. Check properties
class(numbers)
length(numbers)
class(personal_info)
length(personal_info)
class(logical_values)
length(logical_values)

1.3 Exercise 2: Vector Operations

# Question 1: Create two numeric vectors x = (1, 3, 5, 7, 9) and y = (2, 4, 6, 8, 10)
# Question 2: Calculate x + y, x * y, and x^2
# Question 3: Find the mean, sum, and standard deviation of vector x
# Question 4: Create a sequence from 1 to 20 with step size 2

# Write your code here:

Solution

# Solutions:
# 1. Create vectors
x <- c(1, 3, 5, 7, 9)
y <- c(2, 4, 6, 8, 10)

# 2. Vector operations
x + y
x * y
x^2

# 3. Summary statistics
mean(x)
sum(x)
sd(x)

# 4. Sequence
seq(1, 20, by = 2)

1.4 Exercise 3: Factors and Categorical Data

# Question 1: Create a factor from the vector: c("Low", "Medium", "High", "Low", "Medium")
# Question 2: Create an ordered factor with levels: "Beginner", "Intermediate", "Advanced"
# Question 3: Group ages 18, 25, 30, 35, 40, 45, 50 into three categories: "Young" (<30), "Middle" (30-45), "Senior" (>45)
# Question 4: Create a frequency table of your factor from question 1

# Write your code here:

Solution

# Solutions:
# 1. Create factor
levels_factor <- factor(c("Low", "Medium", "High", "Low", "Medium"))

# 2. Create ordered factor
skill_levels <- factor(c("Beginner", "Intermediate", "Advanced", "Beginner", "Advanced"),
                      levels = c("Beginner", "Intermediate", "Advanced"),
                      ordered = TRUE)

# 3. Group ages
ages <- c(18, 25, 30, 35, 40, 45, 50)
age_groups <- cut(ages, 
                 breaks = c(0, 30, 45, 100),
                 labels = c("Young", "Middle", "Senior"),
                 include.lowest = TRUE)

# 4. Frequency table
table(levels_factor)

1.5 Exercise 4: Data Frames and Basic Operations

# Question 1: Create a data frame with columns: Name, Age, City, and Salary
# Question 2: Add a new column called "Age_Group" based on age (<30="Young", 30-50="Middle", >50="Senior")
# Question 3: Find the mean salary by age group
# Question 4: Select only people from a specific city

# Write your code here:

Solution

# Solutions:
# 1. Create data frame
employees <- data.frame(
  Name = c("Alice", "Bob", "Charlie", "Diana", "Eve"),
  Age = c(25, 35, 42, 28, 55),
  City = c("NYC", "LA", "Chicago", "NYC", "Boston"),
  Salary = c(50000, 65000, 75000, 55000, 80000)
)

# 2. Add age group column
employees$Age_Group <- ifelse(employees$Age < 30, "Young",
                             ifelse(employees$Age <= 50, "Middle", "Senior"))

# 3. Mean salary by age group
aggregate(Salary ~ Age_Group, data = employees, FUN = mean)

# 4. Select NYC employees
nyc_employees <- employees[employees$City == "NYC", ]

1.6 Exercise 5: Data Manipulation with dplyr

# Question 1: Load the dplyr package and create a sample data frame
# Question 2: Use filter() to select rows where age > 30
# Question 3: Use select() to keep only Name and Salary columns
# Question 4: Use mutate() to create a new column "Salary_K" (salary in thousands)
# Question 5: Use group_by() and summarize() to find mean salary by city

# Write your code here:

Solution

# Solutions:
library(dplyr)

# 1. Create sample data
sample_data <- data.frame(
  Name = c("John", "Jane", "Mike", "Sarah", "Tom"),
  Age = c(25, 32, 28, 35, 29),
  City = c("NYC", "LA", "NYC", "Chicago", "LA"),
  Salary = c(45000, 60000, 52000, 70000, 58000)
)

# 2. Filter age > 30
older_employees <- sample_data |> 
  filter(Age > 30)

# 3. Select specific columns
name_salary <- sample_data |> 
  select(Name, Salary)

# 4. Create new column
sample_data_with_k <- sample_data |> 
  mutate(Salary_K = Salary / 1000)

# 5. Group and summarize
city_salary_summary <- sample_data |> 
  group_by(City) |> 
  summarize(
    Mean_Salary = mean(Salary),
    Count = n(),
    .groups = "drop"
  )

1.7 Exercise 6: Data Reshaping (Wide to Long)

# Question 1: Create a wide format data frame with test scores at three time points
# Question 2: Reshape the data from wide to long format
# Question 3: Calculate the mean score for each time point
# Question 4: Create a new column showing score improvement from Time1 to Time3

# Write your code here:

Solution

# Solutions:
library(tidyr)

# 1. Create wide format data
wide_data <- data.frame(
  ID = 1:5,
  Score_Time1 = c(75, 82, 68, 90, 78),
  Score_Time2 = c(78, 85, 72, 92, 81),
  Score_Time3 = c(82, 88, 76, 95, 85)
)

# 2. Reshape to long format
long_data <- wide_data |> 
  pivot_longer(
    cols = starts_with("Score_"),
    names_to = "Time",
    values_to = "Score"
  ) |> 
  mutate(Time = gsub("Score_Time", "", Time))

# 3. Mean score by time
time_means <- long_data |> 
  group_by(Time) |> 
  summarize(Mean_Score = mean(Score), .groups = "drop")

# 4. Score improvement
improvement_data <- wide_data |> 
  mutate(Improvement = Score_Time3 - Score_Time1)

1.8 Exercise 7: Basic Statistics and Visualization

# Question 1: Create a numeric vector and calculate basic statistics (mean, median, sd, min, max)
# Question 2: Create a histogram of your data
# Question 3: Create a boxplot comparing two groups
# Question 4: Create a scatter plot with two variables

# Write your code here:

Solution

# Solutions:
# 1. Create data and calculate statistics
data_vector <- c(12, 15, 18, 22, 25, 28, 30, 32, 35, 38, 40, 42, 45, 48, 50)

mean(data_vector)
median(data_vector)
sd(data_vector)
min(data_vector)
max(data_vector)
summary(data_vector)

# 2. Histogram
hist(data_vector, main = "Distribution of Data", xlab = "Values", col = "lightblue")

# 3. Boxplot (create two groups)
group1 <- c(12, 15, 18, 22, 25)
group2 <- c(35, 38, 40, 42, 45)
boxplot(group1, group2, names = c("Group 1", "Group 2"), main = "Comparison of Groups")

# 4. Scatter plot
x_values <- 1:15
y_values <- data_vector
plot(x_values, y_values, main = "Scatter Plot", xlab = "X", ylab = "Y", pch = 16)

1.9 Exercise 8: Control Structures and Functions

# Question 1: Write an if-else statement to classify scores as "Pass" (>=60) or "Fail" (<60)
# Question 2: Use a for loop to calculate the square of numbers 1 to 10
# Question 3: Use sapply() to calculate the square root of numbers 1 to 10
# Question 4: Write a function that calculates the area of a circle given the radius

# Write your code here:

Solution

# Solutions:
# 1. If-else for score classification
score <- 75
if (score >= 60) {
  result <- "Pass"
} else {
  result <- "Fail"
}

# 2. For loop for squares
squares <- numeric(10)
for (i in 1:10) {
  squares[i] <- i^2
}

# 3. sapply for square roots
numbers <- 1:10
square_roots <- sapply(numbers, sqrt)

# 4. Function for circle area
circle_area <- function(radius) {
  area <- pi * radius^2
  return(area)
}

# Test the function
circle_area(5)

1.10 Exercise 9: Working with Missing Data

# Question 1: Create a vector with some NA values
# Question 2: Count how many NA values are in your vector
# Question 3: Remove NA values from your vector
# Question 4: Replace NA values with the mean of non-NA values

# Write your code here:

Solution

# Solutions:
# 1. Create vector with NA
data_with_na <- c(1, 2, NA, 4, 5, NA, 7, 8, NA, 10)

# 2. Count NA values
na_count <- sum(is.na(data_with_na))

# 3. Remove NA values
data_clean <- na.omit(data_with_na)
# or
data_clean2 <- data_with_na[!is.na(data_with_na)]

# 4. Replace NA with mean
data_imputed <- data_with_na
data_imputed[is.na(data_imputed)] <- mean(data_with_na, na.rm = TRUE)

1.11 Exercise 10: Comprehensive Data Analysis

# Question 1: Create a comprehensive dataset with multiple variables
# Question 2: Perform exploratory data analysis (summary statistics, visualizations)
# Question 3: Create new variables through transformations
# Question 4: Group the data and calculate summary statistics by group

# Write your code here:

Solution

# Solutions:
# 1. Create comprehensive dataset
comprehensive_data <- data.frame(
  ID = 1:20,
  Age = sample(18:65, 20, replace = TRUE),
  Gender = sample(c("Male", "Female"), 20, replace = TRUE),
  Education = sample(c("High School", "College", "Graduate"), 20, replace = TRUE),
  Income = sample(30000:100000, 20, replace = TRUE),
  Satisfaction = sample(1:10, 20, replace = TRUE)
)

# 2. Exploratory data analysis
summary(comprehensive_data)
str(comprehensive_data)

# Visualizations
hist(comprehensive_data$Age, main = "Age Distribution")
boxplot(Income ~ Gender, data = comprehensive_data, main = "Income by Gender")

# 3. Create new variables
comprehensive_data$Age_Group <- cut(comprehensive_data$Age, 
                                   breaks = c(0, 30, 50, 100),
                                   labels = c("Young", "Middle", "Senior"))

comprehensive_data$Income_K <- comprehensive_data$Income / 1000

# 4. Group analysis
group_summary <- comprehensive_data |> 
  group_by(Gender, Education) |> 
  summarize(
    Mean_Income = mean(Income),
    Mean_Satisfaction = mean(Satisfaction),
    Count = n(),
    .groups = "drop"
  )

--- title: "R Programming Exercises" execute: eval: false format: html: toc: true toc_float: true toc_depth: 2 number-sections: true code-tools: true code-summary: "Solution" --- # R Programming Exercises This file contains exercises to practice the concepts covered in the R introduction. Each exercise includes questions in comments and solutions in hidden code chunks. ## Note to Students - Try to solve each exercise before looking at the solutions - Copy the Questions (the commented lines) into a new R script file, and write your solutions in that file - Practice running the code in RStudio - Experiment with modifying the code to understand how it works - Use the help documentation (?function_name) when you're unsure about a function ## Exercise 1: Basic Data Types and Vectors ```{r} #| eval: false # Question 1: Create a numeric vector with values 10, 20, 30, 40, 50 # Question 2: Create a character vector with your name and two hobbies # Question 3: Create a logical vector with TRUE, FALSE, TRUE, FALSE # Question 4: Check the class and length of each vector you created # Write your code here: ``` ```{r} #| code-fold: true #| eval: false # Solutions: # 1. Create numeric vector numbers <- c(10, 20, 30, 40, 50) # 2. Create character vector personal_info <- c("Your Name", "Reading", "Swimming") # 3. Create logical vector logical_values <- c(TRUE, FALSE, TRUE, FALSE) # 4. Check properties class(numbers) length(numbers) class(personal_info) length(personal_info) class(logical_values) length(logical_values) ``` ## Exercise 2: Vector Operations ```{r} #| eval: false # Question 1: Create two numeric vectors x = (1, 3, 5, 7, 9) and y = (2, 4, 6, 8, 10) # Question 2: Calculate x + y, x * y, and x^2 # Question 3: Find the mean, sum, and standard deviation of vector x # Question 4: Create a sequence from 1 to 20 with step size 2 # Write your code here: ``` ```{r} #| code-fold: true #| eval: false # Solutions: # 1. Create vectors x <- c(1, 3, 5, 7, 9) y <- c(2, 4, 6, 8, 10) # 2. Vector operations x + y x * y x^2 # 3. Summary statistics mean(x) sum(x) sd(x) # 4. Sequence seq(1, 20, by = 2) ``` ## Exercise 3: Factors and Categorical Data ```{r} #| eval: false # Question 1: Create a factor from the vector: c("Low", "Medium", "High", "Low", "Medium") # Question 2: Create an ordered factor with levels: "Beginner", "Intermediate", "Advanced" # Question 3: Group ages 18, 25, 30, 35, 40, 45, 50 into three categories: "Young" (<30), "Middle" (30-45), "Senior" (>45) # Question 4: Create a frequency table of your factor from question 1 # Write your code here: ``` ```{r} #| code-fold: true #| eval: false # Solutions: # 1. Create factor levels_factor <- factor(c("Low", "Medium", "High", "Low", "Medium")) # 2. Create ordered factor skill_levels <- factor(c("Beginner", "Intermediate", "Advanced", "Beginner", "Advanced"), levels = c("Beginner", "Intermediate", "Advanced"), ordered = TRUE) # 3. Group ages ages <- c(18, 25, 30, 35, 40, 45, 50) age_groups <- cut(ages, breaks = c(0, 30, 45, 100), labels = c("Young", "Middle", "Senior"), include.lowest = TRUE) # 4. Frequency table table(levels_factor) ``` ## Exercise 4: Data Frames and Basic Operations ```{r} #| eval: false # Question 1: Create a data frame with columns: Name, Age, City, and Salary # Question 2: Add a new column called "Age_Group" based on age (<30="Young", 30-50="Middle", >50="Senior") # Question 3: Find the mean salary by age group # Question 4: Select only people from a specific city # Write your code here: ``` ```{r} #| code-fold: true #| eval: false # Solutions: # 1. Create data frame employees <- data.frame( Name = c("Alice", "Bob", "Charlie", "Diana", "Eve"), Age = c(25, 35, 42, 28, 55), City = c("NYC", "LA", "Chicago", "NYC", "Boston"), Salary = c(50000, 65000, 75000, 55000, 80000) ) # 2. Add age group column employees$Age_Group <- ifelse(employees$Age < 30, "Young", ifelse(employees$Age <= 50, "Middle", "Senior")) # 3. Mean salary by age group aggregate(Salary ~ Age_Group, data = employees, FUN = mean) # 4. Select NYC employees nyc_employees <- employees[employees$City == "NYC", ] ``` ## Exercise 5: Data Manipulation with dplyr ```{r} #| eval: false # Question 1: Load the dplyr package and create a sample data frame # Question 2: Use filter() to select rows where age > 30 # Question 3: Use select() to keep only Name and Salary columns # Question 4: Use mutate() to create a new column "Salary_K" (salary in thousands) # Question 5: Use group_by() and summarize() to find mean salary by city # Write your code here: ``` ```{r} #| code-fold: true #| eval: false # Solutions: library(dplyr) # 1. Create sample data sample_data <- data.frame( Name = c("John", "Jane", "Mike", "Sarah", "Tom"), Age = c(25, 32, 28, 35, 29), City = c("NYC", "LA", "NYC", "Chicago", "LA"), Salary = c(45000, 60000, 52000, 70000, 58000) ) # 2. Filter age > 30 older_employees <- sample_data |> filter(Age > 30) # 3. Select specific columns name_salary <- sample_data |> select(Name, Salary) # 4. Create new column sample_data_with_k <- sample_data |> mutate(Salary_K = Salary / 1000) # 5. Group and summarize city_salary_summary <- sample_data |> group_by(City) |> summarize( Mean_Salary = mean(Salary), Count = n(), .groups = "drop" ) ``` ## Exercise 6: Data Reshaping (Wide to Long) ```{r} #| eval: false # Question 1: Create a wide format data frame with test scores at three time points # Question 2: Reshape the data from wide to long format # Question 3: Calculate the mean score for each time point # Question 4: Create a new column showing score improvement from Time1 to Time3 # Write your code here: ``` ```{r} #| code-fold: true #| eval: false # Solutions: library(tidyr) # 1. Create wide format data wide_data <- data.frame( ID = 1:5, Score_Time1 = c(75, 82, 68, 90, 78), Score_Time2 = c(78, 85, 72, 92, 81), Score_Time3 = c(82, 88, 76, 95, 85) ) # 2. Reshape to long format long_data <- wide_data |> pivot_longer( cols = starts_with("Score_"), names_to = "Time", values_to = "Score" ) |> mutate(Time = gsub("Score_Time", "", Time)) # 3. Mean score by time time_means <- long_data |> group_by(Time) |> summarize(Mean_Score = mean(Score), .groups = "drop") # 4. Score improvement improvement_data <- wide_data |> mutate(Improvement = Score_Time3 - Score_Time1) ``` ## Exercise 7: Basic Statistics and Visualization ```{r} #| eval: false # Question 1: Create a numeric vector and calculate basic statistics (mean, median, sd, min, max) # Question 2: Create a histogram of your data # Question 3: Create a boxplot comparing two groups # Question 4: Create a scatter plot with two variables # Write your code here: ``` ```{r} #| code-fold: true #| eval: false # Solutions: # 1. Create data and calculate statistics data_vector <- c(12, 15, 18, 22, 25, 28, 30, 32, 35, 38, 40, 42, 45, 48, 50) mean(data_vector) median(data_vector) sd(data_vector) min(data_vector) max(data_vector) summary(data_vector) # 2. Histogram hist(data_vector, main = "Distribution of Data", xlab = "Values", col = "lightblue") # 3. Boxplot (create two groups) group1 <- c(12, 15, 18, 22, 25) group2 <- c(35, 38, 40, 42, 45) boxplot(group1, group2, names = c("Group 1", "Group 2"), main = "Comparison of Groups") # 4. Scatter plot x_values <- 1:15 y_values <- data_vector plot(x_values, y_values, main = "Scatter Plot", xlab = "X", ylab = "Y", pch = 16) ``` ## Exercise 8: Control Structures and Functions ```{r} #| eval: false # Question 1: Write an if-else statement to classify scores as "Pass" (>=60) or "Fail" (<60) # Question 2: Use a for loop to calculate the square of numbers 1 to 10 # Question 3: Use sapply() to calculate the square root of numbers 1 to 10 # Question 4: Write a function that calculates the area of a circle given the radius # Write your code here: ``` ```{r} #| code-fold: true #| eval: false # Solutions: # 1. If-else for score classification score <- 75 if (score >= 60) { result <- "Pass" } else { result <- "Fail" } # 2. For loop for squares squares <- numeric(10) for (i in 1:10) { squares[i] <- i^2 } # 3. sapply for square roots numbers <- 1:10 square_roots <- sapply(numbers, sqrt) # 4. Function for circle area circle_area <- function(radius) { area <- pi * radius^2 return(area) } # Test the function circle_area(5) ``` ## Exercise 9: Working with Missing Data ```{r} #| eval: false # Question 1: Create a vector with some NA values # Question 2: Count how many NA values are in your vector # Question 3: Remove NA values from your vector # Question 4: Replace NA values with the mean of non-NA values # Write your code here: ``` ```{r} #| code-fold: true #| eval: false # Solutions: # 1. Create vector with NA data_with_na <- c(1, 2, NA, 4, 5, NA, 7, 8, NA, 10) # 2. Count NA values na_count <- sum(is.na(data_with_na)) # 3. Remove NA values data_clean <- na.omit(data_with_na) # or data_clean2 <- data_with_na[!is.na(data_with_na)] # 4. Replace NA with mean data_imputed <- data_with_na data_imputed[is.na(data_imputed)] <- mean(data_with_na, na.rm = TRUE) ``` ## Exercise 10: Comprehensive Data Analysis ```{r} #| eval: false # Question 1: Create a comprehensive dataset with multiple variables # Question 2: Perform exploratory data analysis (summary statistics, visualizations) # Question 3: Create new variables through transformations # Question 4: Group the data and calculate summary statistics by group # Write your code here: ``` ```{r} #| code-fold: true #| eval: false # Solutions: # 1. Create comprehensive dataset comprehensive_data <- data.frame( ID = 1:20, Age = sample(18:65, 20, replace = TRUE), Gender = sample(c("Male", "Female"), 20, replace = TRUE), Education = sample(c("High School", "College", "Graduate"), 20, replace = TRUE), Income = sample(30000:100000, 20, replace = TRUE), Satisfaction = sample(1:10, 20, replace = TRUE) ) # 2. Exploratory data analysis summary(comprehensive_data) str(comprehensive_data) # Visualizations hist(comprehensive_data$Age, main = "Age Distribution") boxplot(Income ~ Gender, data = comprehensive_data, main = "Income by Gender") # 3. Create new variables comprehensive_data$Age_Group <- cut(comprehensive_data$Age, breaks = c(0, 30, 50, 100), labels = c("Young", "Middle", "Senior")) comprehensive_data$Income_K <- comprehensive_data$Income / 1000 # 4. Group analysis group_summary <- comprehensive_data |> group_by(Gender, Education) |> summarize( Mean_Income = mean(Income), Mean_Satisfaction = mean(Satisfaction), Count = n(), .groups = "drop" ) ``` ------------------------------------------------------------------------