load-in the necessary libraries
library(skimr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Q1. Read in and Inspect the data
data("iris")
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# There are 5 variables with 150 observations
Q2. Create a new data frame iris1 that contains only the species
virginica and versicolor with sepal lengths longer than 6 cm and sepal
widths longer than 2.5 cm. How many observations and variables are in
the data set?
iris1 <- iris %>%
group_by(Species) %>%
filter(Sepal.Length > 6,
Sepal.Width > 2.5)
summary(iris1)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## Min. :6.100 Min. :2.600 Min. :4.000 Min. :1.20 setosa : 0
## 1st Qu.:6.300 1st Qu.:2.875 1st Qu.:4.775 1st Qu.:1.50 versicolor:17
## Median :6.600 Median :3.000 Median :5.400 Median :1.85 virginica :39
## Mean :6.698 Mean :3.041 Mean :5.359 Mean :1.87
## 3rd Qu.:6.900 3rd Qu.:3.200 3rd Qu.:5.800 3rd Qu.:2.20
## Max. :7.900 Max. :3.800 Max. :6.900 Max. :2.50
# My df has 5 variables with 56 observations
Q3. create a iris2 data frame from iris1 that contains only the
columns for Species, Sepal.Length, and Sepal.Width. How many
observations and variables are in the data set?
iris2 <- iris1 %>%
select(Species, Sepal.Length, Sepal.Width)
summary(iris2)
## Species Sepal.Length Sepal.Width
## setosa : 0 Min. :6.100 Min. :2.600
## versicolor:17 1st Qu.:6.300 1st Qu.:2.875
## virginica :39 Median :6.600 Median :3.000
## Mean :6.698 Mean :3.041
## 3rd Qu.:6.900 3rd Qu.:3.200
## Max. :7.900 Max. :3.800
# My df has 3 variables with 56 observations
Q4. Create an iris3 data frame from iris2 that orders the
observations from largest to smallest sepal length. Show the first 6
rows of this data set.
iris3 <- iris2 %>%
arrange(desc(Sepal.Length))
head(iris3)
## # A tibble: 6 × 3
## # Groups: Species [1]
## Species Sepal.Length Sepal.Width
## <fct> <dbl> <dbl>
## 1 virginica 7.9 3.8
## 2 virginica 7.7 3.8
## 3 virginica 7.7 2.6
## 4 virginica 7.7 2.8
## 5 virginica 7.7 3
## 6 virginica 7.6 3
Q5. Create an iris4 data frame from iris3 that creates a column with
a sepal area (length * width) value for each observation. How many
observations and variables are in the data set?
iris4 <- iris3 %>%
mutate(Sepal.Area = Sepal.Length * Sepal.Width)
head(iris4)
## # A tibble: 6 × 4
## # Groups: Species [1]
## Species Sepal.Length Sepal.Width Sepal.Area
## <fct> <dbl> <dbl> <dbl>
## 1 virginica 7.9 3.8 30.0
## 2 virginica 7.7 3.8 29.3
## 3 virginica 7.7 2.6 20.0
## 4 virginica 7.7 2.8 21.6
## 5 virginica 7.7 3 23.1
## 6 virginica 7.6 3 22.8
summary(iris4)
## Species Sepal.Length Sepal.Width Sepal.Area
## setosa : 0 Min. :6.100 Min. :2.600 Min. :15.86
## versicolor:17 1st Qu.:6.300 1st Qu.:2.875 1st Qu.:18.30
## virginica :39 Median :6.600 Median :3.000 Median :20.48
## Mean :6.698 Mean :3.041 Mean :20.40
## 3rd Qu.:6.900 3rd Qu.:3.200 3rd Qu.:21.40
## Max. :7.900 Max. :3.800 Max. :30.02
# my df has 4 variables with 56 observations
Q6. Create iris5 that calculates the average sepal length, the
average sepal width, and the sample size of the entire iris4 data frame
and print iris5
iris5 <- data.frame(average.sepal.length = mean(iris4$Sepal.Length),
average.sepal.width = mean(iris4$Sepal.Width),
sample.size = nrow(iris4))
head(iris5)
## average.sepal.length average.sepal.width sample.size
## 1 6.698214 3.041071 56
Q7. Finally, create iris6 that calculates the average sepal length,
the average sepal width, and the sample size for each species of in the
iris4 data frame and print iris6.
iris6 <- iris4 %>%
group_by(Species) %>%
summarise(average.sepal.length = mean(iris4$Sepal.Length),
average.sepal.width = mean(iris4$Sepal.Width),
sample.size = n())
print(iris6)
## # A tibble: 2 × 4
## Species average.sepal.length average.sepal.width sample.size
## <fct> <dbl> <dbl> <int>
## 1 versicolor 6.70 3.04 17
## 2 virginica 6.70 3.04 39
Create a ‘longer’ data frame using the original iris data set with
three columns named “Species”, “Measure”, “Value”. The column “Species”
will retain the species names of the data set. The column “Measure” will
include whether the value corresponds to Sepal.Length, Sepal.Width,
Petal.Length, or Petal.Width and the column “Value” will include the
numerical values of those measurements.
iris_longer <- iris %>%
pivot_longer(!c(Species),names_to = "Measure", values_to = "value")
summary(iris_longer)
## Species Measure value
## setosa :200 Length:600 Min. :0.100
## versicolor:200 Class :character 1st Qu.:1.700
## virginica :200 Mode :character Median :3.200
## Mean :3.465
## 3rd Qu.:5.100
## Max. :7.900