load-in the necessary libraries

library(skimr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## Warning: package 'purrr' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Q1. Read in and Inspect the data

data("iris")

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

# There are 5 variables with 150 observations

Q2. Create a new data frame iris1 that contains only the species virginica and versicolor with sepal lengths longer than 6 cm and sepal widths longer than 2.5 cm. How many observations and variables are in the data set?

iris1 <- iris %>%
  group_by(Species) %>%
  filter(Sepal.Length > 6,
         Sepal.Width > 2.5)
  
summary(iris1)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width         Species  
##  Min.   :6.100   Min.   :2.600   Min.   :4.000   Min.   :1.20   setosa    : 0  
##  1st Qu.:6.300   1st Qu.:2.875   1st Qu.:4.775   1st Qu.:1.50   versicolor:17  
##  Median :6.600   Median :3.000   Median :5.400   Median :1.85   virginica :39  
##  Mean   :6.698   Mean   :3.041   Mean   :5.359   Mean   :1.87                  
##  3rd Qu.:6.900   3rd Qu.:3.200   3rd Qu.:5.800   3rd Qu.:2.20                  
##  Max.   :7.900   Max.   :3.800   Max.   :6.900   Max.   :2.50

# My df has 5 variables with 56 observations

Q3. create a iris2 data frame from iris1 that contains only the columns for Species, Sepal.Length, and Sepal.Width. How many observations and variables are in the data set?

iris2 <- iris1 %>%
  select(Species, Sepal.Length, Sepal.Width)

summary(iris2)

##        Species    Sepal.Length    Sepal.Width   
##  setosa    : 0   Min.   :6.100   Min.   :2.600  
##  versicolor:17   1st Qu.:6.300   1st Qu.:2.875  
##  virginica :39   Median :6.600   Median :3.000  
##                  Mean   :6.698   Mean   :3.041  
##                  3rd Qu.:6.900   3rd Qu.:3.200  
##                  Max.   :7.900   Max.   :3.800

# My df has 3 variables with 56 observations

Q4. Create an iris3 data frame from iris2 that orders the observations from largest to smallest sepal length. Show the first 6 rows of this data set.

iris3 <- iris2 %>%
  arrange(desc(Sepal.Length))

head(iris3)

## # A tibble: 6 × 3
## # Groups:   Species [1]
##   Species   Sepal.Length Sepal.Width
##   <fct>            <dbl>       <dbl>
## 1 virginica          7.9         3.8
## 2 virginica          7.7         3.8
## 3 virginica          7.7         2.6
## 4 virginica          7.7         2.8
## 5 virginica          7.7         3  
## 6 virginica          7.6         3

Q5. Create an iris4 data frame from iris3 that creates a column with a sepal area (length * width) value for each observation. How many observations and variables are in the data set?

iris4 <- iris3 %>%
  mutate(Sepal.Area = Sepal.Length * Sepal.Width)

head(iris4)

## # A tibble: 6 × 4
## # Groups:   Species [1]
##   Species   Sepal.Length Sepal.Width Sepal.Area
##   <fct>            <dbl>       <dbl>      <dbl>
## 1 virginica          7.9         3.8       30.0
## 2 virginica          7.7         3.8       29.3
## 3 virginica          7.7         2.6       20.0
## 4 virginica          7.7         2.8       21.6
## 5 virginica          7.7         3         23.1
## 6 virginica          7.6         3         22.8

summary(iris4)

##        Species    Sepal.Length    Sepal.Width      Sepal.Area   
##  setosa    : 0   Min.   :6.100   Min.   :2.600   Min.   :15.86  
##  versicolor:17   1st Qu.:6.300   1st Qu.:2.875   1st Qu.:18.30  
##  virginica :39   Median :6.600   Median :3.000   Median :20.48  
##                  Mean   :6.698   Mean   :3.041   Mean   :20.40  
##                  3rd Qu.:6.900   3rd Qu.:3.200   3rd Qu.:21.40  
##                  Max.   :7.900   Max.   :3.800   Max.   :30.02

# my df has 4 variables with 56 observations

Q6. Create iris5 that calculates the average sepal length, the average sepal width, and the sample size of the entire iris4 data frame and print iris5

iris5 <- data.frame(average.sepal.length = mean(iris4$Sepal.Length),
          average.sepal.width = mean(iris4$Sepal.Width), 
          sample.size = nrow(iris4)) 
head(iris5)

##   average.sepal.length average.sepal.width sample.size
## 1             6.698214            3.041071          56

Q7. Finally, create iris6 that calculates the average sepal length, the average sepal width, and the sample size for each species of in the iris4 data frame and print iris6.

iris6 <- iris4 %>%
  group_by(Species) %>%
  summarise(average.sepal.length = mean(iris4$Sepal.Length),
          average.sepal.width = mean(iris4$Sepal.Width), 
          sample.size = n())
print(iris6)

## # A tibble: 2 × 4
##   Species    average.sepal.length average.sepal.width sample.size
##   <fct>                     <dbl>               <dbl>       <int>
## 1 versicolor                 6.70                3.04          17
## 2 virginica                  6.70                3.04          39

Q8. In these exercises, you have successively modified different versions of the data frame iris1 iris2 iris3 iris4 iris5 iris6. At each stage, the output data frame from one operation serves as the input fro the next. A more efficient way to do this is to use the pipe operator %>% from the tidyr package. See if you can rework all of your previous statements (except for iris5) into an extended piping operation that uses iris as the input and generates irisFinal as the output

irisFinal <- iris %>%
  select(Species, Sepal.Length, Sepal.Width) %>%
  group_by(Species) %>%
  filter(Sepal.Length > 6,
         Sepal.Width > 2.5) %>%
   arrange(desc(Sepal.Length)) %>%
  mutate(Sepal.Area = Sepal.Length * Sepal.Width) %>%
  summarise(average.sepal.length = mean(Sepal.Length),
          average.sepal.width = mean(Sepal.Width), 
          sample.size = n())

summary(irisFinal)

##        Species  average.sepal.length average.sepal.width  sample.size  
##  setosa    :0   Min.   :6.482        Min.   :2.988       Min.   :17.0  
##  versicolor:1   1st Qu.:6.560        1st Qu.:3.007       1st Qu.:22.5  
##  virginica :1   Median :6.637        Median :3.026       Median :28.0  
##                 Mean   :6.637        Mean   :3.026       Mean   :28.0  
##                 3rd Qu.:6.715        3rd Qu.:3.045       3rd Qu.:33.5  
##                 Max.   :6.792        Max.   :3.064       Max.   :39.0

Create a ‘longer’ data frame using the original iris data set with three columns named “Species”, “Measure”, “Value”. The column “Species” will retain the species names of the data set. The column “Measure” will include whether the value corresponds to Sepal.Length, Sepal.Width, Petal.Length, or Petal.Width and the column “Value” will include the numerical values of those measurements.

iris_longer <- iris %>%
  pivot_longer(!c(Species),names_to = "Measure",  values_to = "value")

summary(iris_longer)

##        Species      Measure              value      
##  setosa    :200   Length:600         Min.   :0.100  
##  versicolor:200   Class :character   1st Qu.:1.700  
##  virginica :200   Mode  :character   Median :3.200  
##                                      Mean   :3.465  
##                                      3rd Qu.:5.100  
##                                      Max.   :7.900

homework_seven

Isaac Buabeng

2025-05-07

load-in the necessary libraries

Q1. Read in and Inspect the data

Q2. Create a new data frame iris1 that contains only the species virginica and versicolor with sepal lengths longer than 6 cm and sepal widths longer than 2.5 cm. How many observations and variables are in the data set?

Q3. create a iris2 data frame from iris1 that contains only the columns for Species, Sepal.Length, and Sepal.Width. How many observations and variables are in the data set?

Q4. Create an iris3 data frame from iris2 that orders the observations from largest to smallest sepal length. Show the first 6 rows of this data set.

Q5. Create an iris4 data frame from iris3 that creates a column with a sepal area (length * width) value for each observation. How many observations and variables are in the data set?

Q6. Create iris5 that calculates the average sepal length, the average sepal width, and the sample size of the entire iris4 data frame and print iris5

Q7. Finally, create iris6 that calculates the average sepal length, the average sepal width, and the sample size for each species of in the iris4 data frame and print iris6.