Advanced Filter Exercises

Published

Last modified on September 09, 2024, 13:21:15 Eastern Daylight Time

Video here

Code

library(tidyverse)

Required packages and datasets

Run the code in the code chunk below to load the dataset happy_full.

Code

# install.packages("remotes")
# remotes::install_github("matthewhirschey/tidybiology", force = TRUE)
# can also use devtools::install_github() since devtools is installed in
# system library
library(tidybiology)
data(happy_full)
happy_full |> 
  DT::datatable()

Choose countries that are NOT in Europe i.e. countries that have a region entry that is NOT equal to either "Western Europe" or "Central and Eastern Europe".

Code

happy_full |>
  filter(!region %in% c("Western Europe", "Central and Eastern Europe")) -> ans1
ggplot(ans1, aes(x = fct_infreq(region))) +
  geom_bar(fill = "darkgreen", color = "black") +
  coord_flip() +
  theme_classic() + 
  labs(x = "")

Choose countries that do NOT (have an above average healthy_life_expectancy or above average generosity score).

Code

happy_full |>
  filter(!(
    healthy_life_expectancy > mean(healthy_life_expectancy) |
      generosity > mean(generosity)
  )) -> ans3
dim(ans3)

[1] 27 20

Code

# This is the same as
happy_full |>
  filter((
    healthy_life_expectancy <= mean(healthy_life_expectancy) & 
      generosity <= mean(generosity)
  )) -> ans4
dim(ans4)

[1] 27 20

Choose countries that are either in Europe or have a ladder_score above 7, and have a healthy_life_expectancy above 70.

Code

happy_full |>
  filter((
    region %in% c("Western Europe", "Central and Eastern Europe") |
      ladder_score > 7
  ) & healthy_life_expectancy > 70) -> ans5
ans5 |>
  relocate(region, ladder_score, healthy_life_expectancy) |>
  DT::datatable()

Code

# Note this is not the same as:
happy_full |>
  filter(
    healthy_life_expectancy > 70 &
      ladder_score > 7 |
      region %in% c("Western Europe", "Central and Eastern Europe")
  ) -> ans6
ans6 |>
  relocate(region, ladder_score, healthy_life_expectancy) |>
  DT::datatable()

Code

# To make the last bit of code work use parentheses as follows:
happy_full |>
  filter(healthy_life_expectancy > 70 &
           (
             ladder_score > 7 |
               region %in% c("Western Europe", "Central and Eastern Europe")
           )) -> ans6a
ans6a |>
  relocate(region, ladder_score, healthy_life_expectancy) |>
  DT::datatable()

Choose countries that have a generosity score that is between the median and mean generosity scores (inclusive) across all countries.

Code

happy_full |>
  summarize(MD = median(generosity),
            ME = mean(generosity))

# A tibble: 1 × 2
      MD      ME
   <dbl>   <dbl>
1 -0.036 -0.0151

Code

# Note: Median < Mean
happy_full |>
  filter(between(generosity, median(generosity), mean(generosity))) -> ans7
dim(ans7)

[1]  8 20

Code

# Check
ans7 |>
  reframe(R = range(generosity))

# A tibble: 2 × 1
       R
   <dbl>
1 -0.036
2 -0.016

Code

# Another solution
happy_full |>
  filter(generosity >= median(generosity),
         generosity <= mean(generosity)) -> ans8
dim(ans8)

[1]  8 20

Choose countries whose ladder_score is equal to the mean ladder_score \(\pm\) the standard error (SE = \(s/\sqrt{n}\)) of ladder_score across all countries.

Code

happy_full |>
  summarize(xbar = mean(ladder_score),
            SE = sd(ladder_score) / sqrt(sum(!is.na(ladder_score)))) -> ans8
ans8

# A tibble: 1 × 2
   xbar     SE
  <dbl>  <dbl>
1  5.53 0.0880

Code

# Using between() with filter
happy_full |>
  filter(between(
    ladder_score,
    mean(ladder_score) - sd(ladder_score) / sqrt(sum(!is.na(ladder_score))),
    mean(ladder_score) + sd(ladder_score) / sqrt(sum(!is.na(ladder_score)))
  )) -> ans9
dim(ans9)

[1]  7 20

Code

# Using logical operators with filter
happy_full |>
  filter(
    ladder_score >= mean(ladder_score) - sd(ladder_score) / sqrt(sum(!is.na(ladder_score))) &
      ladder_score <= mean(ladder_score) + sd(ladder_score) / sqrt(sum(!is.na(ladder_score)))
  ) -> ans10
dim(ans10)

[1]  7 20

Code

# Using near() with filter
happy_full |>
  filter(near(
    ladder_score,
    mean(ladder_score),
    tol = sd(ladder_score) / sqrt(sum(!is.na(ladder_score)))
  )) -> ans11
dim(ans11)

[1]  7 20