Favourite Things

R
web scraping
R packages & functions that make doing data science a joy based on usage across projects
Author

Carl Goodwin

Published

July 26, 2020

Modified

November 24, 2022

Each project closes with a table summarising the R tools used. By visualising the most frequently used packages and functions I can get a sense of where I may most benefit from going deeper into the latest package versions.

It’s an opportunity to replace superseded functions e.g. spread and gather with pivot_wider and pivot_longer. Or to use newer packages, such as tidyclust which brings cluster modelling to tidymodels (used in the latest quarto version of Finding Happiness in ‘The Smoke’), or bslib used in the latest shiny app embedded in Plots Thicken.

theme_set(theme_bw())

n <- 4
palette <- "harrypotter::always"

cols <- paletteer_c(palette, n = n)

tibble(x = 1:n, y = 1) |>
  ggplot(aes(x, y, fill = fct_rev(cols))) +
  geom_col(colour = "white") +
  geom_label(aes(label = cols |> str_remove("FF$")), 
             size = 4, vjust = 2, fill = "white") +
  annotate(
    "label",
    x = (n + 1) / 2, y = 0.5,
    label = palette,
    fill = "white",
    alpha = 0.8,
    size = 6
  ) +
  scale_fill_manual(values = as.character(cols)) +
  theme_void() +
  theme(legend.position = "none")

I’ll start by grabbing the url for all projects.

urls <- "https://www.quantumjitter.com/project/" |> 
  read_html() |> 
  html_elements(".quarto-grid-link") |> 
  html_attr("href") |> 
  as_tibble() |> 
  transmute(str_c("https://www.quantumjitter.com/", value)) |> 
  pull()

This enables me to extract the package and function usage table for each one.

table_df <- map_dfr(urls, function(x) {
  x |>
    read_html() |>
    html_elements("#r-toolbox , table") |>
    html_table()
}) |>
  clean_names(replace = c("io" = "")) |>
  select(package, functn) |>
  drop_na()

A little “spring cleaning” is needed, and separation of tidyverse and non-tidyverse packages.

tidy <-
  c(
    tidyverse_packages(),
    fpp3_packages(),
    tidymodels_packages()
  ) |>
  unique()

tidy_df <- table_df |>
  separate_rows(functn, sep = ";") |>
  separate(functn, c("functn", "count"), "\\Q[\\E") |>
  mutate(
    count = str_remove(count, "]") |> as.integer(),
    functn = str_squish(functn)
  ) |>
  count(package, functn, wt = count) |>
  mutate(multiverse = case_when(
    package %in% tidy ~ "tidy",
    package %in% c("base", "graphics") ~ "base",
    TRUE ~ "special"
  ))

Then I can summarise usage and prepare for a faceted plot.

pack_df <- tidy_df |>
  count(package, multiverse, wt = n) |>
  mutate(name = "package")

fun_df <- tidy_df |>
  count(functn, multiverse, wt = n) |>
  mutate(name = "function")

n_url <- urls |> n_distinct()

packfun_df <- pack_df |>
  bind_rows(fun_df) |>
  group_by(name) |>
  arrange(desc(n)) |>
  mutate(
    packfun = coalesce(package, functn),
    name = fct_rev(name)
  )

Clearly dplyr reigns supreme driven by mutate and filter.

p1 <- packfun_df |>
  filter(name == "package") |> 
  ggplot(aes(fct_reorder(packfun, n), n, fill = multiverse)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  geom_label(aes(label = n), hjust = "inward", size = 2, fill = "white") +
  scale_fill_manual(values = cols[c(1, 2, 4)]) +
  labs(
    title = glue("Favourite Things\nAcross {n_url} Projects"),
    subtitle = "Package Usage",
    x = NULL, y = NULL
  )

p2 <- packfun_df |>
  filter(name == "function", n >= 4) |> 
  ggplot(aes(fct_reorder(packfun, n), n, fill = multiverse)) +
  geom_col() +
  coord_flip() +
  geom_label(aes(label = n), hjust = "inward", size = 2, fill = "white") +
  scale_fill_manual(values = cols[c(1, 2, 4)]) +
  labs(x = NULL, y = NULL, 
       subtitle = "Function Usage >= 4")

p1 + p2

I’d also like a word cloud generated as the new featured image for this project.

set.seed = 123

packfun_df |>
  mutate(angle = 45 * sample(-2:2, n(), 
                             replace = TRUE, 
                             prob = c(1, 1, 4, 1, 1))) |>
  ggplot(aes(
    label = packfun,
    size = n,
    colour = multiverse,
    angle = angle
  )) +
  geom_text_wordcloud(
    eccentricity = 1,
    seed = 789
  ) +
  scale_size_area(max_size = 20) +
  scale_colour_manual(values = cols[c(2, 3, 4)]) +
  theme_void() +
  theme(plot.background = element_rect(fill = cols[1]))

R Toolbox

A little bit circular, but I might as well include this code too in my “favourite things”.

Package Function
base as.character[1]; as.integer[1]; c[5]; conflicts[1]; cumsum[1]; function[2]; sample[1]; search[1]; sum[1]; unique[1]
dplyr filter[7]; arrange[3]; bind_rows[1]; case_when[1]; coalesce[1]; count[4]; desc[3]; group_by[2]; if_else[3]; mutate[10]; n_distinct[1]; pull[1]; select[1]; summarise[1]; transmute[1]
forcats fct_reorder[2]; fct_rev[2]
fpp3 fpp3_packages[1]
ggplot2 aes[7]; annotate[1]; coord_flip[2]; element_rect[1]; geom_col[3]; geom_label[3]; ggplot[4]; labs[2]; scale_colour_manual[1]; scale_fill_manual[3]; scale_size_area[1]; theme[2]; theme_bw[1]; theme_set[1]; theme_void[2]
ggwordcloud geom_text_wordcloud[1]
glue glue[1]
janitor clean_names[1]
paletteer paletteer_c[1]
purrr map[1]; map_dfr[1]; map2_dfr[1]; possibly[1]; set_names[1]
readr read_lines[1]
rvest html_attr[1]; html_elements[2]; html_table[1]; read_html[2]
stringr str_c[6]; str_count[1]; str_detect[2]; str_remove[4]; str_remove_all[1]; str_squish[1]; str_starts[1]
tibble as_tibble[2]; tibble[3]; enframe[1]
tidymodels tidymodels_packages[1]
tidyr drop_na[1]; separate[1]; separate_rows[1]; unnest[1]
tidyverse tidyverse_packages[1]