Favourite Things

Graphic by Carl Goodwin
library(tidyverse)
library(tidytext)
library(rvest)
library(wesanderson)
library(janitor)
library(glue)
library(kableExtra)
library(ggwordcloud)
library(fpp3)
library(tidymodels)
library(patchwork)
theme_set(theme_bw())

(cols <- wes_palette(name = "IsleofDogs1"))

Each project closes with a table summarising the R tools used. By visualising my most frequently used packages and functions I get a sense of where I may most benefit from going deeper and keeping abreast of the latest breaking changes.

I may also spot superseded functions e.g. spread and gather may now be replaced by pivot_wider and pivot_longer. Or an opportunity to switch a non-tidyverse package for a newer tidyverse (or ecosystem) alternative, e.g. for UpSetR I can now use ggupset which plays well with ggplot.

I’ll start by listing the paths to the html files in the project directory.

files <- list.files(
  path = "..//",
  pattern = "\\.html$", recursive = TRUE
) %>%
  str_c(getwd() %>% dirname(), ., sep = "/") %>%
  as_tibble() %>%
  filter(!str_detect(value, "world|dt1|appfiles")) %>%
  pull()

This enables me to extract the usage table for each project.

table_df <- map_dfr(files, function(x) {
  x %>%
    read_html() %>%
    html_elements("#r-toolbox table") %>%
    html_table() %>%
    bind_rows()
}) %>% clean_names(replace = c("io" = ""))

A little “spring cleaning” is needed, and separation of tidyverse and non-tidyverse packages.

tidy <- c(tidyverse_packages(), fpp3_packages(), tidymodels_packages()) %>% unique()

tidy_df <- table_df %>%
  separate_rows(functn, sep = ";") %>%
  separate(functn, c("functn", "count"), "\\Q[\\E") %>%
  mutate(
    count = str_remove(count, "]") %>% as.integer(),
    functn = str_squish(functn)
  ) %>%
  count(package, functn, wt = count) %>%
  mutate(multiverse = case_when(
    package %in% tidy ~ "tidy",
    package %in% c("base", "graphics") ~ "base",
    TRUE ~ "special"
  ))

Then I can summarise usage and prepare for a faceted plot.

pack_df <- tidy_df %>%
  count(package, multiverse, wt = n) %>%
  mutate(name = "package")

fun_df <- tidy_df %>%
  count(functn, multiverse, wt = n) %>%
  mutate(name = "function")

n_url <- files %>% n_distinct()

packfun_df <- pack_df %>%
  bind_rows(fun_df) %>%
  group_by(name) %>%
  arrange(desc(n)) %>%
  mutate(
    packfun = coalesce(package, functn),
    name = fct_rev(name)
  )

Clearly dplyr reigns supreme driven by mutate and filter.

p1 <- packfun_df %>%
  filter(name == "package") %>% 
  ggplot(aes(fct_reorder(packfun, n), n, fill = multiverse)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  geom_label(aes(label = n), hjust = "inward", size = 2, fill = "white") +
  scale_fill_manual(values = cols[c(2, 3, 1)]) +
  labs(
    title = glue("Favourite Things\nAcross {n_url} Projects"),
    subtitle = "Package Usage",
    x = NULL, y = NULL
  )

p2 <- packfun_df %>%
  filter(name == "function", n >= 4) %>% 
  ggplot(aes(fct_reorder(packfun, n), n, fill = multiverse)) +
  geom_col() +
  coord_flip() +
  geom_label(aes(label = n), hjust = "inward", size = 2, fill = "white") +
  scale_fill_manual(values = cols[c(2, 3, 1)]) +
  labs(x = NULL, y = NULL, 
       subtitle = "Function Usage >= 4")

p1 + p2

I’d also like a wordcloud. And thanks to blogdown, the updated visualisation is picked up as the new featured image for this project.

set.seed = 123

packfun_df %>%
  mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40))) %>%
  ggplot(aes(label = packfun, size = n, colour = multiverse, angle = angle)) +
  geom_text_wordcloud(eccentricity = .9, seed = 789) +
  scale_radius(range = c(0, 40), limits = c(0, NA)) +
  scale_colour_manual(values = cols[c(2, 4, 5)]) +
  theme_void() +
  theme(plot.background = element_rect(fill = cols[1]))

R Toolbox

A little bit circular I know, but I might as well include this code too in my “favourite things”.

Package Function
base as.integer[1]; c[8]; conflicts[1]; cumsum[1]; dirname[1]; function[2]; getwd[1]; list.files[1]; sample[1]; search[1]; sum[1]; unique[1]
dplyr filter[8]; arrange[3]; bind_rows[2]; case_when[1]; coalesce[1]; count[4]; desc[3]; group_by[2]; if_else[3]; mutate[10]; n[8]; n_distinct[1]; pull[1]; summarise[1]
forcats fct_reorder[2]; fct_rev[1]
fpp3 fpp3_packages[1]
ggplot2 aes[5]; coord_flip[2]; element_rect[1]; geom_col[2]; geom_label[2]; ggplot[3]; labs[2]; scale_colour_manual[1]; scale_fill_manual[2]; scale_radius[1]; theme[1]; theme_bw[1]; theme_set[1]; theme_void[1]
ggwordcloud geom_text_wordcloud[1]
glue glue[1]
janitor clean_names[1]
kableExtra kable_material[1]; kbl[1]
purrr map[1]; map_dfr[1]; map2_dfr[1]; possibly[1]; set_names[1]
readr read_lines[1]
rvest html_elements[1]; html_table[1]; read_html[1]
stringr str_c[6]; str_count[1]; str_detect[3]; str_remove[3]; str_remove_all[1]; str_squish[1]; str_starts[1]
tibble as_tibble[2]; tibble[2]; enframe[1]
tidymodels tidymodels_packages[1]
tidyr separate[1]; separate_rows[1]; unnest[1]
tidyverse tidyverse_packages[1]
wesanderson wes_palette[1]
Carl Goodwin
Carl Goodwin
IBM Data Scientist & Growth Strategy Leader
comments powered by Disqus

Related