From 2ad422d4b58286f73e441e8b45511ac18f14e115 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Wed, 14 Feb 2024 09:10:58 -0500
Subject: [PATCH] refactor: move report to https://github.com/CCBR/reports

---
 CHANGELOG.md                        |   6 +-
 bin/render.R                        |   5 -
 bin/render.sh                       |  11 -
 bin/render_report_biowulf.sh        |  29 --
 docker/spacesavers2/Dockerfile      | 105 ------
 docker/spacesavers2/environment.txt |  26 --
 docker/spacesavers2/meta.yml        |   4 -
 report.Rmd                          | 504 ----------------------------
 8 files changed, 4 insertions(+), 686 deletions(-)
 delete mode 100755 bin/render.R
 delete mode 100755 bin/render.sh
 delete mode 100755 bin/render_report_biowulf.sh
 delete mode 100644 docker/spacesavers2/Dockerfile
 delete mode 100644 docker/spacesavers2/environment.txt
 delete mode 100644 docker/spacesavers2/meta.yml
 delete mode 100644 report.Rmd

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b65cf73..3ca870d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
 ## spacesavers2 development version
 
+- Move the report to a separate internal repository (@kelly-sovacool)
+
 ### New features
 
 ### Bug fixes
@@ -21,11 +23,11 @@
 - `grubbers` `--limit` can be < 1 GiB (float) (#70, @kopardev)
 - `grubbers` output file format changed. New original file column added. Original file is required by `usurp`.
 - `mimeo` `--duplicateonly` now correctly handles duplicates owned by different UIDs. (#71, @kopardev)
-    - Update `blamematrix` and to account for corrected duplicate handling in `mimeo`.
+  - Update `blamematrix` and to account for corrected duplicate handling in `mimeo`.
 - `usurp` now uses the new "original file" column from `grubbers` while creating hard-links.
 - Total size now closely resembles `df` results (fix #75 @kopardev)
 - Files with future timestamps are handled correctly (fix #76, @kopardev)
-  
+
 ## spacesavers2 0.10.2
 
 - Now tracking user-facing changes with a changelog. (#61, @kelly-sovacool)
diff --git a/bin/render.R b/bin/render.R
deleted file mode 100755
index ebb8ae1..0000000
--- a/bin/render.R
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/env Rscript
-rmarkdown::render("report.Rmd",
-  output_file = "datashare/report.html",
-  params = list(input_dir = "data")
-)
diff --git a/bin/render.sh b/bin/render.sh
deleted file mode 100755
index 6c3242a..0000000
--- a/bin/render.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env bash
-# to be executed from /data/CCBR_Pipeliner/Tools/spacesavers2/report
-# Usage: bash bin/render_report_biowulf.sh
-module load singularity
-SINGULARITY_CACHEDIR=/data/CCBR_Pipeliner/SIFS
-
-# render report
-echo "cd /mnt && \
-    Rscript bin/render.R \
-    " |\
-    singularity exec -C -B $PWD:/mnt,/data/CCBR_Pipeliner/userdata/spacesavers2/:/mnt/data docker://nciccbr/spacesavers2:0.1.1 bash
diff --git a/bin/render_report_biowulf.sh b/bin/render_report_biowulf.sh
deleted file mode 100755
index 714eaa9..0000000
--- a/bin/render_report_biowulf.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-# to be executed from /data/CCBR_Pipeliner/Tools/spacesavers2/report
-# Usage: bash bin/render_report_biowulf.sh
-module load singularity
-SINGULARITY_CACHEDIR=/data/CCBR_Pipeliner/SIFS
-
-today=$(date +'%Y-%m-%d')
-year=$(date +'%Y')
-mkdir -p datashare/$year
-html_filename="datashare/${year}/spacesavers2-report_${today}.html"
-recipient_email="kelly.sovacool@nih.gov,vishal.koparde@nih.gov"
-
-url=https://hpc.nih.gov/~CCBR_Pipeliner/spacesavers2/${year}/spacesavers2-report_${today}.html
-
-# update disk usage
-bash bin/disk_usage.sh
-# render report and send via email
-echo "cd /mnt && \
-    Rscript bin/render.R && \
-    cp datashare/report.html $html_filename && \
-    python src/send_email.py \
-        $html_filename \
-        $url \
-        $recipient_email \
-    " |\
-    singularity exec -C -B $PWD:/mnt,/data/CCBR_Pipeliner/userdata/spacesavers2/:/mnt/data docker://nciccbr/spacesavers2:0.1.1 bash
-
-chmod -R a+r datashare/
-cp -r datashare/* /data/CCBR_Pipeliner/datashare/spacesavers2/
diff --git a/docker/spacesavers2/Dockerfile b/docker/spacesavers2/Dockerfile
deleted file mode 100644
index 233b5fe..0000000
--- a/docker/spacesavers2/Dockerfile
+++ /dev/null
@@ -1,105 +0,0 @@
-FROM ubuntu:20.04
-
-# build time variables
-ARG BUILD_DATE="000000"
-ENV BUILD_DATE=${BUILD_DATE}
-ARG BUILD_TAG="000000"
-ENV BUILD_TAG=${BUILD_TAG}
-ARG REPONAME="000000"
-ENV REPONAME=${REPONAME}
-
-RUN mkdir -p /opt2 && mkdir -p /data2
-ENV TZ=America/New_York
-RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-
-RUN apt update && apt-get -y upgrade
-# Set the locale
-RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-		locales build-essential cmake cpanminus && \
-	localedef -i en_US -f UTF-8 en_US.UTF-8 && \
-	cpanm FindBin Term::ReadLine
-
-# install basic dependencies with apt-get
-RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-    build-essential \
-	figlet \
-	g++ \
-	gcc \
-	gfortran \
-	git \
-	libatlas-base-dev \
-	libblas-dev \
-	libboost-dev \
-	libbz2-dev \
-	libcurl4-openssl-dev \
-	libexpat1-dev \
-	libfreetype6-dev \
-	libgd-dev \
-	libgd-perl \
-	libglib2.0-dev \
-    libgpgme11-dev \
-	libgs-dev \
-	libgsl-dev \
-	libgsl0-dev \
-	libhtml-template-compiled-perl \
-	libicu-dev \
-	libjudy-dev \
-	liblapack-dev \
-	liblzma-dev \
-	libmysqlclient-dev \
-	libncurses-dev \
-	libopenmpi-dev \
-	libpng-dev \
-	librtmp-dev \
-    libseccomp-dev \
-	libssl-dev \
-	libtool \
-	libxml-libxml-debugging-perl \
-	libxml-opml-simplegen-perl \
-	libxml2-dev \
-	libxslt-dev \
-	make \
-	manpages-dev \
-	openjdk-17-jre-headless \
-	parallel \
-	pigz \
-    pkg-config \
-	python3-pip \
-    python3-dev \
-	rsync \
-    squashfs-tools \
-	unzip \
-    uuid-dev \
-	wget \
-	zlib1g \
-	zlib1g-dev \
-	zlibc
-
-# Install conda and give write permissions to conda folder
-RUN echo 'export PATH=/opt2/conda/bin:$PATH' > /etc/profile.d/conda.sh && \
-    wget --quiet "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" -O ~/miniforge3.sh && \
-    /bin/bash ~/miniforge3.sh -b -p /opt2/conda && \
-    rm ~/miniforge3.sh && chmod 777 -R /opt2/conda/
-ENV PATH="/opt2/conda/bin:$PATH"
-
-# install pandoc & R packages
-COPY environment.txt /data2/
-RUN mamba install -c conda-forge --file /data2/environment.txt
-ENV R_LIBS_USER=/opt2/conda/lib/R/library/
-
-# install quarto
-ENV QUARTO_VERSION="1.3.450"
-ADD https://github.com/quarto-dev/quarto-cli/releases/download/v${QUARTO_VERSION}/quarto-${QUARTO_VERSION}-linux-amd64.tar.gz /opt2
-WORKDIR /opt2
-RUN tar -xzvf quarto-${QUARTO_VERSION}-linux-amd64.tar.gz
-ENV PATH="/opt2/quarto-${QUARTO_VERSION}/bin/:${PATH}"
-RUN quarto check
-
-# Save Dockerfile in the docker
-COPY Dockerfile /opt2/Dockerfile_${REPONAME}.${BUILD_TAG}
-RUN chmod a+r /opt2/Dockerfile_${REPONAME}.${BUILD_TAG}
-
-# cleanup
-WORKDIR /data2
-RUN apt-get clean && apt-get purge \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
diff --git a/docker/spacesavers2/environment.txt b/docker/spacesavers2/environment.txt
deleted file mode 100644
index 5334639..0000000
--- a/docker/spacesavers2/environment.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-pandoc
-r-base=4.3.1
-r-DT
-r-RColorBrewer
-r-bslib=0.5.1
-r-crosstalk
-r-downlit
-r-dplyr
-r-fontawesome
-r-ggplot2
-r-glue
-r-here
-r-janitor
-r-knitr
-r-lubridate
-r-plotly
-r-purrr
-r-readr
-r-rlang
-r-rmarkdown
-r-scales
-r-shiny
-r-stringr
-r-tibble
-r-tidyr
-r-xml2
diff --git a/docker/spacesavers2/meta.yml b/docker/spacesavers2/meta.yml
deleted file mode 100644
index d68cd0a..0000000
--- a/docker/spacesavers2/meta.yml
+++ /dev/null
@@ -1,4 +0,0 @@
-dockerhub_namespace: nciccbr
-image_name: spacesavers2
-version: 0.1.1
-container: "$(dockerhub_namespace)/$(image_name):$(version)"
diff --git a/report.Rmd b/report.Rmd
deleted file mode 100644
index c04f01f..0000000
--- a/report.Rmd
+++ /dev/null
@@ -1,504 +0,0 @@
----
-title: "spacesavers2 🚀 report"
-author: "CCR Collaborative Bioinformatics Resource"
-date: '`r lubridate::today()`'
-output:
-  html_document:
-    theme:
-      version: 5
-    code_folding: hide
-    toc: true
-    self_contained: true
-params:
-  input_dir: '/data/CCBR_Pipeliner/userdata/spacesavers2/'
-  n_top_users: 10
-knit: (function(inputFile, encoding) {
-  rmarkdown::render(inputFile, encoding = encoding, output_dir = "datashare/") })
----
-```{r setup}
-knitr::opts_chunk$set(message = FALSE, warning = FALSE)
-```
-
-Notice a bug or want to make a suggestion for this report? [Open an issue](https://github.com/CCBR/spacesavers2/issues) on GitHub.
-
-```{r load}
-library(bslib)
-library(dplyr)
-library(DT)
-library(fontawesome)
-library(ggplot2)
-library(glue)
-library(here)
-library(htmltools)
-library(knitr)
-library(lubridate)
-library(plotly)
-library(purrr)
-library(readr)
-library(rlang)
-library(scales)
-library(shiny)
-library(stringr)
-library(tidyr)
-theme_set(theme_bw())
-
-to_bytes <- function(x, from_unit) {
-  bytes_units <- list(
-    KiB = 1,
-    MiB = 2,
-    GiB = 3,
-    TiB = 4
-  )
-  return(x * (1024^bytes_units[[from_unit]]))
-}
-from_bytes <- function(x, to_unit) {
-  return(x * x / (to_bytes(x, to_unit)))
-}
-
-from_bytes_v <- Vectorize(from_bytes)
-to_bytes_v <- Vectorize(to_bytes)
-
-filter_users <- function(dat, usercol = username) {
-  non_people <- c("allusers", "rpcuser", "slurm")
-  dat %>%
-    filter(
-      !({{ usercol }} %in% non_people), # not actual people
-      !str_detect({{ usercol }}, "[0-9]") # entirely numeric usernames
-    )
-}
-
-is_large_range <- function(x, n_orders_magnitude = 5) {
-  xrange <- range(x)
-  return((xrange[2] - xrange[1]) >= 10^n_orders_magnitude)
-}
-
-plot_user_metric <- function(dat, x_metric) {
-  dat %>%
-    ggplot(aes(
-      x = eval_tidy(data_sym(x_metric)),
-      y = username,
-      fill = eval_tidy(data_sym(x_metric)),
-      text = glue("{username}\n{eval_tidy(data_sym(x_metric))} {x_metric}")
-    )) +
-    geom_col() +
-    # TODO: ggplotly doesn't know what to do with scale::label_log
-    # {if (is_large_range(dat %>% pull(x_metric))) scale_x_log10(labels = label_log(digits = 2)) } +
-    labs(x = x_metric, y = "") +
-    theme(legend.position = "none")
-}
-
-plot_metric_time <- function(dat, y_metric) {
-  dat %>%
-    ggplot(aes(
-      x = date,
-      y = eval_tidy(data_sym(y_metric)),
-      color = username
-    )) +
-    geom_line(alpha = 0.7) +
-    geom_point(aes(text = glue("{username}\n{eval_tidy(data_sym(y_metric))} {y_metric}"))) +
-    labs(y = y_metric)
-}
-
-min_user_bytes_GiB <- 10
-panel_summary <- function(dat,
-                          folder_path = "/data/CCBR",
-                          plot_fcn = plot_metric_time,
-                          min_bytes_GiB = min_user_bytes_GiB) {
-  summary_dat_folder <- dat %>%
-    filter(FolderPath == folder_path) %>%
-    mutate(TotalBytes_GiB = from_bytes(TotalBytes, 'GiB')) %>% 
-    # only keep users with at least 10 GiB total usage
-    filter(TotalBytes_GiB >= min_bytes_GiB) %>% 
-    select(-TotalBytes_GiB)
-  top_users <- summary_dat_folder %>%
-    pivot_longer(all_of(summary_metrics),
-      names_to = "metric"
-    ) %>%
-    mutate(value_adj = case_when(
-      metric == "OverallScore" ~ -value,
-      TRUE ~ value
-    )) %>%
-    group_by(metric) %>%
-    slice_max(order_by = value_adj, n = n_top_users) %>%
-    pull(username) %>%
-    unique()
-  plots <- summary_metrics %>% lapply(function(y_metric) {
-    user_order <- summary_dat_folder %>%
-      filter(username %in% top_users) %>%
-      pivot_longer(all_of(summary_metrics),
-        names_to = "metric"
-      ) %>%
-      mutate(value_adj = case_when(
-        metric == "OverallScore" ~ -value,
-        TRUE ~ value
-      )) %>%
-      filter(metric == y_metric) %>%
-      arrange(by = value_adj) %>%
-      pull(username) %>%
-      unique()
-    if (y_metric == "TotalBytes" | y_metric == "DuplicateBytes") {
-      to_unit <- "TiB" # TODO: dynamically set based on range of metric
-      new_metric_name <- glue("{y_metric}_{to_unit}")
-      summary_dat_folder <- summary_dat_folder %>%
-        mutate("{new_metric_name}" := from_bytes(eval_tidy(data_sym(y_metric)), to_unit))
-      y_metric <- new_metric_name
-    } else if (y_metric == "TotalMeanAge" | y_metric == "DuplicateMeanAge") {
-      new_metric_name <- glue("{y_metric}_Days")
-      summary_dat_folder <- summary_dat_folder %>%
-        rename("{new_metric_name}" := y_metric)
-      y_metric <- new_metric_name
-    } else if (y_metric == "TotalFiles" | y_metric == "DuplicateFiles") {
-      new_metric_name <- glue("{y_metric}_Millions")
-      summary_dat_folder <- summary_dat_folder %>%
-        mutate("{new_metric_name}" := eval_tidy(data_sym(y_metric)) / 10^6)
-      y_metric <- new_metric_name
-    }
-    p <- summary_dat_folder %>%
-      filter(username %in% user_order) %>%
-      mutate(username = factor(username, levels = user_order)) %>%
-      mutate(across(where(is.numeric), round, digits = 2)) %>%
-      plot_fcn(y_metric)
-    nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
-  })
-  nav_panel(
-    title = markdown(glue("`{folder_path}`")),
-    navset_pill_list(!!!plots)
-  )
-}
-```
-
-```{r read_data}
-n_top_users <- params$n_top_users
-input_dir <- params$input_dir # here("data")
-aggregated_filetypes <- c("blamematrix", "catalog", "mimeo")
-# TODO: only load last N weeks of data to keep RAM usage reasonably low
-all_files <- tibble(filename = list.dirs(input_dir) %>%
-  Filter(function(x) {
-    x != input_dir
-  }, .) %>%
-  lapply(function(x) {
-    list.files(x, full.names = TRUE)
-  }) %>%
-  unlist())
-user_dat <- all_files %>%
-  filter(!str_detect(filename, paste(aggregated_filetypes, collapse = "|"))) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext"),
-    too_few = "debug"
-  ) %>%
-  mutate(date = as_date(basename(date)))
-
-dates <- user_dat %>%
-  filter(!is.na(date)) %>%
-  pull(date) %>%
-  unique()
-most_recent_date <- dates %>% max()
-
-total_usage_tb <- user_dat %>%
-  filter(
-    username == "allusers",
-    date == most_recent_date,
-    file == "summary",
-    path == "_data_CCBR"
-  ) %>%
-  pull(filename) %>%
-  read_tsv() %>%
-  filter(FolderPath == "/data/CCBR") %>%
-  mutate(disk_usage_tb = from_bytes(TotalBytes, "TiB")) %>%
-  pull(disk_usage_tb)
-# TODO disk_usage_tb doesn't agree with output from `df`
-
-grubbers_allusers_err <- user_dat %>%
-  filter(
-    username == "allusers",
-    date == most_recent_date,
-    file == "grubbers",
-    ext == "err",
-    path == "_data_CCBR"
-  ) %>%
-  pull(filename) %>%
-  read_lines()
-grubbers_message <- grubbers_allusers_err[2] %>%
-  str_split(":") %>%
-  unlist() %>%
-  .[3]
-
-user_dat <- user_dat %>% filter_users()
-usernames <- user_dat %>%
-  pull(username) %>%
-  unique()
-
-summary_dat_recent <- user_dat %>%
-  filter(
-    date == most_recent_date, file == "summary"
-  ) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x) %>% mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("basepath", "path", "username", "file", "ext")
-  )
-summary_metrics <- summary_dat_recent %>%
-  pivot_longer(where(is.numeric), names_to = "metric") %>%
-  pull(metric) %>%
-  unique()
-```
-
-## Total disk usage
-
-```{r disk_usage_latest}
-disk_usage <- read_tsv(here("results", "disk_usage.tsv")) %>% 
-      mutate(used_tib = from_bytes(to_bytes(Used, "KiB"), "TiB"),
-             avail_tib = from_bytes(to_bytes(Avail, "KiB"),"TiB"),
-             size_tib = used_tib + avail_tib)
-df_date <- disk_usage %>%
-  slice_max(datetime) %>% 
-  pull(datetime) %>%
-  as_date()
-
-layout_column_wrap(
-  width = 1 / 2,
-  value_box(
-    title = p(fa("hard-drive"), "  Disk space in /data/CCBR"),
-    value = markdown(disk_usage %>%
-                       slice_max(datetime) %>% 
-      mutate(Usage = glue("{round(used_tib,1)} / {size_tib}")) %>%
-      select(Usage, `Use%`) %>%
-      kable()),
-    theme = "warning"
-  ),
-  value_box(
-    title = p(fa("users", prefer_type = "regular"), "  Users"),
-    value = p(glue("{length(usernames)} users as of {format(df_date, '%b %d, %Y')}")),
-    theme = "primary"
-  )
-)
-```
-
-### Total usage over time
-
-```{r disk_usage_over_time}
-p <- disk_usage %>% 
-  mutate(datetime = lubridate::as_datetime(datetime)) %>%
-  rename(used = used_tib, size = size_tib, avail = avail_tib) %>% 
-  pivot_longer(c(used, size), names_to = 'metric') %>% 
-  mutate(value = round(value, 2)) %>%
-    ggplot(aes(
-      x = datetime,
-      y = value,
-      color = metric,
-      group = metric
-    )) +
-    geom_line(alpha = 0.7) +
-    geom_point(aes(text = glue("{value} TiB"))) +
-  scale_x_datetime(labels = date_format("%b %Y")) +
-  scale_color_brewer(palette = "Set2",
-                     breaks = c('size', 'used') # enforce order
-                     ) +
-    labs(y = 'TiB', x = '') + 
-  theme(legend.title = element_blank())
-
-card(ggplotly(p, tooltip = "text"))
-```
-
-
-## Summary over time
-
-Usage by top users for each spacesavers metric.
-Only users with at least `r min_user_bytes_GiB` GiB of total disk usage are shown.
-
-```{r summary_over_time}
-summary_dat_all <- user_dat %>%
-  filter(
-    file == "summary"
-  ) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x) %>% mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("basepath", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = str_replace(basepath, ".*/", "") %>% as_date())
-
-navset_tab(
-  summary_dat_all %>% panel_summary("/data/CCBR", plot_metric_time),
-  summary_dat_all %>% panel_summary("/data/CCBR/rawdata", plot_metric_time),
-  summary_dat_all %>% panel_summary("/data/CCBR/projects", plot_metric_time),
-)
-```
-
-
-## Most recent summary (`r most_recent_date`)
-
-Usage by top users for each spacesavers metric.
-
-```{r summary_recent}
-navset_tab(
-  summary_dat_recent %>% panel_summary("/data/CCBR", plot_user_metric),
-  summary_dat_recent %>% panel_summary("/data/CCBR/rawdata", plot_user_metric),
-  summary_dat_recent %>% panel_summary("/data/CCBR/projects", plot_user_metric),
-)
-```
-
-## Summary table
-
-```{r allusers_summary}
-allusers_summary <- all_files %>%
-  filter(str_detect(filename, "_data_CCBR.allusers.summary.txt")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x)
-  }) %>%
-  list_rbind() %>%
-  mutate(
-    TotalBytes_GiB = round(from_bytes_v(TotalBytes, "GiB"), 2),
-    DuplicateBytes_GiB = round(from_bytes_v(DuplicateBytes, "GiB"), 2),
-    .before = "DuplicateBytes"
-  ) %>%
-  select(-c(TotalBytes, DuplicateBytes))
-
-card(
-  card_header("Summary across all users"),
-  datatable(allusers_summary, fillContainer = TRUE)
-)
-```
-
-
-## Blame matrix
-
-```{r blame}
-blame_matrix <- all_files %>%
-  filter(str_detect(filename, "blamematrix")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  filter(!is.na(date), file == "blamematrix", ext == "tsv", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x)
-  }) %>%
-  list_rbind()
-
-card(
-  card_header("Disk usage by user in subdirectories"),
-  datatable(blame_matrix, fillContainer = TRUE)
-)
-```
-
-
-## Duplicate files
-
-`r grubbers_message`
-
-### Potential savings per user
-
-```{r grub_err}
-grub_err <- user_dat %>%
-  filter_users() %>%
-  filter(!is.na(date), file == "grubbers", ext == "err", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x, col_names = FALSE) %>%
-      mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  filter(str_detect(X1, "Deleting")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(
-    date = as_date(basename(date)),
-    grub_msg = str_replace_all(X1, regex("^.*:"), ""),
-    savings_value = as.numeric(
-      str_replace_all(
-        grub_msg,
-        regex(".*save ([\\d\\.]*) [\\w!]+"),
-        "\\1"
-      )
-    ),
-    savings_unit = str_replace_all(
-      grub_msg,
-      regex(".*save [\\d\\.]* ([\\w]+)!"),
-      "\\1"
-    ),
-    savings_bytes = to_bytes_v(savings_value, savings_unit)
-  )
-
-user_grub_table <- grub_err %>%
-  arrange(desc(savings_bytes)) %>%
-  select(username, savings_value, savings_unit)
-
-card(
-  card_header("Savings per user"),
-  datatable(user_grub_table, fillContainer = TRUE)
-)
-```
-
-
-### All high-value duplicates
-
-```{r grubbers}
-grub_dat <- user_dat %>%
-  filter_users() %>%
-  filter(!is.na(date), file == "grubbers", ext == "tsv", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x, col_names = FALSE) %>%
-      mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  rename(
-    file_hash = X1,
-    file_count = X2,
-    total_disk_usage = X3,
-    single_disk_usage = X4,
-    filepaths = X5
-  ) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  filter_users() %>%
-  separate_wider_delim(total_disk_usage,
-    delim = " ",
-    names = c("total_disk_usage_value", "total_disk_usage_unit"),
-    cols_remove = FALSE
-  ) %>%
-  separate_wider_delim(single_disk_usage,
-    delim = " ",
-    names = c("single_disk_usage_value", "single_disk_usage_unit"),
-    cols_remove = FALSE
-  ) %>%
-  mutate(across(all_of(c("total_disk_usage_value", "single_disk_usage_value")), as.numeric))
-
-top_files <- grub_dat %>%
-  arrange(order_by = desc(total_disk_usage_value)) %>%
-  select(total_disk_usage_value, username, filepaths) %>%
-  rename(disk_usage_gb = total_disk_usage_value)
-
-card(card_header("Top files"), datatable(top_files, fillContainer = TRUE))
-```
-
-
-For instructions on how to replace duplicates with hard links, see the
-[`usurp` command in the spacesavers docs](https://ccbr.github.io/spacesavers2/usurp/).