From 2ad422d4b58286f73e441e8b45511ac18f14e115 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Wed, 14 Feb 2024 09:10:58 -0500
Subject: [PATCH 1/8] refactor: move report to https://github.com/CCBR/reports

---
 CHANGELOG.md                        |   6 +-
 bin/render.R                        |   5 -
 bin/render.sh                       |  11 -
 bin/render_report_biowulf.sh        |  29 --
 docker/spacesavers2/Dockerfile      | 105 ------
 docker/spacesavers2/environment.txt |  26 --
 docker/spacesavers2/meta.yml        |   4 -
 report.Rmd                          | 504 ----------------------------
 8 files changed, 4 insertions(+), 686 deletions(-)
 delete mode 100755 bin/render.R
 delete mode 100755 bin/render.sh
 delete mode 100755 bin/render_report_biowulf.sh
 delete mode 100644 docker/spacesavers2/Dockerfile
 delete mode 100644 docker/spacesavers2/environment.txt
 delete mode 100644 docker/spacesavers2/meta.yml
 delete mode 100644 report.Rmd

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b65cf73..3ca870d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
 ## spacesavers2 development version
 
+- Move the report to a separate internal repository (@kelly-sovacool)
+
 ### New features
 
 ### Bug fixes
@@ -21,11 +23,11 @@
 - `grubbers` `--limit` can be < 1 GiB (float) (#70, @kopardev)
 - `grubbers` output file format changed. New original file column added. Original file is required by `usurp`.
 - `mimeo` `--duplicateonly` now correctly handles duplicates owned by different UIDs. (#71, @kopardev)
-    - Update `blamematrix` and to account for corrected duplicate handling in `mimeo`.
+  - Update `blamematrix` and to account for corrected duplicate handling in `mimeo`.
 - `usurp` now uses the new "original file" column from `grubbers` while creating hard-links.
 - Total size now closely resembles `df` results (fix #75 @kopardev)
 - Files with future timestamps are handled correctly (fix #76, @kopardev)
-  
+
 ## spacesavers2 0.10.2
 
 - Now tracking user-facing changes with a changelog. (#61, @kelly-sovacool)
diff --git a/bin/render.R b/bin/render.R
deleted file mode 100755
index ebb8ae1..0000000
--- a/bin/render.R
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/env Rscript
-rmarkdown::render("report.Rmd",
-  output_file = "datashare/report.html",
-  params = list(input_dir = "data")
-)
diff --git a/bin/render.sh b/bin/render.sh
deleted file mode 100755
index 6c3242a..0000000
--- a/bin/render.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env bash
-# to be executed from /data/CCBR_Pipeliner/Tools/spacesavers2/report
-# Usage: bash bin/render_report_biowulf.sh
-module load singularity
-SINGULARITY_CACHEDIR=/data/CCBR_Pipeliner/SIFS
-
-# render report
-echo "cd /mnt && \
-    Rscript bin/render.R \
-    " |\
-    singularity exec -C -B $PWD:/mnt,/data/CCBR_Pipeliner/userdata/spacesavers2/:/mnt/data docker://nciccbr/spacesavers2:0.1.1 bash
diff --git a/bin/render_report_biowulf.sh b/bin/render_report_biowulf.sh
deleted file mode 100755
index 714eaa9..0000000
--- a/bin/render_report_biowulf.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-# to be executed from /data/CCBR_Pipeliner/Tools/spacesavers2/report
-# Usage: bash bin/render_report_biowulf.sh
-module load singularity
-SINGULARITY_CACHEDIR=/data/CCBR_Pipeliner/SIFS
-
-today=$(date +'%Y-%m-%d')
-year=$(date +'%Y')
-mkdir -p datashare/$year
-html_filename="datashare/${year}/spacesavers2-report_${today}.html"
-recipient_email="kelly.sovacool@nih.gov,vishal.koparde@nih.gov"
-
-url=https://hpc.nih.gov/~CCBR_Pipeliner/spacesavers2/${year}/spacesavers2-report_${today}.html
-
-# update disk usage
-bash bin/disk_usage.sh
-# render report and send via email
-echo "cd /mnt && \
-    Rscript bin/render.R && \
-    cp datashare/report.html $html_filename && \
-    python src/send_email.py \
-        $html_filename \
-        $url \
-        $recipient_email \
-    " |\
-    singularity exec -C -B $PWD:/mnt,/data/CCBR_Pipeliner/userdata/spacesavers2/:/mnt/data docker://nciccbr/spacesavers2:0.1.1 bash
-
-chmod -R a+r datashare/
-cp -r datashare/* /data/CCBR_Pipeliner/datashare/spacesavers2/
diff --git a/docker/spacesavers2/Dockerfile b/docker/spacesavers2/Dockerfile
deleted file mode 100644
index 233b5fe..0000000
--- a/docker/spacesavers2/Dockerfile
+++ /dev/null
@@ -1,105 +0,0 @@
-FROM ubuntu:20.04
-
-# build time variables
-ARG BUILD_DATE="000000"
-ENV BUILD_DATE=${BUILD_DATE}
-ARG BUILD_TAG="000000"
-ENV BUILD_TAG=${BUILD_TAG}
-ARG REPONAME="000000"
-ENV REPONAME=${REPONAME}
-
-RUN mkdir -p /opt2 && mkdir -p /data2
-ENV TZ=America/New_York
-RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-
-RUN apt update && apt-get -y upgrade
-# Set the locale
-RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-		locales build-essential cmake cpanminus && \
-	localedef -i en_US -f UTF-8 en_US.UTF-8 && \
-	cpanm FindBin Term::ReadLine
-
-# install basic dependencies with apt-get
-RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-    build-essential \
-	figlet \
-	g++ \
-	gcc \
-	gfortran \
-	git \
-	libatlas-base-dev \
-	libblas-dev \
-	libboost-dev \
-	libbz2-dev \
-	libcurl4-openssl-dev \
-	libexpat1-dev \
-	libfreetype6-dev \
-	libgd-dev \
-	libgd-perl \
-	libglib2.0-dev \
-    libgpgme11-dev \
-	libgs-dev \
-	libgsl-dev \
-	libgsl0-dev \
-	libhtml-template-compiled-perl \
-	libicu-dev \
-	libjudy-dev \
-	liblapack-dev \
-	liblzma-dev \
-	libmysqlclient-dev \
-	libncurses-dev \
-	libopenmpi-dev \
-	libpng-dev \
-	librtmp-dev \
-    libseccomp-dev \
-	libssl-dev \
-	libtool \
-	libxml-libxml-debugging-perl \
-	libxml-opml-simplegen-perl \
-	libxml2-dev \
-	libxslt-dev \
-	make \
-	manpages-dev \
-	openjdk-17-jre-headless \
-	parallel \
-	pigz \
-    pkg-config \
-	python3-pip \
-    python3-dev \
-	rsync \
-    squashfs-tools \
-	unzip \
-    uuid-dev \
-	wget \
-	zlib1g \
-	zlib1g-dev \
-	zlibc
-
-# Install conda and give write permissions to conda folder
-RUN echo 'export PATH=/opt2/conda/bin:$PATH' > /etc/profile.d/conda.sh && \
-    wget --quiet "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" -O ~/miniforge3.sh && \
-    /bin/bash ~/miniforge3.sh -b -p /opt2/conda && \
-    rm ~/miniforge3.sh && chmod 777 -R /opt2/conda/
-ENV PATH="/opt2/conda/bin:$PATH"
-
-# install pandoc & R packages
-COPY environment.txt /data2/
-RUN mamba install -c conda-forge --file /data2/environment.txt
-ENV R_LIBS_USER=/opt2/conda/lib/R/library/
-
-# install quarto
-ENV QUARTO_VERSION="1.3.450"
-ADD https://github.com/quarto-dev/quarto-cli/releases/download/v${QUARTO_VERSION}/quarto-${QUARTO_VERSION}-linux-amd64.tar.gz /opt2
-WORKDIR /opt2
-RUN tar -xzvf quarto-${QUARTO_VERSION}-linux-amd64.tar.gz
-ENV PATH="/opt2/quarto-${QUARTO_VERSION}/bin/:${PATH}"
-RUN quarto check
-
-# Save Dockerfile in the docker
-COPY Dockerfile /opt2/Dockerfile_${REPONAME}.${BUILD_TAG}
-RUN chmod a+r /opt2/Dockerfile_${REPONAME}.${BUILD_TAG}
-
-# cleanup
-WORKDIR /data2
-RUN apt-get clean && apt-get purge \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
diff --git a/docker/spacesavers2/environment.txt b/docker/spacesavers2/environment.txt
deleted file mode 100644
index 5334639..0000000
--- a/docker/spacesavers2/environment.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-pandoc
-r-base=4.3.1
-r-DT
-r-RColorBrewer
-r-bslib=0.5.1
-r-crosstalk
-r-downlit
-r-dplyr
-r-fontawesome
-r-ggplot2
-r-glue
-r-here
-r-janitor
-r-knitr
-r-lubridate
-r-plotly
-r-purrr
-r-readr
-r-rlang
-r-rmarkdown
-r-scales
-r-shiny
-r-stringr
-r-tibble
-r-tidyr
-r-xml2
diff --git a/docker/spacesavers2/meta.yml b/docker/spacesavers2/meta.yml
deleted file mode 100644
index d68cd0a..0000000
--- a/docker/spacesavers2/meta.yml
+++ /dev/null
@@ -1,4 +0,0 @@
-dockerhub_namespace: nciccbr
-image_name: spacesavers2
-version: 0.1.1
-container: "$(dockerhub_namespace)/$(image_name):$(version)"
diff --git a/report.Rmd b/report.Rmd
deleted file mode 100644
index c04f01f..0000000
--- a/report.Rmd
+++ /dev/null
@@ -1,504 +0,0 @@
----
-title: "spacesavers2 🚀 report"
-author: "CCR Collaborative Bioinformatics Resource"
-date: '`r lubridate::today()`'
-output:
-  html_document:
-    theme:
-      version: 5
-    code_folding: hide
-    toc: true
-    self_contained: true
-params:
-  input_dir: '/data/CCBR_Pipeliner/userdata/spacesavers2/'
-  n_top_users: 10
-knit: (function(inputFile, encoding) {
-  rmarkdown::render(inputFile, encoding = encoding, output_dir = "datashare/") })
----
-```{r setup}
-knitr::opts_chunk$set(message = FALSE, warning = FALSE)
-```
-
-Notice a bug or want to make a suggestion for this report? [Open an issue](https://github.com/CCBR/spacesavers2/issues) on GitHub.
-
-```{r load}
-library(bslib)
-library(dplyr)
-library(DT)
-library(fontawesome)
-library(ggplot2)
-library(glue)
-library(here)
-library(htmltools)
-library(knitr)
-library(lubridate)
-library(plotly)
-library(purrr)
-library(readr)
-library(rlang)
-library(scales)
-library(shiny)
-library(stringr)
-library(tidyr)
-theme_set(theme_bw())
-
-to_bytes <- function(x, from_unit) {
-  bytes_units <- list(
-    KiB = 1,
-    MiB = 2,
-    GiB = 3,
-    TiB = 4
-  )
-  return(x * (1024^bytes_units[[from_unit]]))
-}
-from_bytes <- function(x, to_unit) {
-  return(x * x / (to_bytes(x, to_unit)))
-}
-
-from_bytes_v <- Vectorize(from_bytes)
-to_bytes_v <- Vectorize(to_bytes)
-
-filter_users <- function(dat, usercol = username) {
-  non_people <- c("allusers", "rpcuser", "slurm")
-  dat %>%
-    filter(
-      !({{ usercol }} %in% non_people), # not actual people
-      !str_detect({{ usercol }}, "[0-9]") # entirely numeric usernames
-    )
-}
-
-is_large_range <- function(x, n_orders_magnitude = 5) {
-  xrange <- range(x)
-  return((xrange[2] - xrange[1]) >= 10^n_orders_magnitude)
-}
-
-plot_user_metric <- function(dat, x_metric) {
-  dat %>%
-    ggplot(aes(
-      x = eval_tidy(data_sym(x_metric)),
-      y = username,
-      fill = eval_tidy(data_sym(x_metric)),
-      text = glue("{username}\n{eval_tidy(data_sym(x_metric))} {x_metric}")
-    )) +
-    geom_col() +
-    # TODO: ggplotly doesn't know what to do with scale::label_log
-    # {if (is_large_range(dat %>% pull(x_metric))) scale_x_log10(labels = label_log(digits = 2)) } +
-    labs(x = x_metric, y = "") +
-    theme(legend.position = "none")
-}
-
-plot_metric_time <- function(dat, y_metric) {
-  dat %>%
-    ggplot(aes(
-      x = date,
-      y = eval_tidy(data_sym(y_metric)),
-      color = username
-    )) +
-    geom_line(alpha = 0.7) +
-    geom_point(aes(text = glue("{username}\n{eval_tidy(data_sym(y_metric))} {y_metric}"))) +
-    labs(y = y_metric)
-}
-
-min_user_bytes_GiB <- 10
-panel_summary <- function(dat,
-                          folder_path = "/data/CCBR",
-                          plot_fcn = plot_metric_time,
-                          min_bytes_GiB = min_user_bytes_GiB) {
-  summary_dat_folder <- dat %>%
-    filter(FolderPath == folder_path) %>%
-    mutate(TotalBytes_GiB = from_bytes(TotalBytes, 'GiB')) %>% 
-    # only keep users with at least 10 GiB total usage
-    filter(TotalBytes_GiB >= min_bytes_GiB) %>% 
-    select(-TotalBytes_GiB)
-  top_users <- summary_dat_folder %>%
-    pivot_longer(all_of(summary_metrics),
-      names_to = "metric"
-    ) %>%
-    mutate(value_adj = case_when(
-      metric == "OverallScore" ~ -value,
-      TRUE ~ value
-    )) %>%
-    group_by(metric) %>%
-    slice_max(order_by = value_adj, n = n_top_users) %>%
-    pull(username) %>%
-    unique()
-  plots <- summary_metrics %>% lapply(function(y_metric) {
-    user_order <- summary_dat_folder %>%
-      filter(username %in% top_users) %>%
-      pivot_longer(all_of(summary_metrics),
-        names_to = "metric"
-      ) %>%
-      mutate(value_adj = case_when(
-        metric == "OverallScore" ~ -value,
-        TRUE ~ value
-      )) %>%
-      filter(metric == y_metric) %>%
-      arrange(by = value_adj) %>%
-      pull(username) %>%
-      unique()
-    if (y_metric == "TotalBytes" | y_metric == "DuplicateBytes") {
-      to_unit <- "TiB" # TODO: dynamically set based on range of metric
-      new_metric_name <- glue("{y_metric}_{to_unit}")
-      summary_dat_folder <- summary_dat_folder %>%
-        mutate("{new_metric_name}" := from_bytes(eval_tidy(data_sym(y_metric)), to_unit))
-      y_metric <- new_metric_name
-    } else if (y_metric == "TotalMeanAge" | y_metric == "DuplicateMeanAge") {
-      new_metric_name <- glue("{y_metric}_Days")
-      summary_dat_folder <- summary_dat_folder %>%
-        rename("{new_metric_name}" := y_metric)
-      y_metric <- new_metric_name
-    } else if (y_metric == "TotalFiles" | y_metric == "DuplicateFiles") {
-      new_metric_name <- glue("{y_metric}_Millions")
-      summary_dat_folder <- summary_dat_folder %>%
-        mutate("{new_metric_name}" := eval_tidy(data_sym(y_metric)) / 10^6)
-      y_metric <- new_metric_name
-    }
-    p <- summary_dat_folder %>%
-      filter(username %in% user_order) %>%
-      mutate(username = factor(username, levels = user_order)) %>%
-      mutate(across(where(is.numeric), round, digits = 2)) %>%
-      plot_fcn(y_metric)
-    nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
-  })
-  nav_panel(
-    title = markdown(glue("`{folder_path}`")),
-    navset_pill_list(!!!plots)
-  )
-}
-```
-
-```{r read_data}
-n_top_users <- params$n_top_users
-input_dir <- params$input_dir # here("data")
-aggregated_filetypes <- c("blamematrix", "catalog", "mimeo")
-# TODO: only load last N weeks of data to keep RAM usage reasonably low
-all_files <- tibble(filename = list.dirs(input_dir) %>%
-  Filter(function(x) {
-    x != input_dir
-  }, .) %>%
-  lapply(function(x) {
-    list.files(x, full.names = TRUE)
-  }) %>%
-  unlist())
-user_dat <- all_files %>%
-  filter(!str_detect(filename, paste(aggregated_filetypes, collapse = "|"))) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext"),
-    too_few = "debug"
-  ) %>%
-  mutate(date = as_date(basename(date)))
-
-dates <- user_dat %>%
-  filter(!is.na(date)) %>%
-  pull(date) %>%
-  unique()
-most_recent_date <- dates %>% max()
-
-total_usage_tb <- user_dat %>%
-  filter(
-    username == "allusers",
-    date == most_recent_date,
-    file == "summary",
-    path == "_data_CCBR"
-  ) %>%
-  pull(filename) %>%
-  read_tsv() %>%
-  filter(FolderPath == "/data/CCBR") %>%
-  mutate(disk_usage_tb = from_bytes(TotalBytes, "TiB")) %>%
-  pull(disk_usage_tb)
-# TODO disk_usage_tb doesn't agree with output from `df`
-
-grubbers_allusers_err <- user_dat %>%
-  filter(
-    username == "allusers",
-    date == most_recent_date,
-    file == "grubbers",
-    ext == "err",
-    path == "_data_CCBR"
-  ) %>%
-  pull(filename) %>%
-  read_lines()
-grubbers_message <- grubbers_allusers_err[2] %>%
-  str_split(":") %>%
-  unlist() %>%
-  .[3]
-
-user_dat <- user_dat %>% filter_users()
-usernames <- user_dat %>%
-  pull(username) %>%
-  unique()
-
-summary_dat_recent <- user_dat %>%
-  filter(
-    date == most_recent_date, file == "summary"
-  ) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x) %>% mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("basepath", "path", "username", "file", "ext")
-  )
-summary_metrics <- summary_dat_recent %>%
-  pivot_longer(where(is.numeric), names_to = "metric") %>%
-  pull(metric) %>%
-  unique()
-```
-
-## Total disk usage
-
-```{r disk_usage_latest}
-disk_usage <- read_tsv(here("results", "disk_usage.tsv")) %>% 
-      mutate(used_tib = from_bytes(to_bytes(Used, "KiB"), "TiB"),
-             avail_tib = from_bytes(to_bytes(Avail, "KiB"),"TiB"),
-             size_tib = used_tib + avail_tib)
-df_date <- disk_usage %>%
-  slice_max(datetime) %>% 
-  pull(datetime) %>%
-  as_date()
-
-layout_column_wrap(
-  width = 1 / 2,
-  value_box(
-    title = p(fa("hard-drive"), "  Disk space in /data/CCBR"),
-    value = markdown(disk_usage %>%
-                       slice_max(datetime) %>% 
-      mutate(Usage = glue("{round(used_tib,1)} / {size_tib}")) %>%
-      select(Usage, `Use%`) %>%
-      kable()),
-    theme = "warning"
-  ),
-  value_box(
-    title = p(fa("users", prefer_type = "regular"), "  Users"),
-    value = p(glue("{length(usernames)} users as of {format(df_date, '%b %d, %Y')}")),
-    theme = "primary"
-  )
-)
-```
-
-### Total usage over time
-
-```{r disk_usage_over_time}
-p <- disk_usage %>% 
-  mutate(datetime = lubridate::as_datetime(datetime)) %>%
-  rename(used = used_tib, size = size_tib, avail = avail_tib) %>% 
-  pivot_longer(c(used, size), names_to = 'metric') %>% 
-  mutate(value = round(value, 2)) %>%
-    ggplot(aes(
-      x = datetime,
-      y = value,
-      color = metric,
-      group = metric
-    )) +
-    geom_line(alpha = 0.7) +
-    geom_point(aes(text = glue("{value} TiB"))) +
-  scale_x_datetime(labels = date_format("%b %Y")) +
-  scale_color_brewer(palette = "Set2",
-                     breaks = c('size', 'used') # enforce order
-                     ) +
-    labs(y = 'TiB', x = '') + 
-  theme(legend.title = element_blank())
-
-card(ggplotly(p, tooltip = "text"))
-```
-
-
-## Summary over time
-
-Usage by top users for each spacesavers metric.
-Only users with at least `r min_user_bytes_GiB` GiB of total disk usage are shown.
-
-```{r summary_over_time}
-summary_dat_all <- user_dat %>%
-  filter(
-    file == "summary"
-  ) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x) %>% mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("basepath", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = str_replace(basepath, ".*/", "") %>% as_date())
-
-navset_tab(
-  summary_dat_all %>% panel_summary("/data/CCBR", plot_metric_time),
-  summary_dat_all %>% panel_summary("/data/CCBR/rawdata", plot_metric_time),
-  summary_dat_all %>% panel_summary("/data/CCBR/projects", plot_metric_time),
-)
-```
-
-
-## Most recent summary (`r most_recent_date`)
-
-Usage by top users for each spacesavers metric.
-
-```{r summary_recent}
-navset_tab(
-  summary_dat_recent %>% panel_summary("/data/CCBR", plot_user_metric),
-  summary_dat_recent %>% panel_summary("/data/CCBR/rawdata", plot_user_metric),
-  summary_dat_recent %>% panel_summary("/data/CCBR/projects", plot_user_metric),
-)
-```
-
-## Summary table
-
-```{r allusers_summary}
-allusers_summary <- all_files %>%
-  filter(str_detect(filename, "_data_CCBR.allusers.summary.txt")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x)
-  }) %>%
-  list_rbind() %>%
-  mutate(
-    TotalBytes_GiB = round(from_bytes_v(TotalBytes, "GiB"), 2),
-    DuplicateBytes_GiB = round(from_bytes_v(DuplicateBytes, "GiB"), 2),
-    .before = "DuplicateBytes"
-  ) %>%
-  select(-c(TotalBytes, DuplicateBytes))
-
-card(
-  card_header("Summary across all users"),
-  datatable(allusers_summary, fillContainer = TRUE)
-)
-```
-
-
-## Blame matrix
-
-```{r blame}
-blame_matrix <- all_files %>%
-  filter(str_detect(filename, "blamematrix")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  filter(!is.na(date), file == "blamematrix", ext == "tsv", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x)
-  }) %>%
-  list_rbind()
-
-card(
-  card_header("Disk usage by user in subdirectories"),
-  datatable(blame_matrix, fillContainer = TRUE)
-)
-```
-
-
-## Duplicate files
-
-`r grubbers_message`
-
-### Potential savings per user
-
-```{r grub_err}
-grub_err <- user_dat %>%
-  filter_users() %>%
-  filter(!is.na(date), file == "grubbers", ext == "err", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x, col_names = FALSE) %>%
-      mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  filter(str_detect(X1, "Deleting")) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(
-    date = as_date(basename(date)),
-    grub_msg = str_replace_all(X1, regex("^.*:"), ""),
-    savings_value = as.numeric(
-      str_replace_all(
-        grub_msg,
-        regex(".*save ([\\d\\.]*) [\\w!]+"),
-        "\\1"
-      )
-    ),
-    savings_unit = str_replace_all(
-      grub_msg,
-      regex(".*save [\\d\\.]* ([\\w]+)!"),
-      "\\1"
-    ),
-    savings_bytes = to_bytes_v(savings_value, savings_unit)
-  )
-
-user_grub_table <- grub_err %>%
-  arrange(desc(savings_bytes)) %>%
-  select(username, savings_value, savings_unit)
-
-card(
-  card_header("Savings per user"),
-  datatable(user_grub_table, fillContainer = TRUE)
-)
-```
-
-
-### All high-value duplicates
-
-```{r grubbers}
-grub_dat <- user_dat %>%
-  filter_users() %>%
-  filter(!is.na(date), file == "grubbers", ext == "tsv", path == "_data_CCBR") %>%
-  slice_max(order_by = date) %>%
-  pull(filename) %>%
-  map(function(x) {
-    read_tsv(x, col_names = FALSE) %>%
-      mutate(filename = x)
-  }) %>%
-  list_rbind() %>%
-  rename(
-    file_hash = X1,
-    file_count = X2,
-    total_disk_usage = X3,
-    single_disk_usage = X4,
-    filepaths = X5
-  ) %>%
-  separate_wider_delim(filename,
-    delim = ".", cols_remove = FALSE,
-    names = c("date", "path", "username", "file", "ext")
-  ) %>%
-  mutate(date = as_date(basename(date))) %>%
-  filter_users() %>%
-  separate_wider_delim(total_disk_usage,
-    delim = " ",
-    names = c("total_disk_usage_value", "total_disk_usage_unit"),
-    cols_remove = FALSE
-  ) %>%
-  separate_wider_delim(single_disk_usage,
-    delim = " ",
-    names = c("single_disk_usage_value", "single_disk_usage_unit"),
-    cols_remove = FALSE
-  ) %>%
-  mutate(across(all_of(c("total_disk_usage_value", "single_disk_usage_value")), as.numeric))
-
-top_files <- grub_dat %>%
-  arrange(order_by = desc(total_disk_usage_value)) %>%
-  select(total_disk_usage_value, username, filepaths) %>%
-  rename(disk_usage_gb = total_disk_usage_value)
-
-card(card_header("Top files"), datatable(top_files, fillContainer = TRUE))
-```
-
-
-For instructions on how to replace duplicates with hard links, see the
-[`usurp` command in the spacesavers docs](https://ccbr.github.io/spacesavers2/usurp/).

From 6c92e66fb9f9ae7e7b46af243178ad56bd777e5e Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Wed, 14 Feb 2024 09:12:51 -0500
Subject: [PATCH 2/8] docs: link PR

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ca870d..749ca74 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 ## spacesavers2 development version
 
-- Move the report to a separate internal repository (@kelly-sovacool)
+- Move the report to a separate internal repository (#79, @kelly-sovacool)
 
 ### New features
 

From 52a7fc84c0d87eb0fa174a8786fe61aa9dead138 Mon Sep 17 00:00:00 2001
From: kopardev <vishal.koparde@nih.gov>
Date: Tue, 27 Feb 2024 17:39:24 -0500
Subject: [PATCH 3/8] add quick assess command pdq; fix #89

---
 spacesavers2_pdq | 105 +++++++++++++++++++++++++++++++++++++++++++++++
 src/pdq.py       |  63 ++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+)
 create mode 100755 spacesavers2_pdq
 create mode 100644 src/pdq.py

diff --git a/spacesavers2_pdq b/spacesavers2_pdq
new file mode 100755
index 0000000..d1885da
--- /dev/null
+++ b/spacesavers2_pdq
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+# pqd = pretty darn quick
+
+from src.VersionCheck import version_check
+from src.VersionCheck import __version__
+from src.utils import *
+
+version_check()
+
+# import required modules
+import textwrap
+import tqdm
+import sys
+from src.pdq import pdq
+from multiprocessing import Pool
+import argparse
+from pathlib import Path
+
+
+def task(f):
+    fd = pdq()
+    fd.set(f)
+    return fd
+
+
+def main():
+    elog = textwrap.dedent(
+        """\
+    Version:
+        {}
+    Example:
+        > spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file
+        """.format(
+            __version__
+        )
+    )
+    parser = argparse.ArgumentParser(
+        description="spacesavers2_pdq: get quick per user info (number of files and bytes).",
+        epilog=elog,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "-f",
+        "--folder",
+        dest="folder",
+        required=True,
+        type=str,
+        help="spacesavers2_pdq will be run on all files in this folder and its subfolders",
+    )
+    parser.add_argument(
+        "-p",
+        "--threads",
+        dest="threads",
+        required=False,
+        type=int,
+        default=4,
+        help="number of threads to be used (default 4)",
+    )
+    parser.add_argument(
+        "-o",
+        "--outfile",
+        dest="outfile",
+        required=False,
+        type=str,
+        help="outfile ... catalog file .. by default output is printed to screen",
+    )
+    parser.add_argument("-v", "--version", action="version", version=__version__)
+
+    global args
+    args = parser.parse_args()
+
+    folder = args.folder
+    p = Path(folder)
+    files = [p]
+    files2 = p.glob("**/*")
+    files.extend(files2)
+
+    if args.outfile:
+        outfh = open(args.outfile, 'w')
+    else:
+        outfh = sys.stdout
+
+    bigdict=dict()
+
+    with Pool(processes=args.threads) as pool:
+        for fd in tqdm.tqdm(pool.imap_unordered(task, files),total=len(files)):
+            if not fd.is_file(): continue
+            uid = fd.get_uid()
+            if not uid in bigdict: bigdict[uid]=dict()
+            inode = fd.get_inode()
+            if not inode in bigdict[uid]: bigdict[uid][inode]=fd.get_size()
+        
+    for uid in bigdict.keys():
+        username = get_username_groupname(uid)
+        nfiles = len(bigdict[uid])
+        nbytes = 0
+        for inode in bigdict[uid].keys():
+            nbytes += bigdict[uid][inode]
+        outfh.write(f"{username}\t{nfiles}\t{nbytes}\n")
+    
+    if args.outfile:
+        outfh.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/src/pdq.py b/src/pdq.py
new file mode 100644
index 0000000..604780b
--- /dev/null
+++ b/src/pdq.py
@@ -0,0 +1,63 @@
+from pathlib import Path
+import sys
+
+def get_type(p): # copy paste from FileDetails
+    # input:
+    # 1. PosixPath object
+    # output:
+    # 1. type of path
+    #   u = unknown
+    #   L = broken symlink
+    #   l = symlink
+    #   f = file
+    #   d = folder or directory
+    x = "u" # unknown
+    try:
+        if p.is_symlink():
+            x = "l" # link or symlink
+            try:
+                p.exists()
+            except:
+                x = "L" # upper case L is broken symlink
+                sys.stderr.write("spacesavers2:Broken symlink found:{}\n".format(p))
+            return x
+        if not p.exists():
+            x = "a" # absent
+            return x
+        if p.is_dir():
+            x = "d" # directory
+            return x
+        if p.is_file():
+            x = "f" # file
+            return x
+    except: # mainly to catch PermissionError:
+        sys.stderr.write("spacesavers2:File cannot be read:{}\n".format(p))
+    return x
+
+class pdq:
+    def __init__(self):
+        self.inode  = -1
+        self.fld    = "u" # u or f or l or d
+        self.size   = -1
+        self.uid    = 0
+    def set(self,p,st_block_byte_size=512):
+        p           = Path(p).absolute()
+        try:
+            st          = p.stat(follow_symlinks=False)
+            self.size   = st.st_blocks * st_block_byte_size
+            self.inode 	= st.st_ino
+            self.uid	= st.st_uid
+            self.fld	= get_type(p)
+        except:
+            print(f"spacesavers2_pdq: {p} File not found!")
+    def get_uid(self):
+        return self.uid
+    def get_fld(self):
+        return self.fld
+    def is_file(self):
+        if self.fld == "f": return True
+        return False
+    def get_inode(self):
+        return self.inode
+    def get_size(self):
+        return self.size
\ No newline at end of file

From 9c2a355f9899522d2504408c2616a54796d4b123 Mon Sep 17 00:00:00 2001
From: kopardev <vishal.koparde@nih.gov>
Date: Tue, 27 Feb 2024 17:40:30 -0500
Subject: [PATCH 4/8] fix: add bin redirect for new command

---
 bin/spacesavers2_pdq | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 bin/spacesavers2_pdq

diff --git a/bin/spacesavers2_pdq b/bin/spacesavers2_pdq
new file mode 120000
index 0000000..577f1ce
--- /dev/null
+++ b/bin/spacesavers2_pdq
@@ -0,0 +1 @@
+redirect
\ No newline at end of file

From 338ddcab1babad91690e9b0fa152dec1d666c130 Mon Sep 17 00:00:00 2001
From: kopardev <vishal.koparde@nih.gov>
Date: Tue, 27 Feb 2024 17:40:58 -0500
Subject: [PATCH 5/8] docs: add new command documentation

---
 README.md   |  1 +
 docs/pdq.md | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 docs/pdq.md

diff --git a/README.md b/README.md
index c31b6a2..7e41ba2 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,7 @@ Welcome! `spacesavers2`:
 - spacesavers2_grubbers
 - spacesavers2_e2e
 - spacesavers2_usurp
+- spacesavers2_pdq
 
 ## `spacesavers2` typical workflow looks like this:
 
diff --git a/docs/pdq.md b/docs/pdq.md
new file mode 100644
index 0000000..efb9060
--- /dev/null
+++ b/docs/pdq.md
@@ -0,0 +1,60 @@
+## spacesavers2_pdq
+
+pdq = Pretty Darn Quick
+
+This uses `glob` library to list all files in a user-provided folder recursively. 
+
+For each user it gathers information like:
+ - total number of files
+ - total number of bytes
+
+It is quick tool to gather datapoints to monitor filesystem usage. Typically, can be run once daily and compared with previous days run to find large changes.
+
+### Inputs
+ - `--folder`: Path to the folder to run `spacesavers2_pdq` on.
+ - `--threads`: `spacesavers2_pdq` uses multiprocessing library to parallelize orchestration. This defines the number of threads to run in parallel.
+ - `--outfile`: If not supplied then the optput is written to the screen.
+
+> NOTE: `spacesavers2_pdq` reports errors (eg. cannot read file) to STDERR
+
+```bash
+usage: spacesavers2_pdq [-h] -f FOLDER [-p THREADS] [-o OUTFILE] [-v]
+
+spacesavers2_pdq: get quick per user info (number of files and bytes).
+
+options:
+  -h, --help            show this help message and exit
+  -f FOLDER, --folder FOLDER
+                        spacesavers2_pdq will be run on all files in this folder and its subfolders
+  -p THREADS, --threads THREADS
+                        number of threads to be used (default 4)
+  -o OUTFILE, --outfile OUTFILE
+                        outfile ... catalog file .. by default output is printed to screen
+  -v, --version         show program's version number and exit
+
+Version:
+    v0.12.0
+Example:
+    > spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file
+```
+
+### Output
+
+## tab-delimited output (file)
+
+`spacesavers2_pdq` creates one tab seperated output line per user:
+
+```bash
+% head -n1 test.out
+user1       1386138 6089531321856
+user2  230616  2835680212992
+user3      1499    126442496
+```
+The 3 items in the line are as follows:
+
+
+| Column | Description              | Example                                                                                        |
+| ------ | ------------------------ | ---------------------------------------------------------------------------------------------- |
+| 1      | username                 | "user1" |
+| 2      | total no. of files owned     | 1386138                                                                                          |
+| 3      | total no. of bytes occupied      | 6089531321856                                                                                        |

From b3a6b9f0c64be3e3bc508af35a5b815c4dc90985 Mon Sep 17 00:00:00 2001
From: kopardev <vishal.koparde@nih.gov>
Date: Tue, 27 Feb 2024 17:41:30 -0500
Subject: [PATCH 6/8] chore: update version number for next release

---
 CHANGELOG.md  | 6 ++++++
 docs/index.md | 1 +
 src/VERSION   | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b6efdb..18defdc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,12 @@
 
 ### Bug fixes
 
+## spacesavers2 0.11.5
+
+### New features
+
+- new command `spacesavers2_pdq` to get per-user number of files and number of bytes
+
 ## spacesavers2 0.11.4
 
 ### New features
diff --git a/docs/index.md b/docs/index.md
index 1b16c90..fc357a5 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -30,6 +30,7 @@
 - [spacesavers2_blamematrix](blamematrix.md)
 - [spacesavers2_usurp](usurp.md)
 - [spacesavers2_e2e](e2e.md)
+- [spacesavers2_pdq](pdq.md)
 
 ## Use case
 
diff --git a/src/VERSION b/src/VERSION
index 35ad344..d33c3a2 100644
--- a/src/VERSION
+++ b/src/VERSION
@@ -1 +1 @@
-0.11.4
+0.12.0
\ No newline at end of file

From d689730f4f076420e4c1e1785e47d1ddf8b349a8 Mon Sep 17 00:00:00 2001
From: kopardev <vishal.koparde@nih.gov>
Date: Tue, 27 Feb 2024 17:43:20 -0500
Subject: [PATCH 7/8] refact: using version 0.11.5

---
 src/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/VERSION b/src/VERSION
index d33c3a2..62d5dbd 100644
--- a/src/VERSION
+++ b/src/VERSION
@@ -1 +1 @@
-0.12.0
\ No newline at end of file
+0.11.5

From c9393a0d070a89d0829fec076ecdb019bab6e4c8 Mon Sep 17 00:00:00 2001
From: kopardev <vishal.koparde@nih.gov>
Date: Tue, 27 Feb 2024 17:43:55 -0500
Subject: [PATCH 8/8] chore: update docs to reflect new version

---
 docs/pdq.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/pdq.md b/docs/pdq.md
index efb9060..8f0bd45 100644
--- a/docs/pdq.md
+++ b/docs/pdq.md
@@ -33,7 +33,7 @@ options:
   -v, --version         show program's version number and exit
 
 Version:
-    v0.12.0
+    v0.11.5
 Example:
     > spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file
 ```