From 2ad422d4b58286f73e441e8b45511ac18f14e115 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Wed, 14 Feb 2024 09:10:58 -0500 Subject: [PATCH 1/8] refactor: move report to https://github.com/CCBR/reports --- CHANGELOG.md | 6 +- bin/render.R | 5 - bin/render.sh | 11 - bin/render_report_biowulf.sh | 29 -- docker/spacesavers2/Dockerfile | 105 ------ docker/spacesavers2/environment.txt | 26 -- docker/spacesavers2/meta.yml | 4 - report.Rmd | 504 ---------------------------- 8 files changed, 4 insertions(+), 686 deletions(-) delete mode 100755 bin/render.R delete mode 100755 bin/render.sh delete mode 100755 bin/render_report_biowulf.sh delete mode 100644 docker/spacesavers2/Dockerfile delete mode 100644 docker/spacesavers2/environment.txt delete mode 100644 docker/spacesavers2/meta.yml delete mode 100644 report.Rmd diff --git a/CHANGELOG.md b/CHANGELOG.md index b65cf73..3ca870d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ ## spacesavers2 development version +- Move the report to a separate internal repository (@kelly-sovacool) + ### New features ### Bug fixes @@ -21,11 +23,11 @@ - `grubbers` `--limit` can be < 1 GiB (float) (#70, @kopardev) - `grubbers` output file format changed. New original file column added. Original file is required by `usurp`. - `mimeo` `--duplicateonly` now correctly handles duplicates owned by different UIDs. (#71, @kopardev) - - Update `blamematrix` and to account for corrected duplicate handling in `mimeo`. + - Update `blamematrix` and to account for corrected duplicate handling in `mimeo`. - `usurp` now uses the new "original file" column from `grubbers` while creating hard-links. - Total size now closely resembles `df` results (fix #75 @kopardev) - Files with future timestamps are handled correctly (fix #76, @kopardev) - + ## spacesavers2 0.10.2 - Now tracking user-facing changes with a changelog. (#61, @kelly-sovacool) diff --git a/bin/render.R b/bin/render.R deleted file mode 100755 index ebb8ae1..0000000 --- a/bin/render.R +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env Rscript -rmarkdown::render("report.Rmd", - output_file = "datashare/report.html", - params = list(input_dir = "data") -) diff --git a/bin/render.sh b/bin/render.sh deleted file mode 100755 index 6c3242a..0000000 --- a/bin/render.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash -# to be executed from /data/CCBR_Pipeliner/Tools/spacesavers2/report -# Usage: bash bin/render_report_biowulf.sh -module load singularity -SINGULARITY_CACHEDIR=/data/CCBR_Pipeliner/SIFS - -# render report -echo "cd /mnt && \ - Rscript bin/render.R \ - " |\ - singularity exec -C -B $PWD:/mnt,/data/CCBR_Pipeliner/userdata/spacesavers2/:/mnt/data docker://nciccbr/spacesavers2:0.1.1 bash diff --git a/bin/render_report_biowulf.sh b/bin/render_report_biowulf.sh deleted file mode 100755 index 714eaa9..0000000 --- a/bin/render_report_biowulf.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash -# to be executed from /data/CCBR_Pipeliner/Tools/spacesavers2/report -# Usage: bash bin/render_report_biowulf.sh -module load singularity -SINGULARITY_CACHEDIR=/data/CCBR_Pipeliner/SIFS - -today=$(date +'%Y-%m-%d') -year=$(date +'%Y') -mkdir -p datashare/$year -html_filename="datashare/${year}/spacesavers2-report_${today}.html" -recipient_email="kelly.sovacool@nih.gov,vishal.koparde@nih.gov" - -url=https://hpc.nih.gov/~CCBR_Pipeliner/spacesavers2/${year}/spacesavers2-report_${today}.html - -# update disk usage -bash bin/disk_usage.sh -# render report and send via email -echo "cd /mnt && \ - Rscript bin/render.R && \ - cp datashare/report.html $html_filename && \ - python src/send_email.py \ - $html_filename \ - $url \ - $recipient_email \ - " |\ - singularity exec -C -B $PWD:/mnt,/data/CCBR_Pipeliner/userdata/spacesavers2/:/mnt/data docker://nciccbr/spacesavers2:0.1.1 bash - -chmod -R a+r datashare/ -cp -r datashare/* /data/CCBR_Pipeliner/datashare/spacesavers2/ diff --git a/docker/spacesavers2/Dockerfile b/docker/spacesavers2/Dockerfile deleted file mode 100644 index 233b5fe..0000000 --- a/docker/spacesavers2/Dockerfile +++ /dev/null @@ -1,105 +0,0 @@ -FROM ubuntu:20.04 - -# build time variables -ARG BUILD_DATE="000000" -ENV BUILD_DATE=${BUILD_DATE} -ARG BUILD_TAG="000000" -ENV BUILD_TAG=${BUILD_TAG} -ARG REPONAME="000000" -ENV REPONAME=${REPONAME} - -RUN mkdir -p /opt2 && mkdir -p /data2 -ENV TZ=America/New_York -RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone - -RUN apt update && apt-get -y upgrade -# Set the locale -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - locales build-essential cmake cpanminus && \ - localedef -i en_US -f UTF-8 en_US.UTF-8 && \ - cpanm FindBin Term::ReadLine - -# install basic dependencies with apt-get -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - build-essential \ - figlet \ - g++ \ - gcc \ - gfortran \ - git \ - libatlas-base-dev \ - libblas-dev \ - libboost-dev \ - libbz2-dev \ - libcurl4-openssl-dev \ - libexpat1-dev \ - libfreetype6-dev \ - libgd-dev \ - libgd-perl \ - libglib2.0-dev \ - libgpgme11-dev \ - libgs-dev \ - libgsl-dev \ - libgsl0-dev \ - libhtml-template-compiled-perl \ - libicu-dev \ - libjudy-dev \ - liblapack-dev \ - liblzma-dev \ - libmysqlclient-dev \ - libncurses-dev \ - libopenmpi-dev \ - libpng-dev \ - librtmp-dev \ - libseccomp-dev \ - libssl-dev \ - libtool \ - libxml-libxml-debugging-perl \ - libxml-opml-simplegen-perl \ - libxml2-dev \ - libxslt-dev \ - make \ - manpages-dev \ - openjdk-17-jre-headless \ - parallel \ - pigz \ - pkg-config \ - python3-pip \ - python3-dev \ - rsync \ - squashfs-tools \ - unzip \ - uuid-dev \ - wget \ - zlib1g \ - zlib1g-dev \ - zlibc - -# Install conda and give write permissions to conda folder -RUN echo 'export PATH=/opt2/conda/bin:$PATH' > /etc/profile.d/conda.sh && \ - wget --quiet "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" -O ~/miniforge3.sh && \ - /bin/bash ~/miniforge3.sh -b -p /opt2/conda && \ - rm ~/miniforge3.sh && chmod 777 -R /opt2/conda/ -ENV PATH="/opt2/conda/bin:$PATH" - -# install pandoc & R packages -COPY environment.txt /data2/ -RUN mamba install -c conda-forge --file /data2/environment.txt -ENV R_LIBS_USER=/opt2/conda/lib/R/library/ - -# install quarto -ENV QUARTO_VERSION="1.3.450" -ADD https://github.com/quarto-dev/quarto-cli/releases/download/v${QUARTO_VERSION}/quarto-${QUARTO_VERSION}-linux-amd64.tar.gz /opt2 -WORKDIR /opt2 -RUN tar -xzvf quarto-${QUARTO_VERSION}-linux-amd64.tar.gz -ENV PATH="/opt2/quarto-${QUARTO_VERSION}/bin/:${PATH}" -RUN quarto check - -# Save Dockerfile in the docker -COPY Dockerfile /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} -RUN chmod a+r /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} - -# cleanup -WORKDIR /data2 -RUN apt-get clean && apt-get purge \ - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/docker/spacesavers2/environment.txt b/docker/spacesavers2/environment.txt deleted file mode 100644 index 5334639..0000000 --- a/docker/spacesavers2/environment.txt +++ /dev/null @@ -1,26 +0,0 @@ -pandoc -r-base=4.3.1 -r-DT -r-RColorBrewer -r-bslib=0.5.1 -r-crosstalk -r-downlit -r-dplyr -r-fontawesome -r-ggplot2 -r-glue -r-here -r-janitor -r-knitr -r-lubridate -r-plotly -r-purrr -r-readr -r-rlang -r-rmarkdown -r-scales -r-shiny -r-stringr -r-tibble -r-tidyr -r-xml2 diff --git a/docker/spacesavers2/meta.yml b/docker/spacesavers2/meta.yml deleted file mode 100644 index d68cd0a..0000000 --- a/docker/spacesavers2/meta.yml +++ /dev/null @@ -1,4 +0,0 @@ -dockerhub_namespace: nciccbr -image_name: spacesavers2 -version: 0.1.1 -container: "$(dockerhub_namespace)/$(image_name):$(version)" diff --git a/report.Rmd b/report.Rmd deleted file mode 100644 index c04f01f..0000000 --- a/report.Rmd +++ /dev/null @@ -1,504 +0,0 @@ ---- -title: "spacesavers2 🚀 report" -author: "CCR Collaborative Bioinformatics Resource" -date: '`r lubridate::today()`' -output: - html_document: - theme: - version: 5 - code_folding: hide - toc: true - self_contained: true -params: - input_dir: '/data/CCBR_Pipeliner/userdata/spacesavers2/' - n_top_users: 10 -knit: (function(inputFile, encoding) { - rmarkdown::render(inputFile, encoding = encoding, output_dir = "datashare/") }) ---- -```{r setup} -knitr::opts_chunk$set(message = FALSE, warning = FALSE) -``` - -Notice a bug or want to make a suggestion for this report? [Open an issue](https://github.com/CCBR/spacesavers2/issues) on GitHub. - -```{r load} -library(bslib) -library(dplyr) -library(DT) -library(fontawesome) -library(ggplot2) -library(glue) -library(here) -library(htmltools) -library(knitr) -library(lubridate) -library(plotly) -library(purrr) -library(readr) -library(rlang) -library(scales) -library(shiny) -library(stringr) -library(tidyr) -theme_set(theme_bw()) - -to_bytes <- function(x, from_unit) { - bytes_units <- list( - KiB = 1, - MiB = 2, - GiB = 3, - TiB = 4 - ) - return(x * (1024^bytes_units[[from_unit]])) -} -from_bytes <- function(x, to_unit) { - return(x * x / (to_bytes(x, to_unit))) -} - -from_bytes_v <- Vectorize(from_bytes) -to_bytes_v <- Vectorize(to_bytes) - -filter_users <- function(dat, usercol = username) { - non_people <- c("allusers", "rpcuser", "slurm") - dat %>% - filter( - !({{ usercol }} %in% non_people), # not actual people - !str_detect({{ usercol }}, "[0-9]") # entirely numeric usernames - ) -} - -is_large_range <- function(x, n_orders_magnitude = 5) { - xrange <- range(x) - return((xrange[2] - xrange[1]) >= 10^n_orders_magnitude) -} - -plot_user_metric <- function(dat, x_metric) { - dat %>% - ggplot(aes( - x = eval_tidy(data_sym(x_metric)), - y = username, - fill = eval_tidy(data_sym(x_metric)), - text = glue("{username}\n{eval_tidy(data_sym(x_metric))} {x_metric}") - )) + - geom_col() + - # TODO: ggplotly doesn't know what to do with scale::label_log - # {if (is_large_range(dat %>% pull(x_metric))) scale_x_log10(labels = label_log(digits = 2)) } + - labs(x = x_metric, y = "") + - theme(legend.position = "none") -} - -plot_metric_time <- function(dat, y_metric) { - dat %>% - ggplot(aes( - x = date, - y = eval_tidy(data_sym(y_metric)), - color = username - )) + - geom_line(alpha = 0.7) + - geom_point(aes(text = glue("{username}\n{eval_tidy(data_sym(y_metric))} {y_metric}"))) + - labs(y = y_metric) -} - -min_user_bytes_GiB <- 10 -panel_summary <- function(dat, - folder_path = "/data/CCBR", - plot_fcn = plot_metric_time, - min_bytes_GiB = min_user_bytes_GiB) { - summary_dat_folder <- dat %>% - filter(FolderPath == folder_path) %>% - mutate(TotalBytes_GiB = from_bytes(TotalBytes, 'GiB')) %>% - # only keep users with at least 10 GiB total usage - filter(TotalBytes_GiB >= min_bytes_GiB) %>% - select(-TotalBytes_GiB) - top_users <- summary_dat_folder %>% - pivot_longer(all_of(summary_metrics), - names_to = "metric" - ) %>% - mutate(value_adj = case_when( - metric == "OverallScore" ~ -value, - TRUE ~ value - )) %>% - group_by(metric) %>% - slice_max(order_by = value_adj, n = n_top_users) %>% - pull(username) %>% - unique() - plots <- summary_metrics %>% lapply(function(y_metric) { - user_order <- summary_dat_folder %>% - filter(username %in% top_users) %>% - pivot_longer(all_of(summary_metrics), - names_to = "metric" - ) %>% - mutate(value_adj = case_when( - metric == "OverallScore" ~ -value, - TRUE ~ value - )) %>% - filter(metric == y_metric) %>% - arrange(by = value_adj) %>% - pull(username) %>% - unique() - if (y_metric == "TotalBytes" | y_metric == "DuplicateBytes") { - to_unit <- "TiB" # TODO: dynamically set based on range of metric - new_metric_name <- glue("{y_metric}_{to_unit}") - summary_dat_folder <- summary_dat_folder %>% - mutate("{new_metric_name}" := from_bytes(eval_tidy(data_sym(y_metric)), to_unit)) - y_metric <- new_metric_name - } else if (y_metric == "TotalMeanAge" | y_metric == "DuplicateMeanAge") { - new_metric_name <- glue("{y_metric}_Days") - summary_dat_folder <- summary_dat_folder %>% - rename("{new_metric_name}" := y_metric) - y_metric <- new_metric_name - } else if (y_metric == "TotalFiles" | y_metric == "DuplicateFiles") { - new_metric_name <- glue("{y_metric}_Millions") - summary_dat_folder <- summary_dat_folder %>% - mutate("{new_metric_name}" := eval_tidy(data_sym(y_metric)) / 10^6) - y_metric <- new_metric_name - } - p <- summary_dat_folder %>% - filter(username %in% user_order) %>% - mutate(username = factor(username, levels = user_order)) %>% - mutate(across(where(is.numeric), round, digits = 2)) %>% - plot_fcn(y_metric) - nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text")) - }) - nav_panel( - title = markdown(glue("`{folder_path}`")), - navset_pill_list(!!!plots) - ) -} -``` - -```{r read_data} -n_top_users <- params$n_top_users -input_dir <- params$input_dir # here("data") -aggregated_filetypes <- c("blamematrix", "catalog", "mimeo") -# TODO: only load last N weeks of data to keep RAM usage reasonably low -all_files <- tibble(filename = list.dirs(input_dir) %>% - Filter(function(x) { - x != input_dir - }, .) %>% - lapply(function(x) { - list.files(x, full.names = TRUE) - }) %>% - unlist()) -user_dat <- all_files %>% - filter(!str_detect(filename, paste(aggregated_filetypes, collapse = "|"))) %>% - separate_wider_delim(filename, - delim = ".", cols_remove = FALSE, - names = c("date", "path", "username", "file", "ext"), - too_few = "debug" - ) %>% - mutate(date = as_date(basename(date))) - -dates <- user_dat %>% - filter(!is.na(date)) %>% - pull(date) %>% - unique() -most_recent_date <- dates %>% max() - -total_usage_tb <- user_dat %>% - filter( - username == "allusers", - date == most_recent_date, - file == "summary", - path == "_data_CCBR" - ) %>% - pull(filename) %>% - read_tsv() %>% - filter(FolderPath == "/data/CCBR") %>% - mutate(disk_usage_tb = from_bytes(TotalBytes, "TiB")) %>% - pull(disk_usage_tb) -# TODO disk_usage_tb doesn't agree with output from `df` - -grubbers_allusers_err <- user_dat %>% - filter( - username == "allusers", - date == most_recent_date, - file == "grubbers", - ext == "err", - path == "_data_CCBR" - ) %>% - pull(filename) %>% - read_lines() -grubbers_message <- grubbers_allusers_err[2] %>% - str_split(":") %>% - unlist() %>% - .[3] - -user_dat <- user_dat %>% filter_users() -usernames <- user_dat %>% - pull(username) %>% - unique() - -summary_dat_recent <- user_dat %>% - filter( - date == most_recent_date, file == "summary" - ) %>% - pull(filename) %>% - map(function(x) { - read_tsv(x) %>% mutate(filename = x) - }) %>% - list_rbind() %>% - separate_wider_delim(filename, - delim = ".", cols_remove = FALSE, - names = c("basepath", "path", "username", "file", "ext") - ) -summary_metrics <- summary_dat_recent %>% - pivot_longer(where(is.numeric), names_to = "metric") %>% - pull(metric) %>% - unique() -``` - -## Total disk usage - -```{r disk_usage_latest} -disk_usage <- read_tsv(here("results", "disk_usage.tsv")) %>% - mutate(used_tib = from_bytes(to_bytes(Used, "KiB"), "TiB"), - avail_tib = from_bytes(to_bytes(Avail, "KiB"),"TiB"), - size_tib = used_tib + avail_tib) -df_date <- disk_usage %>% - slice_max(datetime) %>% - pull(datetime) %>% - as_date() - -layout_column_wrap( - width = 1 / 2, - value_box( - title = p(fa("hard-drive"), " Disk space in /data/CCBR"), - value = markdown(disk_usage %>% - slice_max(datetime) %>% - mutate(Usage = glue("{round(used_tib,1)} / {size_tib}")) %>% - select(Usage, `Use%`) %>% - kable()), - theme = "warning" - ), - value_box( - title = p(fa("users", prefer_type = "regular"), " Users"), - value = p(glue("{length(usernames)} users as of {format(df_date, '%b %d, %Y')}")), - theme = "primary" - ) -) -``` - -### Total usage over time - -```{r disk_usage_over_time} -p <- disk_usage %>% - mutate(datetime = lubridate::as_datetime(datetime)) %>% - rename(used = used_tib, size = size_tib, avail = avail_tib) %>% - pivot_longer(c(used, size), names_to = 'metric') %>% - mutate(value = round(value, 2)) %>% - ggplot(aes( - x = datetime, - y = value, - color = metric, - group = metric - )) + - geom_line(alpha = 0.7) + - geom_point(aes(text = glue("{value} TiB"))) + - scale_x_datetime(labels = date_format("%b %Y")) + - scale_color_brewer(palette = "Set2", - breaks = c('size', 'used') # enforce order - ) + - labs(y = 'TiB', x = '') + - theme(legend.title = element_blank()) - -card(ggplotly(p, tooltip = "text")) -``` - - -## Summary over time - -Usage by top users for each spacesavers metric. -Only users with at least `r min_user_bytes_GiB` GiB of total disk usage are shown. - -```{r summary_over_time} -summary_dat_all <- user_dat %>% - filter( - file == "summary" - ) %>% - pull(filename) %>% - map(function(x) { - read_tsv(x) %>% mutate(filename = x) - }) %>% - list_rbind() %>% - separate_wider_delim(filename, - delim = ".", cols_remove = FALSE, - names = c("basepath", "path", "username", "file", "ext") - ) %>% - mutate(date = str_replace(basepath, ".*/", "") %>% as_date()) - -navset_tab( - summary_dat_all %>% panel_summary("/data/CCBR", plot_metric_time), - summary_dat_all %>% panel_summary("/data/CCBR/rawdata", plot_metric_time), - summary_dat_all %>% panel_summary("/data/CCBR/projects", plot_metric_time), -) -``` - - -## Most recent summary (`r most_recent_date`) - -Usage by top users for each spacesavers metric. - -```{r summary_recent} -navset_tab( - summary_dat_recent %>% panel_summary("/data/CCBR", plot_user_metric), - summary_dat_recent %>% panel_summary("/data/CCBR/rawdata", plot_user_metric), - summary_dat_recent %>% panel_summary("/data/CCBR/projects", plot_user_metric), -) -``` - -## Summary table - -```{r allusers_summary} -allusers_summary <- all_files %>% - filter(str_detect(filename, "_data_CCBR.allusers.summary.txt")) %>% - separate_wider_delim(filename, - delim = ".", cols_remove = FALSE, - names = c("date", "path", "username", "file", "ext") - ) %>% - mutate(date = as_date(basename(date))) %>% - slice_max(order_by = date) %>% - pull(filename) %>% - map(function(x) { - read_tsv(x) - }) %>% - list_rbind() %>% - mutate( - TotalBytes_GiB = round(from_bytes_v(TotalBytes, "GiB"), 2), - DuplicateBytes_GiB = round(from_bytes_v(DuplicateBytes, "GiB"), 2), - .before = "DuplicateBytes" - ) %>% - select(-c(TotalBytes, DuplicateBytes)) - -card( - card_header("Summary across all users"), - datatable(allusers_summary, fillContainer = TRUE) -) -``` - - -## Blame matrix - -```{r blame} -blame_matrix <- all_files %>% - filter(str_detect(filename, "blamematrix")) %>% - separate_wider_delim(filename, - delim = ".", cols_remove = FALSE, - names = c("date", "path", "file", "ext") - ) %>% - mutate(date = as_date(basename(date))) %>% - filter(!is.na(date), file == "blamematrix", ext == "tsv", path == "_data_CCBR") %>% - slice_max(order_by = date) %>% - pull(filename) %>% - map(function(x) { - read_tsv(x) - }) %>% - list_rbind() - -card( - card_header("Disk usage by user in subdirectories"), - datatable(blame_matrix, fillContainer = TRUE) -) -``` - - -## Duplicate files - -`r grubbers_message` - -### Potential savings per user - -```{r grub_err} -grub_err <- user_dat %>% - filter_users() %>% - filter(!is.na(date), file == "grubbers", ext == "err", path == "_data_CCBR") %>% - slice_max(order_by = date) %>% - pull(filename) %>% - map(function(x) { - read_tsv(x, col_names = FALSE) %>% - mutate(filename = x) - }) %>% - list_rbind() %>% - filter(str_detect(X1, "Deleting")) %>% - separate_wider_delim(filename, - delim = ".", cols_remove = FALSE, - names = c("date", "path", "username", "file", "ext") - ) %>% - mutate( - date = as_date(basename(date)), - grub_msg = str_replace_all(X1, regex("^.*:"), ""), - savings_value = as.numeric( - str_replace_all( - grub_msg, - regex(".*save ([\\d\\.]*) [\\w!]+"), - "\\1" - ) - ), - savings_unit = str_replace_all( - grub_msg, - regex(".*save [\\d\\.]* ([\\w]+)!"), - "\\1" - ), - savings_bytes = to_bytes_v(savings_value, savings_unit) - ) - -user_grub_table <- grub_err %>% - arrange(desc(savings_bytes)) %>% - select(username, savings_value, savings_unit) - -card( - card_header("Savings per user"), - datatable(user_grub_table, fillContainer = TRUE) -) -``` - - -### All high-value duplicates - -```{r grubbers} -grub_dat <- user_dat %>% - filter_users() %>% - filter(!is.na(date), file == "grubbers", ext == "tsv", path == "_data_CCBR") %>% - slice_max(order_by = date) %>% - pull(filename) %>% - map(function(x) { - read_tsv(x, col_names = FALSE) %>% - mutate(filename = x) - }) %>% - list_rbind() %>% - rename( - file_hash = X1, - file_count = X2, - total_disk_usage = X3, - single_disk_usage = X4, - filepaths = X5 - ) %>% - separate_wider_delim(filename, - delim = ".", cols_remove = FALSE, - names = c("date", "path", "username", "file", "ext") - ) %>% - mutate(date = as_date(basename(date))) %>% - filter_users() %>% - separate_wider_delim(total_disk_usage, - delim = " ", - names = c("total_disk_usage_value", "total_disk_usage_unit"), - cols_remove = FALSE - ) %>% - separate_wider_delim(single_disk_usage, - delim = " ", - names = c("single_disk_usage_value", "single_disk_usage_unit"), - cols_remove = FALSE - ) %>% - mutate(across(all_of(c("total_disk_usage_value", "single_disk_usage_value")), as.numeric)) - -top_files <- grub_dat %>% - arrange(order_by = desc(total_disk_usage_value)) %>% - select(total_disk_usage_value, username, filepaths) %>% - rename(disk_usage_gb = total_disk_usage_value) - -card(card_header("Top files"), datatable(top_files, fillContainer = TRUE)) -``` - - -For instructions on how to replace duplicates with hard links, see the -[`usurp` command in the spacesavers docs](https://ccbr.github.io/spacesavers2/usurp/). From 6c92e66fb9f9ae7e7b46af243178ad56bd777e5e Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Wed, 14 Feb 2024 09:12:51 -0500 Subject: [PATCH 2/8] docs: link PR --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ca870d..749ca74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ## spacesavers2 development version -- Move the report to a separate internal repository (@kelly-sovacool) +- Move the report to a separate internal repository (#79, @kelly-sovacool) ### New features From 52a7fc84c0d87eb0fa174a8786fe61aa9dead138 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:39:24 -0500 Subject: [PATCH 3/8] add quick assess command pdq; fix #89 --- spacesavers2_pdq | 105 +++++++++++++++++++++++++++++++++++++++++++++++ src/pdq.py | 63 ++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100755 spacesavers2_pdq create mode 100644 src/pdq.py diff --git a/spacesavers2_pdq b/spacesavers2_pdq new file mode 100755 index 0000000..d1885da --- /dev/null +++ b/spacesavers2_pdq @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# pqd = pretty darn quick + +from src.VersionCheck import version_check +from src.VersionCheck import __version__ +from src.utils import * + +version_check() + +# import required modules +import textwrap +import tqdm +import sys +from src.pdq import pdq +from multiprocessing import Pool +import argparse +from pathlib import Path + + +def task(f): + fd = pdq() + fd.set(f) + return fd + + +def main(): + elog = textwrap.dedent( + """\ + Version: + {} + Example: + > spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file + """.format( + __version__ + ) + ) + parser = argparse.ArgumentParser( + description="spacesavers2_pdq: get quick per user info (number of files and bytes).", + epilog=elog, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "-f", + "--folder", + dest="folder", + required=True, + type=str, + help="spacesavers2_pdq will be run on all files in this folder and its subfolders", + ) + parser.add_argument( + "-p", + "--threads", + dest="threads", + required=False, + type=int, + default=4, + help="number of threads to be used (default 4)", + ) + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + required=False, + type=str, + help="outfile ... catalog file .. by default output is printed to screen", + ) + parser.add_argument("-v", "--version", action="version", version=__version__) + + global args + args = parser.parse_args() + + folder = args.folder + p = Path(folder) + files = [p] + files2 = p.glob("**/*") + files.extend(files2) + + if args.outfile: + outfh = open(args.outfile, 'w') + else: + outfh = sys.stdout + + bigdict=dict() + + with Pool(processes=args.threads) as pool: + for fd in tqdm.tqdm(pool.imap_unordered(task, files),total=len(files)): + if not fd.is_file(): continue + uid = fd.get_uid() + if not uid in bigdict: bigdict[uid]=dict() + inode = fd.get_inode() + if not inode in bigdict[uid]: bigdict[uid][inode]=fd.get_size() + + for uid in bigdict.keys(): + username = get_username_groupname(uid) + nfiles = len(bigdict[uid]) + nbytes = 0 + for inode in bigdict[uid].keys(): + nbytes += bigdict[uid][inode] + outfh.write(f"{username}\t{nfiles}\t{nbytes}\n") + + if args.outfile: + outfh.close() + +if __name__ == "__main__": + main() diff --git a/src/pdq.py b/src/pdq.py new file mode 100644 index 0000000..604780b --- /dev/null +++ b/src/pdq.py @@ -0,0 +1,63 @@ +from pathlib import Path +import sys + +def get_type(p): # copy paste from FileDetails + # input: + # 1. PosixPath object + # output: + # 1. type of path + # u = unknown + # L = broken symlink + # l = symlink + # f = file + # d = folder or directory + x = "u" # unknown + try: + if p.is_symlink(): + x = "l" # link or symlink + try: + p.exists() + except: + x = "L" # upper case L is broken symlink + sys.stderr.write("spacesavers2:Broken symlink found:{}\n".format(p)) + return x + if not p.exists(): + x = "a" # absent + return x + if p.is_dir(): + x = "d" # directory + return x + if p.is_file(): + x = "f" # file + return x + except: # mainly to catch PermissionError: + sys.stderr.write("spacesavers2:File cannot be read:{}\n".format(p)) + return x + +class pdq: + def __init__(self): + self.inode = -1 + self.fld = "u" # u or f or l or d + self.size = -1 + self.uid = 0 + def set(self,p,st_block_byte_size=512): + p = Path(p).absolute() + try: + st = p.stat(follow_symlinks=False) + self.size = st.st_blocks * st_block_byte_size + self.inode = st.st_ino + self.uid = st.st_uid + self.fld = get_type(p) + except: + print(f"spacesavers2_pdq: {p} File not found!") + def get_uid(self): + return self.uid + def get_fld(self): + return self.fld + def is_file(self): + if self.fld == "f": return True + return False + def get_inode(self): + return self.inode + def get_size(self): + return self.size \ No newline at end of file From 9c2a355f9899522d2504408c2616a54796d4b123 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:40:30 -0500 Subject: [PATCH 4/8] fix: add bin redirect for new command --- bin/spacesavers2_pdq | 1 + 1 file changed, 1 insertion(+) create mode 120000 bin/spacesavers2_pdq diff --git a/bin/spacesavers2_pdq b/bin/spacesavers2_pdq new file mode 120000 index 0000000..577f1ce --- /dev/null +++ b/bin/spacesavers2_pdq @@ -0,0 +1 @@ +redirect \ No newline at end of file From 338ddcab1babad91690e9b0fa152dec1d666c130 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:40:58 -0500 Subject: [PATCH 5/8] docs: add new command documentation --- README.md | 1 + docs/pdq.md | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 docs/pdq.md diff --git a/README.md b/README.md index c31b6a2..7e41ba2 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Welcome! `spacesavers2`: - spacesavers2_grubbers - spacesavers2_e2e - spacesavers2_usurp +- spacesavers2_pdq ## `spacesavers2` typical workflow looks like this: diff --git a/docs/pdq.md b/docs/pdq.md new file mode 100644 index 0000000..efb9060 --- /dev/null +++ b/docs/pdq.md @@ -0,0 +1,60 @@ +## spacesavers2_pdq + +pdq = Pretty Darn Quick + +This uses `glob` library to list all files in a user-provided folder recursively. + +For each user it gathers information like: + - total number of files + - total number of bytes + +It is quick tool to gather datapoints to monitor filesystem usage. Typically, can be run once daily and compared with previous days run to find large changes. + +### Inputs + - `--folder`: Path to the folder to run `spacesavers2_pdq` on. + - `--threads`: `spacesavers2_pdq` uses multiprocessing library to parallelize orchestration. This defines the number of threads to run in parallel. + - `--outfile`: If not supplied then the optput is written to the screen. + +> NOTE: `spacesavers2_pdq` reports errors (eg. cannot read file) to STDERR + +```bash +usage: spacesavers2_pdq [-h] -f FOLDER [-p THREADS] [-o OUTFILE] [-v] + +spacesavers2_pdq: get quick per user info (number of files and bytes). + +options: + -h, --help show this help message and exit + -f FOLDER, --folder FOLDER + spacesavers2_pdq will be run on all files in this folder and its subfolders + -p THREADS, --threads THREADS + number of threads to be used (default 4) + -o OUTFILE, --outfile OUTFILE + outfile ... catalog file .. by default output is printed to screen + -v, --version show program's version number and exit + +Version: + v0.12.0 +Example: + > spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file +``` + +### Output + +## tab-delimited output (file) + +`spacesavers2_pdq` creates one tab seperated output line per user: + +```bash +% head -n1 test.out +user1 1386138 6089531321856 +user2 230616 2835680212992 +user3 1499 126442496 +``` +The 3 items in the line are as follows: + + +| Column | Description | Example | +| ------ | ------------------------ | ---------------------------------------------------------------------------------------------- | +| 1 | username | "user1" | +| 2 | total no. of files owned | 1386138 | +| 3 | total no. of bytes occupied | 6089531321856 | From b3a6b9f0c64be3e3bc508af35a5b815c4dc90985 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:41:30 -0500 Subject: [PATCH 6/8] chore: update version number for next release --- CHANGELOG.md | 6 ++++++ docs/index.md | 1 + src/VERSION | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b6efdb..18defdc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ ### Bug fixes +## spacesavers2 0.11.5 + +### New features + +- new command `spacesavers2_pdq` to get per-user number of files and number of bytes + ## spacesavers2 0.11.4 ### New features diff --git a/docs/index.md b/docs/index.md index 1b16c90..fc357a5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -30,6 +30,7 @@ - [spacesavers2_blamematrix](blamematrix.md) - [spacesavers2_usurp](usurp.md) - [spacesavers2_e2e](e2e.md) +- [spacesavers2_pdq](pdq.md) ## Use case diff --git a/src/VERSION b/src/VERSION index 35ad344..d33c3a2 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.11.4 +0.12.0 \ No newline at end of file From d689730f4f076420e4c1e1785e47d1ddf8b349a8 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:43:20 -0500 Subject: [PATCH 7/8] refact: using version 0.11.5 --- src/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/VERSION b/src/VERSION index d33c3a2..62d5dbd 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.12.0 \ No newline at end of file +0.11.5 From c9393a0d070a89d0829fec076ecdb019bab6e4c8 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:43:55 -0500 Subject: [PATCH 8/8] chore: update docs to reflect new version --- docs/pdq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pdq.md b/docs/pdq.md index efb9060..8f0bd45 100644 --- a/docs/pdq.md +++ b/docs/pdq.md @@ -33,7 +33,7 @@ options: -v, --version show program's version number and exit Version: - v0.12.0 + v0.11.5 Example: > spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file ```