From 52a7fc84c0d87eb0fa174a8786fe61aa9dead138 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:39:24 -0500 Subject: [PATCH 1/6] add quick assess command pdq; fix #89 --- spacesavers2_pdq | 105 +++++++++++++++++++++++++++++++++++++++++++++++ src/pdq.py | 63 ++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100755 spacesavers2_pdq create mode 100644 src/pdq.py diff --git a/spacesavers2_pdq b/spacesavers2_pdq new file mode 100755 index 0000000..d1885da --- /dev/null +++ b/spacesavers2_pdq @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# pqd = pretty darn quick + +from src.VersionCheck import version_check +from src.VersionCheck import __version__ +from src.utils import * + +version_check() + +# import required modules +import textwrap +import tqdm +import sys +from src.pdq import pdq +from multiprocessing import Pool +import argparse +from pathlib import Path + + +def task(f): + fd = pdq() + fd.set(f) + return fd + + +def main(): + elog = textwrap.dedent( + """\ + Version: + {} + Example: + > spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file + """.format( + __version__ + ) + ) + parser = argparse.ArgumentParser( + description="spacesavers2_pdq: get quick per user info (number of files and bytes).", + epilog=elog, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "-f", + "--folder", + dest="folder", + required=True, + type=str, + help="spacesavers2_pdq will be run on all files in this folder and its subfolders", + ) + parser.add_argument( + "-p", + "--threads", + dest="threads", + required=False, + type=int, + default=4, + help="number of threads to be used (default 4)", + ) + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + required=False, + type=str, + help="outfile ... catalog file .. by default output is printed to screen", + ) + parser.add_argument("-v", "--version", action="version", version=__version__) + + global args + args = parser.parse_args() + + folder = args.folder + p = Path(folder) + files = [p] + files2 = p.glob("**/*") + files.extend(files2) + + if args.outfile: + outfh = open(args.outfile, 'w') + else: + outfh = sys.stdout + + bigdict=dict() + + with Pool(processes=args.threads) as pool: + for fd in tqdm.tqdm(pool.imap_unordered(task, files),total=len(files)): + if not fd.is_file(): continue + uid = fd.get_uid() + if not uid in bigdict: bigdict[uid]=dict() + inode = fd.get_inode() + if not inode in bigdict[uid]: bigdict[uid][inode]=fd.get_size() + + for uid in bigdict.keys(): + username = get_username_groupname(uid) + nfiles = len(bigdict[uid]) + nbytes = 0 + for inode in bigdict[uid].keys(): + nbytes += bigdict[uid][inode] + outfh.write(f"{username}\t{nfiles}\t{nbytes}\n") + + if args.outfile: + outfh.close() + +if __name__ == "__main__": + main() diff --git a/src/pdq.py b/src/pdq.py new file mode 100644 index 0000000..604780b --- /dev/null +++ b/src/pdq.py @@ -0,0 +1,63 @@ +from pathlib import Path +import sys + +def get_type(p): # copy paste from FileDetails + # input: + # 1. PosixPath object + # output: + # 1. type of path + # u = unknown + # L = broken symlink + # l = symlink + # f = file + # d = folder or directory + x = "u" # unknown + try: + if p.is_symlink(): + x = "l" # link or symlink + try: + p.exists() + except: + x = "L" # upper case L is broken symlink + sys.stderr.write("spacesavers2:Broken symlink found:{}\n".format(p)) + return x + if not p.exists(): + x = "a" # absent + return x + if p.is_dir(): + x = "d" # directory + return x + if p.is_file(): + x = "f" # file + return x + except: # mainly to catch PermissionError: + sys.stderr.write("spacesavers2:File cannot be read:{}\n".format(p)) + return x + +class pdq: + def __init__(self): + self.inode = -1 + self.fld = "u" # u or f or l or d + self.size = -1 + self.uid = 0 + def set(self,p,st_block_byte_size=512): + p = Path(p).absolute() + try: + st = p.stat(follow_symlinks=False) + self.size = st.st_blocks * st_block_byte_size + self.inode = st.st_ino + self.uid = st.st_uid + self.fld = get_type(p) + except: + print(f"spacesavers2_pdq: {p} File not found!") + def get_uid(self): + return self.uid + def get_fld(self): + return self.fld + def is_file(self): + if self.fld == "f": return True + return False + def get_inode(self): + return self.inode + def get_size(self): + return self.size \ No newline at end of file From 9c2a355f9899522d2504408c2616a54796d4b123 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:40:30 -0500 Subject: [PATCH 2/6] fix: add bin redirect for new command --- bin/spacesavers2_pdq | 1 + 1 file changed, 1 insertion(+) create mode 120000 bin/spacesavers2_pdq diff --git a/bin/spacesavers2_pdq b/bin/spacesavers2_pdq new file mode 120000 index 0000000..577f1ce --- /dev/null +++ b/bin/spacesavers2_pdq @@ -0,0 +1 @@ +redirect \ No newline at end of file From 338ddcab1babad91690e9b0fa152dec1d666c130 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:40:58 -0500 Subject: [PATCH 3/6] docs: add new command documentation --- README.md | 1 + docs/pdq.md | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 docs/pdq.md diff --git a/README.md b/README.md index c31b6a2..7e41ba2 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Welcome! `spacesavers2`: - spacesavers2_grubbers - spacesavers2_e2e - spacesavers2_usurp +- spacesavers2_pdq ## `spacesavers2` typical workflow looks like this: diff --git a/docs/pdq.md b/docs/pdq.md new file mode 100644 index 0000000..efb9060 --- /dev/null +++ b/docs/pdq.md @@ -0,0 +1,60 @@ +## spacesavers2_pdq + +pdq = Pretty Darn Quick + +This uses `glob` library to list all files in a user-provided folder recursively. + +For each user it gathers information like: + - total number of files + - total number of bytes + +It is quick tool to gather datapoints to monitor filesystem usage. Typically, can be run once daily and compared with previous days run to find large changes. + +### Inputs + - `--folder`: Path to the folder to run `spacesavers2_pdq` on. + - `--threads`: `spacesavers2_pdq` uses multiprocessing library to parallelize orchestration. This defines the number of threads to run in parallel. + - `--outfile`: If not supplied then the optput is written to the screen. + +> NOTE: `spacesavers2_pdq` reports errors (eg. cannot read file) to STDERR + +```bash +usage: spacesavers2_pdq [-h] -f FOLDER [-p THREADS] [-o OUTFILE] [-v] + +spacesavers2_pdq: get quick per user info (number of files and bytes). + +options: + -h, --help show this help message and exit + -f FOLDER, --folder FOLDER + spacesavers2_pdq will be run on all files in this folder and its subfolders + -p THREADS, --threads THREADS + number of threads to be used (default 4) + -o OUTFILE, --outfile OUTFILE + outfile ... catalog file .. by default output is printed to screen + -v, --version show program's version number and exit + +Version: + v0.12.0 +Example: + > spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file +``` + +### Output + +## tab-delimited output (file) + +`spacesavers2_pdq` creates one tab seperated output line per user: + +```bash +% head -n1 test.out +user1 1386138 6089531321856 +user2 230616 2835680212992 +user3 1499 126442496 +``` +The 3 items in the line are as follows: + + +| Column | Description | Example | +| ------ | ------------------------ | ---------------------------------------------------------------------------------------------- | +| 1 | username | "user1" | +| 2 | total no. of files owned | 1386138 | +| 3 | total no. of bytes occupied | 6089531321856 | From b3a6b9f0c64be3e3bc508af35a5b815c4dc90985 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:41:30 -0500 Subject: [PATCH 4/6] chore: update version number for next release --- CHANGELOG.md | 6 ++++++ docs/index.md | 1 + src/VERSION | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b6efdb..18defdc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ ### Bug fixes +## spacesavers2 0.11.5 + +### New features + +- new command `spacesavers2_pdq` to get per-user number of files and number of bytes + ## spacesavers2 0.11.4 ### New features diff --git a/docs/index.md b/docs/index.md index 1b16c90..fc357a5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -30,6 +30,7 @@ - [spacesavers2_blamematrix](blamematrix.md) - [spacesavers2_usurp](usurp.md) - [spacesavers2_e2e](e2e.md) +- [spacesavers2_pdq](pdq.md) ## Use case diff --git a/src/VERSION b/src/VERSION index 35ad344..d33c3a2 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.11.4 +0.12.0 \ No newline at end of file From d689730f4f076420e4c1e1785e47d1ddf8b349a8 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:43:20 -0500 Subject: [PATCH 5/6] refact: using version 0.11.5 --- src/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/VERSION b/src/VERSION index d33c3a2..62d5dbd 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.12.0 \ No newline at end of file +0.11.5 From c9393a0d070a89d0829fec076ecdb019bab6e4c8 Mon Sep 17 00:00:00 2001 From: kopardev Date: Tue, 27 Feb 2024 17:43:55 -0500 Subject: [PATCH 6/6] chore: update docs to reflect new version --- docs/pdq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pdq.md b/docs/pdq.md index efb9060..8f0bd45 100644 --- a/docs/pdq.md +++ b/docs/pdq.md @@ -33,7 +33,7 @@ options: -v, --version show program's version number and exit Version: - v0.12.0 + v0.11.5 Example: > spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file ```