Skip to content

Commit

Permalink
Merge pull request #90 from CCBR/quick_assess
Browse files Browse the repository at this point in the history
quick assess
  • Loading branch information
kopardev authored Feb 27, 2024
2 parents 5daaa31 + c9393a0 commit 990f252
Show file tree
Hide file tree
Showing 8 changed files with 238 additions and 1 deletion.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

### Bug fixes

## spacesavers2 0.11.5

### New features

- new command `spacesavers2_pdq` to get per-user number of files and number of bytes

## spacesavers2 0.11.4

### New features
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Welcome! `spacesavers2`:
- spacesavers2_grubbers
- spacesavers2_e2e
- spacesavers2_usurp
- spacesavers2_pdq

## `spacesavers2` typical workflow looks like this:

Expand Down
1 change: 1 addition & 0 deletions bin/spacesavers2_pdq
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
- [spacesavers2_blamematrix](blamematrix.md)
- [spacesavers2_usurp](usurp.md)
- [spacesavers2_e2e](e2e.md)
- [spacesavers2_pdq](pdq.md)
## Use case
Expand Down
60 changes: 60 additions & 0 deletions docs/pdq.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
## spacesavers2_pdq

pdq = Pretty Darn Quick

This uses `glob` library to list all files in a user-provided folder recursively.

For each user it gathers information like:
- total number of files
- total number of bytes

It is quick tool to gather datapoints to monitor filesystem usage. Typically, can be run once daily and compared with previous days run to find large changes.

### Inputs
- `--folder`: Path to the folder to run `spacesavers2_pdq` on.
- `--threads`: `spacesavers2_pdq` uses multiprocessing library to parallelize orchestration. This defines the number of threads to run in parallel.
- `--outfile`: If not supplied then the optput is written to the screen.

> NOTE: `spacesavers2_pdq` reports errors (eg. cannot read file) to STDERR
```bash
usage: spacesavers2_pdq [-h] -f FOLDER [-p THREADS] [-o OUTFILE] [-v]

spacesavers2_pdq: get quick per user info (number of files and bytes).

options:
-h, --help show this help message and exit
-f FOLDER, --folder FOLDER
spacesavers2_pdq will be run on all files in this folder and its subfolders
-p THREADS, --threads THREADS
number of threads to be used (default 4)
-o OUTFILE, --outfile OUTFILE
outfile ... catalog file .. by default output is printed to screen
-v, --version show program's version number and exit
Version:
v0.11.5
Example:
> spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file
```
### Output
## tab-delimited output (file)
`spacesavers2_pdq` creates one tab seperated output line per user:
```bash
% head -n1 test.out
user1 1386138 6089531321856
user2 230616 2835680212992
user3 1499 126442496
```
The 3 items in the line are as follows:
| Column | Description | Example |
| ------ | ------------------------ | ---------------------------------------------------------------------------------------------- |
| 1 | username | "user1" |
| 2 | total no. of files owned | 1386138 |
| 3 | total no. of bytes occupied | 6089531321856 |
105 changes: 105 additions & 0 deletions spacesavers2_pdq
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python3
# pqd = pretty darn quick

from src.VersionCheck import version_check
from src.VersionCheck import __version__
from src.utils import *

version_check()

# import required modules
import textwrap
import tqdm
import sys
from src.pdq import pdq
from multiprocessing import Pool
import argparse
from pathlib import Path


def task(f):
fd = pdq()
fd.set(f)
return fd


def main():
elog = textwrap.dedent(
"""\
Version:
{}
Example:
> spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file
""".format(
__version__
)
)
parser = argparse.ArgumentParser(
description="spacesavers2_pdq: get quick per user info (number of files and bytes).",
epilog=elog,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-f",
"--folder",
dest="folder",
required=True,
type=str,
help="spacesavers2_pdq will be run on all files in this folder and its subfolders",
)
parser.add_argument(
"-p",
"--threads",
dest="threads",
required=False,
type=int,
default=4,
help="number of threads to be used (default 4)",
)
parser.add_argument(
"-o",
"--outfile",
dest="outfile",
required=False,
type=str,
help="outfile ... catalog file .. by default output is printed to screen",
)
parser.add_argument("-v", "--version", action="version", version=__version__)

global args
args = parser.parse_args()

folder = args.folder
p = Path(folder)
files = [p]
files2 = p.glob("**/*")
files.extend(files2)

if args.outfile:
outfh = open(args.outfile, 'w')
else:
outfh = sys.stdout

bigdict=dict()

with Pool(processes=args.threads) as pool:
for fd in tqdm.tqdm(pool.imap_unordered(task, files),total=len(files)):
if not fd.is_file(): continue
uid = fd.get_uid()
if not uid in bigdict: bigdict[uid]=dict()
inode = fd.get_inode()
if not inode in bigdict[uid]: bigdict[uid][inode]=fd.get_size()

for uid in bigdict.keys():
username = get_username_groupname(uid)
nfiles = len(bigdict[uid])
nbytes = 0
for inode in bigdict[uid].keys():
nbytes += bigdict[uid][inode]
outfh.write(f"{username}\t{nfiles}\t{nbytes}\n")

if args.outfile:
outfh.close()

if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion src/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.11.4
0.11.5
63 changes: 63 additions & 0 deletions src/pdq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from pathlib import Path
import sys

def get_type(p): # copy paste from FileDetails
# input:
# 1. PosixPath object
# output:
# 1. type of path
# u = unknown
# L = broken symlink
# l = symlink
# f = file
# d = folder or directory
x = "u" # unknown
try:
if p.is_symlink():
x = "l" # link or symlink
try:
p.exists()
except:
x = "L" # upper case L is broken symlink
sys.stderr.write("spacesavers2:Broken symlink found:{}\n".format(p))
return x
if not p.exists():
x = "a" # absent
return x
if p.is_dir():
x = "d" # directory
return x
if p.is_file():
x = "f" # file
return x
except: # mainly to catch PermissionError:
sys.stderr.write("spacesavers2:File cannot be read:{}\n".format(p))
return x

class pdq:
def __init__(self):
self.inode = -1
self.fld = "u" # u or f or l or d
self.size = -1
self.uid = 0
def set(self,p,st_block_byte_size=512):
p = Path(p).absolute()
try:
st = p.stat(follow_symlinks=False)
self.size = st.st_blocks * st_block_byte_size
self.inode = st.st_ino
self.uid = st.st_uid
self.fld = get_type(p)
except:
print(f"spacesavers2_pdq: {p} File not found!")
def get_uid(self):
return self.uid
def get_fld(self):
return self.fld
def is_file(self):
if self.fld == "f": return True
return False
def get_inode(self):
return self.inode
def get_size(self):
return self.size

0 comments on commit 990f252

Please sign in to comment.