Skip to content

Commit

Permalink
--subtext and --hashed arguments to reduce size of report
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Jul 14, 2024
1 parent a936b81 commit b122bf3
Show file tree
Hide file tree
Showing 15 changed files with 1,718 additions and 1,653 deletions.
7 changes: 7 additions & 0 deletions credsweeper/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ def get_arguments() -> Namespace:
const="output.xlsx",
dest="xlsx_filename",
metavar="PATH")
parser.add_argument("--subtext", help="only part of text will be outputted", action="store_const", const=True)
parser.add_argument("--hashed",
help="line, variable, value will be hashed in output",
action="store_const",
const=True)
parser.add_argument("--sort", help="enable output sorting", dest="sort_output", action="store_true")
parser.add_argument("--log",
"-l",
Expand Down Expand Up @@ -282,6 +287,8 @@ def scan(args: Namespace, content_provider: AbstractProvider, json_filename: Opt
api_validation=args.api_validation,
json_filename=json_filename,
xlsx_filename=xlsx_filename,
subtext=args.subtext,
hashed=args.hashed,
sort_output=args.sort_output,
use_filters=args.no_filters,
pool_count=args.jobs,
Expand Down
13 changes: 10 additions & 3 deletions credsweeper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def __init__(self,
api_validation: bool = False,
json_filename: Union[None, str, Path] = None,
xlsx_filename: Union[None, str, Path] = None,
subtext: bool = False,
hashed: bool = False,
sort_output: bool = False,
use_filters: bool = True,
pool_count: int = 1,
Expand Down Expand Up @@ -72,6 +74,8 @@ def __init__(self,
to json
xlsx_filename: optional string variable, path to save result
to xlsx
subtext: use subtext of line near value like it performed in ML
hashed: use hash of line, value and variable instead plain text
use_filters: boolean variable, specifying the need of rule filters
pool_count: int value, number of parallel processes to use
ml_batch_size: int value, size of the batch for model inference
Expand Down Expand Up @@ -106,6 +110,8 @@ def __init__(self,
self.credential_manager = CredentialManager()
self.json_filename: Union[None, str, Path] = json_filename
self.xlsx_filename: Union[None, str, Path] = xlsx_filename
self.subtext = subtext
self.hashed = hashed
self.sort_output = sort_output
self.ml_batch_size = ml_batch_size if ml_batch_size and 0 < ml_batch_size else 16
self.ml_threshold = ml_threshold
Expand Down Expand Up @@ -390,16 +396,17 @@ def export_results(self) -> None:

if self.json_filename:
is_exported = True
Util.json_dump([credential.to_json() for credential in credentials], file_path=self.json_filename)
Util.json_dump([credential.to_json(subtext=self.subtext, hashed=self.hashed) for credential in credentials],
file_path=self.json_filename)

if self.xlsx_filename:
is_exported = True
data_list = []
for credential in credentials:
data_list.extend(credential.to_dict_list())
data_list.extend(credential.to_dict_list(subtext=self.subtext, hashed=self.hashed))
df = pd.DataFrame(data=data_list)
df.to_excel(self.xlsx_filename, index=False)

if is_exported is False:
for credential in credentials:
print(credential)
print(credential.to_str(subtext=self.subtext, hashed=self.hashed))
19 changes: 13 additions & 6 deletions credsweeper/credentials/candidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,22 @@ def is_api_validation_available(self) -> bool:
"""
return len(self.validations) > 0

def __str__(self) -> str:
def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
"""Represent candidate with subtext or|and hashed values"""
return f"rule: {self.rule_name}" \
f" | severity: {self.severity.value}" \
f" | confidence: {self.confidence.value}" \
f" | line_data_list: {self.line_data_list}" \
f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
f" | api_validation: {self.api_validation.name}" \
f" | ml_validation: {self.ml_validation.name}"

def to_json(self) -> Dict:
def __str__(self):
return self.to_str()

def __repr__(self):
return self.to_str(subtext=True)

def to_json(self, subtext: bool, hashed: bool) -> Dict:
"""Convert credential candidate object to dictionary.
Return:
Expand All @@ -113,23 +120,23 @@ def to_json(self) -> Dict:
"confidence": self.confidence.value,
"use_ml": self.use_ml,
# put the array to end to make json more readable
"line_data_list": [line_data.to_json() for line_data in self.line_data_list],
"line_data_list": [line_data.to_json(subtext, hashed) for line_data in self.line_data_list],
}
if self.config is not None:
reported_output = {k: v for k, v in full_output.items() if k in self.config.candidate_output}
else:
reported_output = full_output
return reported_output

def to_dict_list(self) -> List[dict]:
def to_dict_list(self, subtext: bool, hashed: bool) -> List[dict]:
"""Convert credential candidate object to List[dict].
Return:
List[dict] object generated from current credential candidate
"""
reported_output = []
json_output = self.to_json()
json_output = self.to_json(subtext, hashed)
refined_data = copy.deepcopy(json_output)
del refined_data["line_data_list"]
for line_data in json_output["line_data_list"]:
Expand Down
40 changes: 30 additions & 10 deletions credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import contextlib
import hashlib
import re
import string
from functools import cached_property
from typing import Any, Dict, Optional, Tuple

from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.common.constants import MAX_LINE_LENGTH, UTF_8, ML_HUNK
from credsweeper.config import Config
from credsweeper.utils import Util
from credsweeper.utils.entropy_validator import EntropyValidator
Expand Down Expand Up @@ -282,11 +283,29 @@ def is_source_file_with_quotes(self) -> bool:
return True
return False

def __repr__(self) -> str:
return f"line: '{self.line}' | line_num: {self.line_num} | path: {self.path}" \
f" | value: '{self.value}' | entropy_validation: {EntropyValidator(self.value)}"

def to_json(self) -> Dict:
@staticmethod
def get_subtext_or_hash(text: Optional[str], pos: int, subtext: bool, hashed: bool) -> Optional[str]:
"""Represent a text with subtext or|and hash if required"""
text = Util.subtext(text, pos, ML_HUNK) if subtext and text is not None else text
if hashed:
# text = hashlib.sha256(text.encode(UTF_8, errors="replace")).hexdigest() if text is not None else None
text = hashlib.sha256(text.encode(UTF_8, errors="strict")).hexdigest() if text is not None else None
return text

def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
"""Represent line_data with subtext or|and hashed values"""
return f"line: '{self.get_subtext_or_hash(self.line, self.value_start, subtext, hashed)}'" \
f" | line_num: {self.line_num} | path: {self.path}" \
f" | value: '{self.get_subtext_or_hash(self.value, 0, subtext, hashed)}'" \
f" | entropy_validation: {EntropyValidator(self.value)}"

def __str__(self):
return self.to_str()

def __repr__(self):
return self.to_str(subtext=True)

def to_json(self, subtext: bool, hashed: bool) -> Dict:
"""Convert line data object to dictionary.
Return:
Expand All @@ -295,18 +314,19 @@ def to_json(self) -> Dict:
"""
full_output = {
"key": self.key,
"line": self.line,
"line": self.get_subtext_or_hash(self.line, self.value_start, subtext, hashed),
"line_num": self.line_num,
"path": self.path,
"info": self.info,
# info may contain variable name - so let it be hashed if requested
"info": hashlib.sha256(self.info.encode(UTF_8)).hexdigest() if hashed and self.info else self.info,
"pattern": self.pattern.pattern,
"separator": self.separator,
"separator_start": self.separator_start,
"separator_end": self.separator_end,
"value": self.value,
"value": self.get_subtext_or_hash(self.value, 0, subtext, hashed),
"value_start": self.value_start,
"value_end": self.value_end,
"variable": self.variable,
"variable": self.get_subtext_or_hash(self.variable, 0, subtext, hashed),
"variable_start": self.variable_start,
"variable_end": self.variable_end,
"value_leftquote": self.value_leftquote,
Expand Down
9 changes: 6 additions & 3 deletions credsweeper/deep_scanner/bzip2_scanner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import bz2
import logging
from abc import ABC
from pathlib import Path
from typing import List

from credsweeper.credentials import Candidate
Expand All @@ -22,10 +23,12 @@ def data_scan(
"""Extracts data from bzip2 archive and launches data_scan"""
candidates = []
try:
new_path = data_provider.file_path if ".bz2" != Util.get_extension(
data_provider.file_path) else data_provider.file_path[:-4]
file_path = Path(data_provider.file_path)
new_path = file_path.as_posix()
if ".bz2" == file_path.suffix:
new_path = new_path[:-4]
bzip2_content_provider = DataContentProvider(data=bz2.decompress(data_provider.data),
file_path=data_provider.file_path,
file_path=new_path,
file_type=Util.get_extension(new_path),
info=f"{data_provider.info}|BZIP2|{new_path}")
new_limit = recursive_limit_size - len(bzip2_content_provider.data)
Expand Down
3 changes: 2 additions & 1 deletion credsweeper/deep_scanner/deep_scanner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import logging
from pathlib import Path
from typing import List, Optional, Any, Tuple, Union

from credsweeper.common.constants import RECURSIVE_SCAN_LIMITATION
Expand Down Expand Up @@ -136,7 +137,7 @@ def scan(self,
data_provider = DataContentProvider(data=data,
file_path=content_provider.file_path,
file_type=content_provider.file_type,
info=content_provider.file_path)
info=Path(content_provider.file_path).as_posix())
# iterate for all possibly scanner methods WITHOUT ByteContentProvider for TextContentProvider
scanner_classes = self.get_deep_scanners(data, content_provider.file_type)
for scan_class in scanner_classes:
Expand Down
9 changes: 6 additions & 3 deletions credsweeper/deep_scanner/gzip_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import io
import logging
from abc import ABC
from pathlib import Path
from typing import List

from credsweeper.credentials import Candidate
Expand All @@ -24,10 +25,12 @@ def data_scan(
candidates = []
try:
with gzip.open(io.BytesIO(data_provider.data)) as f:
new_path = data_provider.file_path if ".gz" != Util.get_extension(
data_provider.file_path) else data_provider.file_path[:-3]
file_path = Path(data_provider.file_path)
new_path = file_path.as_posix()
if ".gz" == file_path.suffix:
new_path = new_path[:-3]
gzip_content_provider = DataContentProvider(data=f.read(),
file_path=data_provider.file_path,
file_path=new_path,
file_type=Util.get_extension(new_path),
info=f"{data_provider.info}|GZIP|{new_path}")
new_limit = recursive_limit_size - len(gzip_content_provider.data)
Expand Down
4 changes: 3 additions & 1 deletion docs/source/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Get all argument list:
usage: python -m credsweeper [-h] (--path PATH [PATH ...] | --diff_path PATH [PATH ...] | --export_config [PATH] | --export_log_config [PATH]) [--rules [PATH]] [--severity SEVERITY] [--config [PATH]]
[--log_config [PATH]] [--denylist PATH] [--find-by-ext] [--depth POSITIVE_INT] [--no-filters] [--doc] [--ml_threshold FLOAT_OR_STR] [--ml_batch_size POSITIVE_INT]
[--azure | --cuda] [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]] [--save-xlsx [PATH]] [--sort] [--log LOG_LEVEL] [--size_limit SIZE_LIMIT]
[--azure | --cuda] [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]] [--save-xlsx [PATH]] [--subtext] [--hashed] [--sort] [--log LOG_LEVEL] [--size_limit SIZE_LIMIT]
[--banner] [--version]
options:
-h, --help show this help message and exit
Expand Down Expand Up @@ -49,6 +49,8 @@ Get all argument list:
--skip_ignored parse .gitignore files and skip credentials from ignored objects
--save-json [PATH] save result to json file (default: output.json)
--save-xlsx [PATH] save result to xlsx file (default: output.xlsx)
--subtext only part of text will be outputted
--hashed line, variable, value will be hashed in output
--sort enable output sorting
--log LOG_LEVEL, -l LOG_LEVEL
provide logging level of ['DEBUG', 'INFO', 'WARN', 'WARNING', 'ERROR', 'FATAL', 'CRITICAL', 'SILENCE'](default: 'warning', case insensitive)
Expand Down
2 changes: 1 addition & 1 deletion fuzz/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def fuzz_credsweeper_scan(data: bytes):
elif validation.__class__.__name__ in [GoogleMultiValidation.__name__]:
for i in range(3):
mock_flow(i, candidate)
candidate.to_dict_list()
candidate.to_dict_list(False, False)


def main():
Expand Down
5 changes: 4 additions & 1 deletion tests/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Dict, Any, List

from tests import SAMPLES_POST_CRED_COUNT, SAMPLES_IN_DEEP_3, SAMPLES_CRED_COUNT, SAMPLES_IN_DOC, NEGLIGIBLE_ML_THRESHOLD
from tests import SAMPLES_POST_CRED_COUNT, SAMPLES_IN_DEEP_3, SAMPLES_CRED_COUNT, SAMPLES_IN_DOC, \
NEGLIGIBLE_ML_THRESHOLD

DATA_TEST_CFG: List[Dict[str, Any]] = [{
"__cred_count": SAMPLES_POST_CRED_COUNT,
Expand All @@ -9,11 +10,13 @@
}, {
"__cred_count": SAMPLES_CRED_COUNT,
"sort_output": True,
"hashed": True,
"json_filename": "ml_threshold.json",
"ml_threshold": NEGLIGIBLE_ML_THRESHOLD
}, {
"__cred_count": SAMPLES_IN_DOC,
"sort_output": True,
"subtext": True,
"json_filename": "doc.json",
"doc": True
}, {
Expand Down
Loading

0 comments on commit b122bf3

Please sign in to comment.