Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export results transformation (subtext and hashed) #582

Merged
merged 30 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
8786414
square bracket workaround in keywort regex
babenek Aug 10, 2024
672342a
path filter
babenek Aug 11, 2024
ac6ee1a
BM score fix
babenek Aug 11, 2024
3497f06
ValueStringTypeCheck workaround for heterogenous source
babenek Aug 12, 2024
6c10bf6
wrap added to filter array definitions
babenek Aug 12, 2024
abc980c
TOML format sanitizer
babenek Aug 12, 2024
ddbda1a
YAML case
babenek Aug 12, 2024
164cdfd
BM fix
babenek Aug 12, 2024
b55af99
BM scores fix
babenek Aug 12, 2024
137f6b2
[skip actions] [subhashtext] 2024-08-12T21:32:30+03:00
babenek Aug 12, 2024
ea404b3
variable is hashed too
babenek Aug 12, 2024
a076394
hash & subtext test
babenek Aug 12, 2024
04a15c3
testBM
babenek Aug 12, 2024
e271544
updBMscor
babenek Aug 12, 2024
d06c8a9
refactoring
babenek Aug 13, 2024
7930ff6
skip f* in BM experiment
babenek Aug 13, 2024
9851fb2
keep 0*-3* meta for experiment
babenek Aug 13, 2024
530b16e
less repos in test
babenek Aug 13, 2024
37d386d
refactoring2
babenek Aug 13, 2024
653f10b
read_text.cache_clear()
babenek Aug 13, 2024
ea871c6
--subtext in benchmark
babenek Aug 13, 2024
3bb24a0
[skip actions] [subhashtext] 2024-08-13T12:52:11+03:00
babenek Aug 13, 2024
bf4eb64
[skip actions] [subhashtext] 2024-08-13T12:55:14+03:00
babenek Aug 13, 2024
1a09f85
fix
babenek Aug 13, 2024
8c6c30d
subtext
babenek Aug 13, 2024
09f813d
Merge branch 'main' into auxiliary
babenek Aug 14, 2024
1be4e7c
[skip actions] [subhashtext] 2024-08-14T07:46:29+03:00
babenek Aug 14, 2024
feeefc3
experiment ml rollback
babenek Aug 14, 2024
95e0b1a
BM scores with hashes
babenek Aug 14, 2024
0ce84fc
some rollbacks
babenek Aug 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ jobs:

- name: Run CredSweeper tool
run: |
credsweeper --banner --log info --jobs $(nproc) --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
credsweeper --banner --log info --jobs $(nproc) --subtext --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log

- name: Run Benchmark
run: |
Expand Down
62 changes: 32 additions & 30 deletions cicd/benchmark.txt

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion credsweeper/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from credsweeper import __version__
from credsweeper.app import APP_PATH, CredSweeper
from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType
from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType, ML_HUNK
from credsweeper.file_handler.abstract_provider import AbstractProvider
from credsweeper.file_handler.files_provider import FilesProvider
from credsweeper.file_handler.patches_provider import PatchesProvider
Expand Down Expand Up @@ -215,6 +215,14 @@ def get_arguments() -> Namespace:
const="output.xlsx",
dest="xlsx_filename",
metavar="PATH")
parser.add_argument("--hashed",
help="line, variable, value will be hashed in output",
action="store_const",
const=True)
parser.add_argument("--subtext",
help=f"line text will be stripped in {2 * ML_HUNK} symbols but value and variable are kept",
action="store_const",
const=True)
parser.add_argument("--sort", help="enable output sorting", dest="sort_output", action="store_true")
parser.add_argument("--log",
"-l",
Expand Down Expand Up @@ -282,6 +290,8 @@ def scan(args: Namespace, content_provider: AbstractProvider, json_filename: Opt
api_validation=args.api_validation,
json_filename=json_filename,
xlsx_filename=xlsx_filename,
hashed=args.hashed,
subtext=args.subtext,
sort_output=args.sort_output,
use_filters=args.no_filters,
pool_count=args.jobs,
Expand Down
13 changes: 10 additions & 3 deletions credsweeper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def __init__(self,
api_validation: bool = False,
json_filename: Union[None, str, Path] = None,
xlsx_filename: Union[None, str, Path] = None,
hashed: bool = False,
subtext: bool = False,
sort_output: bool = False,
use_filters: bool = True,
pool_count: int = 1,
Expand Down Expand Up @@ -70,6 +72,8 @@ def __init__(self,
to json
xlsx_filename: optional string variable, path to save result
to xlsx
hashed: use hash of line, value and variable instead plain text
subtext: use subtext of line near variable-value like it performed in ML
use_filters: boolean variable, specifying the need of rule filters
pool_count: int value, number of parallel processes to use
ml_batch_size: int value, size of the batch for model inference
Expand Down Expand Up @@ -104,6 +108,8 @@ def __init__(self,
self.credential_manager = CredentialManager()
self.json_filename: Union[None, str, Path] = json_filename
self.xlsx_filename: Union[None, str, Path] = xlsx_filename
self.hashed = hashed
self.subtext = subtext
self.sort_output = sort_output
self.ml_batch_size = ml_batch_size if ml_batch_size and 0 < ml_batch_size else 16
self.ml_threshold = ml_threshold
Expand Down Expand Up @@ -400,16 +406,17 @@ def export_results(self) -> None:

if self.json_filename:
is_exported = True
Util.json_dump([credential.to_json() for credential in credentials], file_path=self.json_filename)
Util.json_dump([credential.to_json(hashed=self.hashed, subtext=self.subtext) for credential in credentials],
file_path=self.json_filename)

if self.xlsx_filename:
is_exported = True
data_list = []
for credential in credentials:
data_list.extend(credential.to_dict_list())
data_list.extend(credential.to_dict_list(hashed=self.hashed, subtext=self.subtext))
df = pd.DataFrame(data=data_list)
df.to_excel(self.xlsx_filename, index=False)

if is_exported is False:
for credential in credentials:
print(credential)
print(credential.to_str(hashed=self.hashed, subtext=self.subtext))
3 changes: 3 additions & 0 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import typing
from enum import Enum
from typing import Optional, Union

Expand Down Expand Up @@ -167,6 +168,8 @@ class DiffRowType(Enum):
DELETED = "deleted"


StartEnd = typing.NamedTuple("StartEnd", [("start", int), ("end", int)])

MIN_VARIABLE_LENGTH = 1
MIN_SEPARATOR_LENGTH = 1
MIN_VALUE_LENGTH = 4
Expand Down
18 changes: 11 additions & 7 deletions credsweeper/credentials/candidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,22 @@ def is_api_validation_available(self) -> bool:
"""
return len(self.validations) > 0

def __str__(self) -> str:
def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
"""Represent candidate with subtext or|and hashed values"""
return f"rule: {self.rule_name}" \
f" | severity: {self.severity.value}" \
f" | confidence: {self.confidence.value}" \
f" | line_data_list: {self.line_data_list}" \
f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
f" | api_validation: {self.api_validation.name}" \
f" | ml_validation: {self.ml_validation.name}"

def __str__(self):
return self.to_str()

def __repr__(self):
return str(self)
return self.to_str(subtext=True)

def to_json(self) -> Dict:
def to_json(self, hashed: bool, subtext: bool) -> Dict:
"""Convert credential candidate object to dictionary.

Return:
Expand All @@ -116,23 +120,23 @@ def to_json(self) -> Dict:
"confidence": self.confidence.value,
"use_ml": self.use_ml,
# put the array to end to make json more readable
"line_data_list": [line_data.to_json() for line_data in self.line_data_list],
"line_data_list": [line_data.to_json(hashed, subtext) for line_data in self.line_data_list],
}
if self.config is not None:
reported_output = {k: v for k, v in full_output.items() if k in self.config.candidate_output}
else:
reported_output = full_output
return reported_output

def to_dict_list(self) -> List[dict]:
def to_dict_list(self, hashed: bool, subtext: bool) -> List[dict]:
"""Convert credential candidate object to List[dict].

Return:
List[dict] object generated from current credential candidate

"""
reported_output = []
json_output = self.to_json()
json_output = self.to_json(hashed, subtext)
refined_data = copy.deepcopy(json_output)
del refined_data["line_data_list"]
for line_data in json_output["line_data_list"]:
Expand Down
66 changes: 57 additions & 9 deletions credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import contextlib
import hashlib
import re
import string
from functools import cached_property
from typing import Any, Dict, Optional, Tuple

from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.common.constants import MAX_LINE_LENGTH, UTF_8, StartEnd, ML_HUNK
from credsweeper.config import Config
from credsweeper.utils import Util
from credsweeper.utils.entropy_validator import EntropyValidator
Expand Down Expand Up @@ -300,34 +301,81 @@ def is_source_file_with_quotes(self) -> bool:
return True
return False

@staticmethod
def get_hash_or_subtext(
text: Optional[str], #
hashed: bool, #
cut_pos: Optional[StartEnd] = None, #
) -> Optional[str]:
"""Represent not empty text with hash or a "beauty" subtext if required

Args:
text: str - input string
hashed: bool - whether the text will be hashed and returned
cut_pos: Optional[StartEnd] - start, end positions which text must be kept in output

Return:
sha256 hash in hex representation of input text with UTF-8 encodings
or
subtext from start to end, or original text as is

"""
if text:
if hashed:
text = hashlib.sha256(text.encode(UTF_8, errors="strict")).hexdigest()
elif cut_pos is not None:
if 2 * ML_HUNK < cut_pos.end - cut_pos.start:
# subtext positions exceed the limit
text = text[cut_pos.start:cut_pos.end]
else:
strip_text = text.strip()
if 2 * ML_HUNK >= len(strip_text):
# stripped text length meets the limit
text = strip_text
else:
offset = len(text) - len(text.lstrip())
center = (cut_pos.end + cut_pos.start - offset) >> 1
text = Util.subtext(strip_text, center, ML_HUNK)
return text

def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
"""Represent line_data with subtext or|and hashed values"""
cut_pos = StartEnd(self.variable_start, self.value_end) if subtext else None
return f"line: '{self.get_hash_or_subtext(self.line, hashed, cut_pos)}'" \
f" | line_num: {self.line_num} | path: {self.path}" \
f" | value: '{self.get_hash_or_subtext(self.value, hashed)}'" \
f" | entropy_validation: {EntropyValidator(self.value)}"

def __str__(self):
return f"line: '{self.line}' | line_num: {self.line_num} | path: {self.path}" \
f" | value: '{self.value}' | entropy_validation: {EntropyValidator(self.value)}"
return self.to_str()

def __repr__(self):
return str(self)
return self.to_str(subtext=True)

def to_json(self) -> Dict:
def to_json(self, hashed: bool, subtext: bool) -> Dict:
"""Convert line data object to dictionary.

Return:
Dictionary object generated from current line data

"""
cut_pos = StartEnd(self.variable_start if 0 <= self.variable_start else self.value_start,
self.value_end) if subtext else None
full_output = {
"key": self.key,
"line": self.line,
"line": self.get_hash_or_subtext(self.line, hashed, cut_pos),
"line_num": self.line_num,
"path": self.path,
"info": self.info,
# info may contain variable name - so let it be hashed if requested
"info": self.get_hash_or_subtext(self.info, hashed),
"pattern": self.pattern.pattern,
"separator": self.separator,
"separator_start": self.separator_start,
"separator_end": self.separator_end,
"value": self.value,
"value": self.get_hash_or_subtext(self.value, hashed),
"value_start": self.value_start,
"value_end": self.value_end,
"variable": self.variable,
"variable": self.get_hash_or_subtext(self.variable, hashed),
"variable_start": self.variable_start,
"variable_end": self.variable_end,
"value_leftquote": self.value_leftquote,
Expand Down
4 changes: 2 additions & 2 deletions credsweeper/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,8 @@ def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[
text = content.decode(encoding, errors="strict")
if content != text.encode(encoding, errors="strict"):
raise UnicodeError
# windows style workaround
lines = text.replace('\r\n', '\n').replace('\r', '\n').split("\n")
# windows & macos styles workaround
lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n')
break
except UnicodeError:
binary_suggest = True
Expand Down
4 changes: 3 additions & 1 deletion docs/source/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Get all argument list:

usage: python -m credsweeper [-h] (--path PATH [PATH ...] | --diff_path PATH [PATH ...] | --export_config [PATH] | --export_log_config [PATH]) [--rules [PATH]] [--severity SEVERITY] [--config [PATH]]
[--log_config [PATH]] [--denylist PATH] [--find-by-ext] [--depth POSITIVE_INT] [--no-filters] [--doc] [--ml_threshold FLOAT_OR_STR] [--ml_batch_size POSITIVE_INT]
[--azure | --cuda] [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]] [--save-xlsx [PATH]] [--sort] [--log LOG_LEVEL] [--size_limit SIZE_LIMIT]
[--azure | --cuda] [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]] [--save-xlsx [PATH]] [--hashed] [--subtext] [--sort] [--log LOG_LEVEL] [--size_limit SIZE_LIMIT]
[--banner] [--version]
options:
-h, --help show this help message and exit
Expand Down Expand Up @@ -49,6 +49,8 @@ Get all argument list:
--skip_ignored parse .gitignore files and skip credentials from ignored objects
--save-json [PATH] save result to json file (default: output.json)
--save-xlsx [PATH] save result to xlsx file (default: output.xlsx)
--hashed line, variable, value will be hashed in output
--subtext line text will be stripped in 160 symbols but value and variable are kept
--sort enable output sorting
--log LOG_LEVEL, -l LOG_LEVEL
provide logging level of ['DEBUG', 'INFO', 'WARN', 'WARNING', 'ERROR', 'FATAL', 'CRITICAL', 'SILENCE'](default: 'warning', case insensitive)
Expand Down
4 changes: 2 additions & 2 deletions experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from experiment.src.features import prepare_data
from experiment.src.lstm_model import get_model
from experiment.src.model_config_preprocess import model_config_preprocess
from experiment.src.prepare_data import prepare_train_data, meta_checksum
from experiment.src.prepare_data import prepare_train_data, data_checksum


def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray], y_label: np.ndarray):
Expand Down Expand Up @@ -59,7 +59,7 @@ def main(cred_data_location: str, jobs: int) -> str:
prepare_train_data(_cred_data_location, jobs)

# detected data means which data is passed to ML validator of credsweeper after filters with RuleName
detected_data = read_detected_data(f"results/detected_data.{meta_checksum(cred_data_location)}.json")
detected_data = read_detected_data(f"results/detected_data.{data_checksum(cred_data_location)}.json")
print(f"CredSweeper detected {len(detected_data)} credentials without ML")
# all markup data
meta_data = read_metadata(f"{cred_data_location}/meta")
Expand Down
4 changes: 2 additions & 2 deletions experiment/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi

cd ${CREDSWEEPER_DIR}
report_file=${RESULT_DIR}/${now}.json
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/auxiliary/data/ --log info --job $(nproc) --save-json ${report_file}
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/abspos/data/ --log info --job $(nproc) --subtext --save-json ${report_file}

cd ~/q/DataCred/auxiliary/
cd ~/q/DataCred/abspos/
.venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${report_file}.log
32 changes: 16 additions & 16 deletions experiment/src/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import pathlib
from copy import deepcopy
from functools import cache
from typing import Tuple, Dict, Set, Any

import numpy as np
Expand Down Expand Up @@ -38,14 +39,7 @@ def read_detected_data(file_path: str) -> Dict[identifier, Dict]:
line_data = deepcopy(cred["line_data_list"][0])
line_data.pop("entropy_validation")
line_data.pop("info")
line = line_data["line"].lstrip()
offset = len(line_data["line"]) - len(line)
line_data["line"] = line.rstrip()
line_data["value_start"] -= offset
line_data["value_end"] -= offset
line_data["variable_start"] -= offset
line_data["variable_end"] -= offset
assert line_data["value"] == line_data["line"][line_data["value_start"]:line_data["value_end"]], line_data
line_data["line"] = None # will be read during join_label with data for ML input only
meta_path = transform_to_meta_path(line_data["path"])
line_data["path"] = meta_path
line_data["RuleName"] = [rule_name]
Expand Down Expand Up @@ -143,11 +137,20 @@ def get_colored_line(line_data: Dict[str, Any]) -> str:

def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier, Dict],
cred_data_location: str) -> pd.DataFrame:

@cache
def read_text(path) -> list[str]:
with open(path, "r", encoding="utf8") as f:
return f.read().replace("\r\n", '\n').replace('\r', '\n').split('\n')

values = []
detected_rules: Set[str] = set()
for index, line_data in detected_data.items():
for i in line_data["RuleName"]:
detected_rules.add(i)
text = read_text(f'{cred_data_location}/{line_data["path"]}')
line = text[line_data["line_num"] - 1]
line_data["line"] = line
if not line_data["value"]:
print(f"WARNING: empty value\n{line_data}")
continue
Expand Down Expand Up @@ -184,11 +187,9 @@ def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier
f"\nvariable:'{line_data['variable']}' value:'{line_data['value']}'"
f"\nsub_line:'{get_colored_line(line_data)}'")
continue
line = line_data["line"]
# the line in detected data must be striped
assert line == line.strip(), line_data
# check the value in detected data
assert line[line_data["value_start"]:line_data["value_end"]] == line_data["value"]
assert line[line_data["value_start"]:line_data["value_end"]] == line_data["value"], (
line_data, line[line_data["value_start"]:line_data["value_end"]], line_data["value"])
# todo: variable input has to be markup in meta too, or/and new feature "VariableExists" created ???
line_data["GroundTruth"] = label
line_data["ext"] = Util.get_extension(line_data["path"])
Expand All @@ -206,18 +207,17 @@ def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier
print(','.join(markup.keys()))
all_meta_found = False
print(','.join(str(x) for x in markup.values()))
text = Util.read_file(f'{cred_data_location}/{markup["FilePath"]}')
line = text[markup["LineStart"] - 1].strip()
text = read_text(f'{cred_data_location}/{markup["FilePath"]}')
line = text[markup["LineStart"] - 1]
if 0 <= markup["ValueStart"] and 0 <= markup["ValueEnd"]:
line = line[:markup["ValueStart"]] \
+ Fore.LIGHTGREEN_EX \
+ line[markup["ValueStart"]:markup["ValueEnd"]] \
+ Style.RESET_ALL \
+ line[markup["ValueEnd"]:]
print(line)
# print(Util.subtext(line, markup['ValueStart'], ML_HUNK))
break

read_text.cache_clear()
df = pd.DataFrame(values)
return df

Expand Down
Loading
Loading