Skip to content

Commit

Permalink
optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Jul 17, 2023
1 parent 562d611 commit 4111067
Show file tree
Hide file tree
Showing 38 changed files with 304 additions and 270 deletions.
12 changes: 8 additions & 4 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Add synthetic huge data
if: steps.cache-data.outputs.cache-hit == 'true'
run: python -c "for n in range(7654321):print(f'{n:08x}')" >data/test.text

- name: Update PIP
run: python -m pip install --upgrade pip

Expand All @@ -193,10 +197,10 @@ jobs:
# check the banner
credsweeper --banner
- name: Run performance benchmark
- name: Run performance benchmark RELEASE
run: |
START_TIME=$(date +%s)
/usr/bin/time --verbose credsweeper --path data --save-json /dev/null
/usr/bin/time --verbose credsweeper --log error --path data --save-json /dev/null
FINISH_TIME=$(date +%s)
RELEASE_TIME=$(( ${FINISH_TIME} - ${START_TIME} ))
if [ 0 -lt ${RELEASE_TIME} ]; then
Expand Down Expand Up @@ -224,10 +228,10 @@ jobs:
# check the banner
credsweeper --banner
- name: Run performance benchmark
- name: Run performance benchmark CURRENT
run: |
START_TIME=$(date +%s)
/usr/bin/time --verbose credsweeper --path data --save-json /dev/null
/usr/bin/time --verbose credsweeper --log error --path data --save-json /dev/null
FINISH_TIME=$(date +%s)
HEAD_TIME=$(( ${FINISH_TIME} - ${START_TIME} ))
if [ 0 -lt ${HEAD_TIME} ]; then
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
done
exit ${n}
# # # git workflow
# # # git workflow - may be only for information due merge is performed with squash

- name: Get latest release tag name
if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
Expand Down Expand Up @@ -85,7 +85,6 @@ jobs:
echo "GIT workflow OK"
else
echo "Please, rebase the branch after ${LATEST_RELEASE_TAG}"
exit 1
fi
# # # Python setup
Expand Down
4 changes: 2 additions & 2 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,8 @@ class DiffRowType(Enum):
"""Diff type of row"""
ADDED = "added"
DELETED = "deleted"
ADDED_ACCOMPANY = "added_accompany"
DELETED_ACCOMPANY = "deleted_accompany"
# ADDED_ACCOMPANY = "added_accompany"
# DELETED_ACCOMPANY = "deleted_accompany"


MIN_VARIABLE_LENGTH = 1
Expand Down
70 changes: 62 additions & 8 deletions credsweeper/credentials/candidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,19 @@ class Candidate:
use_ml: Should ML work on this credential or not. If not prediction based on regular expression and filter only
"""

__slots__ = [
"__api_validation",
"__ml_validation",
"__line_data_list",
"__patterns",
"__ml_probability",
"__rule_name",
"__severity",
"__validations",
"__use_ml",
"__config",
]

def __init__(self,
line_data_list: List[LineData],
patterns: List[re.Pattern],
Expand All @@ -32,16 +45,57 @@ def __init__(self,
config: Config,
validations: List[Validation] = None,
use_ml: bool = False) -> None:
self.line_data_list = line_data_list if line_data_list is not None else []
self.patterns = patterns if patterns is not None else []
self.rule_name = rule_name
self.severity = severity
self.config = config
self.validations = validations if validations else []
self.use_ml = use_ml

self.api_validation = KeyValidationOption.NOT_AVAILABLE
self.ml_validation = KeyValidationOption.NOT_AVAILABLE
self.line_data_list: List[LineData] = line_data_list if line_data_list else []
self.patterns: List[re.Pattern] = patterns if patterns else []
self.ml_probability = None
self.rule_name: str = rule_name
self.severity: Optional[Severity] = severity
self.validations: List[Validation] = validations if validations else []
self.use_ml: bool = use_ml
self.config = config

@property
def config(self) -> Config:
"""config getter"""
return self.__config

@config.setter
def config(self, config: Config) -> None:
"""config setter"""
self.__config = config

@property
def use_ml(self) -> bool:
"""use_ml getter"""
return self.__use_ml

@use_ml.setter
def use_ml(self, use_ml: bool) -> None:
"""use_ml setter"""
self.__use_ml = use_ml

@property
def ml_probability(self) -> Optional[float]:
"""ml_probability getter"""
return self.__ml_probability

@ml_probability.setter
def ml_probability(self, ml_probability: Optional[float]) -> None:
"""ml_probability setter"""
self.__ml_probability = ml_probability

@property
def validations(self) -> List[Validation]:
"""validations getter"""
return self.__validations

@validations.setter
def validations(self, validations: List[Validation]) -> None:
"""validations setter"""
self.__validations = validations

@property
def api_validation(self) -> KeyValidationOption:
Expand Down Expand Up @@ -182,7 +236,7 @@ def to_dict_list(self) -> List[dict]:
def get_dummy_candidate(cls, config: Config, file_path: str, file_type: str, info: str):
"""Create dummy instance to use in searching file by extension"""
return cls( #
line_data_list=[LineData(config, "dummy line", -1,0, file_path, file_type, info, re.compile(".*"))],
line_data_list=[LineData(config, "dummy line", -1, 0, file_path, file_type, info, re.compile(".*"))],
patterns=[re.compile(".*")], #
rule_name="Dummy candidate", #
severity=Severity.INFO, #
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def line_pos(self) -> int:
def line_pos(self, line_pos: int) -> None:
"""line_pos setter"""
self.__line_pos = line_pos

@property
def line_num(self) -> int:
"""line_num getter"""
Expand Down
3 changes: 1 addition & 2 deletions credsweeper/deep_scanner/pdf_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ def data_scan(
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|PDF")
analysis_targets = string_data_provider.get_analysis_target()
pdf_candidates = self.scanner.scan(analysis_targets)
pdf_candidates = self.scanner.scan(string_data_provider)
candidates.extend(pdf_candidates)
except Exception as pdf_exc:
logger.error(f"{data_provider.file_path}:{pdf_exc}")
Expand Down
13 changes: 7 additions & 6 deletions credsweeper/file_handler/analysis_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
class AnalysisTarget:
"""AnalysisTarget"""

def __init__(self,
line_pos: int,
lines: List[str],
line_nums: List[int],
descriptor: Descriptor,
):
def __init__(
self,
line_pos: int,
lines: List[str],
line_nums: List[int],
descriptor: Descriptor,
):
self.__line_pos = line_pos
self.__lines = lines
self.__line_nums = line_nums
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/file_handler/byte_content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def lines(self, lines: List[str]) -> None:
"""lines setter for ByteContentProvider"""
self.__lines = lines

def yield_analysis_target(self) -> Generator[AnalysisTarget,None,None]:
def yield_analysis_target(self) -> Generator[AnalysisTarget, None, None]:
"""Return lines to scan.
Return:
Expand Down
9 changes: 6 additions & 3 deletions credsweeper/file_handler/content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def __init__(
self.__descriptor = Descriptor(_file_path, _file_type, _info)

@abstractmethod
def yield_analysis_target(self) -> Generator[AnalysisTarget,None,None]:
def yield_analysis_target(self) -> Generator[AnalysisTarget, None, None]:
"""Load and preprocess file diff data to scan.
Return:
Expand Down Expand Up @@ -72,7 +72,10 @@ def data(self, data: Optional[bytes]) -> None:
"""abstract data setter"""
raise NotImplementedError(__name__)

def lines_to_targets(self, lines: List[str], line_nums: Optional[List[int]] = None) -> Generator[AnalysisTarget,None,None]:
def lines_to_targets(
self, #
lines: List[str], #
line_nums: Optional[List[int]] = None) -> Generator[AnalysisTarget, None, None]:
"""Creates list of targets with multiline concatenation"""
if line_nums and len(line_nums) == len(lines):
for line_pos in range(len(lines)):
Expand All @@ -81,7 +84,7 @@ def lines_to_targets(self, lines: List[str], line_nums: Optional[List[int]] = No
else:
if line_nums and len(line_nums) != len(lines):
logger.warning(f"line numerations {len(line_nums)} does not match lines {len(lines)}")
_line_nums = [x for x in range(len(lines))]
_line_nums = [x + 1 for x in range(len(lines))]
for line_pos in range(len(lines)):
target = AnalysisTarget(line_pos, lines, _line_nums, self.descriptor)
yield target
2 changes: 1 addition & 1 deletion credsweeper/file_handler/data_content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def represent_as_encoded(self) -> bool:
return self.decoded is not None and 0 < len(self.decoded)
return False

def yield_analysis_target(self) -> Generator[AnalysisTarget,None,None]:
def yield_analysis_target(self) -> Generator[AnalysisTarget, None, None]:
"""Return nothing. The class provides only data storage.
Raise:
Expand Down
17 changes: 6 additions & 11 deletions credsweeper/file_handler/diff_content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ def parse_lines_data(self, lines_data: List[DiffRowData]) -> Tuple[List[int], Li
all_lines = []
for line_data in lines_data:
if line_data.line_type == self.change_type:
# or DiffRowType.ADDED_ACCOMPANY == line_data.line_type and DiffRowType.ADDED == self.change_type \
# or DiffRowType.DELETED_ACCOMPANY == line_data.line_type and DiffRowType.DELETED == self.change_type
change_numbs.append(line_data.line_numb)
all_lines.append(line_data.line)
return change_numbs, all_lines
Expand All @@ -75,14 +77,7 @@ def yield_analysis_target(self) -> Generator[AnalysisTarget, None, None]:
"""
lines_data = Util.preprocess_file_diff(self.diff)
try:
change_numbs, all_lines = self.parse_lines_data(lines_data)
for l_pos in range(len(change_numbs)):
target = AnalysisTarget(
l_pos, #
all_lines, #
change_numbs, #
self.descriptor) #
yield target
except Exception as exc:
logger.error(f"Wrong diff {type(exc)} {exc}")
change_numbs, all_lines = self.parse_lines_data(lines_data)
for l_pos in range(len(all_lines)):
target = AnalysisTarget(l_pos, all_lines, change_numbs, self.descriptor)
yield target
4 changes: 2 additions & 2 deletions credsweeper/file_handler/struct_content_provider.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import List, Optional, Any, Generator
from typing import Optional, Any, Generator

from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
Expand Down Expand Up @@ -44,7 +44,7 @@ def data(self, data: bytes) -> None:
"""data setter for StructContentProvider"""
raise NotImplementedError(__name__)

def yield_analysis_target(self) -> Generator[AnalysisTarget,None,None]:
def yield_analysis_target(self) -> Generator[AnalysisTarget, None, None]:
"""Return nothing. The class provides only data storage.
Raise:
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/file_handler/text_content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def lines(self, lines: Optional[List[str]]) -> None:
"""lines setter for TextContentProvider"""
self.__lines = lines

def yield_analysis_target(self) -> Generator[AnalysisTarget,None,None]:
def yield_analysis_target(self) -> Generator[AnalysisTarget, None, None]:
"""Load and preprocess file content to scan.
Return:
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,7 @@
type: pattern
values:
- (^|[^.0-9A-Za-z_/+-])(?=[A-Za-z0-9]{64})(?P<value>[A-Za-z0-9]{10,12}[B-Za-z0-9]A{10,12}[B-Za-z0-9][A-Za-z0-9]{40,44})([^=0-9A-Za-z_/+-]|$)
filter_type: []
filter_type: [ ]
min_line_len: 43
required_substrings:
- AAAAAAAAAA
Expand Down
5 changes: 5 additions & 0 deletions credsweeper/scanner/scan_type/multi_pattern.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Optional

from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.config import Config
from credsweeper.credentials import Candidate
from credsweeper.file_handler.analysis_target import AnalysisTarget
Expand Down Expand Up @@ -76,6 +77,10 @@ def _scan(cls, config: Config, candidate: Candidate, candi_line_pos: int, target
"""
new_target = AnalysisTarget(candi_line_pos, target.lines, target.line_nums, target.descriptor)

if MAX_LINE_LENGTH < new_target.line_len:
return False

line_data = cls.get_line_data(config=config, target=new_target, pattern=rule.patterns[1], filters=rule.filters)

if line_data is None:
Expand Down
13 changes: 3 additions & 10 deletions credsweeper/scanner/scan_type/pem_key_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,23 +77,16 @@ def detect_pem_key(cls, config: Config, rule: Rule, target: AnalysisTarget) -> L
key_data = ""
# get line with -----BEGIN which may contain full key
first_line = LineData(config, target.line, target.line_pos, target.line_num, target.file_path, target.file_type,
target.info,
rule.patterns[0])
target.info, rule.patterns[0])
line_data.append(first_line)
# protection check for case when first line starts from 0
start_pos = target.line_pos if 0 <= target.line_pos else 0
finish_pos = min(start_pos + 200, target.lines_len)
for line_pos in range(start_pos, finish_pos):
line = target.lines[line_pos]
if target.line_pos != line_pos:
_line = LineData(config, #
line, #
line_pos, #
target.line_nums[line_pos], #
target.file_path, #
target.file_type, #
target.info, #
cls.re_value_pem)
_line = LineData(config, line, line_pos, target.line_nums[line_pos], target.file_path, target.file_type,
target.info, cls.re_value_pem)
line_data.append(_line)
# replace escaped line ends with real and process them - PEM does not contain '\' sign
sublines = line.replace("\\r", '\n').replace("\\n", '\n').splitlines()
Expand Down
10 changes: 2 additions & 8 deletions credsweeper/scanner/scan_type/scan_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,8 @@ def get_line_data(
return None
logger.debug("Valid line for pattern: %s in file: %s:%d in line: %s", pattern, target.file_path,
target.line_num, target.line)
line_data = LineData(config, #
target.line, #
target.line_pos, #
target.line_num, #
target.file_path, #
target.file_type, #
target.info, #
pattern)
line_data = LineData(config, target.line, target.line_pos, target.line_num, target.file_path, target.file_type,
target.info, pattern)

if cls.filtering(config, target, line_data, filters):
return None
Expand Down
7 changes: 2 additions & 5 deletions credsweeper/scanner/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,9 @@ def _is_available(self, rule: Rule) -> bool:
return True
return False

def yield_rule_scanner(self, #
line_len: int, #
matched_pattern: bool, #
matched_keyword: bool, #
def yield_rule_scanner(self, line_len: int, matched_pattern: bool, matched_keyword: bool,
matched_pem_key: bool) -> Generator[Tuple[Rule, Type[ScanType]], None, None]:

"""returns generator for rules and according scanner"""
for rule, scanner in self.rules_scanners:
if line_len >= rule.min_line_len \
and (RuleType.PATTERN == rule.rule_type and matched_pattern
Expand Down
Loading

0 comments on commit 4111067

Please sign in to comment.