From e6d2db0d162cf404c4b9b11b916e08d341ac621a Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Fri, 6 Sep 2024 11:30:48 +0300 Subject: [PATCH] Apply morpheme check for CamelCase and ValueFilePath filters --- credsweeper/common/keyword_checklist.py | 18 ++++++++++++++++++ credsweeper/common/morpheme_checklist.txt | 6 ++++++ credsweeper/filters/value_camel_case_check.py | 3 ++- .../filters/value_couple_keyword_check.py | 9 +-------- credsweeper/filters/value_file_path_check.py | 7 ++++--- tests/common/test_keyword_checklist.py | 2 +- tests/filters/test_value_file_path_check.py | 8 +++++--- 7 files changed, 37 insertions(+), 16 deletions(-) diff --git a/credsweeper/common/keyword_checklist.py b/credsweeper/common/keyword_checklist.py index 63f37c2e8..401a345e4 100644 --- a/credsweeper/common/keyword_checklist.py +++ b/credsweeper/common/keyword_checklist.py @@ -49,3 +49,21 @@ def morpheme_set(self) -> Set[str]: def morpheme_len(self) -> int: """Length of morpheme_set""" return len(self.__morpheme_set) + + def check_morphemes(self, line_lower:str, threshold:int)->bool: + """Checks limit of morphemes limit in line. + + Args: + line_lower: input line - MUST be in lower + threshold: number of minimal morphemes + + Return: + True - if number of morphemes exceeds the threshold + """ + matches = 0 + for keyword in self.morpheme_set: + if keyword in line_lower: + matches += 1 + if threshold < matches: + return True + return False \ No newline at end of file diff --git a/credsweeper/common/morpheme_checklist.txt b/credsweeper/common/morpheme_checklist.txt index 13fc7d4db..f2e5240ac 100644 --- a/credsweeper/common/morpheme_checklist.txt +++ b/credsweeper/common/morpheme_checklist.txt @@ -1,3 +1,6 @@ +../ +.com +.org 000 111 222 @@ -373,6 +376,7 @@ course court cove cpu_ +crac creat cred cript @@ -694,6 +698,7 @@ hybrid iabl ical icon +id_rsa iden idle ieee @@ -1307,6 +1312,7 @@ spot spray sql src_ +ssh ssl stack stan diff --git a/credsweeper/filters/value_camel_case_check.py b/credsweeper/filters/value_camel_case_check.py index 86a0f45d2..0fad1df90 100644 --- a/credsweeper/filters/value_camel_case_check.py +++ b/credsweeper/filters/value_camel_case_check.py @@ -1,6 +1,7 @@ import re from credsweeper.config import Config +from credsweeper.common import static_keyword_checklist from credsweeper.credentials import LineData from credsweeper.file_handler.analysis_target import AnalysisTarget from credsweeper.filters import Filter @@ -30,6 +31,6 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: if line_data.is_well_quoted_value: return False if self.CAMEL_CASE_PATTERN.match(line_data.value): - return True + return static_keyword_checklist.check_morphemes(line_data.value.lower(), 1) return False diff --git a/credsweeper/filters/value_couple_keyword_check.py b/credsweeper/filters/value_couple_keyword_check.py index 513823944..0f97f0678 100644 --- a/credsweeper/filters/value_couple_keyword_check.py +++ b/credsweeper/filters/value_couple_keyword_check.py @@ -22,11 +22,4 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: True, if need to filter candidate and False if left """ - value = line_data.value.lower() - matches = 0 - for keyword in static_keyword_checklist.morpheme_set: - if keyword in value: - matches += 1 - if 1 < matches: - return True - return False + return static_keyword_checklist.check_morphemes(line_data.value.lower(), 1) diff --git a/credsweeper/filters/value_file_path_check.py b/credsweeper/filters/value_file_path_check.py index 11d9fb37a..b871547dd 100644 --- a/credsweeper/filters/value_file_path_check.py +++ b/credsweeper/filters/value_file_path_check.py @@ -1,4 +1,5 @@ from credsweeper.common.constants import Chars +from credsweeper.common import static_keyword_checklist from credsweeper.config import Config from credsweeper.credentials import LineData from credsweeper.file_handler.analysis_target import AnalysisTarget @@ -13,7 +14,7 @@ class ValueFilePathCheck(Filter): and do not have any special characters ( !$@`&*()+) """ base64_possible_set = set(Chars.BASE64_CHARS.value) | set(Chars.BASE64URL_CHARS.value) - unusual_windows_symbols_in_path = "\t\n\r !$@`&*()[]{}<>+=;,~" + unusual_windows_symbols_in_path = "\t\n\r !$@`&*()[]{}<>+=;,~^" unusual_linux_symbols_in_path = unusual_windows_symbols_in_path + ":\\" def __init__(self, config: Config = None) -> None: @@ -41,7 +42,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: or value.startswith("//") and ':' == line_data.separator): # common case for url definition or aliases # or _keyword_://example.com where : is the separator - return True + return static_keyword_checklist.check_morphemes(value.lower(), 1) # base64 encoded data might look like linux path min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(value)) # get minimal entropy to compare with shannon entropy of found value @@ -70,5 +71,5 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: break else: if contains_unix_separator ^ contains_windows_separator: - return True + return static_keyword_checklist.check_morphemes(value.lower(), 1) return False diff --git a/tests/common/test_keyword_checklist.py b/tests/common/test_keyword_checklist.py index b86754ec8..6951cac61 100644 --- a/tests/common/test_keyword_checklist.py +++ b/tests/common/test_keyword_checklist.py @@ -17,7 +17,7 @@ def test_morpheme_set_p(self): for i in KeywordChecklist().morpheme_set: self.assertLessEqual(3, len(i)) # valid symbols for variable names - self.assertRegex(i, r"[a-z0-9_]{3,500}") + self.assertRegex(i, r"[a-z0-9_/.\\:]{3,500}") def test_keyword_set_n(self): # checks whether the keywords are unique, in lower case and not shorter than 3 symbols diff --git a/tests/filters/test_value_file_path_check.py b/tests/filters/test_value_file_path_check.py index 3a1697014..48a039bd3 100644 --- a/tests/filters/test_value_file_path_check.py +++ b/tests/filters/test_value_file_path_check.py @@ -17,11 +17,13 @@ def test_value_file_path_check_p(self, file_path: pytest.fixture, line: str) -> @pytest.mark.parametrize( "line", [ + "~/.ssh/id_rsa", # path + "../key", # path + "../../log", # path + "/home/user/.ssh/id_rsa", # path + "../.ssh/id_rsa", # path "crackle/filepath.txt", "/home/user/tmp", # simple path - "../..", # path - "dir/..", # path - "../dir", # path "file:///Crackle/filepath/", # path from browser url "~/.custompass", # path with synonym "./sshpass.sh", # path with synonym