Samsung · babenek · Jul 4, 2023 · Jul 3, 2023 · Jul 4, 2023
@@ -1,7 +1,7 @@
-Detected Credentials: 4675
-result_cnt : 4164, lost_cnt : 96, true_cnt : 3703, false_cnt : 365
-credsweeper -> TP : 3703, FP : 365, TN : 19429483, FN : 907, FPR : 0.0000187855, FNR : 0.1967462039, ACC : 0.9999345492, PRC : 0.9102753196, RCL : 0.8032537961, F1 : 0.8534224476
-credsweeper Private Key -> TP : 952, FP : 0, TN : 4, FN : 49, FPR : None, FNR : 0.0489510490, ACC : 0.9512437811, PRC : 1.0000000000, RCL : 0.9510489510, F1 : 0.9749103943
+Detected Credentials: 4693
+result_cnt : 4182, lost_cnt : 96, true_cnt : 3718, false_cnt : 368
+credsweeper -> TP : 3718, FP : 368, TN : 19429480, FN : 892, FPR : 0.0000189399, FNR : 0.1934924078, ACC : 0.9999351667, PRC : 0.9099363681, RCL : 0.8065075922, F1 : 0.8551057958
+credsweeper Private Key -> TP : 967, FP : 0, TN : 4, FN : 34, FPR : None, FNR : 0.0339660340, ACC : 0.9661691542, PRC : 1.0000000000, RCL : 0.9660339660, F1 : 0.9827235772
 credsweeper Predefined Pattern -> TP : 309, FP : 2, TN : 40, FN : 17, FPR : 0.0476190476, FNR : 0.0521472393, ACC : 0.9483695652, PRC : 0.9935691318, RCL : 0.9478527607, F1 : 0.9701726845
 credsweeper Password -> TP : 974, FP : 116, TN : 4164, FN : 422, FPR : 0.0271028037, FNR : 0.3022922636, ACC : 0.9052149401, PRC : 0.8935779817, RCL : 0.6977077364, F1 : 0.7835880933
 credsweeper Generic Token -> TP : 284, FP : 6, TN : 597, FN : 49, FPR : 0.0099502488, FNR : 0.1471471471, ACC : 0.9412393162, PRC : 0.9793103448, RCL : 0.8528528529, F1 : 0.9117174960

@@ -143,3 +143,4 @@ class DiffRowType(Enum):
 
 # PEM x509 patterns
 PEM_BEGIN_PATTERN = "-----BEGIN"
+PEM_END_PATTERN = "-----END"
@@ -444,8 +444,7 @@ diod
 dir_
 direct
 disab
-discipl
-discon
+disc
 disk
 dismi
 dispos
@@ -952,6 +951,7 @@ obj
 oblique
 occur
 ocean
+ocess
 oder
 off
 often
@@ -1052,7 +1052,7 @@ priv
 pro_
 probe
 problem
-process
+proc
 prod
 prof
 prog
@@ -1204,9 +1204,12 @@ scali
 scen
 sched
 schem
+scipl
+scont
 scope
 scram
 screen
+scret
 scri
 scro
 seal

@@ -274,7 +274,7 @@
   severity: high
   type: pem_key
   values:
-    - (?P<value>-----BEGIN\s(?!ENCRYPTED|EC)[^-]*PRIVATE[^-]*KEY[^-]*-----)
+    - (?P<value>-----BEGIN\s(?!ENCRYPTED|EC)[^-]*PRIVATE[^-]*KEY[^-]*-----(.+-----END[^-]+-----)?)
   filter_type:
     - LineSpecificKeyCheck
   min_line_len: 27

@@ -1,14 +1,18 @@
-from typing import List, Optional
+import logging
+import re
+import string
+from typing import Optional, List
 
+from credsweeper.common.constants import Chars, PEM_BEGIN_PATTERN, PEM_END_PATTERN
 from credsweeper.config import Config
-from credsweeper.credentials import Candidate
+from credsweeper.credentials import Candidate, LineData
 from credsweeper.file_handler.analysis_target import AnalysisTarget
-from credsweeper.filters import ValuePatternCheck
+from credsweeper.filters import ValuePatternCheck, ValuePemPatternCheck
 from credsweeper.rules import Rule
 from credsweeper.scanner.scan_type import ScanType
 from credsweeper.utils import Util
 
-PEM_END_PATTERN = "-----END"
+logger = logging.getLogger(__name__)
 
 
 class PemKeyPattern(ScanType):
@@ -21,12 +25,18 @@ class PemKeyPattern(ScanType):
 
     """
 
-    ignore_starts = ["Proc-Type", "Version", "DEK-Info"]
-    remove_characters = " '\";,[]\n\r\t\\+#*"
+    ignore_starts = [PEM_BEGIN_PATTERN, "Proc-Type", "Version", "DEK-Info"]
+    wrap_characters = "\\'\";,[]#*"
+    remove_characters = string.whitespace + wrap_characters
+    remove_characters_plus = remove_characters + '+'
+    pem_pattern_check: Optional[ValuePatternCheck] = None
+    # last line contains 4 symbols, at least
+    re_value_pem = re.compile(r"(?P<value>([^-]*" + PEM_END_PATTERN +
+                              r"[^-]+-----)|(([a-zA-Z0-9/+=]{64}.*)?[a-zA-Z0-9/+=]{4})+)")
 
     @classmethod
     def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> Optional[Candidate]:
-        """Check if current line is a start of a PEM key.
+        """Check if target is a PEM key
 
         Args:
             config: user configs
@@ -40,47 +50,73 @@ def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> Optional[Can
         """
         assert rule.pattern_type == rule.PEM_KEY_PATTERN, \
             "Rules provided to PemKeyPattern.run should have pattern_type equal to PEM_KEY_PATTERN"
-
-        if cls.is_pem_key(target.lines[target.line_num:], config):
-            return cls._get_candidate(config, rule, target)
+        if not cls.pem_pattern_check:
+            cls.pem_pattern_check = ValuePemPatternCheck(config)
+        if candidate := cls._get_candidate(config, rule, target):
+            if pem_lines := cls.detect_pem_key(config, rule, target):
+                candidate.line_data_list = pem_lines
+                return candidate
 
         return None
 
     @classmethod
-    def is_pem_key(cls, lines: List[str], config: Config) -> bool:
-        """Check if provided lines is a PEM key.
+    def detect_pem_key(cls, config: Config, rule: Rule, target: AnalysisTarget) -> List[LineData]:
+        """Detects PEM key in single line and with iterative for next lines according
+        https://www.rfc-editor.org/rfc/rfc7468
 
         Args:
-            lines: Lines to be checked
+            config: Config
+            rule: Rule
+            target: Analysis target
 
         Return:
-            Boolean. True if PEM key, False otherwise
+            List of LineData with found PEM
 
         """
-        lines = cls.strip_lines(lines)
-        lines = cls.remove_leading_config_lines(lines)
+        line_data: List[LineData] = []
         key_data = ""
-        for line_num, line in enumerate(lines):
-            if line_num >= 190:
-                return False
-            if PEM_END_PATTERN in line:
-                # Check if entropy is high enough
-                removed_by_entropy = not Util.is_entropy_validate(key_data)
-                # Check if have no substring with 5 same consecutive characters (like 'AAAAA')
-                pattern_check = ValuePatternCheck(config)
-                removed_by_filter = pattern_check.equal_pattern_check(key_data)
-                not_removed = not (removed_by_entropy or removed_by_filter)
-                return not_removed
-            # PEM key line should not contain spaces or . (and especially not ...)
-            elif " " in line or "..." in line:
-                return False
-            else:
-                key_data += line
-
-        return False  # Return false if no `-END` section in lines
+        # get line with -----BEGIN which may contain full key
+        first_line = LineData(config, target.line, target.line_num, target.file_path, target.file_type, target.info,
+                              rule.patterns[0])
+        line_data.append(first_line)
+        # protection check for case when first line starts from 0
+        line_num = target.line_num if 0 < target.line_num else 1
+        finish_line = line_num + 200
+        for line in target.lines[line_num - 1:]:
+            if finish_line < line_num:
+                return []
+            if 1 != line_num and target.line_num != line_num:
+                _line = LineData(config, line, line_num, target.file_path, target.file_type, target.info,
+                                 cls.re_value_pem)
+                line_data.append(_line)
+            line_num += 1
+            # replace escaped line ends with real and process them - PEM does not contain '\' sign
+            sublines = line.replace("\\r", '\n').replace("\\n", '\n').splitlines()
+            for subline in sublines:
+                if cls.is_leading_config_line(subline):
+                    continue
+                elif PEM_END_PATTERN in subline:
+                    # Check if entropy is high enough for base64 set with padding sign
+                    entropy = Util.get_shannon_entropy(key_data, Chars.BASE64_CHARS.value)
+                    if 4.85 > entropy:
+                        logger.debug("Filtered with entropy %f '%s'", entropy, key_data)
+                        return []
+                    # OPENSSH format has multiple AAAAA pattern
+                    if "OPENSSH" not in target.line and cls.pem_pattern_check.equal_pattern_check(key_data):
+                        logger.debug("Filtered with ValuePemPatternCheck %s", target)
+                        return []
+                    # all OK - return line data with all lines which include PEM
+                    return line_data
+                else:
+                    sanitized_line = cls.sanitize_line(subline)
+                    # PEM key line should not contain spaces or . (and especially not ...)
+                    if ' ' in sanitized_line or "..." in sanitized_line:
+                        return []
+                    key_data += sanitized_line
+        return []
 
     @classmethod
-    def strip_lines(cls, lines: List[str]) -> List[str]:
+    def sanitize_line(cls, line: str, recurse_level: int = 5) -> str:
         """Remove common symbols that can surround PEM keys inside code.
 
         Examples::
@@ -90,22 +126,45 @@ def strip_lines(cls, lines: List[str]) -> List[str]:
             `  "ZZAWarrA1\\n" + `
 
         Args:
-            lines: Lines to be striped
+            line: Line to be cleaned
+            recurse_level: to avoid infinite loop in case when removed symbol inside base64 encoded
 
         Return:
-            lines with special characters removed from both ends
+            line with special characters removed from both ends
 
         """
+        recurse_level -= 1
+
+        if 0 > recurse_level:
+            return line
+
         # Note that this strip would remove `\n` but not `\\n`
-        stripped_lines = [line.strip(cls.remove_characters) for line in lines]
-        # If line still ends with "\n" - remove last 2 characters and strip again (case of `\\n` in the line)
-        stripped_lines = [
-            line[:-2].strip(cls.remove_characters) if line.endswith("\\n") else line for line in stripped_lines
-        ]
-        return stripped_lines
+        line = line.strip(string.whitespace)
+        if line.startswith("// "):
+            # assume, the commented line has to be separated from base64 code. Otherwise, it may be a part of PEM.
+            line = line[3:]
+        if line.startswith("/*"):
+            line = line[2:]
+        if line.endswith("*/"):
+            line = line[:-2]
+        if '"' in line or "'" in line:
+            # remove concatenation only when quotes present
+            line = line.strip(cls.remove_characters_plus)
+        else:
+            line = line.strip(cls.remove_characters)
+        # check whether new iteration requires
+        for x in string.whitespace:
+            if line.startswith(x) or line.endswith(x):
+                return cls.sanitize_line(line, recurse_level)
+
+        for x in cls.wrap_characters:
+            if x in line:
+                return cls.sanitize_line(line, recurse_level)
+
+        return line
 
     @classmethod
-    def remove_leading_config_lines(cls, lines: List[str]) -> List[str]:
+    def is_leading_config_line(cls, line: str) -> bool:
         """Remove non-key lines from the beginning of a list.
 
         Example lines with non-key leading lines:
@@ -118,23 +177,15 @@ def remove_leading_config_lines(cls, lines: List[str]) -> List[str]:
             ZZAWarrA1...
 
         Args:
-            lines: Lines to be checked
+            line: Line to be checked
 
         Return:
-            List of strings without leading non-key lines
+            True if the line is not a part of encoded data but leading config
 
         """
-        leading_lines = 0
-
-        for line in lines:
-            if len(line) == 0:
-                leading_lines += 1
-            else:
-                for ignore_string in cls.ignore_starts:
-                    if line.startswith(ignore_string):
-                        leading_lines += 1
-                        break
-                if not leading_lines:
-                    break
-
-        return lines[leading_lines:]
+        if 0 == len(line):
+            return True
+        for ignore_string in cls.ignore_starts:
+            if ignore_string in line:
+                return True
+        return False
@@ -88,9 +88,9 @@ def get_shannon_entropy(data: str, iterator: str) -> float:
             return 0
 
         entropy = 0.
-        data_len = len(data)
+        data_len = float(len(data))
         for x in iterator:
-            p_x = float(data.count(x)) / data_len
+            p_x = data.count(x) / data_len
             if p_x > 0:
                 entropy += -p_x * math.log(p_x, 2)
 

@@ -4,14 +4,14 @@
 SAMPLES_FILES_COUNT: int = 106
 
 # credentials count after scan
-SAMPLES_CRED_COUNT: int = 101
-SAMPLES_CRED_LINE_COUNT: int = 105
+SAMPLES_CRED_COUNT: int = 102
+SAMPLES_CRED_LINE_COUNT: int = 113
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 95
+SAMPLES_POST_CRED_COUNT: int = 96
 
 # with option --doc
-SAMPLES_IN_DOC = 72
+SAMPLES_IN_DOC = 73
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 17