Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PEM scanner refactoring #373

Merged
merged 2 commits into from
Jul 4, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cicd/benchmark.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Detected Credentials: 4675
result_cnt : 4164, lost_cnt : 96, true_cnt : 3703, false_cnt : 365
credsweeper -> TP : 3703, FP : 365, TN : 19429483, FN : 907, FPR : 0.0000187855, FNR : 0.1967462039, ACC : 0.9999345492, PRC : 0.9102753196, RCL : 0.8032537961, F1 : 0.8534224476
credsweeper Private Key -> TP : 952, FP : 0, TN : 4, FN : 49, FPR : None, FNR : 0.0489510490, ACC : 0.9512437811, PRC : 1.0000000000, RCL : 0.9510489510, F1 : 0.9749103943
Detected Credentials: 4693
result_cnt : 4182, lost_cnt : 96, true_cnt : 3718, false_cnt : 368
credsweeper -> TP : 3718, FP : 368, TN : 19429480, FN : 892, FPR : 0.0000189399, FNR : 0.1934924078, ACC : 0.9999351667, PRC : 0.9099363681, RCL : 0.8065075922, F1 : 0.8551057958
credsweeper Private Key -> TP : 967, FP : 0, TN : 4, FN : 34, FPR : None, FNR : 0.0339660340, ACC : 0.9661691542, PRC : 1.0000000000, RCL : 0.9660339660, F1 : 0.9827235772
credsweeper Predefined Pattern -> TP : 309, FP : 2, TN : 40, FN : 17, FPR : 0.0476190476, FNR : 0.0521472393, ACC : 0.9483695652, PRC : 0.9935691318, RCL : 0.9478527607, F1 : 0.9701726845
credsweeper Password -> TP : 974, FP : 116, TN : 4164, FN : 422, FPR : 0.0271028037, FNR : 0.3022922636, ACC : 0.9052149401, PRC : 0.8935779817, RCL : 0.6977077364, F1 : 0.7835880933
credsweeper Generic Token -> TP : 284, FP : 6, TN : 597, FN : 49, FPR : 0.0099502488, FNR : 0.1471471471, ACC : 0.9412393162, PRC : 0.9793103448, RCL : 0.8528528529, F1 : 0.9117174960
Expand Down
1 change: 1 addition & 0 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,4 @@ class DiffRowType(Enum):

# PEM x509 patterns
PEM_BEGIN_PATTERN = "-----BEGIN"
PEM_END_PATTERN = "-----END"
9 changes: 6 additions & 3 deletions credsweeper/common/morpheme_checklist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -444,8 +444,7 @@ diod
dir_
direct
disab
discipl
discon
disc
disk
dismi
dispos
Expand Down Expand Up @@ -952,6 +951,7 @@ obj
oblique
occur
ocean
ocess
oder
off
often
Expand Down Expand Up @@ -1052,7 +1052,7 @@ priv
pro_
probe
problem
process
proc
prod
prof
prog
Expand Down Expand Up @@ -1204,9 +1204,12 @@ scali
scen
sched
schem
scipl
scont
scope
scram
screen
scret
scri
scro
seal
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@
severity: high
type: pem_key
values:
- (?P<value>-----BEGIN\s(?!ENCRYPTED|EC)[^-]*PRIVATE[^-]*KEY[^-]*-----)
- (?P<value>-----BEGIN\s(?!ENCRYPTED|EC)[^-]*PRIVATE[^-]*KEY[^-]*-----(.+-----END[^-]+-----)?)
filter_type:
- LineSpecificKeyCheck
min_line_len: 27
Expand Down
171 changes: 111 additions & 60 deletions credsweeper/scanner/scan_type/pem_key_pattern.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from typing import List, Optional
import logging
import re
import string
from typing import Optional, List

from credsweeper.common.constants import Chars, PEM_BEGIN_PATTERN, PEM_END_PATTERN
from credsweeper.config import Config
from credsweeper.credentials import Candidate
from credsweeper.credentials import Candidate, LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters import ValuePatternCheck
from credsweeper.filters import ValuePatternCheck, ValuePemPatternCheck
from credsweeper.rules import Rule
from credsweeper.scanner.scan_type import ScanType
from credsweeper.utils import Util

PEM_END_PATTERN = "-----END"
logger = logging.getLogger(__name__)


class PemKeyPattern(ScanType):
Expand All @@ -21,12 +25,18 @@ class PemKeyPattern(ScanType):

"""

ignore_starts = ["Proc-Type", "Version", "DEK-Info"]
remove_characters = " '\";,[]\n\r\t\\+#*"
ignore_starts = [PEM_BEGIN_PATTERN, "Proc-Type", "Version", "DEK-Info"]
wrap_characters = "\\'\";,[]#*"
remove_characters = string.whitespace + wrap_characters
remove_characters_plus = remove_characters + '+'
pem_pattern_check: Optional[ValuePatternCheck] = None
# last line contains 4 symbols, at least
re_value_pem = re.compile(r"(?P<value>([^-]*" + PEM_END_PATTERN +
r"[^-]+-----)|(([a-zA-Z0-9/+=]{64}.*)?[a-zA-Z0-9/+=]{4})+)")

@classmethod
def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> Optional[Candidate]:
"""Check if current line is a start of a PEM key.
"""Check if target is a PEM key

Args:
config: user configs
Expand All @@ -40,47 +50,73 @@ def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> Optional[Can
"""
assert rule.pattern_type == rule.PEM_KEY_PATTERN, \
"Rules provided to PemKeyPattern.run should have pattern_type equal to PEM_KEY_PATTERN"

if cls.is_pem_key(target.lines[target.line_num:], config):
return cls._get_candidate(config, rule, target)
if not cls.pem_pattern_check:
cls.pem_pattern_check = ValuePemPatternCheck(config)
if candidate := cls._get_candidate(config, rule, target):
if pem_lines := cls.detect_pem_key(config, rule, target):
candidate.line_data_list = pem_lines
return candidate

return None

@classmethod
def is_pem_key(cls, lines: List[str], config: Config) -> bool:
"""Check if provided lines is a PEM key.
def detect_pem_key(cls, config: Config, rule: Rule, target: AnalysisTarget) -> List[LineData]:
"""Detects PEM key in single line and with iterative for next lines according
https://www.rfc-editor.org/rfc/rfc7468

Args:
lines: Lines to be checked
config: Config
rule: Rule
target: Analysis target

Return:
Boolean. True if PEM key, False otherwise
List of LineData with found PEM

"""
lines = cls.strip_lines(lines)
lines = cls.remove_leading_config_lines(lines)
line_data: List[LineData] = []
key_data = ""
for line_num, line in enumerate(lines):
if line_num >= 190:
return False
if PEM_END_PATTERN in line:
# Check if entropy is high enough
removed_by_entropy = not Util.is_entropy_validate(key_data)
# Check if have no substring with 5 same consecutive characters (like 'AAAAA')
pattern_check = ValuePatternCheck(config)
removed_by_filter = pattern_check.equal_pattern_check(key_data)
not_removed = not (removed_by_entropy or removed_by_filter)
return not_removed
# PEM key line should not contain spaces or . (and especially not ...)
elif " " in line or "..." in line:
return False
else:
key_data += line

return False # Return false if no `-END` section in lines
# get line with -----BEGIN which may contain full key
first_line = LineData(config, target.line, target.line_num, target.file_path, target.file_type, target.info,
rule.patterns[0])
line_data.append(first_line)
# protection check for case when first line starts from 0
line_num = target.line_num if 0 < target.line_num else 1
finish_line = line_num + 200
for line in target.lines[line_num - 1:]:
if finish_line < line_num:
return []
if 1 != line_num and target.line_num != line_num:
_line = LineData(config, line, line_num, target.file_path, target.file_type, target.info,
cls.re_value_pem)
line_data.append(_line)
line_num += 1
# replace escaped line ends with real and process them - PEM does not contain '\' sign
sublines = line.replace("\\r", '\n').replace("\\n", '\n').splitlines()
for subline in sublines:
if cls.is_leading_config_line(subline):
continue
elif PEM_END_PATTERN in subline:
# Check if entropy is high enough for base64 set with padding sign
entropy = Util.get_shannon_entropy(key_data, Chars.BASE64_CHARS.value)
if 4.85 > entropy:
csh519 marked this conversation as resolved.
Show resolved Hide resolved
babenek marked this conversation as resolved.
Show resolved Hide resolved
logger.debug("Filtered with entropy %f '%s'", entropy, key_data)
return []
# OPENSSH format has multiple AAAAA pattern
if "OPENSSH" not in target.line and cls.pem_pattern_check.equal_pattern_check(key_data):
logger.debug("Filtered with ValuePemPatternCheck %s", target)
return []
# all OK - return line data with all lines which include PEM
return line_data
else:
sanitized_line = cls.sanitize_line(subline)
# PEM key line should not contain spaces or . (and especially not ...)
if ' ' in sanitized_line or "..." in sanitized_line:
return []
key_data += sanitized_line
return []

@classmethod
def strip_lines(cls, lines: List[str]) -> List[str]:
def sanitize_line(cls, line: str, recurse_level: int = 5) -> str:
"""Remove common symbols that can surround PEM keys inside code.

Examples::
Expand All @@ -90,22 +126,45 @@ def strip_lines(cls, lines: List[str]) -> List[str]:
` "ZZAWarrA1\\n" + `

Args:
lines: Lines to be striped
line: Line to be cleaned
recurse_level: to avoid infinite loop in case when removed symbol inside base64 encoded

Return:
lines with special characters removed from both ends
line with special characters removed from both ends

"""
recurse_level -= 1

if 0 > recurse_level:
return line

# Note that this strip would remove `\n` but not `\\n`
stripped_lines = [line.strip(cls.remove_characters) for line in lines]
# If line still ends with "\n" - remove last 2 characters and strip again (case of `\\n` in the line)
stripped_lines = [
line[:-2].strip(cls.remove_characters) if line.endswith("\\n") else line for line in stripped_lines
]
return stripped_lines
line = line.strip(string.whitespace)
if line.startswith("// "):
# assume, the commented line has to be separated from base64 code. Otherwise, it may be a part of PEM.
line = line[3:]
if line.startswith("/*"):
line = line[2:]
if line.endswith("*/"):
line = line[:-2]
if '"' in line or "'" in line:
# remove concatenation only when quotes present
line = line.strip(cls.remove_characters_plus)
else:
line = line.strip(cls.remove_characters)
# check whether new iteration requires
for x in string.whitespace:
if line.startswith(x) or line.endswith(x):
return cls.sanitize_line(line, recurse_level)

for x in cls.wrap_characters:
if x in line:
return cls.sanitize_line(line, recurse_level)

return line

@classmethod
def remove_leading_config_lines(cls, lines: List[str]) -> List[str]:
def is_leading_config_line(cls, line: str) -> bool:
"""Remove non-key lines from the beginning of a list.

Example lines with non-key leading lines:
Expand All @@ -118,23 +177,15 @@ def remove_leading_config_lines(cls, lines: List[str]) -> List[str]:
ZZAWarrA1...

Args:
lines: Lines to be checked
line: Line to be checked

Return:
List of strings without leading non-key lines
True if the line is not a part of encoded data but leading config

"""
leading_lines = 0

for line in lines:
if len(line) == 0:
leading_lines += 1
else:
for ignore_string in cls.ignore_starts:
if line.startswith(ignore_string):
leading_lines += 1
break
if not leading_lines:
break

return lines[leading_lines:]
if 0 == len(line):
return True
for ignore_string in cls.ignore_starts:
if ignore_string in line:
return True
return False
4 changes: 2 additions & 2 deletions credsweeper/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ def get_shannon_entropy(data: str, iterator: str) -> float:
return 0

entropy = 0.
data_len = len(data)
data_len = float(len(data))
for x in iterator:
p_x = float(data.count(x)) / data_len
p_x = data.count(x) / data_len
if p_x > 0:
entropy += -p_x * math.log(p_x, 2)

Expand Down
8 changes: 4 additions & 4 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
SAMPLES_FILES_COUNT: int = 106

# credentials count after scan
SAMPLES_CRED_COUNT: int = 101
SAMPLES_CRED_LINE_COUNT: int = 105
SAMPLES_CRED_COUNT: int = 102
SAMPLES_CRED_LINE_COUNT: int = 113

# credentials count after post-processing
SAMPLES_POST_CRED_COUNT: int = 95
SAMPLES_POST_CRED_COUNT: int = 96

# with option --doc
SAMPLES_IN_DOC = 72
SAMPLES_IN_DOC = 73

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 17
Expand Down
Loading