Skip to content

Commit

Permalink
Merge branch 'main' into babenek-patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek authored Sep 19, 2024
2 parents f50b038 + 80da63d commit 544e578
Show file tree
Hide file tree
Showing 102 changed files with 8,369 additions and 6,081 deletions.
138 changes: 71 additions & 67 deletions .ci/benchmark.txt

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,8 @@ jobs:
- name: Check ml_model.onnx integrity
if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
run: |
md5sum --binary credsweeper/ml_model/ml_config.json | grep 2b29c5e1aa199d14b788652bd542c7c0
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 88f37978fc0599ac8d1bf732ad40c077
md5sum --binary credsweeper/ml_model/ml_config.json | grep 49c4352ae9ec82ad432d49d7e51c27f1
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep ff66e97c446d0f2bbd8d37b7dfff7361
# # # line ending

Expand Down
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -260,10 +260,8 @@ Name | E-Mail
-- | --
[Jaeku Yun](https://github.com/silentearth) | [email protected]
[Shinhyung Choi](https://github.com/csh519) | [email protected]
[Yujeong Lee](https://github.com/yuzzyuzz) | [email protected]
[Oleksandra Sokol](https://github.com/meanrin) | [email protected]
[Dmytro Kuzmenko](https://github.com/Dmitriy-NK) | [email protected]
[Arkadiy Melkonyan](https://github.com/ARKAD97) | [email protected]
[Roman Babenko](https://github.com/babenek) | [email protected]
[Yuliia Tatarinova](https://github.com/Yullia) | [email protected]

## How to Contact

Expand Down
31 changes: 3 additions & 28 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,8 @@
import re
import typing
from enum import Enum
from typing import Optional, Union


class KeywordPattern:
"""Pattern set of keyword types"""
key_left = r"(\\[nrt])?(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
r"(?P<keyword>"
# there will be inserted a keyword
key_right = r")" \
r"[^:='\"`<>{?!&]*)[`'\"]*)" # <variable>
separator = r"\s*\]?\s*" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=" \
r"|:|=>|!=|===|==|=)" \
r"\s*(?P<wrap>((new\s*)?\w|\.|->|\(|\[)*[\[\(\{](\w{1,32}=)?\s*)?"
# Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
value = r"(?P<value_leftquote>((b|r|br|rb|u|f|rf|fr|\\{0,8})?[`'\"]){1,4})?" \
r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?" \
r"(?P<value>" \
r"(?(value_leftquote)(?:\\[tnrux0-7][0-9a-f]*|[^`'\"\\])|(?:\\n|\\r|\\?[^\s`'\"\\,;])){1,8000}" \
r"|(?:\{[^}]{3,8000}\})|(?:<[^>]{3,8000}>)" \
r")" \
r"(?(value_leftquote)(?P<value_rightquote>(\\{0,8}[`'\"]){1,4})?|(?(wrap)[\]\)\},;]))"

@classmethod
def get_keyword_pattern(cls, keyword: str) -> re.Pattern:
"""Returns compiled regex pattern"""
expression = "".join([cls.key_left, keyword, cls.key_right, cls.separator, cls.value])
return re.compile(expression, flags=re.IGNORECASE)


class Severity(Enum):
"""Severity of candidate"""
CRITICAL = "critical"
Expand Down Expand Up @@ -89,6 +61,9 @@ def get(confidence: Union[str, "Confidence"]) -> Optional["Confidence"]:

class Base(Enum):
"""Stores types of character sets in lower case"""
digits = "digits"
ascii_uppercase = "ascii_uppercase"
ascii_lowercase = "ascii_lowercase"
base16upper = "base16upper"
base16lower = "base16lower"
base32 = "base32"
Expand Down
18 changes: 18 additions & 0 deletions credsweeper/common/keyword_checklist.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,21 @@ def morpheme_set(self) -> Set[str]:
def morpheme_len(self) -> int:
"""Length of morpheme_set"""
return len(self.__morpheme_set)

def check_morphemes(self, line_lower: str, threshold: int) -> bool:
"""Checks limit of morphemes limit in line.
Args:
line_lower: input line - MUST be in lower
threshold: number of minimal morphemes
Return:
True - if number of morphemes exceeds the threshold
"""
matches = 0
for keyword in self.morpheme_set:
if keyword in line_lower:
matches += 1
if threshold < matches:
return True
return False
58 changes: 58 additions & 0 deletions credsweeper/common/keyword_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import re


class KeywordPattern:
"""Pattern set of keyword types"""
key_left = r"(\\[nrt])?"\
r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
r"(?P<keyword>"
# there will be inserted a keyword
key_right = r")" \
r"[^:='\"`<>{?!&]*)[`'\"]*)" # <variable>
separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=)" \
r"(\s|\\+[tnr])*"
# might be curly, square or parenthesis with words before
wrap = r"(?P<wrap>(" \
r"(new(\s|\\+[tnr])+)?" \
r"([0-9a-z_.]|-(>|(&|\\\\*u0026)gt;))*" \
r"[\[\(\{]"\
r"(\s|\\+[tnr])*" \
r"([0-9a-z_]{1,32}=)?" \
r")+)?"
string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[`'\"])))?"
left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?[`'\"]){1,4}))?"
# Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
auth_keywords = r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?"
value = r"(?P<value>" \
r"(?(value_leftquote)" \
r"(" \
r"(?!(?P=value_leftquote))" \
r"(?(esq)((?!(?P=esq)['`\"]).)|((?!(?P=value_leftquote)).)))" \
r"|" \
r"(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])" \
r"){3,8000}" \
r"|(\{[^}]{3,8000}\})" \
r"|(<[^>]{3,8000}>)" \
r")"
right_quote = r"(?(value_leftquote)" \
r"(?P<value_rightquote>(?<!\\)(?P=value_leftquote)|\\$|(?<=[0-9a-z+_/-])$)" \
r"|" \
r"(?(wrap)[\]\)\},;]))"

@classmethod
def get_keyword_pattern(cls, keyword: str) -> re.Pattern:
"""Returns compiled regex pattern"""
expression = "".join([ #
cls.key_left, #
keyword, #
cls.key_right, #
cls.separator, #
cls.wrap, #
cls.string_prefix, #
cls.left_quote, #
cls.auth_keywords, #
cls.value, #
cls.right_quote, #
])
return re.compile(expression, flags=re.IGNORECASE | re.DOTALL)
27 changes: 27 additions & 0 deletions credsweeper/common/morpheme_checklist.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
../
.com
.org
/bin
/dev
/etc
/lib
/mnt
/opt
/sbin
/srv
/tmp
/usr
/var
000
111
222
Expand Down Expand Up @@ -206,6 +220,7 @@ best
bias
big
bill
bin/
binar
bind
bio
Expand Down Expand Up @@ -373,6 +388,7 @@ course
court
cove
cpu_
crac
creat
cred
cript
Expand Down Expand Up @@ -428,6 +444,7 @@ dest
detach
detai
detect
dev/
dev_
develop
device
Expand Down Expand Up @@ -529,6 +546,7 @@ esam
esses
estima
esult
etc/
eth_
etic
eting
Expand Down Expand Up @@ -694,6 +712,7 @@ hybrid
iabl
ical
icon
id_rsa
iden
idle
ieee
Expand Down Expand Up @@ -808,6 +827,7 @@ lexeme
lexic
lianc
liant
lib/
library
licens
lies
Expand Down Expand Up @@ -893,6 +913,7 @@ mit
mix
mmon
mmun
mnt/
mobile
mock
mode
Expand Down Expand Up @@ -968,6 +989,7 @@ one
onfig
only
open
opt/
opted
opti
oracle
Expand Down Expand Up @@ -1307,6 +1329,8 @@ spot
spray
sql
src_
srv/
ssh
ssl
stack
stan
Expand Down Expand Up @@ -1400,6 +1424,7 @@ tio
tish
title
titud
tmp/
to_
tod
toke
Expand Down Expand Up @@ -1461,11 +1486,13 @@ url
usb
use
usin
usr/
uster
util
val_
valid
valu
var/
vari
vault
vect
Expand Down
51 changes: 37 additions & 14 deletions credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,17 @@ def clean_bash_parameters(self) -> None:
self.value = value_whsp[0]

def clean_toml_parameters(self) -> None:
"""Curly brackets may be caught in TOML format"""
while self.value.endswith('}') and '{' in self.line[:self.value_start]:
self.value = self.value[:-1]
"""Parenthesis, curly and squared brackets may be caught in TOML format and bash. Simple clearing"""
cleaning_required = self.value and self.value[-1] in ['}', ']', ')']
line_before_value = self.line[:self.value_start] if self.value_start and 0 <= self.value_start else ""
while cleaning_required:
cleaning_required = False
for left, right in [('{', '}'), ('[', ']'), ('(', ')')]:
if self.value.endswith(right) and left not in self.value \
and line_before_value.count(left) > line_before_value.count(right):
# full match does not reasonable to implement due open character may be in other line
self.value = self.value[:-1]
cleaning_required = True

def sanitize_variable(self) -> None:
"""Remove trailing spaces, dashes and quotations around the variable. Correct position."""
Expand Down Expand Up @@ -227,31 +235,45 @@ def is_comment(self) -> bool:

@cached_property
def is_well_quoted_value(self) -> bool:
"""Well quoted value - means the quotations must be equal"""
"""Well quoted value - means the value has been quoted or has line wrap"""
result = False
if self.value_leftquote and self.value_rightquote:
if self.value_leftquote == self.value_rightquote:
# regex caught well
return True

if 1 == len(self.value_leftquote):
leftquote = self.value_leftquote
else:
for q in self.quotation_marks:
if q in self.value_leftquote:
leftquote = q
break
else:
# right side symbol should be a quote
leftquote = self.value_leftquote[-1]
if leftquote not in self.quotation_marks:
leftquote = ""

if 1 == len(self.value_rightquote):
rightquote = self.value_rightquote
else:
for q in self.quotation_marks:
if q in self.value_rightquote:
# clean \ sign in escaping text
for q in self.value_rightquote:
if q in self.quotation_marks:
rightquote = q
break
else:
rightquote = ""

return bool(leftquote) and bool(rightquote) and leftquote == rightquote
result = bool(leftquote) and ( #
bool(rightquote) and (leftquote == rightquote) # normal case
or '\\' == self.value_rightquote and '\\' == self.line[-1] # line wrap
)

return False
elif self.value_leftquote:
result = ( #
('\\' == self.value_rightquote or '\\' == self.value[-1]) and '\\' == self.line[-1] # line wrap
or '.php' == self.file_type # php may use multiline string
or 3 == self.value_leftquote.count('"') or 3 == self.value_leftquote.count("'") # python multiline
)

return result

@cached_property
def is_quoted(self) -> bool:
Expand All @@ -273,7 +295,8 @@ def is_quoted(self) -> bool:
if i in ('"', "'", '`'):
right_quote = i
break
return bool(left_quote) and bool(right_quote) and left_quote == right_quote
result = bool(left_quote) and bool(right_quote) and left_quote == right_quote
return result

def is_source_file(self) -> bool:
"""Check if file with credential is a source code file or not (data, log, plain text).
Expand Down
2 changes: 0 additions & 2 deletions credsweeper/filters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,3 @@
from credsweeper.filters.value_token_base36_check import ValueTokenBase36Check
from credsweeper.filters.value_token_base64_check import ValueTokenBase64Check
from credsweeper.filters.value_token_check import ValueTokenCheck
from credsweeper.filters.value_useless_word_check import ValueUselessWordCheck
from credsweeper.filters.variable_not_allowed_pattern_check import VariableNotAllowedPatternCheck
4 changes: 2 additions & 2 deletions credsweeper/filters/group/general_keyword.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from credsweeper.common.constants import GroupType
from credsweeper.config import Config
from credsweeper.filters import ValueDictionaryKeywordCheck, ValueUselessWordCheck
from credsweeper.filters import ValueDictionaryKeywordCheck
from credsweeper.filters.group import Group


Expand All @@ -9,4 +9,4 @@ class GeneralKeyword(Group):

def __init__(self, config: Config) -> None:
super().__init__(config, GroupType.KEYWORD)
self.filters.extend([ValueDictionaryKeywordCheck(), ValueUselessWordCheck()])
self.filters.extend([ValueDictionaryKeywordCheck()])
2 changes: 0 additions & 2 deletions credsweeper/filters/group/general_pattern.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from credsweeper.common.constants import GroupType
from credsweeper.config import Config
from credsweeper.filters import ValueUselessWordCheck
from credsweeper.filters.group import Group


Expand All @@ -9,4 +8,3 @@ class GeneralPattern(Group):

def __init__(self, config: Config) -> None:
super().__init__(config, GroupType.PATTERN)
self.filters.extend([ValueUselessWordCheck()])
Loading

0 comments on commit 544e578

Please sign in to comment.