Skip to content

Commit

Permalink
[skip actions] [cmdpass] 2024-08-31T12:52:24+03:00
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Aug 31, 2024
1 parent c3f9b8b commit 2d39f03
Show file tree
Hide file tree
Showing 13 changed files with 460 additions and 302 deletions.
47 changes: 47 additions & 0 deletions credsweeper/ml_model/char_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'."""
import contextlib
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Any, Dict, Tuple, Set

import numpy as np

from credsweeper.common.constants import Base, Chars, CHUNK_SIZE
from credsweeper.credentials import Candidate
from credsweeper.ml_model.features import Feature
from credsweeper.utils import Util



class CharSet(Feature):
"""Feature is true when all characters of the value are from a set."""

# Constant dictionary to get characters set via name
CHARS: Dict[Base, str] = { #
Base.base16upper: Chars.BASE16UPPER.value, #
Base.base16lower: Chars.BASE16LOWER.value, #
Base.base32: Chars.BASE32_CHARS.value, #
Base.base36: Chars.BASE36_CHARS.value, #
Base.base64std: Chars.BASE64STD_CHARS.value + '=', #
Base.base64url: Chars.BASE64URL_CHARS.value + '=', #
}

def __init__(self, base: str) -> None:
"""CharSet class initializer.
Args:
base: base set ID
"""
super().__init__()
self.base: Base = getattr(Base, base)

def extract(self, candidate: Candidate) -> bool:
with contextlib.suppress(Exception):
for i in self.CHARS[self.base]:
if i not in candidate.line_data_list[0].value:
break
else:
return True
return False

308 changes: 7 additions & 301 deletions credsweeper/ml_model/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import numpy as np

from credsweeper.app import logger
from credsweeper.common.constants import Base, Chars, CHUNK_SIZE
from credsweeper.credentials import Candidate
from credsweeper.utils import Util
Expand Down Expand Up @@ -51,144 +50,7 @@ def any_word_in_(self, a_string: str) -> bool:



class WordIn(Feature):
"""Abstract feature returns array with all matched words in a string"""

def __init__(self,words:List[str]):
super().__init__()
self.dimension = len(words)
self.words=sorted(list(set(words)))
self.enumerated_words = list(enumerate(self.words))
if len(self.enumerated_words) != self.dimension:
raise RuntimeError(f"Check duplicates:{words}")

@property
def enumerated_words(self) -> List[Tuple[int,str]]:
"""getter for speedup"""
return self.__enumerated_words

@enumerated_words.setter
def enumerated_words(self, enumerated_words: List[Tuple[int,str]]) -> None:
"""setter for speedup"""
self.__enumerated_words = enumerated_words

@property
def dimension(self) -> int:
"""getter"""
return self.__dimension

@dimension.setter
def dimension(self, dimension: int) -> None:
"""setter"""
self.__dimension = dimension

@abstractmethod
def extract(self, candidate: Candidate) -> Any:
raise NotImplementedError

def word_in_str(self, a_string: str) -> np.ndarray:
"""Returns array with words included in a string"""
result = np.zeros(shape=[self.dimension], dtype=np.int8)
for i, word in self.enumerated_words:
if word in a_string:
result[i] = 1
return np.array([result])

def word_in_set(self, a_strings_set: Set[str]) -> np.ndarray:
"""Returns array with words matches in a_strings_set"""
result = np.zeros(shape=[self.dimension], dtype=np.int8)
for i, word in self.enumerated_words:
if word in a_strings_set:
result[i] = 1
return np.array([result])



class WordInVariable(WordIn):
"""Feature returns array of words matching in variable"""

def __init__(self, words: List[str]) -> None:
"""Feature is true if candidate value contains at least one predefined word.
Args:
words: list of predefined words - MUST BE IN LOWER CASE
"""
super().__init__(words)

def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns array of matching words for first line"""
if candidate.line_data_list[0].variable:
return self.word_in_str(candidate.line_data_list[0].variable.lower())
else:
return np.zeros(shape=[self.dimension], dtype=np.int8)


class WordInSecret(WordIn):
"""Feature returns true if candidate value contains at least one word from predefined list."""

def __init__(self, words: List[str]) -> None:
"""Feature is true if candidate value contains at least one predefined word.
Args:
words: list of predefined words - MUST BE IN LOWER CASE
"""
super().__init__(words)

def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns array of matching words for first line"""
value = candidate.line_data_list[0].value
if value:
return self.word_in_str(value.lower())
else:
return np.zeros(shape=[self.dimension], dtype=np.int8)


class WordInLine(WordIn):
"""Feature is true if line contains at least one word from predefined list."""

def __init__(self, words: List[str]) -> None:
"""Feature returns array of matching words
Args:
words: list of predefined words - MUST BE IN LOWER CASE
"""
super().__init__(words)

def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns true if any words in first line"""
subtext = Util.subtext(candidate.line_data_list[0].line, candidate.line_data_list[0].value_start, CHUNK_SIZE)
if subtext:
return self.word_in_str(subtext.lower())
else:
return np.zeros(shape=[self.dimension], dtype=np.int8)



class WordInPath(WordIn):
"""Categorical feature that corresponds to words in path (POSIX, lowercase)"""

def __init__(self, words: List[str]) -> None:
"""WordInPath constructor
Args:
words: list of predefined words - MUST BE IN LOWER CASE & POSIX
"""
super().__init__(words)

def __call__(self, candidates: List[Candidate]) -> np.ndarray:
# actually there must be one path because the candidates are grouped before
candidate_path = Path( candidates[0].line_data_list[0].path).as_posix().lower()
if candidate_path:
return self.word_in_str(candidate_path.lower())
else:
return np.zeros(shape=[self.dimension], dtype=np.int8)

def extract(self, candidate: Candidate) -> Any:
raise NotImplementedError


class HasHtmlTag(Feature):
Expand All @@ -203,12 +65,15 @@ def __init__(self) -> None:
def extract(self, candidate: Candidate) -> bool:
subtext = Util.subtext(candidate.line_data_list[0].line, candidate.line_data_list[0].value_start, CHUNK_SIZE)
candidate_line_data_list_0_line_lower = subtext.lower()
if '<' not in candidate_line_data_list_0_line_lower:
# early check
return False
if self.any_word_in_(candidate_line_data_list_0_line_lower):
return True
for i in ["<", "/>"]:
if i not in candidate_line_data_list_0_line_lower:
return False
return True
if "/>" in candidate_line_data_list_0_line_lower or "</" in candidate_line_data_list_0_line_lower:
# possible closed tag
return True
return False


class PossibleComment(Feature):
Expand All @@ -231,162 +96,3 @@ def extract(self, candidate: Candidate) -> bool:
except ValueError:
return False


class RenyiEntropy(Feature):
"""Renyi entropy.
See next link for details:
https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf
Parameters:
alpha: entropy parameter
norm: set True to normalize output probabilities
"""

# Constant dictionary to get characters set via name
CHARS: Dict[Base, Chars] = { #
Base.base32: Chars.BASE32_CHARS, #
Base.base36: Chars.BASE36_CHARS, #
Base.base64: Chars.BASE64_CHARS, #
Base.hex: Chars.HEX_CHARS #
}

def __init__(self, base: str, alpha: float, norm=False) -> None:
"""Renyi entropy class initializer.
Args:
base: number base type
alpha: entropy parameter
norm: set True to normalize output probabilities, default is False
"""
super().__init__()
self.base: Base = getattr(Base, base)
self.alpha = alpha
self.norm = norm

def extract(self, candidate: Candidate) -> np.ndarray:
p_x = self.get_probabilities(candidate.line_data_list[0].value)
return np.array([self.estimate_entropy(p_x)])

def get_probabilities(self, data: str) -> np.ndarray:
"""Get list of alphabet's characters presented in inputted string."""
unique_elements = [x for x in RenyiEntropy.CHARS[self.base].value if data.count(x) > 0]

# perform estimation of probability of characters
p_x = np.array([float(data.count(x)) / len(data) for x in unique_elements])
# get probabilities for alphabet's characters presented in data
p_x = p_x[p_x > 0]

# linear weighting of probabilities for theirs normalization
if self.norm:
p_x /= p_x.sum()

return p_x

def estimate_entropy(self, p_x: np.ndarray) -> float:
"""Calculate Renyi entropy of 'p_x' sequence.
Function is based on definition of Renyi entropy for arbitrary probability distribution.
Please see next link for details:
https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf
"""
if 0 == len(p_x):
entropy = 0
elif np.abs(0.0 - self.alpha) < np.finfo(np.float32).eps:
# corresponds to Hartley or max-entropy
entropy = np.log2(p_x.size)
elif np.abs(1.0 - self.alpha) < np.finfo(np.float32).eps:
# corresponds to Shannon entropy
entropy = np.sum(-p_x * np.log2(p_x))
else:
entropy = np.log2((p_x**self.alpha).sum()) / (1.0 - self.alpha)

return entropy


class ShannonEntropy(RenyiEntropy):
"""Shannon entropy feature."""

def __init__(self, base: str, norm: bool = False) -> None:
super().__init__(base, 1.0, norm)


class HartleyEntropy(RenyiEntropy):
"""Hartley entropy feature."""

def __init__(self, base: str, norm: bool = False) -> None:
super().__init__(base, 0.0, norm)


class CharSet(Feature):
"""Feature is true when all characters of the value are from a set."""

# Constant dictionary to get characters set via name
CHARS: Dict[Base, str] = { #
Base.base16upper: Chars.BASE16UPPER.value, #
Base.base16lower: Chars.BASE16LOWER.value, #
Base.base32: Chars.BASE32_CHARS.value, #
Base.base36: Chars.BASE36_CHARS.value, #
Base.base64std: Chars.BASE64STD_CHARS.value + '=', #
Base.base64url: Chars.BASE64URL_CHARS.value + '=', #
}

def __init__(self, base: str) -> None:
"""CharSet class initializer.
Args:
base: base set ID
"""
super().__init__()
self.base: Base = getattr(Base, base)

def extract(self, candidate: Candidate) -> bool:
with contextlib.suppress(Exception):
for i in self.CHARS[self.base]:
if i not in candidate.line_data_list[0].value:
break
else:
return True
return False


class FileExtension(WordIn):
"""Categorical feature of file type.
Parameters:
extensions: extension labels
"""

def __init__(self, extensions: List[str]) -> None:
super().__init__(extensions)

def __call__(self, candidates: List[Candidate]) -> np.ndarray:
extension_set = set([candidate.line_data_list[0].file_type.lower() for candidate in candidates])
return self.word_in_set(extension_set)


def extract(self, candidate: Candidate) -> Any:
raise NotImplementedError


class RuleName(WordIn):
"""Categorical feature that corresponds to rule name.
Parameters:
rule_names: rule name labels
"""

def __init__(self, rule_names: List[str]) -> None:
super().__init__(rule_names)

def __call__(self, candidates: List[Candidate]) -> np.ndarray:
candidate_rule_set = set(x.rule_name for x in candidates)
return self.word_in_set(candidate_rule_set)

def extract(self, candidate: Candidate) -> Any:
raise NotImplementedError
Loading

0 comments on commit 2d39f03

Please sign in to comment.