diff --git a/credsweeper/ml_model/char_set.py b/credsweeper/ml_model/char_set.py new file mode 100644 index 000000000..48bb78d58 --- /dev/null +++ b/credsweeper/ml_model/char_set.py @@ -0,0 +1,47 @@ +"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'.""" +import contextlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Any, Dict, Tuple, Set + +import numpy as np + +from credsweeper.common.constants import Base, Chars, CHUNK_SIZE +from credsweeper.credentials import Candidate +from credsweeper.ml_model.features import Feature +from credsweeper.utils import Util + + + +class CharSet(Feature): + """Feature is true when all characters of the value are from a set.""" + + # Constant dictionary to get characters set via name + CHARS: Dict[Base, str] = { # + Base.base16upper: Chars.BASE16UPPER.value, # + Base.base16lower: Chars.BASE16LOWER.value, # + Base.base32: Chars.BASE32_CHARS.value, # + Base.base36: Chars.BASE36_CHARS.value, # + Base.base64std: Chars.BASE64STD_CHARS.value + '=', # + Base.base64url: Chars.BASE64URL_CHARS.value + '=', # + } + + def __init__(self, base: str) -> None: + """CharSet class initializer. + + Args: + base: base set ID + + """ + super().__init__() + self.base: Base = getattr(Base, base) + + def extract(self, candidate: Candidate) -> bool: + with contextlib.suppress(Exception): + for i in self.CHARS[self.base]: + if i not in candidate.line_data_list[0].value: + break + else: + return True + return False + diff --git a/credsweeper/ml_model/features.py b/credsweeper/ml_model/features.py index c28158fd4..f3a945c4f 100644 --- a/credsweeper/ml_model/features.py +++ b/credsweeper/ml_model/features.py @@ -6,7 +6,6 @@ import numpy as np -from credsweeper.app import logger from credsweeper.common.constants import Base, Chars, CHUNK_SIZE from credsweeper.credentials import Candidate from credsweeper.utils import Util @@ -51,144 +50,7 @@ def any_word_in_(self, a_string: str) -> bool: -class WordIn(Feature): - """Abstract feature returns array with all matched words in a string""" - def __init__(self,words:List[str]): - super().__init__() - self.dimension = len(words) - self.words=sorted(list(set(words))) - self.enumerated_words = list(enumerate(self.words)) - if len(self.enumerated_words) != self.dimension: - raise RuntimeError(f"Check duplicates:{words}") - - @property - def enumerated_words(self) -> List[Tuple[int,str]]: - """getter for speedup""" - return self.__enumerated_words - - @enumerated_words.setter - def enumerated_words(self, enumerated_words: List[Tuple[int,str]]) -> None: - """setter for speedup""" - self.__enumerated_words = enumerated_words - - @property - def dimension(self) -> int: - """getter""" - return self.__dimension - - @dimension.setter - def dimension(self, dimension: int) -> None: - """setter""" - self.__dimension = dimension - - @abstractmethod - def extract(self, candidate: Candidate) -> Any: - raise NotImplementedError - - def word_in_str(self, a_string: str) -> np.ndarray: - """Returns array with words included in a string""" - result = np.zeros(shape=[self.dimension], dtype=np.int8) - for i, word in self.enumerated_words: - if word in a_string: - result[i] = 1 - return np.array([result]) - - def word_in_set(self, a_strings_set: Set[str]) -> np.ndarray: - """Returns array with words matches in a_strings_set""" - result = np.zeros(shape=[self.dimension], dtype=np.int8) - for i, word in self.enumerated_words: - if word in a_strings_set: - result[i] = 1 - return np.array([result]) - - - -class WordInVariable(WordIn): - """Feature returns array of words matching in variable""" - - def __init__(self, words: List[str]) -> None: - """Feature is true if candidate value contains at least one predefined word. - - Args: - words: list of predefined words - MUST BE IN LOWER CASE - - """ - super().__init__(words) - - def extract(self, candidate: Candidate) -> np.ndarray: - """Returns array of matching words for first line""" - if candidate.line_data_list[0].variable: - return self.word_in_str(candidate.line_data_list[0].variable.lower()) - else: - return np.zeros(shape=[self.dimension], dtype=np.int8) - - -class WordInSecret(WordIn): - """Feature returns true if candidate value contains at least one word from predefined list.""" - - def __init__(self, words: List[str]) -> None: - """Feature is true if candidate value contains at least one predefined word. - - Args: - words: list of predefined words - MUST BE IN LOWER CASE - - """ - super().__init__(words) - - def extract(self, candidate: Candidate) -> np.ndarray: - """Returns array of matching words for first line""" - value = candidate.line_data_list[0].value - if value: - return self.word_in_str(value.lower()) - else: - return np.zeros(shape=[self.dimension], dtype=np.int8) - - -class WordInLine(WordIn): - """Feature is true if line contains at least one word from predefined list.""" - - def __init__(self, words: List[str]) -> None: - """Feature returns array of matching words - - Args: - words: list of predefined words - MUST BE IN LOWER CASE - - """ - super().__init__(words) - - def extract(self, candidate: Candidate) -> np.ndarray: - """Returns true if any words in first line""" - subtext = Util.subtext(candidate.line_data_list[0].line, candidate.line_data_list[0].value_start, CHUNK_SIZE) - if subtext: - return self.word_in_str(subtext.lower()) - else: - return np.zeros(shape=[self.dimension], dtype=np.int8) - - - -class WordInPath(WordIn): - """Categorical feature that corresponds to words in path (POSIX, lowercase)""" - - def __init__(self, words: List[str]) -> None: - """WordInPath constructor - - Args: - words: list of predefined words - MUST BE IN LOWER CASE & POSIX - - """ - super().__init__(words) - - def __call__(self, candidates: List[Candidate]) -> np.ndarray: - # actually there must be one path because the candidates are grouped before - candidate_path = Path( candidates[0].line_data_list[0].path).as_posix().lower() - if candidate_path: - return self.word_in_str(candidate_path.lower()) - else: - return np.zeros(shape=[self.dimension], dtype=np.int8) - - def extract(self, candidate: Candidate) -> Any: - raise NotImplementedError class HasHtmlTag(Feature): @@ -203,12 +65,15 @@ def __init__(self) -> None: def extract(self, candidate: Candidate) -> bool: subtext = Util.subtext(candidate.line_data_list[0].line, candidate.line_data_list[0].value_start, CHUNK_SIZE) candidate_line_data_list_0_line_lower = subtext.lower() + if '<' not in candidate_line_data_list_0_line_lower: + # early check + return False if self.any_word_in_(candidate_line_data_list_0_line_lower): return True - for i in ["<", "/>"]: - if i not in candidate_line_data_list_0_line_lower: - return False - return True + if "/>" in candidate_line_data_list_0_line_lower or " bool: except ValueError: return False - -class RenyiEntropy(Feature): - """Renyi entropy. - - See next link for details: - https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf - - Parameters: - alpha: entropy parameter - norm: set True to normalize output probabilities - - """ - - # Constant dictionary to get characters set via name - CHARS: Dict[Base, Chars] = { # - Base.base32: Chars.BASE32_CHARS, # - Base.base36: Chars.BASE36_CHARS, # - Base.base64: Chars.BASE64_CHARS, # - Base.hex: Chars.HEX_CHARS # - } - - def __init__(self, base: str, alpha: float, norm=False) -> None: - """Renyi entropy class initializer. - - Args: - base: number base type - alpha: entropy parameter - norm: set True to normalize output probabilities, default is False - - """ - super().__init__() - self.base: Base = getattr(Base, base) - self.alpha = alpha - self.norm = norm - - def extract(self, candidate: Candidate) -> np.ndarray: - p_x = self.get_probabilities(candidate.line_data_list[0].value) - return np.array([self.estimate_entropy(p_x)]) - - def get_probabilities(self, data: str) -> np.ndarray: - """Get list of alphabet's characters presented in inputted string.""" - unique_elements = [x for x in RenyiEntropy.CHARS[self.base].value if data.count(x) > 0] - - # perform estimation of probability of characters - p_x = np.array([float(data.count(x)) / len(data) for x in unique_elements]) - # get probabilities for alphabet's characters presented in data - p_x = p_x[p_x > 0] - - # linear weighting of probabilities for theirs normalization - if self.norm: - p_x /= p_x.sum() - - return p_x - - def estimate_entropy(self, p_x: np.ndarray) -> float: - """Calculate Renyi entropy of 'p_x' sequence. - - Function is based on definition of Renyi entropy for arbitrary probability distribution. - Please see next link for details: - https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf - """ - if 0 == len(p_x): - entropy = 0 - elif np.abs(0.0 - self.alpha) < np.finfo(np.float32).eps: - # corresponds to Hartley or max-entropy - entropy = np.log2(p_x.size) - elif np.abs(1.0 - self.alpha) < np.finfo(np.float32).eps: - # corresponds to Shannon entropy - entropy = np.sum(-p_x * np.log2(p_x)) - else: - entropy = np.log2((p_x**self.alpha).sum()) / (1.0 - self.alpha) - - return entropy - - -class ShannonEntropy(RenyiEntropy): - """Shannon entropy feature.""" - - def __init__(self, base: str, norm: bool = False) -> None: - super().__init__(base, 1.0, norm) - - -class HartleyEntropy(RenyiEntropy): - """Hartley entropy feature.""" - - def __init__(self, base: str, norm: bool = False) -> None: - super().__init__(base, 0.0, norm) - - -class CharSet(Feature): - """Feature is true when all characters of the value are from a set.""" - - # Constant dictionary to get characters set via name - CHARS: Dict[Base, str] = { # - Base.base16upper: Chars.BASE16UPPER.value, # - Base.base16lower: Chars.BASE16LOWER.value, # - Base.base32: Chars.BASE32_CHARS.value, # - Base.base36: Chars.BASE36_CHARS.value, # - Base.base64std: Chars.BASE64STD_CHARS.value + '=', # - Base.base64url: Chars.BASE64URL_CHARS.value + '=', # - } - - def __init__(self, base: str) -> None: - """CharSet class initializer. - - Args: - base: base set ID - - """ - super().__init__() - self.base: Base = getattr(Base, base) - - def extract(self, candidate: Candidate) -> bool: - with contextlib.suppress(Exception): - for i in self.CHARS[self.base]: - if i not in candidate.line_data_list[0].value: - break - else: - return True - return False - - -class FileExtension(WordIn): - """Categorical feature of file type. - - Parameters: - extensions: extension labels - - """ - - def __init__(self, extensions: List[str]) -> None: - super().__init__(extensions) - - def __call__(self, candidates: List[Candidate]) -> np.ndarray: - extension_set = set([candidate.line_data_list[0].file_type.lower() for candidate in candidates]) - return self.word_in_set(extension_set) - - - def extract(self, candidate: Candidate) -> Any: - raise NotImplementedError - - -class RuleName(WordIn): - """Categorical feature that corresponds to rule name. - - Parameters: - rule_names: rule name labels - - """ - - def __init__(self, rule_names: List[str]) -> None: - super().__init__(rule_names) - - def __call__(self, candidates: List[Candidate]) -> np.ndarray: - candidate_rule_set = set(x.rule_name for x in candidates) - return self.word_in_set(candidate_rule_set) - - def extract(self, candidate: Candidate) -> Any: - raise NotImplementedError diff --git a/credsweeper/ml_model/file_extension.py b/credsweeper/ml_model/file_extension.py new file mode 100644 index 000000000..3833ab921 --- /dev/null +++ b/credsweeper/ml_model/file_extension.py @@ -0,0 +1,33 @@ +"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'.""" +import contextlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Any, Dict, Tuple, Set + +import numpy as np + +from credsweeper.common.constants import Base, Chars, CHUNK_SIZE +from credsweeper.credentials import Candidate +from credsweeper.ml_model.word_in import WordIn +from credsweeper.utils import Util + + +class FileExtension(WordIn): + """Categorical feature of file type. + + Parameters: + extensions: extension labels + + """ + + def __init__(self, extensions: List[str]) -> None: + super().__init__(extensions) + + def __call__(self, candidates: List[Candidate]) -> np.ndarray: + extension_set = set([candidate.line_data_list[0].file_type.lower() for candidate in candidates]) + return self.word_in_set(extension_set) + + + def extract(self, candidate: Candidate) -> Any: + raise NotImplementedError + diff --git a/credsweeper/ml_model/hartley_entropy.py b/credsweeper/ml_model/hartley_entropy.py new file mode 100644 index 000000000..95ef43ddb --- /dev/null +++ b/credsweeper/ml_model/hartley_entropy.py @@ -0,0 +1,20 @@ +"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'.""" +import contextlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Any, Dict, Tuple, Set + +import numpy as np + +from credsweeper.common.constants import Base, Chars, CHUNK_SIZE +from credsweeper.credentials import Candidate +from credsweeper.ml_model.reny_entropy import RenyiEntropy +from credsweeper.utils import Util + + +class HartleyEntropy(RenyiEntropy): + """Hartley entropy feature.""" + + def __init__(self, base: str, norm: bool = False) -> None: + super().__init__(base, 0.0, norm) + diff --git a/credsweeper/ml_model/ml_config.json b/credsweeper/ml_model/ml_config.json index 1d6fa53f4..f8270d6f0 100644 --- a/credsweeper/ml_model/ml_config.json +++ b/credsweeper/ml_model/ml_config.json @@ -17,7 +17,7 @@ } }, { - "type": "WordInSecret", + "type": "WordInValue", "kwargs": { "words": [ "(", diff --git a/credsweeper/ml_model/reny_entropy.py b/credsweeper/ml_model/reny_entropy.py new file mode 100644 index 000000000..96caa662a --- /dev/null +++ b/credsweeper/ml_model/reny_entropy.py @@ -0,0 +1,88 @@ +"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'.""" +import contextlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Any, Dict, Tuple, Set + +import numpy as np + +from credsweeper.common.constants import Base, Chars, CHUNK_SIZE +from credsweeper.credentials import Candidate +from credsweeper.ml_model.features import Feature +from credsweeper.utils import Util + + + +class RenyiEntropy(Feature): + """Renyi entropy. + + See next link for details: + https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf + + Parameters: + alpha: entropy parameter + norm: set True to normalize output probabilities + + """ + + # Constant dictionary to get characters set via name + CHARS: Dict[Base, Chars] = { # + Base.base32: Chars.BASE32_CHARS, # + Base.base36: Chars.BASE36_CHARS, # + Base.base64: Chars.BASE64_CHARS, # + Base.hex: Chars.HEX_CHARS # + } + + def __init__(self, base: str, alpha: float, norm=False) -> None: + """Renyi entropy class initializer. + + Args: + base: number base type + alpha: entropy parameter + norm: set True to normalize output probabilities, default is False + + """ + super().__init__() + self.base: Base = getattr(Base, base) + self.alpha = alpha + self.norm = norm + + def extract(self, candidate: Candidate) -> np.ndarray: + p_x = self.get_probabilities(candidate.line_data_list[0].value) + return np.array([self.estimate_entropy(p_x)]) + + def get_probabilities(self, data: str) -> np.ndarray: + """Get list of alphabet's characters presented in inputted string.""" + unique_elements = [x for x in RenyiEntropy.CHARS[self.base].value if data.count(x) > 0] + + # perform estimation of probability of characters + p_x = np.array([float(data.count(x)) / len(data) for x in unique_elements]) + # get probabilities for alphabet's characters presented in data + p_x = p_x[p_x > 0] + + # linear weighting of probabilities for theirs normalization + if self.norm: + p_x /= p_x.sum() + + return p_x + + def estimate_entropy(self, p_x: np.ndarray) -> float: + """Calculate Renyi entropy of 'p_x' sequence. + + Function is based on definition of Renyi entropy for arbitrary probability distribution. + Please see next link for details: + https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf + """ + if 0 == len(p_x): + entropy = 0 + elif np.abs(0.0 - self.alpha) < np.finfo(np.float32).eps: + # corresponds to Hartley or max-entropy + entropy = np.log2(p_x.size) + elif np.abs(1.0 - self.alpha) < np.finfo(np.float32).eps: + # corresponds to Shannon entropy + entropy = np.sum(-p_x * np.log2(p_x)) + else: + entropy = np.log2((p_x**self.alpha).sum()) / (1.0 - self.alpha) + + return entropy + diff --git a/credsweeper/ml_model/rule_name.py b/credsweeper/ml_model/rule_name.py new file mode 100644 index 000000000..8e1c78355 --- /dev/null +++ b/credsweeper/ml_model/rule_name.py @@ -0,0 +1,32 @@ +"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'.""" +import contextlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Any, Dict, Tuple, Set + +import numpy as np + +from credsweeper.common.constants import Base, Chars, CHUNK_SIZE +from credsweeper.credentials import Candidate +from credsweeper.ml_model.word_in import WordIn +from credsweeper.utils import Util + + + +class RuleName(WordIn): + """Categorical feature that corresponds to rule name. + + Parameters: + rule_names: rule name labels + + """ + + def __init__(self, rule_names: List[str]) -> None: + super().__init__(rule_names) + + def __call__(self, candidates: List[Candidate]) -> np.ndarray: + candidate_rule_set = set(x.rule_name for x in candidates) + return self.word_in_set(candidate_rule_set) + + def extract(self, candidate: Candidate) -> Any: + raise NotImplementedError diff --git a/credsweeper/ml_model/shannon_entropy.py b/credsweeper/ml_model/shannon_entropy.py new file mode 100644 index 000000000..08db77b55 --- /dev/null +++ b/credsweeper/ml_model/shannon_entropy.py @@ -0,0 +1,22 @@ +"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'.""" +import contextlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Any, Dict, Tuple, Set + +import numpy as np + +from credsweeper.common.constants import Base, Chars, CHUNK_SIZE +from credsweeper.credentials import Candidate +from credsweeper.ml_model.reny_entropy import RenyiEntropy +from credsweeper.utils import Util + + + + +class ShannonEntropy(RenyiEntropy): + """Shannon entropy feature.""" + + def __init__(self, base: str, norm: bool = False) -> None: + super().__init__(base, 1.0, norm) + diff --git a/credsweeper/ml_model/word_in.py b/credsweeper/ml_model/word_in.py new file mode 100644 index 000000000..f4a159072 --- /dev/null +++ b/credsweeper/ml_model/word_in.py @@ -0,0 +1,63 @@ +import contextlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Any, Dict, Tuple, Set + +import numpy as np + +from credsweeper.common.constants import Base, Chars, CHUNK_SIZE +from credsweeper.credentials import Candidate +from credsweeper.ml_model.features import Feature +from credsweeper.utils import Util + + +class WordIn(Feature): + """Abstract feature returns array with all matched words in a string""" + + def __init__(self,words:List[str]): + super().__init__() + self.dimension = len(words) + self.words=sorted(list(set(words))) + self.enumerated_words = list(enumerate(self.words)) + if len(self.enumerated_words) != self.dimension: + raise RuntimeError(f"Check duplicates:{words}") + + @property + def enumerated_words(self) -> List[Tuple[int,str]]: + """getter for speedup""" + return self.__enumerated_words + + @enumerated_words.setter + def enumerated_words(self, enumerated_words: List[Tuple[int,str]]) -> None: + """setter for speedup""" + self.__enumerated_words = enumerated_words + + @property + def dimension(self) -> int: + """getter""" + return self.__dimension + + @dimension.setter + def dimension(self, dimension: int) -> None: + """setter""" + self.__dimension = dimension + + @abstractmethod + def extract(self, candidate: Candidate) -> Any: + raise NotImplementedError + + def word_in_str(self, a_string: str) -> np.ndarray: + """Returns array with words included in a string""" + result = np.zeros(shape=[self.dimension], dtype=np.int8) + for i, word in self.enumerated_words: + if word in a_string: + result[i] = 1 + return np.array([result]) + + def word_in_set(self, a_strings_set: Set[str]) -> np.ndarray: + """Returns array with words matches in a_strings_set""" + result = np.zeros(shape=[self.dimension], dtype=np.int8) + for i, word in self.enumerated_words: + if word in a_strings_set: + result[i] = 1 + return np.array([result]) diff --git a/credsweeper/ml_model/word_in_line.py b/credsweeper/ml_model/word_in_line.py new file mode 100644 index 000000000..e96a8c951 --- /dev/null +++ b/credsweeper/ml_model/word_in_line.py @@ -0,0 +1,37 @@ +"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'.""" +import contextlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Any, Dict, Tuple, Set + +import numpy as np + +from credsweeper.common.constants import Base, Chars, CHUNK_SIZE +from credsweeper.credentials import Candidate +from credsweeper.utils import Util + + + + + +class WordInLine(WordIn): + """Feature is true if line contains at least one word from predefined list.""" + + def __init__(self, words: List[str]) -> None: + """Feature returns array of matching words + + Args: + words: list of predefined words - MUST BE IN LOWER CASE + + """ + super().__init__(words) + + def extract(self, candidate: Candidate) -> np.ndarray: + """Returns true if any words in first line""" + subtext = Util.subtext(candidate.line_data_list[0].line, candidate.line_data_list[0].value_start, CHUNK_SIZE) + if subtext: + return self.word_in_str(subtext.lower()) + else: + return np.zeros(shape=[self.dimension], dtype=np.int8) + + diff --git a/credsweeper/ml_model/word_in_path.py b/credsweeper/ml_model/word_in_path.py new file mode 100644 index 000000000..313755236 --- /dev/null +++ b/credsweeper/ml_model/word_in_path.py @@ -0,0 +1,38 @@ +"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'.""" +import contextlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Any, Dict, Tuple, Set + +import numpy as np + +from credsweeper.common.constants import Base, Chars, CHUNK_SIZE +from credsweeper.credentials import Candidate +from credsweeper.utils import Util + + + + +class WordInPath(WordIn): + """Categorical feature that corresponds to words in path (POSIX, lowercase)""" + + def __init__(self, words: List[str]) -> None: + """WordInPath constructor + + Args: + words: list of predefined words - MUST BE IN LOWER CASE & POSIX + + """ + super().__init__(words) + + def __call__(self, candidates: List[Candidate]) -> np.ndarray: + # actually there must be one path because the candidates are grouped before + candidate_path = Path( candidates[0].line_data_list[0].path).as_posix().lower() + if candidate_path: + return self.word_in_str(candidate_path.lower()) + else: + return np.zeros(shape=[self.dimension], dtype=np.int8) + + def extract(self, candidate: Candidate) -> Any: + raise NotImplementedError + diff --git a/credsweeper/ml_model/word_in_value.py b/credsweeper/ml_model/word_in_value.py new file mode 100644 index 000000000..766e354c1 --- /dev/null +++ b/credsweeper/ml_model/word_in_value.py @@ -0,0 +1,37 @@ +"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'.""" +import contextlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Any, Dict, Tuple, Set + +import numpy as np + +from credsweeper.common.constants import Base, Chars, CHUNK_SIZE +from credsweeper.credentials import Candidate +from credsweeper.ml_model.word_in import WordIn +from credsweeper.utils import Util + + + + + +class WordInValue(WordIn): + """Feature returns true if candidate value contains at least one word from predefined list.""" + + def __init__(self, words: List[str]) -> None: + """Feature is true if candidate value contains at least one predefined word. + + Args: + words: list of predefined words - MUST BE IN LOWER CASE + + """ + super().__init__(words) + + def extract(self, candidate: Candidate) -> np.ndarray: + """Returns array of matching words for first line""" + value = candidate.line_data_list[0].value + if value: + return self.word_in_str(value.lower()) + else: + return np.zeros(shape=[self.dimension], dtype=np.int8) + diff --git a/credsweeper/ml_model/word_in_variable.py b/credsweeper/ml_model/word_in_variable.py new file mode 100644 index 000000000..b8b3e26b5 --- /dev/null +++ b/credsweeper/ml_model/word_in_variable.py @@ -0,0 +1,35 @@ +"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'.""" +import contextlib +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Any, Dict, Tuple, Set + +import numpy as np + +from credsweeper.common.constants import Base, Chars, CHUNK_SIZE +from credsweeper.credentials import Candidate +from credsweeper.ml_model.word_in import WordIn +from credsweeper.utils import Util + + + + +class WordInVariable(WordIn): + """Feature returns array of words matching in variable""" + + def __init__(self, words: List[str]) -> None: + """Feature is true if candidate value contains at least one predefined word. + + Args: + words: list of predefined words - MUST BE IN LOWER CASE + + """ + super().__init__(words) + + def extract(self, candidate: Candidate) -> np.ndarray: + """Returns array of matching words for first line""" + if candidate.line_data_list[0].variable: + return self.word_in_str(candidate.line_data_list[0].variable.lower()) + else: + return np.zeros(shape=[self.dimension], dtype=np.int8) +