bridgecrewio · pazbechor · Jul 22, 2024 · Jul 21, 2024 · Jul 21, 2024 · Jul 21, 2024
diff --git a/checkov/secrets/local_secrets_runner.py b/checkov/secrets/local_secrets_runner.py
@@ -0,0 +1,43 @@
+# flake8: noqa
+# type: ignore
+
+import json
+import os
+
+from checkov.main import secrets_runner
+from checkov.common.runners.runner_registry import RunnerRegistry
+from checkov.runner_filter import RunnerFilter
+from checkov.common.bridgecrew.platform_integration import bc_integration
+
+
+with open(os.environ['LOCAL_SECRETS_POLICIES_JSON']) as secrets_policies_file:
+    default_regexes = json.load(secrets_policies_file)
+bc_integration.customer_run_config_response = {'secretsPolicies': default_regexes}
+
+
+def execute():
+    runner = secrets_runner(entropy_limit=4)
+    # 20 min less in order to finish processing, else put checkov's default (12h - 1200)
+    runner_registry = RunnerRegistry(
+        '',
+        RunnerFilter(
+            block_list_secret_scan=[],
+            enable_secret_scan_all_files=True,
+            enable_git_history_secret_scan=False,
+            git_history_last_commit_scanned=None,
+            git_history_timeout="checkov_timeout_str",
+            checks=['BC_GIT_79']
+        ),
+        runner
+    )
+
+    scan_reports = runner_registry.run(
+        root_folder=os.environ["LOCAL_SCANNING_FOLDER"],
+        external_checks_dir=list(),
+        collect_skip_comments=True)
+
+    print(scan_reports)
+
+
+if __name__ == "__main__":
+    execute()
diff --git a/checkov/secrets/plugins/custom_regex_detector.py b/checkov/secrets/plugins/custom_regex_detector.py
@@ -2,6 +2,7 @@
 
 import logging
 from typing import Set, Any, Generator, Pattern, Optional, Dict, Tuple, TYPE_CHECKING, cast
+from collections import defaultdict
 
 from detect_secrets.constants import VerifiedResult
 from detect_secrets.core.potential_secret import PotentialSecret
@@ -29,16 +30,24 @@ def __init__(self) -> None:
         self.regex_to_metadata: dict[str, dict[str, Any]] = dict()
         self.denylist = set()
         self.multiline_deny_list = set()
+        self.multiline_pattern_by_prerun_compiled: dict[str, Pattern[str]] = dict()
         self.multiline_regex_to_metadata: dict[str, dict[str, Any]] = dict()
         self._analyzed_files: Set[str] = set()
+        self._analyzed_files_by_check: Dict[str, Set[str]] = defaultdict(lambda: set())
         self._multiline_regex_supported_file_types: Set[str] = set()
         detectors = load_detectors()
 
         for detector in detectors:
             try:
                 if detector.get("isMultiline"):
-                    self.multiline_deny_list.add(re.compile('{}'.format(detector["Regex"])))
-                    self.multiline_regex_to_metadata[detector["Regex"]] = detector
+                    # If prerun exists, we will add it as 'regular detector' (special treat in analyze_line)
+                    if detector.get("prerun"):
+                        self.denylist.add(re.compile('{}'.format(detector["prerun"])))
+                        self.regex_to_metadata[detector["prerun"]] = detector
+                        self.multiline_pattern_by_prerun_compiled[detector["prerun"]] = re.compile('{}'.format(detector["Regex"]))
+                    else:
+                        self.multiline_deny_list.add(re.compile('{}'.format(detector["Regex"])))
+                        self.multiline_regex_to_metadata[detector["Regex"]] = detector
                     continue
                 self.denylist.add(re.compile('{}'.format(detector["Regex"])))
                 self.regex_to_metadata[detector["Regex"]] = detector
@@ -86,11 +95,14 @@ def analyze_line(
         if filename not in self._analyzed_files:
             self._analyzed_files.add(filename)
             # We only want to read file if: there is regex supporting it & file size is not over MAX_FILE_SIZE
+            # Notice: in the find potential secret we check per multiline regex if we should run it according the filetype.
+            #   This is only a validation to reduce file content reading in case it not supported at all
             if not self.multiline_regex_to_metadata.values() or \
                     not self.multiline_regex_supported_file_types or \
                     not any([filename.endswith(str(file_type)) for file_type in self.multiline_regex_supported_file_types]) or \
                     not 0 < get_file_size_safe(filename) < CustomRegexDetector.MAX_FILE_SIZE:
                 return output
+
             file_content = read_file_safe(filename)
             if not file_content:
                 return output
@@ -138,6 +150,38 @@ def _find_potential_secret(
             except Exception:
                 is_verified = False
             regex_data = current_regex_to_metadata[regex.pattern]
+
+            # It's a multiline regex (only the prerun executed). We should execute the whole multiline pattern
+            # We want to run multiline policy once per file (if prerun was found)
+            if regex_data.get("prerun") and filename not in self._analyzed_files_by_check[regex_data['Check_ID']]:
+                self._analyzed_files_by_check[regex_data['Check_ID']].add(filename)
+
+                # We are going to scan the whole file with the multiline regex
+                if not 0 < get_file_size_safe(filename) < CustomRegexDetector.MAX_FILE_SIZE:
+                    return
+                file_content = read_file_safe(filename)
+                if not file_content:
+                    return
+                multiline_regex = self.multiline_pattern_by_prerun_compiled.get(regex.pattern)
+                if multiline_regex is None:
+                    return
+                multiline_matches = multiline_regex.findall(file_content)
+                for mm in multiline_matches:
+                    mm = f"'{mm}'"
+                    ps = PotentialSecret(
+                        type=regex_data["Name"],
+                        filename=filename,
+                        secret=mm,
+                        line_number=line_number,
+                        is_verified=is_verified,
+                        is_added=is_added,
+                        is_removed=is_removed,
+                        is_multiline=True,
+                    )
+                    ps.check_id = regex_data["Check_ID"]
+                    output.add(ps)
+                return
+
             # Wrap multiline match with fstring + ''
             match = f"'{match}'" if is_multiline else match
             ps = PotentialSecret(

diff --git a/checkov/secrets/plugins/load_detectors.py b/checkov/secrets/plugins/load_detectors.py
@@ -32,15 +32,16 @@ def modify_secrets_policy_to_detectors(policies_list: List[dict[str, Any]]) -> L
     return secrets_list
 
 
-def add_to_custom_detectors(custom_detectors: List[Dict[str, Any]], name: str, check_id: str, regex: str,
+def add_to_custom_detectors(custom_detectors: List[Dict[str, Any]], name: str, check_id: str, regex: str, prerun: str,
                             is_custom: str, is_multiline: bool = False, supported_files: Optional[List[str]] = None) -> None:
     custom_detectors.append({
         'Name': name,
         'Check_ID': check_id,
         'Regex': regex,
         'isCustom': is_custom,
         'isMultiline': is_multiline,
-        'supportedFiles': supported_files if supported_files else []
+        'supportedFiles': supported_files if supported_files else [],
+        'prerun': prerun
     })
 
 
@@ -54,7 +55,7 @@ def add_detectors_from_condition_query(custom_detectors: List[Dict[str, Any]], c
             value = [value]
         for regex in value:
             parsed = True
-            add_to_custom_detectors(custom_detectors, secret_policy['title'], check_id, regex,
+            add_to_custom_detectors(custom_detectors, secret_policy['title'], check_id, regex, "",
                                     secret_policy['isCustom'])
     return parsed
 
@@ -77,6 +78,8 @@ def add_detectors_from_code(custom_detectors: List[Dict[str, Any]], code: str, s
                 secret_policy['title'],
                 check_id,
                 regex,
+                # Only one prerun per multiline regex
+                code_dict['definition'].get('prerun', [''])[0],
                 secret_policy['isCustom'],
                 code_dict['definition'].get("multiline", False),
                 code_dict['definition'].get("supported_files", [])