Discord bot token and Grafana service (#574)

* Refactoring&retrain * fix PYLINT.USELESS_ELSE_ON_LOOP * skip lambda usage * Apply suggestions from code review * test data fix * optimisation * style * discord_and_grafana * style * slack-token-regex-rollback * [no ci] renamed new rules * [no ci] default ml_batch_size * sk-proj- prefix for openai * [skip actions] [sanitizer] 2024-07-05T12:30:02+03:00 * FB token [no ci] * style * BM scores fix * keyword pattern - apply quotation first * value_array_dictionary_check skip wellquoted * [skip actions] [main] 2024-07-08T18:14:59+03:00 * custom BM * BM fix * python 3.8 is not applicable for benchmark * sample LF * rollback * fix full value
Samsung · Jul 9, 2024 · a936b81 · a936b81
1 parent 31dcd1d
commit a936b81
Show file tree

Hide file tree

Showing 23 changed files with 672 additions and 110 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -160,14 +160,14 @@ jobs:
   # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
   performance_benchmark:
-    # put the benchmark in single job to keep constant environment during test
+    # put the benchmark in single job to keep constant environment during test python 3.8 is not applicable
     needs: [ download_data ]
 
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.10", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.9", "3.10", "3.11" ]
 
     steps:
 

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
@@ -1,4 +1,4 @@
-DATA: 16988575 interested lines. MARKUP: 62853 items
+DATA: 16988573 interested lines. MARKUP: 62864 items
 FileType           FileNumber    ValidLines    Positives    Negatives    Templates
 ---------------  ------------  ------------  -----------  -----------  -----------
                           194         28318           65          430           89
@@ -54,14 +54,14 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .erl                        4            96                         8
 .ex                        25          4968            3          105            5
 .example                   17          1838           74           35           55
-.exs                       24          4842            3          187            4
+.exs                       24          4842            3          188            4
 .ext                        5           211            1            4            2
 .fsproj                     1            75                         1
 .g4                         2           201                         2
 .gd                         1            37                         1
 .gml                        3          3075                        26
 .gni                        3          5017                        18
-.go                      1084        569469          661         4330          742
+.go                      1084        569469          661         4344          742
 .golden                     5          1168            1           14           29
 .gradle                    45          3265            4           91          100
 .graphql                    8           445            1           13
@@ -83,7 +83,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .java                     621        134132          328         1341          170
 .jenkinsfile                1            58            1            7
 .jinja2                     1            64                         2
-.js                       659        536413          541         2630          336
+.js                       659        536413          541         2631          336
 .json                     861      13670751          914        10970          143
 .jsp                       13          3202            1           42
 .jsx                        7           857                        19
@@ -107,11 +107,10 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .lua                       10          1924            3           37            3
 .m                         16         13358            8          152            3
 .manifest                   3           102                         3
-.map                        2             2                         2
 .markdown                   3           139                         3            1
 .markerb                    3            12                         3
 .marko                      1            21                         2
-.md                       679        149755          784         2563          671
+.md                       679        149755          784         2565          671
 .mdx                        3           549                         7
 .mjml                       1            18                         1
 .mjs                       22          4424          108          310
@@ -223,16 +222,16 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .yml                      418         36162          467          917          384
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
-TOTAL:                  10335      16988575         8365        60294         5233
-credsweeper result_cnt : 7754, lost_cnt : 0, true_cnt : 7202, false_cnt : 552
+TOTAL:                  10333      16988573         8365        60310         5233
+credsweeper result_cnt : 7771, lost_cnt : 0, true_cnt : 7202, false_cnt : 569
 Rules                             Positives    Negatives    Templates    Reported    TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  ----  ----  -----  ----  --------  --------  --------  --------  --------  --------
 API                                     123         3106          185         112   109     3   3288    14  0.000912  0.113821  0.995021  0.973214  0.886179  0.927660
 AWS Client ID                           168           13            0         160   160     0     13     8  0.000000  0.047619  0.955801  1.000000  0.952381  0.975610
 AWS Multi                                75           12            0          87    75    11      1     0  0.916667  0.000000  0.873563  0.872093  1.000000  0.931677
 AWS S3 Bucket                            61           25            0          87    61    24      1     0  0.960000  0.000000  0.720930  0.717647  1.000000  0.835616
 Atlassian Old PAT token                  27          212            3          12     3     8    207    24  0.037209  0.888889  0.867769  0.272727  0.111111  0.157895
-Auth                                    404         2745           77         369   349    20   2802    55  0.007087  0.136139  0.976751  0.945799  0.863861  0.902975
+Auth                                    404         2746           77         370   349    21   2802    55  0.007439  0.136139  0.976449  0.943243  0.863861  0.901809
 Azure Access Token                       19            0            0                 0     0      0    19            1.000000  0.000000            0.000000
 BASE64 Private Key                        7            2            0           7     7     0      2     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 BASE64 encoded PEM Private Key            7            0            0           5     5     0      0     2            0.285714  0.714286  1.000000  0.714286  0.833333
@@ -257,15 +256,15 @@ JSON Web Token                          284           11            2         27
 Jira / Confluence PAT token               0            4            0                 0     0      4     0  0.000000            1.000000
 Jira 2FA                                 14            6            0          10    10     0      6     4  0.000000  0.285714  0.800000  1.000000  0.714286  0.833333
 Key                                     483         7844          464         444   435     9   8299    48  0.001083  0.099379  0.993516  0.979730  0.900621  0.938511
-Nonce                                    83           52            0          84    79     5     47     4  0.096154  0.048193  0.933333  0.940476  0.951807  0.946108
+Nonce                                    83           53            0          85    79     6     47     4  0.113208  0.048193  0.926471  0.929412  0.951807  0.940476
 Other                                     0            0            5                 0     0      5     0  0.000000            1.000000
 PEM Private Key                        1019         1483            0        1023  1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1836         7434         2754        1665  1613    52  10136   223  0.005104  0.121460  0.977129  0.968769  0.878540  0.921451
+Password                               1836         7450         2754        1680  1613    67  10137   223  0.006566  0.121460  0.975914  0.960119  0.878540  0.917520
 Salt                                     42           72            2          38    38     0     74     4  0.000000  0.095238  0.965517  1.000000  0.904762  0.950000
-Secret                                 1358        29107          868        1234  1229     5  29970   129  0.000167  0.094993  0.995723  0.995948  0.905007  0.948302
+Secret                                 1358        29105          868        1234  1229     5  29968   129  0.000167  0.094993  0.995723  0.995948  0.905007  0.948302
 Seed                                      1            6            0                 0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4     4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 Token                                   585         3950          439         519   511     8   4381    74  0.001823  0.126496  0.983514  0.984586  0.873504  0.925725
 Twilio API Key                            0            5            2                 0     0      7     0  0.000000            1.000000
 URL Credentials                         172          122          250         162   162     0    372    10  0.000000  0.058140  0.981618  1.000000  0.941860  0.970060
-                                       8365        60294         5233        7896  7202   552  59742  1163  0.009155  0.139032  0.975021  0.928811  0.860968  0.893604
+                                       8365        60310         5233        7913  7202   569  59741  1163  0.009435  0.139032  0.974780  0.926779  0.860968  0.892662
diff --git a/credsweeper/app.py b/credsweeper/app.py
@@ -47,7 +47,7 @@ def __init__(self,
                  sort_output: bool = False,
                  use_filters: bool = True,
                  pool_count: int = 1,
-                 ml_batch_size: Optional[int] = 16,
+                 ml_batch_size: Optional[int] = None,
                  ml_threshold: Union[float, ThresholdPreset] = ThresholdPreset.medium,
                  azure: bool = False,
                  cuda: bool = False,
@@ -107,7 +107,7 @@ def __init__(self,
         self.json_filename: Union[None, str, Path] = json_filename
         self.xlsx_filename: Union[None, str, Path] = xlsx_filename
         self.sort_output = sort_output
-        self.ml_batch_size = ml_batch_size
+        self.ml_batch_size = ml_batch_size if ml_batch_size and 0 < ml_batch_size else 16
         self.ml_threshold = ml_threshold
         self.azure = azure
         self.cuda = cuda

diff --git a/credsweeper/common/constants.py b/credsweeper/common/constants.py
@@ -17,8 +17,10 @@ class KeywordPattern:
     # Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
     value = r"(?P<value_leftquote>((b|r|br|rb|u|f|rf|fr|\\{0,8})?[`'\"]){1,4})?" \
             r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?" \
-            r"(?P<value>(?:\{[^}]{3,8000}\})|(?:<[^>]{3,8000}>)|" \
-            r"(?(value_leftquote)(?:\\[tnrux0-7][0-9a-f]*|[^`'\"\\])|(?:\\n|\\r|\\?[^\s`'\"\\,;])){3,8000})" \
+            r"(?P<value>" \
+            r"(?(value_leftquote)(?:\\[tnrux0-7][0-9a-f]*|[^`'\"\\])|(?:\\n|\\r|\\?[^\s`'\"\\,;])){3,8000}" \
+            r"|(?:\{[^}]{3,8000}\})|(?:<[^>]{3,8000}>)" \
+            r")" \
             r"(?(value_leftquote)(?P<value_rightquote>(\\{0,8}[`'\"]){1,4})?)"
 
     @classmethod

diff --git a/credsweeper/filters/__init__.py b/credsweeper/filters/__init__.py
@@ -14,13 +14,15 @@
 from credsweeper.filters.value_couple_keyword_check import ValueCoupleKeywordCheck
 from credsweeper.filters.value_dictionary_keyword_check import ValueDictionaryKeywordCheck
 from credsweeper.filters.value_dictionary_value_length_check import ValueDictionaryValueLengthCheck
+from credsweeper.filters.value_discord_bot_check import ValueDiscordBotCheck
 from credsweeper.filters.value_entropy_base32_check import ValueEntropyBase32Check
 from credsweeper.filters.value_entropy_base36_check import ValueEntropyBase36Check
 from credsweeper.filters.value_entropy_base64_check import ValueEntropyBase64Check
 from credsweeper.filters.value_file_path_check import ValueFilePathCheck
 from credsweeper.filters.value_first_word_check import ValueFirstWordCheck
 from credsweeper.filters.value_github_check import ValueGitHubCheck
 from credsweeper.filters.value_grafana_check import ValueGrafanaCheck
+from credsweeper.filters.value_grafana_service_check import ValueGrafanaServiceCheck
 from credsweeper.filters.value_hex_number_check import ValueHexNumberCheck
 from credsweeper.filters.value_ip_check import ValueIPCheck
 from credsweeper.filters.value_jfrog_token_check import ValueJfrogTokenCheck

diff --git a/credsweeper/filters/value_array_dictionary_check.py b/credsweeper/filters/value_array_dictionary_check.py
@@ -30,7 +30,8 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
             True, if need to filter candidate and False if left
 
         """
-
+        if line_data.is_well_quoted_value:
+            return False
         if self.PATTERN.search(line_data.value):
             return True
 

diff --git a/credsweeper/filters/value_discord_bot_check.py b/credsweeper/filters/value_discord_bot_check.py
@@ -0,0 +1,31 @@
+import contextlib
+
+from credsweeper.config import Config
+from credsweeper.credentials import LineData
+from credsweeper.file_handler.analysis_target import AnalysisTarget
+from credsweeper.filters import Filter
+from credsweeper.utils import Util
+
+
+class ValueDiscordBotCheck(Filter):
+    """Discord bot Token"""
+
+    def __init__(self, config: Config = None) -> None:
+        pass
+
+    def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
+        """Run filter checks on received token which might be structured.
+
+        Args:
+            line_data: credential candidate data
+            target: multiline target from which line data was obtained
+
+        Return:
+            True, when need to filter candidate and False if left
+
+        """
+        with contextlib.suppress(Exception):
+            parts = line_data.value.split('.')
+            if int(Util.decode_base64(parts[0], padding_safe=True, urlsafe_detect=True)):
+                return False
+        return True
diff --git a/credsweeper/filters/value_grafana_service_check.py b/credsweeper/filters/value_grafana_service_check.py
@@ -0,0 +1,35 @@
+import binascii
+import contextlib
+import struct
+
+from credsweeper.common.constants import ASCII
+from credsweeper.config import Config
+from credsweeper.credentials import LineData
+from credsweeper.file_handler.analysis_target import AnalysisTarget
+from credsweeper.filters import Filter
+
+
+class ValueGrafanaServiceCheck(Filter):
+    """Check that candidate have a known structure"""
+
+    def __init__(self, config: Config = None) -> None:
+        pass
+
+    def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
+        """Run filter checks on received token which might be structured.
+
+        Args:
+            line_data: credential candidate data
+            target: multiline target from which line data was obtained
+
+        Return:
+            True, if need to filter candidate and False if left
+
+        """
+        with contextlib.suppress(Exception):
+            checksum = struct.unpack("<I", bytes.fromhex(line_data.value[38:]))[0]
+            data = line_data.value[:37].encode(ASCII)
+            crc32 = binascii.crc32(data)
+            if checksum == crc32:
+                return False
+        return True
diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml
@@ -232,11 +232,26 @@
   confidence: moderate
   type: pattern
   values:
-    - (?<![0-9A-Za-z_+-])(?P<value>EAAC[0-9A-Za-z]{27,80})
+    - (?<![0-9A-Za-z_+-])(?P<value>EAA[0-9A-Za-z]{80,800})
   filter_type: GeneralPattern
   required_substrings:
-    - EAAC
-  min_line_len: 31
+    - EAA
+  min_line_len: 80
+  target:
+    - code
+    - doc
+
+- name: Facebook App Token
+  severity: high
+  confidence: moderate
+  type: pattern
+  values:
+    - (?<![0-9A-Za-z_+-])(?P<value>[0-9]{12,18}\|[0-9A-Za-z_-]{24,28})(?![=0-9A-Za-z_+-])
+  filter_type: GeneralPattern
+  required_substrings:
+    - "|"
+  required_regex: "[0-9A-Za-z_/+-]{15}"
+  min_line_len: 33
   target:
     - code
     - doc
@@ -246,7 +261,7 @@
   confidence: moderate
   type: pattern
   values:
-    - (?i)((git)[0-9A-Za-z_-]{0,80}(token|key|api)[0-9A-Za-z_-]{0,80}(\s)*(=|:|:=)(\s)*(["']?)(?P<value>[a-z|\d]{40})(["']?))
+    - (?i)((git)[0-9A-Za-z_-]{0,80}(token|key|api)[0-9A-Za-z_-]{0,80}(\s)*(=|:|:=)(\s)*(["']?)(?P<value>[0-9a-z]{40})(["']?))
   filter_type: GeneralPattern
   use_ml: true
   validations:
@@ -644,7 +659,7 @@
   confidence: moderate
   type: pattern
   values:
-    - (?<![0-9A-Za-z_+-])(?P<value>SK[0-9a-fA-F]{32})(?![=0-9A-Za-z_+-])
+    - (?<![0-9A-Za-z_+-])(?P<value>SK[0-9A-Fa-f]{32})(?![=0-9A-Za-z_+-])
   filter_type: GeneralPattern
   required_substrings:
     - SK
@@ -1196,10 +1211,15 @@
   confidence: strong
   type: pattern
   values:
-    - (?<![.0-9A-Za-z_/+-])(?P<value>sk-[0-9A-Za-z_-]{20}T3BlbkFJ[0-9A-Za-z_-]{20})(?![=0-9A-Za-z_/+-])
+    - (?<![.0-9A-Za-z_/+-])(?P<value>sk-[0-9A-Za-z_-]{16,32}(T3BlbkFJ|9wZW5BS|PcGVuQU)[0-9A-Za-z_-]{16,32})(?![=0-9A-Za-z_/+-])
   min_line_len: 51
+  filter_type:
+    - ValuePatternCheck
+    - ValueEntropyBase64Check
   required_substrings:
     - T3BlbkFJ
+    - 9wZW5BS
+    - PcGVuQU
   target:
     - code
     - doc
@@ -1219,3 +1239,35 @@
     - code
     - doc
 
+- name: Discord Bot Token
+  severity: high
+  confidence: strong
+  type: pattern
+  values:
+    - (?<![0-9A-Za-z_/+-])(?P<value>[NMO][ADgjQTwz][a-zA-Z0-9_-]{22,26}\.[a-zA-Z0-9_-]{6}\.[a-zA-Z0-9_-]{30,40})(?![0-9A-Za-z_/+-])
+  min_line_len: 62
+  filter_type:
+    - ValueDiscordBotCheck
+  required_substrings:
+    - M
+    - N
+    - O
+  required_regex: "[0-9A-Za-z_/+-]{15}"
+  target:
+    - code
+    - doc
+
+- name: Grafana Service Account Token
+  severity: high
+  confidence: strong
+  type: pattern
+  values:
+    - (?<![0-9A-Za-z_+-])(?P<value>glsa_[0-9A-Za-z_-]{32}_[0-9A-Fa-f]{8})(?![=0-9A-Za-z_+-])
+  min_line_len: 46
+  filter_type:
+    - ValueGrafanaServiceCheck
+  required_substrings:
+    - glsa_
+  target:
+    - code
+    - doc
diff --git a/credsweeper/secret/config.json b/credsweeper/secret/config.json
@@ -32,6 +32,7 @@
             ".jar",
             ".jpeg",
             ".jpg",
+            ".map",
             ".m4a",
             ".mat",
             ".mo",
@@ -47,6 +48,8 @@
             ".pyc",
             ".pyd",
             ".pyo",
+            ".rc",
+            ".rc2",
             ".rar",
             ".realm",
             ".s7z",
@@ -58,6 +61,7 @@
             ".tiff",
             ".ttf",
             ".vcxproj",
+            ".vdproj",
             ".war",
             ".wav",
             ".webm",

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,20 +1,20 @@
 from pathlib import Path
 
 # total number of files in test samples
-SAMPLES_FILES_COUNT: int = 128
+SAMPLES_FILES_COUNT: int = 130
 
 # the lowest value of ML threshold is used to display possible lowest values
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
 
 # credentials count after scan
-SAMPLES_CRED_COUNT: int = 421
-SAMPLES_CRED_LINE_COUNT: int = 438
+SAMPLES_CRED_COUNT: int = 425
+SAMPLES_CRED_LINE_COUNT: int = 442
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 379
+SAMPLES_POST_CRED_COUNT: int = 383
 
 # with option --doc
-SAMPLES_IN_DOC = 404
+SAMPLES_IN_DOC = 407
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 25