ML model retrain (#527)

* markup actual * actual scores + one sample * fix files count * ml model upd * style * BM scores fix
Samsung · Mar 26, 2024 · 348c6e1 · 348c6e1
1 parent bb2a30d
commit 348c6e1
Show file tree

Hide file tree

Showing 25 changed files with 2,271 additions and 1,856 deletions.
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -58,7 +58,7 @@ jobs:
     - name: Check ml_model.onnx integrity
       if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
       run: |
-        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 4774fdce802e940023316c32f14a68df
+        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 6f1480a8c1d3269b85fe77a565aad7f8
 
     # # # Python setup
 

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
@@ -1,15 +1,15 @@
-DATA: 19076375 valid lines. MARKUP: 64556 items
+DATA: 19076375 valid lines. MARKUP: 64562 items
 Category                      Positives    Negatives    Template
 --------------------------  -----------  -----------  ----------
-Authentication Credentials           96         2651          32
-Cryptographic Primitives             54          171           1
+Authentication Credentials           96         2631          32
+Cryptographic Primitives             54          171           3
 Generic Secret                     1210        29585         618
-Generic Token                       341         3718         556
-Other                               715         3695          37
-Password                           1483         7145        4224
+Generic Token                       343         3713         556
+Other                               730         3703          37
+Password                           1489         7144        4223
 Predefined Pattern                  427         5290          11
 Private Key                        1019         1477
-TOTAL:                             5345        53732        5479
+TOTAL:                             5368        53714        5480
 FileType           FileNumber    ValidLines    Positives    Negatives    Template
 ---------------  ------------  ------------  -----------  -----------  ----------
                           190         36319           45          407          80
@@ -70,7 +70,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .gd                         1            38                         1
 .gml                        3          4011                        16
 .gni                        3          6340                        17
-.go                      1090        718367          504         4028         742
+.go                      1090        718367          503         4029         742
 .golden                     5          1246            1           12          31
 .gradle                    41          3647            2           79          59
 .graphql                    8           575            1           13
@@ -89,10 +89,10 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .ipynb                      1           210                         4
 .j                          1           329                         2
 .j2                        32          6327            8          175          11
-.java                     589        169939          177         1263         176
+.java                     589        169939          176         1263         176
 .jenkinsfile                1            78            1            6
 .jinja2                     1            64                         2
-.js                       665        705090          321         2445         363
+.js                       665        705090          319         2447         363
 .json                     856      15025976          337        10628         185
 .jsp                       13          4101            1           38           1
 .jsx                        7          1162                        19
@@ -120,7 +120,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .markdown                   3           146                         2           2
 .markerb                    3            12                         2           1
 .marko                      1            32                         2
-.md                       659        172418          398         2401         719
+.md                       659        172418          417         2382         719
 .mdx                        3           723                         7
 .mjml                       2           183                         3
 .mjs                       22          5853           85          309
@@ -132,7 +132,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .mqh                        1          1390                         2
 .msg                        1         26646            1            1
 .mysql                      1            40                                     2
-.ndjson                     2          5006           28          261           1
+.ndjson                     2          5006           31          258           1
 .nix                        4           280            1           12
 .nolint                     1             2                         1
 .odd                        1          1304                        43
@@ -161,7 +161,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .pug                        3           379                         3
 .purs                       1            73                         4
 .pxd                        1           153                         5           1
-.py                       896        327022          353         3367         861
+.py                       896        327022          360         3366         860
 .pyi                        4          1418                         9
 .pyp                        1           193                         1
 .pyx                        2          1175                        21
@@ -208,7 +208,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .testsettings               1            21                         5
 .tf                        21          1667                        29           5
 .tfstate                    4           407           18           10           1
-.tfvars                     1            32                         3
+.tfvars                     1            32                         3           2
 .tl                         2          2161                       155           2
 .tmpl                       5           345                         3           9
 .token                      1             1            1
@@ -228,20 +228,20 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .xib                       11           504                       164
 .xsl                        1           315                         1
 .yaml                     151         23500           92          379          52
-.yml                      450         41925          294          961         359
+.yml                      450         41925          292          963         359
 .zsh                        7          1109                        13
 .zsh-theme                  1           121                         1
-TOTAL:                  10214      19076375         5345        53732        5479
-Detected Credentials: 6022
-credsweeper result_cnt : 5093, lost_cnt : 0, true_cnt : 4677, false_cnt : 416
+TOTAL:                  10214      19076375         5368        53714        5480
+Detected Credentials: 6221
+credsweeper result_cnt : 5195, lost_cnt : 0, true_cnt : 4827, false_cnt : 368
 Category                      TP    FP        TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 --------------------------  ----  ----  --------  ----  --------  --------  --------  --------  --------  --------
-Authentication Credentials    74    25      2658    22  0.009318  0.229167  0.983087  0.747475  0.770833  0.758974
-Cryptographic Primitives      38     3       169    16  0.017442  0.296296  0.915929  0.926829  0.703704  0.800000
-Generic Secret              1108    23     30180   102  0.000762  0.084298  0.996021  0.979664  0.915702  0.946604
-Generic Token                299     7      4267    42  0.001638  0.123167  0.989382  0.977124  0.876833  0.924266
-Other                        534   317      3415   181  0.084941  0.253147  0.888014  0.627497  0.746853  0.681992
-Password                    1195    37     11332   288  0.003254  0.194201  0.974712  0.969968  0.805799  0.880295
-Predefined Pattern           410     4      5297    17  0.000755  0.039813  0.996334  0.990338  0.960187  0.975030
+Authentication Credentials    84     7      2656    12  0.002629  0.125000  0.993113  0.923077  0.875000  0.898396
+Cryptographic Primitives      50     0       174     4            0.074074  0.982456  1.000000  0.925926  0.961538
+Generic Secret              1123    10     30193    87  0.000331  0.071901  0.996912  0.991174  0.928099  0.958600
+Generic Token                313     0      4269    30            0.087464  0.993495  1.000000  0.912536  0.954268
+Other                        551   322      3418   179  0.086096  0.245205  0.887919  0.631157  0.754795  0.687461
+Password                    1274    28     11339   215  0.002463  0.144392  0.981098  0.978495  0.855608  0.912934
+Predefined Pattern           413     1      5300    14  0.000189  0.032787  0.997381  0.997585  0.967213  0.982164
 Private Key                 1019     0      1477     0                      1.000000  1.000000  1.000000  1.000000
-                            4677   416  19070614   668  0.000022  0.124977  0.999943  0.918319  0.875023  0.896149
+                            4827   368  19070639   541  0.000019  0.100782  0.999952  0.929163  0.899218  0.913945
diff --git a/credsweeper/ml_model/ml_model.onnx b/credsweeper/ml_model/ml_model.onnx
diff --git a/credsweeper/ml_model/ml_validator.py b/credsweeper/ml_model/ml_validator.py
@@ -172,6 +172,6 @@ def validate_groups(self, group_list: List[Tuple[str, List[Candidate]]],
             probability[head:tail] = self._batch_call_model(line_input_list, features_list)
         is_cred = probability > self.threshold
         for i in range(len(is_cred)):
-            logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], round(probability[i], 3),
+            logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], round(probability[i], 8),
                          group_list[i][0])
         return is_cred, probability
diff --git a/experiment/augmentation/__init__.py b/experiment/augmentation/__init__.py
diff --git a/experiment/augmentation/dictionaries/secrets.pickle b/experiment/augmentation/dictionaries/secrets.pickle
diff --git a/experiment/augmentation/main.py b/experiment/augmentation/main.py
@@ -7,10 +7,8 @@
 
 import pandas as pd
 
-from .obfuscation import get_obfuscated_value, generate_value, SecretCreds
-
-logging.basicConfig(format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s", level="DEBUG")
-logger = logging.getLogger(__name__)
+from credsweeper.utils import Util
+from .obfuscation import get_obfuscated_value, obfuscate_value, SecretCreds
 
 BASE_PATH = ["test", "src", "other"]
 COLUMN_TYPES = {
@@ -56,29 +54,30 @@ def load_meta(meta_path, directory):
     return df
 
 
-def obfuscate_row(row, meta, secret_creds):
-    category = meta.Category
+def obfuscate_row(row, meta, secret_creds: SecretCreds):
     try:
         position = int(meta.ValueStart)
         pos_end = int(meta.ValueEnd)
     except ValueError:
         return row
     space_len = len(row) - len(row.lstrip())
     value = row[position + space_len:pos_end + space_len]
-    if category == "Predefined Pattern":
+    if "Password" == meta.Category:
+        obfuscated_value = secret_creds.get_password()
+    elif "Predefined Pattern" == meta.Category:
         pattern = meta.PredefinedPattern
         obfuscated_value = get_obfuscated_value(value, pattern)
+    elif "Cryptographic Primitives" == meta.Category:
+        obfuscated_value = secret_creds.generate_secret()
+    elif meta.Category in [
+            "Authentication Credentials",  #
+            "Generic Secret",  #
+            "Generic Token"  #
+    ]:
+        obfuscated_value = obfuscate_value(value)
     else:
-        if meta.WithWords == "1" and meta.Category not in [
-                "Authentication Credentials",  #
-                "Generic Secret",  #
-                "Generic Token"  #
-        ]:
-            obfuscated_value = secret_creds.get_word_secret()
-        elif meta.Category == "Password":
-            obfuscated_value = secret_creds.get_password()
-        else:
-            obfuscated_value = generate_value(value)
+        print(f"Unusual category '{meta.Category}' in {row}")
+        obfuscated_value = obfuscate_value(value)
 
     if position > 0:
         obfuscated_line = row[:position + space_len] + obfuscated_value + row[position + space_len + len(value):]
@@ -96,53 +95,50 @@ def add_raw_lines(meta_df, filepath, content):
     false_df = temp_df[temp_df.GroundTruth == "F"]
     # Get line for row with "false" label
     for index, row in false_df.iterrows():
-        line_numb = int(row["LineStart:LineEnd"].split(":")[0])
-        meta_df.loc[index, "RawLine"] = content[line_numb - 1]
+        line_numb = row["LineStart:LineEnd"].split(":")
+        assert line_numb[0] == line_numb[1], row
+        # line_numb = int(row["LineStart:LineEnd"].split(":")[0])
+        meta_df.loc[index, "RawLine"] = f"{content[int(line_numb[0]) - 1]}\n"
     # Get line for row with "true" label
     true_df = temp_df[temp_df.GroundTruth == "T"]
     for index, row in true_df.iterrows():
         line_numb = row["LineStart:LineEnd"].split(":")
+        assert line_numb[0] == line_numb[1], row
         line = ""
         for l_n in range(int(line_numb[0]), int(line_numb[0]) + 1):
             obf_row = obfuscate_row(content[l_n - 1], row, secret_creds)
             line += obf_row
-        meta_df.loc[index, "RawLine"] = line
+        meta_df.loc[index, "RawLine"] = f"{line}\n"
     # Get line for row with "Template" label(temporary solution)
     template_df = temp_df[temp_df.GroundTruth == "Template"]
     for index, row in template_df.iterrows():
         line_numb = row["LineStart:LineEnd"].split(":")
+        assert line_numb[0] == line_numb[1], row
         line = ""
         for l_n in range(int(line_numb[0]), int(line_numb[0]) + 1):
             obf_row = obfuscate_row(content[l_n - 1], row, secret_creds)
             line += obf_row
-        meta_df.loc[index, "RawLine"] = line
+        meta_df.loc[index, "RawLine"] = f"{line}\n"
 
 
 def write2aug_file(repo_local_path, meta_df, aug_file):
     fls_path = list(set(meta_df.FilePath))
     for filepath in fls_path:
-        with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
-            content = reader.readlines()
+        content = Util.read_file(repo_local_path / filepath)
         add_raw_lines(meta_df, filepath, content)
     with open(repo_local_path / aug_file, "w", encoding="utf8") as writer:
         Rows = meta_df.RawLine
         writer.writelines(Rows)
 
 
-def write_meta_file(meta_df, meta_file):
-    save_df = meta_df[meta_df.GroundTruth != "F"]
-    save_df.to_csv(meta_file)
-
-
 def join_series(series):
     meta_df = pd.DataFrame(series)
     return meta_df
 
 
 def write_meta(aug_df, aug_metapath):
     aug_df = pd.concat(aug_df)
-    aug_df = aug_df[aug_df["GroundTruth"] != "F"]
-    aug_df["GroundTruth"] = "T"
+    aug_df.loc[aug_df['GroundTruth'] != "F", 'GroundTruth'] = 'T'
     aug_df.rename(columns=RENAME_OLD_COLUMNS, inplace=True)
     aug_df.rename(columns=RENAME_NEW_COLUMNS, inplace=True)
     aug_df.to_csv(aug_metapath)
@@ -153,8 +149,9 @@ def get_linage(repo_local_path, df):
     files_length = {}
     overall_linage = 0
     for filepath in fls_path:
-        with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
-            content = reader.readlines()
+        # with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
+        #     content = reader.read().splitlines()
+        content = Util.read_file(repo_local_path / filepath)
         overall_linage += len(content)
         files_length[filepath] = len(content)
     return files_length, overall_linage
@@ -239,23 +236,28 @@ def generate_rows(repo_local_path, aug_filename, df, true_stake, scale):
     aug_file_linage = int(scale * overall_linage)
     fl_true_lines, true_cred_count = get_true_lines(df)
     aug_file_linage = int(true_cred_count * scale / true_stake)
+    idx = 0
     for row_numb in range(1, aug_file_linage):
+        old_idx = idx
+        idx += 1
         rand = random.uniform(0, 1)
         if rand < true_stake:
-            ground_trues, idx = get_true_row(df, row_numb, aug_filename)
-            row_numb = idx
+            ground_trues, idx = get_true_row(df, idx, aug_filename)
         else:
-            ground_trues, idx = get_false_row(df, row_numb, aug_filename)
+            ground_trues, idx = get_false_row(df, idx, aug_filename)
         if ground_trues is None:
-            row_numb -= 1
+            # suppose, markup has F & T values for all filename cases
+            idx = old_idx
+            if 0 > idx:
+                idx = 0
             continue
         new_series.append(ground_trues)
     return new_series
 
 
 def aug_data(repo_local_path, meta_data, true_stake, scale):
     augument_list = [
-        "Authentication Credentials"  #
+        "Authentication Credentials",  #
         "Cryptographic Primitives",  #
         "Generic Secret",  #
         "Generic Token",  #
@@ -266,7 +268,7 @@ def aug_data(repo_local_path, meta_data, true_stake, scale):
         new_meta = []
         aug_meta = str(repo_local_path / "aug_data" / "meta" / base) + ".csv"
         aug_file_template = repo_local_path / "aug_data" / "data" / base
-        meta_df = meta_data[meta_data["FilePath"].str.contains(base)]
+        meta_df = meta_data[meta_data["FilePath"].str.contains(f"/{base}/")]
         meta_df = meta_df[meta_df["Category"].isin(augument_list)]
         exts = get_extentions(meta_df)
         for extension in exts: