Skip to content

Commit

Permalink
ML model retrain (#527)
Browse files Browse the repository at this point in the history
* markup actual

* actual scores + one sample

* fix files count

* ml model upd

* style

* BM scores fix
  • Loading branch information
babenek authored Mar 26, 2024
1 parent bb2a30d commit 348c6e1
Show file tree
Hide file tree
Showing 25 changed files with 2,271 additions and 1,856 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:
- name: Check ml_model.onnx integrity
if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
run: |
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 4774fdce802e940023316c32f14a68df
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 6f1480a8c1d3269b85fe77a565aad7f8
# # # Python setup

Expand Down
52 changes: 26 additions & 26 deletions cicd/benchmark.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
DATA: 19076375 valid lines. MARKUP: 64556 items
DATA: 19076375 valid lines. MARKUP: 64562 items
Category Positives Negatives Template
-------------------------- ----------- ----------- ----------
Authentication Credentials 96 2651 32
Cryptographic Primitives 54 171 1
Authentication Credentials 96 2631 32
Cryptographic Primitives 54 171 3
Generic Secret 1210 29585 618
Generic Token 341 3718 556
Other 715 3695 37
Password 1483 7145 4224
Generic Token 343 3713 556
Other 730 3703 37
Password 1489 7144 4223
Predefined Pattern 427 5290 11
Private Key 1019 1477
TOTAL: 5345 53732 5479
TOTAL: 5368 53714 5480
FileType FileNumber ValidLines Positives Negatives Template
--------------- ------------ ------------ ----------- ----------- ----------
190 36319 45 407 80
Expand Down Expand Up @@ -70,7 +70,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.gd 1 38 1
.gml 3 4011 16
.gni 3 6340 17
.go 1090 718367 504 4028 742
.go 1090 718367 503 4029 742
.golden 5 1246 1 12 31
.gradle 41 3647 2 79 59
.graphql 8 575 1 13
Expand All @@ -89,10 +89,10 @@ FileType FileNumber ValidLines Positives Negatives Templat
.ipynb 1 210 4
.j 1 329 2
.j2 32 6327 8 175 11
.java 589 169939 177 1263 176
.java 589 169939 176 1263 176
.jenkinsfile 1 78 1 6
.jinja2 1 64 2
.js 665 705090 321 2445 363
.js 665 705090 319 2447 363
.json 856 15025976 337 10628 185
.jsp 13 4101 1 38 1
.jsx 7 1162 19
Expand Down Expand Up @@ -120,7 +120,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.markdown 3 146 2 2
.markerb 3 12 2 1
.marko 1 32 2
.md 659 172418 398 2401 719
.md 659 172418 417 2382 719
.mdx 3 723 7
.mjml 2 183 3
.mjs 22 5853 85 309
Expand All @@ -132,7 +132,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.mqh 1 1390 2
.msg 1 26646 1 1
.mysql 1 40 2
.ndjson 2 5006 28 261 1
.ndjson 2 5006 31 258 1
.nix 4 280 1 12
.nolint 1 2 1
.odd 1 1304 43
Expand Down Expand Up @@ -161,7 +161,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.pug 3 379 3
.purs 1 73 4
.pxd 1 153 5 1
.py 896 327022 353 3367 861
.py 896 327022 360 3366 860
.pyi 4 1418 9
.pyp 1 193 1
.pyx 2 1175 21
Expand Down Expand Up @@ -208,7 +208,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.testsettings 1 21 5
.tf 21 1667 29 5
.tfstate 4 407 18 10 1
.tfvars 1 32 3
.tfvars 1 32 3 2
.tl 2 2161 155 2
.tmpl 5 345 3 9
.token 1 1 1
Expand All @@ -228,20 +228,20 @@ FileType FileNumber ValidLines Positives Negatives Templat
.xib 11 504 164
.xsl 1 315 1
.yaml 151 23500 92 379 52
.yml 450 41925 294 961 359
.yml 450 41925 292 963 359
.zsh 7 1109 13
.zsh-theme 1 121 1
TOTAL: 10214 19076375 5345 53732 5479
Detected Credentials: 6022
credsweeper result_cnt : 5093, lost_cnt : 0, true_cnt : 4677, false_cnt : 416
TOTAL: 10214 19076375 5368 53714 5480
Detected Credentials: 6221
credsweeper result_cnt : 5195, lost_cnt : 0, true_cnt : 4827, false_cnt : 368
Category TP FP TN FN FPR FNR ACC PRC RCL F1
-------------------------- ---- ---- -------- ---- -------- -------- -------- -------- -------- --------
Authentication Credentials 74 25 2658 22 0.009318 0.229167 0.983087 0.747475 0.770833 0.758974
Cryptographic Primitives 38 3 169 16 0.017442 0.296296 0.915929 0.926829 0.703704 0.800000
Generic Secret 1108 23 30180 102 0.000762 0.084298 0.996021 0.979664 0.915702 0.946604
Generic Token 299 7 4267 42 0.001638 0.123167 0.989382 0.977124 0.876833 0.924266
Other 534 317 3415 181 0.084941 0.253147 0.888014 0.627497 0.746853 0.681992
Password 1195 37 11332 288 0.003254 0.194201 0.974712 0.969968 0.805799 0.880295
Predefined Pattern 410 4 5297 17 0.000755 0.039813 0.996334 0.990338 0.960187 0.975030
Authentication Credentials 84 7 2656 12 0.002629 0.125000 0.993113 0.923077 0.875000 0.898396
Cryptographic Primitives 50 0 174 4 0.074074 0.982456 1.000000 0.925926 0.961538
Generic Secret 1123 10 30193 87 0.000331 0.071901 0.996912 0.991174 0.928099 0.958600
Generic Token 313 0 4269 30 0.087464 0.993495 1.000000 0.912536 0.954268
Other 551 322 3418 179 0.086096 0.245205 0.887919 0.631157 0.754795 0.687461
Password 1274 28 11339 215 0.002463 0.144392 0.981098 0.978495 0.855608 0.912934
Predefined Pattern 413 1 5300 14 0.000189 0.032787 0.997381 0.997585 0.967213 0.982164
Private Key 1019 0 1477 0 1.000000 1.000000 1.000000 1.000000
4677 416 19070614 668 0.000022 0.124977 0.999943 0.918319 0.875023 0.896149
4827 368 19070639 541 0.000019 0.100782 0.999952 0.929163 0.899218 0.913945
Binary file modified credsweeper/ml_model/ml_model.onnx
Binary file not shown.
2 changes: 1 addition & 1 deletion credsweeper/ml_model/ml_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,6 @@ def validate_groups(self, group_list: List[Tuple[str, List[Candidate]]],
probability[head:tail] = self._batch_call_model(line_input_list, features_list)
is_cred = probability > self.threshold
for i in range(len(is_cred)):
logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], round(probability[i], 3),
logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], round(probability[i], 8),
group_list[i][0])
return is_cred, probability
Empty file.
Binary file removed experiment/augmentation/dictionaries/secrets.pickle
Binary file not shown.
78 changes: 40 additions & 38 deletions experiment/augmentation/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@

import pandas as pd

from .obfuscation import get_obfuscated_value, generate_value, SecretCreds

logging.basicConfig(format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s", level="DEBUG")
logger = logging.getLogger(__name__)
from credsweeper.utils import Util
from .obfuscation import get_obfuscated_value, obfuscate_value, SecretCreds

BASE_PATH = ["test", "src", "other"]
COLUMN_TYPES = {
Expand Down Expand Up @@ -56,29 +54,30 @@ def load_meta(meta_path, directory):
return df


def obfuscate_row(row, meta, secret_creds):
category = meta.Category
def obfuscate_row(row, meta, secret_creds: SecretCreds):
try:
position = int(meta.ValueStart)
pos_end = int(meta.ValueEnd)
except ValueError:
return row
space_len = len(row) - len(row.lstrip())
value = row[position + space_len:pos_end + space_len]
if category == "Predefined Pattern":
if "Password" == meta.Category:
obfuscated_value = secret_creds.get_password()
elif "Predefined Pattern" == meta.Category:
pattern = meta.PredefinedPattern
obfuscated_value = get_obfuscated_value(value, pattern)
elif "Cryptographic Primitives" == meta.Category:
obfuscated_value = secret_creds.generate_secret()
elif meta.Category in [
"Authentication Credentials", #
"Generic Secret", #
"Generic Token" #
]:
obfuscated_value = obfuscate_value(value)
else:
if meta.WithWords == "1" and meta.Category not in [
"Authentication Credentials", #
"Generic Secret", #
"Generic Token" #
]:
obfuscated_value = secret_creds.get_word_secret()
elif meta.Category == "Password":
obfuscated_value = secret_creds.get_password()
else:
obfuscated_value = generate_value(value)
print(f"Unusual category '{meta.Category}' in {row}")
obfuscated_value = obfuscate_value(value)

if position > 0:
obfuscated_line = row[:position + space_len] + obfuscated_value + row[position + space_len + len(value):]
Expand All @@ -96,53 +95,50 @@ def add_raw_lines(meta_df, filepath, content):
false_df = temp_df[temp_df.GroundTruth == "F"]
# Get line for row with "false" label
for index, row in false_df.iterrows():
line_numb = int(row["LineStart:LineEnd"].split(":")[0])
meta_df.loc[index, "RawLine"] = content[line_numb - 1]
line_numb = row["LineStart:LineEnd"].split(":")
assert line_numb[0] == line_numb[1], row
# line_numb = int(row["LineStart:LineEnd"].split(":")[0])
meta_df.loc[index, "RawLine"] = f"{content[int(line_numb[0]) - 1]}\n"
# Get line for row with "true" label
true_df = temp_df[temp_df.GroundTruth == "T"]
for index, row in true_df.iterrows():
line_numb = row["LineStart:LineEnd"].split(":")
assert line_numb[0] == line_numb[1], row
line = ""
for l_n in range(int(line_numb[0]), int(line_numb[0]) + 1):
obf_row = obfuscate_row(content[l_n - 1], row, secret_creds)
line += obf_row
meta_df.loc[index, "RawLine"] = line
meta_df.loc[index, "RawLine"] = f"{line}\n"
# Get line for row with "Template" label(temporary solution)
template_df = temp_df[temp_df.GroundTruth == "Template"]
for index, row in template_df.iterrows():
line_numb = row["LineStart:LineEnd"].split(":")
assert line_numb[0] == line_numb[1], row
line = ""
for l_n in range(int(line_numb[0]), int(line_numb[0]) + 1):
obf_row = obfuscate_row(content[l_n - 1], row, secret_creds)
line += obf_row
meta_df.loc[index, "RawLine"] = line
meta_df.loc[index, "RawLine"] = f"{line}\n"


def write2aug_file(repo_local_path, meta_df, aug_file):
fls_path = list(set(meta_df.FilePath))
for filepath in fls_path:
with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
content = reader.readlines()
content = Util.read_file(repo_local_path / filepath)
add_raw_lines(meta_df, filepath, content)
with open(repo_local_path / aug_file, "w", encoding="utf8") as writer:
Rows = meta_df.RawLine
writer.writelines(Rows)


def write_meta_file(meta_df, meta_file):
save_df = meta_df[meta_df.GroundTruth != "F"]
save_df.to_csv(meta_file)


def join_series(series):
meta_df = pd.DataFrame(series)
return meta_df


def write_meta(aug_df, aug_metapath):
aug_df = pd.concat(aug_df)
aug_df = aug_df[aug_df["GroundTruth"] != "F"]
aug_df["GroundTruth"] = "T"
aug_df.loc[aug_df['GroundTruth'] != "F", 'GroundTruth'] = 'T'
aug_df.rename(columns=RENAME_OLD_COLUMNS, inplace=True)
aug_df.rename(columns=RENAME_NEW_COLUMNS, inplace=True)
aug_df.to_csv(aug_metapath)
Expand All @@ -153,8 +149,9 @@ def get_linage(repo_local_path, df):
files_length = {}
overall_linage = 0
for filepath in fls_path:
with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
content = reader.readlines()
# with open(repo_local_path / filepath, "r", encoding="utf8") as reader:
# content = reader.read().splitlines()
content = Util.read_file(repo_local_path / filepath)
overall_linage += len(content)
files_length[filepath] = len(content)
return files_length, overall_linage
Expand Down Expand Up @@ -239,23 +236,28 @@ def generate_rows(repo_local_path, aug_filename, df, true_stake, scale):
aug_file_linage = int(scale * overall_linage)
fl_true_lines, true_cred_count = get_true_lines(df)
aug_file_linage = int(true_cred_count * scale / true_stake)
idx = 0
for row_numb in range(1, aug_file_linage):
old_idx = idx
idx += 1
rand = random.uniform(0, 1)
if rand < true_stake:
ground_trues, idx = get_true_row(df, row_numb, aug_filename)
row_numb = idx
ground_trues, idx = get_true_row(df, idx, aug_filename)
else:
ground_trues, idx = get_false_row(df, row_numb, aug_filename)
ground_trues, idx = get_false_row(df, idx, aug_filename)
if ground_trues is None:
row_numb -= 1
# suppose, markup has F & T values for all filename cases
idx = old_idx
if 0 > idx:
idx = 0
continue
new_series.append(ground_trues)
return new_series


def aug_data(repo_local_path, meta_data, true_stake, scale):
augument_list = [
"Authentication Credentials" #
"Authentication Credentials", #
"Cryptographic Primitives", #
"Generic Secret", #
"Generic Token", #
Expand All @@ -266,7 +268,7 @@ def aug_data(repo_local_path, meta_data, true_stake, scale):
new_meta = []
aug_meta = str(repo_local_path / "aug_data" / "meta" / base) + ".csv"
aug_file_template = repo_local_path / "aug_data" / "data" / base
meta_df = meta_data[meta_data["FilePath"].str.contains(base)]
meta_df = meta_data[meta_data["FilePath"].str.contains(f"/{base}/")]
meta_df = meta_df[meta_df["Category"].isin(augument_list)]
exts = get_extentions(meta_df)
for extension in exts:
Expand Down
Loading

0 comments on commit 348c6e1

Please sign in to comment.