From 45e064346ec7926727c1dbcf2bd8647d32f49bf7 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Tue, 13 Aug 2024 18:35:38 +0300 Subject: [PATCH] ValuePathFilter and keyword regex enchancement (#595) * square bracket workaround in keywort regex * path filter * BM score fix * ValueStringTypeCheck workaround for heterogenous source * wrap added to filter array definitions * TOML format sanitizer * YAML case * BM fix * BM scores fix * skip f* in BM experiment * keep 0*-3* meta for experiment * less repos in test --- .github/workflows/benchmark.yml | 6 +-- cicd/benchmark.txt | 52 +++++++++---------- credsweeper/common/constants.py | 6 +-- credsweeper/credentials/line_data.py | 11 ++++ .../filters/value_array_dictionary_check.py | 2 + credsweeper/filters/value_file_path_check.py | 8 ++- .../filters/value_string_type_check.py | 4 +- experiment/main.py | 4 +- experiment/main.sh | 15 ++---- experiment/src/data_loader.py | 13 +++-- experiment/src/prepare_data.py | 29 +++++++++-- tests/__init__.py | 8 +-- tests/data/depth_3.json | 27 ++++++++++ tests/data/doc.json | 27 ++++++++++ tests/data/ml_threshold.json | 27 ++++++++++ tests/data/output.json | 27 ++++++++++ .../test_value_array_dictionary_check.py | 2 +- tests/samples/nonce.py | 3 ++ tests/samples/url_cred.js | 4 ++ tests/test_main.py | 6 +++ 20 files changed, 221 insertions(+), 60 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 78556d277..8fe50e19d 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -374,8 +374,8 @@ jobs: - name: Exclude some sets for speed-up run: | - rm -rf data/2* data/8* data/b* - rm -rf meta/2* meta/8* meta/b* + rm -rf data/4* data/5* data/6* data/7* data/8* data/9* data/a* data/b* data/c* data/d* data/e* data/f* + rm -rf meta/4* meta/5* meta/6* meta/7* meta/8* meta/9* meta/a* meta/b* meta/c* meta/d* meta/e* meta/f* mkdir -vp ${{ github.workspace }}/CredData mv data ${{ github.workspace }}/CredData/ mv meta ${{ github.workspace }}/CredData/ @@ -424,7 +424,7 @@ jobs: # run quick scan python -m credsweeper --log debug --path ../tests/samples --save-json NEW_MODEL_FOUND_SAMPLES=$(jq '.|length' output.json) - if [ 100 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then + if [ 10 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then echo "Failure: found ${NEW_MODEL_FOUND_SAMPLES} credentials" exit 1 fi diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt index 72246106f..62d122f04 100644 --- a/cicd/benchmark.txt +++ b/cicd/benchmark.txt @@ -1,4 +1,4 @@ -DATA: 16348035 interested lines. MARKUP: 62567 items +DATA: 16348035 interested lines. MARKUP: 62632 items FileType FileNumber ValidLines Positives Negatives Templates --------------- ------------ ------------ ----------- ----------- ----------- 194 28318 66 427 87 @@ -33,7 +33,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .creds 1 10 1 1 .crlf 1 27 1 .crt 2 4979 253 -.cs 268 82410 158 907 94 +.cs 268 82410 158 910 94 .cshtml 5 180 12 .csp 3 379 11 .csproj 1 14 1 @@ -61,7 +61,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .gd 1 37 1 .gml 3 3075 26 .gni 3 5017 18 -.go 1080 566476 673 4319 741 +.go 1080 566476 690 4330 741 .golden 5 1168 1 14 29 .gradle 45 3265 4 91 100 .graphql 7 420 13 @@ -80,10 +80,10 @@ FileType FileNumber ValidLines Positives Negatives Templat .ipynb 1 134 5 .j 1 241 2 2 .j2 30 5530 6 213 10 -.java 621 134132 359 1360 170 +.java 621 134132 362 1359 170 .jenkinsfile 1 58 2 7 .jinja2 1 64 2 -.js 659 536413 536 2635 330 +.js 659 536413 536 2636 330 .json 850 13046270 1074 10778 140 .jsp 13 3202 1 42 .jsx 7 857 19 @@ -105,12 +105,12 @@ FileType FileNumber ValidLines Positives Negatives Templat .lock 24 160912 144 .log 2 199 38 52 .lua 10 1924 37 3 -.m 16 13358 11 152 3 +.m 16 13358 11 154 3 .manifest 3 102 9 3 .markdown 3 139 3 1 .markerb 3 12 3 .marko 1 21 2 -.md 674 149399 722 2365 662 +.md 674 149399 722 2370 662 .mdx 3 549 7 .mjml 1 18 1 .mjs 22 4424 78 343 @@ -122,7 +122,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .mqh 1 1023 2 .msg 1 26644 1 1 .mysql 1 36 2 -.ndjson 2 5006 70 266 2 +.ndjson 2 5006 71 265 2 .nix 4 211 12 .nolint 1 2 1 .odd 1 1281 57 @@ -150,10 +150,10 @@ FileType FileNumber ValidLines Positives Negatives Templat .pug 2 193 2 .purs 1 69 4 .pxd 1 150 5 2 -.py 890 291553 685 3456 729 +.py 890 291553 682 3462 729 .pyi 4 1361 9 .pyp 1 167 1 -.pyx 2 1094 21 +.pyx 2 1094 23 .r 4 62 6 3 1 .rake 2 51 2 .rb 860 131838 259 3451 612 @@ -217,32 +217,32 @@ FileType FileNumber ValidLines Positives Negatives Templat .xml 9 689 9 .xsl 1 311 1 .yaml 137 19004 128 356 44 -.yml 418 36162 515 910 384 +.yml 418 36162 550 910 384 .zsh 6 872 12 .zsh-theme 1 97 1 -TOTAL: 10259 16348035 8706 59679 5182 -credsweeper result_cnt : 7664, lost_cnt : 0, true_cnt : 7472, false_cnt : 192 +TOTAL: 10259 16348035 8759 59707 5182 +credsweeper result_cnt : 7749, lost_cnt : 0, true_cnt : 7530, false_cnt : 219 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ---- -------- -------- -------- -------- -------- -------- -API 131 3126 185 111 109 2 3309 22 0.000604 0.167939 0.993027 0.981982 0.832061 0.900826 +API 128 3130 185 111 109 2 3313 19 0.000603 0.148438 0.993901 0.981982 0.851562 0.912134 AWS Client ID 167 18 0 160 160 0 18 7 0.000000 0.041916 0.962162 1.000000 0.958084 0.978593 AWS Multi 75 14 0 87 75 11 3 0 0.785714 0.000000 0.876404 0.872093 1.000000 0.931677 AWS S3 Bucket 66 24 0 92 66 24 0 0 1.000000 0.000000 0.733333 0.733333 1.000000 0.846154 Atlassian Old PAT token 27 208 3 12 3 8 203 24 0.037915 0.888889 0.865546 0.272727 0.111111 0.157895 -Auth 412 2723 76 371 353 18 2781 59 0.006431 0.143204 0.976020 0.951482 0.856796 0.901660 +Auth 412 2724 76 373 355 18 2782 57 0.006429 0.138350 0.976650 0.951743 0.861650 0.904459 Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194 BASE64 Private Key 7 2 0 7 7 0 2 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333 Bitbucket Client ID 142 1807 9 46 27 18 1798 115 0.009912 0.809859 0.932074 0.600000 0.190141 0.288770 Bitbucket Client Secret 230 527 10 44 33 11 526 197 0.020484 0.856522 0.728814 0.750000 0.143478 0.240876 -Certificate 25 460 1 21 20 1 460 5 0.002169 0.200000 0.987654 0.952381 0.800000 0.869565 -Credential 94 154 74 90 90 0 228 4 0.000000 0.042553 0.987578 1.000000 0.957447 0.978261 +Certificate 25 466 1 27 20 7 460 5 0.014989 0.200000 0.975610 0.740741 0.800000 0.769231 +Credential 94 154 74 83 83 0 228 11 0.000000 0.117021 0.965839 1.000000 0.882979 0.937853 Docker Swarm Token 2 0 0 2 2 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 Dropbox App secret 62 114 0 46 36 9 105 26 0.078947 0.419355 0.801136 0.800000 0.580645 0.672897 Facebook Access Token 0 1 0 0 0 1 0 0.000000 1.000000 Firebase Domain 6 1 0 7 6 1 0 0 1.000000 0.000000 0.857143 0.857143 1.000000 0.923077 Github Old Token 1 0 0 1 1 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 -Gitlab Feed Token 188 451 87 60 47 12 526 141 0.022305 0.750000 0.789256 0.796610 0.250000 0.380567 +Gitlab Feed Token 189 450 87 60 48 11 526 141 0.020484 0.746032 0.790634 0.813559 0.253968 0.387097 Gitlab Incoming Email Token 37 3 0 21 19 2 1 18 0.666667 0.486486 0.500000 0.904762 0.513514 0.655172 Google API Key 12 0 0 12 12 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 Google Multi 10 2 0 11 10 1 1 0 0.500000 0.000000 0.916667 0.909091 1.000000 0.952381 @@ -251,16 +251,16 @@ Grafana Provisioned API Key 22 1 0 JSON Web Token 170 61 0 131 131 0 61 39 0.000000 0.229412 0.831169 1.000000 0.770588 0.870432 Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000 Jira 2FA 14 6 0 10 10 0 6 4 0.000000 0.285714 0.800000 1.000000 0.714286 0.833333 -Key 522 8453 464 452 447 5 8912 75 0.000561 0.143678 0.991525 0.988938 0.856322 0.917864 -Nonce 91 47 0 84 83 1 46 8 0.021277 0.087912 0.934783 0.988095 0.912088 0.948571 +Key 538 8456 464 468 461 7 8913 77 0.000785 0.143123 0.991119 0.985043 0.856877 0.916501 +Nonce 91 48 0 85 83 2 46 8 0.041667 0.087912 0.928058 0.976471 0.912088 0.943182 PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041 -Password 1841 7468 2724 1691 1637 54 10138 204 0.005298 0.110809 0.978559 0.968066 0.889191 0.926954 -Salt 45 73 2 39 39 0 75 6 0.000000 0.133333 0.950000 1.000000 0.866667 0.928571 -Secret 1365 28359 868 1237 1233 4 29223 132 0.000137 0.096703 0.995554 0.996766 0.903297 0.947733 +Password 1842 7476 2724 1725 1656 69 10131 186 0.006765 0.100977 0.978824 0.960000 0.899023 0.928511 +Salt 45 74 2 40 39 1 75 6 0.013158 0.133333 0.942149 0.975000 0.866667 0.917647 +Secret 1367 28360 868 1240 1234 6 29222 133 0.000205 0.097293 0.995457 0.995161 0.902707 0.946682 Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000 Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -Token 612 3949 437 516 511 5 4381 101 0.001140 0.165033 0.978792 0.990310 0.834967 0.906028 +Token 648 3952 437 545 539 6 4383 109 0.001367 0.168210 0.977169 0.988991 0.831790 0.903604 Twilio API Key 0 5 2 0 0 7 0 0.000000 1.000000 -URL Credentials 209 127 240 200 200 0 367 9 0.000000 0.043062 0.984375 1.000000 0.956938 0.977995 +URL Credentials 209 128 240 200 200 0 368 9 0.000000 0.043062 0.984402 1.000000 0.956938 0.977995 UUID 1068 1 0 1058 1057 1 0 11 1.000000 0.010300 0.988775 0.999055 0.989700 0.994356 - 8706 59679 5182 7671 7472 192 59487 1234 0.003217 0.141741 0.979147 0.974948 0.858259 0.912889 + 8759 59707 5182 7756 7530 219 59488 1229 0.003668 0.140313 0.978851 0.971738 0.859687 0.912285 diff --git a/credsweeper/common/constants.py b/credsweeper/common/constants.py index 292fee633..bfe869e5f 100644 --- a/credsweeper/common/constants.py +++ b/credsweeper/common/constants.py @@ -13,15 +13,15 @@ class KeywordPattern: separator = r"\s*\]?\s*" \ r"(?P:( [a-z]{3,9}[?]? )?=" \ r"|:|=>|!=|===|==|=)" \ - r"((?!\s*ENC(\(|\[))(\s|\w)*\((\s|\w|=|\()*|\s*)" + r"\s*(?P(\w|\.|->|\(|\[)*[\[\(\{](\w{1,32}=)?\s*)?" # Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential value = r"(?P((b|r|br|rb|u|f|rf|fr|\\{0,8})?[`'\"]){1,4})?" \ r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?" \ r"(?P" \ - r"(?(value_leftquote)(?:\\[tnrux0-7][0-9a-f]*|[^`'\"\\])|(?:\\n|\\r|\\?[^\s`'\"\\,;])){3,8000}" \ + r"(?(value_leftquote)(?:\\[tnrux0-7][0-9a-f]*|[^`'\"\\])|(?:\\n|\\r|\\?[^\s`'\"\\,;])){1,8000}" \ r"|(?:\{[^}]{3,8000}\})|(?:<[^>]{3,8000}>)" \ r")" \ - r"(?(value_leftquote)(?P(\\{0,8}[`'\"]){1,4})?)" + r"(?(value_leftquote)(?P(\\{0,8}[`'\"]){1,4})?|(?(wrap)[\]\)\},;]))" @classmethod def get_keyword_pattern(cls, keyword: str) -> re.Pattern: diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py index d407b657a..0822f99e3 100644 --- a/credsweeper/credentials/line_data.py +++ b/credsweeper/credentials/line_data.py @@ -79,6 +79,7 @@ def __init__( self.value_rightquote: Optional[str] = None # is set when variable & value are in URL for any source type self.url_part = False + self.wrap = None self.initialize(match_obj) @@ -121,6 +122,7 @@ def get_span_from_match_obj(_match_obj: re.Match, group: str) -> Tuple[int, int] self.variable_start, self.variable_end = get_span_from_match_obj(match_obj, "variable") self.value_leftquote = get_group_from_match_obj(match_obj, "value_leftquote") self.value_rightquote = get_group_from_match_obj(match_obj, "value_rightquote") + self.wrap = get_group_from_match_obj(match_obj, "wrap") self.sanitize_value() self.sanitize_variable() @@ -131,6 +133,7 @@ def sanitize_value(self): _value = self.value self.clean_url_parameters() self.clean_bash_parameters() + self.clean_toml_parameters() if 0 <= self.value_start and 0 <= self.value_end and len(self.value) < len(_value): start = _value.find(self.value) self.value_start += start @@ -186,6 +189,11 @@ def clean_bash_parameters(self) -> None: if len(value_whsp) > 1: self.value = value_whsp[0] + def clean_toml_parameters(self) -> None: + """Curly brackets may be caught in TOML format""" + while self.value.endswith('}') and '{' in self.line[:self.value_start]: + self.value = self.value[:-1] + def sanitize_variable(self) -> None: """Remove trailing spaces, dashes and quotations around the variable. Correct position.""" sanitized_var_len = 0 @@ -195,6 +203,9 @@ def sanitize_variable(self) -> None: self.variable = self.variable.strip(self.variable_strip_pattern) if self.variable.endswith('\\'): self.variable = self.variable[:-1] + if self.variable.startswith('{') and '}' in self.line[self.variable_end:]: + # TOML case + self.variable = self.variable[1:] if variable and len(self.variable) < len(variable) and 0 <= self.variable_start and 0 <= self.variable_end: start = variable.find(self.variable) self.variable_start += start diff --git a/credsweeper/filters/value_array_dictionary_check.py b/credsweeper/filters/value_array_dictionary_check.py index 4aa9ced0c..34415a6a3 100644 --- a/credsweeper/filters/value_array_dictionary_check.py +++ b/credsweeper/filters/value_array_dictionary_check.py @@ -34,5 +34,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: return False if self.PATTERN.search(line_data.value): return True + if line_data.wrap and not line_data.is_well_quoted_value and ('[' in line_data.wrap or '(' in line_data.wrap): + return True return False diff --git a/credsweeper/filters/value_file_path_check.py b/credsweeper/filters/value_file_path_check.py index 61aa96b46..11d9fb37a 100644 --- a/credsweeper/filters/value_file_path_check.py +++ b/credsweeper/filters/value_file_path_check.py @@ -33,8 +33,14 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: value = line_data.value contains_unix_separator = '/' in value if contains_unix_separator: - if "://" in value or value.startswith("~/") or value.startswith("./") or "../" in value or "/.." in value: + if ("://" in value # + or value.startswith("~/") # + or value.startswith("./") # + or "../" in value # + or "/.." in value # + or value.startswith("//") and ':' == line_data.separator): # common case for url definition or aliases + # or _keyword_://example.com where : is the separator return True # base64 encoded data might look like linux path min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(value)) diff --git a/credsweeper/filters/value_string_type_check.py b/credsweeper/filters/value_string_type_check.py index eec1f12cc..b573fa47f 100644 --- a/credsweeper/filters/value_string_type_check.py +++ b/credsweeper/filters/value_string_type_check.py @@ -40,7 +40,9 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: not_quoted = not line_data.is_well_quoted_value not_comment = not line_data.is_comment() - if line_data.is_source_file_with_quotes() and not_comment and not_quoted and not line_data.is_quoted: + if line_data.is_source_file_with_quotes() and not_comment and not_quoted and not line_data.is_quoted \ + and '=' in line_data.separator: + # heterogeneous code e.g. YAML in Python uses colon sign instead equals return True return False diff --git a/experiment/main.py b/experiment/main.py index 747b4ae98..e52ce1f14 100644 --- a/experiment/main.py +++ b/experiment/main.py @@ -20,7 +20,7 @@ from experiment.src.features import prepare_data from experiment.src.lstm_model import get_model from experiment.src.model_config_preprocess import model_config_preprocess -from experiment.src.prepare_data import prepare_train_data +from experiment.src.prepare_data import prepare_train_data, meta_checksum def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray], y_label: np.ndarray): @@ -59,7 +59,7 @@ def main(cred_data_location: str, jobs: int) -> str: prepare_train_data(_cred_data_location, jobs) # detected data means which data is passed to ML validator of credsweeper after filters with RuleName - detected_data = read_detected_data("results/detected_data.json") + detected_data = read_detected_data(f"results/detected_data.{meta_checksum(cred_data_location)}.json") print(f"CredSweeper detected {len(detected_data)} credentials without ML") # all markup data meta_data = read_metadata(f"{cred_data_location}/meta") diff --git a/experiment/main.sh b/experiment/main.sh index 2d50e5ee5..97e6fc811 100755 --- a/experiment/main.sh +++ b/experiment/main.sh @@ -12,20 +12,13 @@ now=$(date +%Y%m%d_%H%M%S) RESULT_DIR=${CREDSWEEPER_DIR}/experiment/results mkdir -vp ${RESULT_DIR} -${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData --jobs 32 | tee ${RESULT_DIR}/train.${now}.log +${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData --jobs $(nproc) | tee ${RESULT_DIR}/train.${now}.log +error_code=${PIPESTATUS} +if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi cd ${CREDSWEEPER_DIR} report_file=${RESULT_DIR}/${now}.json -${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/auxiliary/data/ --log error --job 32 --save-json ${report_file} +${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/auxiliary/data/ --log info --job $(nproc) --save-json ${report_file} cd ~/q/DataCred/auxiliary/ .venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${report_file}.log - -#last_tf_model=$(cat train.log | tail -n1) - -#echo $last_tf_model - -#pwd - -#python -m tf2onnx.convert --saved-model results/$last_tf_model --output ../credsweeper/ml_model/ml_model.onnx --verbose - diff --git a/experiment/src/data_loader.py b/experiment/src/data_loader.py index d5742d042..919378d54 100644 --- a/experiment/src/data_loader.py +++ b/experiment/src/data_loader.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from colorama import Fore, Style +from colorama import Fore, Style, Back from credsweeper.common.constants import ML_HUNK from credsweeper.utils import Util @@ -95,7 +95,8 @@ def read_metadata(meta_dir: str) -> Dict[identifier, Dict]: df.loc[df["GroundTruth"] == "Template", "GroundTruth"] = 'F' for _, row in df.iterrows(): j += 1 - if row["LineStart"] != row["LineEnd"] or any(x in row["Category"] for x in ["AWS Multi", "Google Multi"]): + if row["LineStart"] != row["LineEnd"] \ + or all(x in ["AWS Multi", "Google Multi"] for x in row["Category"].split(':')): # print(f"WARNING: skip not ml category {row['FilePath']},{line_start},{line_end}" # f",{row['GroundTruth']},{row['Category']}") continue @@ -194,11 +195,17 @@ def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier line_data["type"] = line_data["path"].split('/')[-2] values.append(line_data) + all_meta_found = True for markup in meta_data.values(): if 'T' == markup["GroundTruth"] and not markup["Used"]: for markup_rule in markup["Category"].split(':'): if markup_rule in detected_rules: - print(f"WARNING: Not found! {markup}") + if all_meta_found: + # print header of the markup once + print(f"{Back.MAGENTA}{Fore.BLACK}WARNING: Not all TRUE meta found!{Style.RESET_ALL}") + print(','.join(markup.keys())) + all_meta_found = False + print(','.join(str(x) for x in markup.values())) text = Util.read_file(f'{cred_data_location}/{markup["FilePath"]}') line = text[markup["LineStart"] - 1].strip() if 0 <= markup["ValueStart"] and 0 <= markup["ValueEnd"]: diff --git a/experiment/src/prepare_data.py b/experiment/src/prepare_data.py index 48d8e9075..5d00ab31a 100644 --- a/experiment/src/prepare_data.py +++ b/experiment/src/prepare_data.py @@ -1,6 +1,9 @@ +import binascii +import hashlib import os import subprocess import sys +from pathlib import Path from credsweeper.utils import Util @@ -9,9 +12,21 @@ def execute_scanner(dataset_location: str, result_location_str, j): """Execute CredSweeper as a separate process to make sure no global states is shared with training script""" dir_path = os.path.dirname(os.path.realpath(__file__)) + "/.." command = f"{sys.executable} -m credsweeper --path {dataset_location}/data" \ - f" --save-json {result_location_str} " \ - f"--job {j} --sort --rules results/train_config.yaml --ml_threshold 0" - subprocess.check_call(command, shell=True, cwd=dir_path, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) + f" --save-json {result_location_str} --log info" \ + f" --job {j} --sort --rules results/train_config.yaml --ml_threshold 0" + error_code = subprocess.check_call(command, shell=True, cwd=dir_path) + if 0 != error_code: + sys.exit(error_code) + + +def meta_checksum(cred_data_location: str) -> str: + checksum = hashlib.md5(b'').digest() + for root, dirs, files in os.walk(Path(cred_data_location) / "meta"): + for file in files: + with open(os.path.join(root, file), "rb") as f: + cvs_checksum = hashlib.md5(f.read()).digest() + checksum = bytes(a ^ b for a, b in zip(checksum, cvs_checksum)) + return binascii.hexlify(checksum).decode() def prepare_train_data(cred_data_location: str, j: int): @@ -23,8 +38,12 @@ def prepare_train_data(cred_data_location: str, j: int): new_rules = [x for x in rules if x.get("use_ml")] Util.yaml_dump(new_rules, "results/train_config.yaml") - if not os.path.exists("results/detected_data.json"): + detected_data_filename = f"results/detected_data.{meta_checksum(cred_data_location)}.json" + + if not os.path.exists(detected_data_filename): print(f"Get CredSweeper results from {cred_data_location}. May take some time") - execute_scanner(cred_data_location, "results/detected_data.json", j) + execute_scanner(cred_data_location, detected_data_filename, j) + else: + print(f"Get cached result {meta_checksum(cred_data_location)}") print("Train data prepared!") diff --git a/tests/__init__.py b/tests/__init__.py index 85a275175..73b4cd837 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -7,14 +7,14 @@ NEGLIGIBLE_ML_THRESHOLD = 0.0001 # credentials count after scan -SAMPLES_CRED_COUNT: int = 363 -SAMPLES_CRED_LINE_COUNT: int = 380 +SAMPLES_CRED_COUNT: int = 364 +SAMPLES_CRED_LINE_COUNT: int = 381 # credentials count after post-processing -SAMPLES_POST_CRED_COUNT: int = 322 +SAMPLES_POST_CRED_COUNT: int = 323 # with option --doc -SAMPLES_IN_DOC = 416 +SAMPLES_IN_DOC = 417 # archived credentials that are not found without --depth SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24 diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index 8fd52ab2c..8ea01c932 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -7648,6 +7648,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Password", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "password: F1lT3ReDw17hQoT3s", + "line_num": 3, + "path": "tests/samples/nonce.py", + "info": "tests/samples/nonce.py|RAW", + "value": "F1lT3ReDw17hQoT3s", + "value_start": 10, + "value_end": 27, + "variable": "password", + "variable_start": 0, + "variable_end": 8, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.734521664779752, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/doc.json b/tests/data/doc.json index 0311797b5..b0b19396d 100644 --- a/tests/data/doc.json +++ b/tests/data/doc.json @@ -11824,6 +11824,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "PASSWD_PAIR", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "password: F1lT3ReDw17hQoT3s", + "line_num": 3, + "path": "tests/samples/nonce.py", + "info": "tests/samples/nonce.py|RAW", + "value": "F1lT3ReDw17hQoT3s", + "value_start": 10, + "value_end": 27, + "variable": "password", + "variable_start": 0, + "variable_end": 8, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.734521664779752, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json index 01c1a0929..8fb14c91f 100644 --- a/tests/data/ml_threshold.json +++ b/tests/data/ml_threshold.json @@ -8255,6 +8255,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Password", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "password: F1lT3ReDw17hQoT3s", + "line_num": 3, + "path": "tests/samples/nonce.py", + "info": "", + "value": "F1lT3ReDw17hQoT3s", + "value_start": 10, + "value_end": 27, + "variable": "password", + "variable_start": 0, + "variable_end": 8, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.734521664779752, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/output.json b/tests/data/output.json index f6dcde344..de20dcb49 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -7337,6 +7337,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Password", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "password: F1lT3ReDw17hQoT3s", + "line_num": 3, + "path": "tests/samples/nonce.py", + "info": "", + "value": "F1lT3ReDw17hQoT3s", + "value_start": 10, + "value_end": 27, + "variable": "password", + "variable_start": 0, + "variable_end": 8, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.734521664779752, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/filters/test_value_array_dictionary_check.py b/tests/filters/test_value_array_dictionary_check.py index 3ea0cbd7f..afdc30d3e 100644 --- a/tests/filters/test_value_array_dictionary_check.py +++ b/tests/filters/test_value_array_dictionary_check.py @@ -11,7 +11,7 @@ class TestValueArrayDictionaryCheck: @pytest.fixture def token_rule(self, config) -> Rule: token_rule_without_filters = { - "name": "Pass", + "name": "Password", "severity": "medium", "confidence": "moderate", "type": "keyword", diff --git a/tests/samples/nonce.py b/tests/samples/nonce.py index 4e2edd092..0a63f7a1b 100644 --- a/tests/samples/nonce.py +++ b/tests/samples/nonce.py @@ -1 +1,4 @@ nonce = 'bsfcvir57nt40rydvtbhs8lzbgljmet5' +secure_yaml = """ +password: F1lT3ReDw17hQoT3s +""" diff --git a/tests/samples/url_cred.js b/tests/samples/url_cred.js index d555f7233..2bcbeb844 100644 --- a/tests/samples/url_cred.js +++ b/tests/samples/url_cred.js @@ -11,3 +11,7 @@ url = "https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut */ email_as_login = "smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465"; + +/* +@"otpauth://host/port?set=VNMXQKAZFVOYOJCDNBIYXYIWX2&info=should_not_be_found_even_in_ml_threshold +*/ diff --git a/tests/test_main.py b/tests/test_main.py index 7e664004b..7064e3a11 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -774,6 +774,9 @@ def prepare(report: List[Dict[str, Any]]): def test_param_n(self) -> None: # internal parametrized tests for quick debug - no itms should be found items = [ # + ('x3.txt', b'passwd = values[token_id]'), + ('t.py', b'new_params = {"dsn": new_params["dsn"], "password": new_params["password"]}'), + ('t.m', b'@"otpauth://host/port?set=VNMXQKAZFVOYOJCDNBIYXYIWX2&algorithm=F4KE",'), ("test.c", b" *keylen = X448_KEYLEN;"), ("test.php", b"$yourls_user_passwords = $copy;"), ("", b"passwords = List"), @@ -800,6 +803,9 @@ def test_param_n(self) -> None: def test_param_p(self) -> None: # internal parametrized tests for quick debug items = [ # + ('my.toml', b'{nkey: XMIGDHSYNSJQ0XNR}', "nkey", "XMIGDHSYNSJQ0XNR"), + ('my.yaml', b'password: 3287#JQ0XX@IG}', "password", "3287#JQ0XX@IG}"), + ("creds.py", b'"tokens": ["xabsjhdbasu7d9g", "ashbjhdifufhsds"]', "tokens", "xabsjhdbasu7d9g"), ("slt.py", b'\\t\\tsalt = "\\x187bhgerjhqw\\n iKa\\tW_R~0/8"', "salt", "\\x187bhgerjhqw\\n iKa\\tW_R~0/8"), ("log.txt", b'json\\nAuthorization: Basic jfhlksadjiu9813ryiuhdfskadjlkjh34\\n\\u003c/code\\u003e\\u003c/pre\\u003e"',