From 45e064346ec7926727c1dbcf2bd8647d32f49bf7 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@users.noreply.github.com>
Date: Tue, 13 Aug 2024 18:35:38 +0300
Subject: [PATCH] ValuePathFilter and keyword regex enchancement (#595)

* square bracket workaround in keywort regex

* path filter

* BM score fix

* ValueStringTypeCheck workaround for heterogenous source

* wrap added to filter array definitions

* TOML format sanitizer

* YAML case

* BM fix

* BM scores fix

* skip f* in BM experiment

* keep 0*-3* meta for experiment

* less repos in test
---
 .github/workflows/benchmark.yml               |  6 +--
 cicd/benchmark.txt                            | 52 +++++++++----------
 credsweeper/common/constants.py               |  6 +--
 credsweeper/credentials/line_data.py          | 11 ++++
 .../filters/value_array_dictionary_check.py   |  2 +
 credsweeper/filters/value_file_path_check.py  |  8 ++-
 .../filters/value_string_type_check.py        |  4 +-
 experiment/main.py                            |  4 +-
 experiment/main.sh                            | 15 ++----
 experiment/src/data_loader.py                 | 13 +++--
 experiment/src/prepare_data.py                | 29 +++++++++--
 tests/__init__.py                             |  8 +--
 tests/data/depth_3.json                       | 27 ++++++++++
 tests/data/doc.json                           | 27 ++++++++++
 tests/data/ml_threshold.json                  | 27 ++++++++++
 tests/data/output.json                        | 27 ++++++++++
 .../test_value_array_dictionary_check.py      |  2 +-
 tests/samples/nonce.py                        |  3 ++
 tests/samples/url_cred.js                     |  4 ++
 tests/test_main.py                            |  6 +++
 20 files changed, 221 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 78556d277..8fe50e19d 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -374,8 +374,8 @@ jobs:
 
       - name: Exclude some sets for speed-up
         run: |
-          rm -rf data/2* data/8* data/b*
-          rm -rf meta/2* meta/8* meta/b*
+          rm -rf data/4* data/5* data/6* data/7* data/8* data/9* data/a* data/b* data/c* data/d* data/e* data/f*
+          rm -rf meta/4* meta/5* meta/6* meta/7* meta/8* meta/9* meta/a* meta/b* meta/c* meta/d* meta/e* meta/f*
           mkdir -vp ${{ github.workspace }}/CredData
           mv data ${{ github.workspace }}/CredData/
           mv meta ${{ github.workspace }}/CredData/
@@ -424,7 +424,7 @@ jobs:
           # run quick scan
           python -m credsweeper --log debug --path ../tests/samples --save-json
           NEW_MODEL_FOUND_SAMPLES=$(jq '.|length' output.json)
-          if [ 100 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then
+          if [ 10 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then
             echo "Failure: found ${NEW_MODEL_FOUND_SAMPLES} credentials"
             exit 1
           fi
diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
index 72246106f..62d122f04 100644
--- a/cicd/benchmark.txt
+++ b/cicd/benchmark.txt
@@ -1,4 +1,4 @@
-DATA: 16348035 interested lines. MARKUP: 62567 items
+DATA: 16348035 interested lines. MARKUP: 62632 items
 FileType           FileNumber    ValidLines    Positives    Negatives    Templates
 ---------------  ------------  ------------  -----------  -----------  -----------
                           194         28318           66          427           87
@@ -33,7 +33,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .creds                      1            10            1            1
 .crlf                       1            27            1
 .crt                        2          4979                       253
-.cs                       268         82410          158          907           94
+.cs                       268         82410          158          910           94
 .cshtml                     5           180                        12
 .csp                        3           379                        11
 .csproj                     1            14                         1
@@ -61,7 +61,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .gd                         1            37                         1
 .gml                        3          3075                        26
 .gni                        3          5017                        18
-.go                      1080        566476          673         4319          741
+.go                      1080        566476          690         4330          741
 .golden                     5          1168            1           14           29
 .gradle                    45          3265            4           91          100
 .graphql                    7           420                        13
@@ -80,10 +80,10 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .ipynb                      1           134                         5
 .j                          1           241            2            2
 .j2                        30          5530            6          213           10
-.java                     621        134132          359         1360          170
+.java                     621        134132          362         1359          170
 .jenkinsfile                1            58            2            7
 .jinja2                     1            64                         2
-.js                       659        536413          536         2635          330
+.js                       659        536413          536         2636          330
 .json                     850      13046270         1074        10778          140
 .jsp                       13          3202            1           42
 .jsx                        7           857                        19
@@ -105,12 +105,12 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .lock                      24        160912                       144
 .log                        2           199           38           52
 .lua                       10          1924                        37            3
-.m                         16         13358           11          152            3
+.m                         16         13358           11          154            3
 .manifest                   3           102            9            3
 .markdown                   3           139                         3            1
 .markerb                    3            12                         3
 .marko                      1            21                         2
-.md                       674        149399          722         2365          662
+.md                       674        149399          722         2370          662
 .mdx                        3           549                         7
 .mjml                       1            18                         1
 .mjs                       22          4424           78          343
@@ -122,7 +122,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .mqh                        1          1023                         2
 .msg                        1         26644            1            1
 .mysql                      1            36                                      2
-.ndjson                     2          5006           70          266            2
+.ndjson                     2          5006           71          265            2
 .nix                        4           211                        12
 .nolint                     1             2                         1
 .odd                        1          1281                        57
@@ -150,10 +150,10 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .pug                        2           193                         2
 .purs                       1            69                         4
 .pxd                        1           150                         5            2
-.py                       890        291553          685         3456          729
+.py                       890        291553          682         3462          729
 .pyi                        4          1361                         9
 .pyp                        1           167                         1
-.pyx                        2          1094                        21
+.pyx                        2          1094                        23
 .r                          4            62            6            3            1
 .rake                       2            51                         2
 .rb                       860        131838          259         3451          612
@@ -217,32 +217,32 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .xml                        9           689                         9
 .xsl                        1           311                         1
 .yaml                     137         19004          128          356           44
-.yml                      418         36162          515          910          384
+.yml                      418         36162          550          910          384
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
-TOTAL:                  10259      16348035         8706        59679         5182
-credsweeper result_cnt : 7664, lost_cnt : 0, true_cnt : 7472, false_cnt : 192
+TOTAL:                  10259      16348035         8759        59707         5182
+credsweeper result_cnt : 7749, lost_cnt : 0, true_cnt : 7530, false_cnt : 219
 Rules                             Positives    Negatives    Templates    Reported    TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  ----  ----  -----  ----  --------  --------  --------  --------  --------  --------
-API                                     131         3126          185         111   109     2   3309    22  0.000604  0.167939  0.993027  0.981982  0.832061  0.900826
+API                                     128         3130          185         111   109     2   3313    19  0.000603  0.148438  0.993901  0.981982  0.851562  0.912134
 AWS Client ID                           167           18            0         160   160     0     18     7  0.000000  0.041916  0.962162  1.000000  0.958084  0.978593
 AWS Multi                                75           14            0          87    75    11      3     0  0.785714  0.000000  0.876404  0.872093  1.000000  0.931677
 AWS S3 Bucket                            66           24            0          92    66    24      0     0  1.000000  0.000000  0.733333  0.733333  1.000000  0.846154
 Atlassian Old PAT token                  27          208            3          12     3     8    203    24  0.037915  0.888889  0.865546  0.272727  0.111111  0.157895
-Auth                                    412         2723           76         371   353    18   2781    59  0.006431  0.143204  0.976020  0.951482  0.856796  0.901660
+Auth                                    412         2724           76         373   355    18   2782    57  0.006429  0.138350  0.976650  0.951743  0.861650  0.904459
 Azure Access Token                       19            0            0          12    12     0      0     7            0.368421  0.631579  1.000000  0.631579  0.774194
 BASE64 Private Key                        7            2            0           7     7     0      2     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 BASE64 encoded PEM Private Key            7            0            0           5     5     0      0     2            0.285714  0.714286  1.000000  0.714286  0.833333
 Bitbucket Client ID                     142         1807            9          46    27    18   1798   115  0.009912  0.809859  0.932074  0.600000  0.190141  0.288770
 Bitbucket Client Secret                 230          527           10          44    33    11    526   197  0.020484  0.856522  0.728814  0.750000  0.143478  0.240876
-Certificate                              25          460            1          21    20     1    460     5  0.002169  0.200000  0.987654  0.952381  0.800000  0.869565
-Credential                               94          154           74          90    90     0    228     4  0.000000  0.042553  0.987578  1.000000  0.957447  0.978261
+Certificate                              25          466            1          27    20     7    460     5  0.014989  0.200000  0.975610  0.740741  0.800000  0.769231
+Credential                               94          154           74          83    83     0    228    11  0.000000  0.117021  0.965839  1.000000  0.882979  0.937853
 Docker Swarm Token                        2            0            0           2     2     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
 Dropbox App secret                       62          114            0          46    36     9    105    26  0.078947  0.419355  0.801136  0.800000  0.580645  0.672897
 Facebook Access Token                     0            1            0                 0     0      1     0  0.000000            1.000000
 Firebase Domain                           6            1            0           7     6     1      0     0  1.000000  0.000000  0.857143  0.857143  1.000000  0.923077
 Github Old Token                          1            0            0           1     1     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
-Gitlab Feed Token                       188          451           87          60    47    12    526   141  0.022305  0.750000  0.789256  0.796610  0.250000  0.380567
+Gitlab Feed Token                       189          450           87          60    48    11    526   141  0.020484  0.746032  0.790634  0.813559  0.253968  0.387097
 Gitlab Incoming Email Token              37            3            0          21    19     2      1    18  0.666667  0.486486  0.500000  0.904762  0.513514  0.655172
 Google API Key                           12            0            0          12    12     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
 Google Multi                             10            2            0          11    10     1      1     0  0.500000  0.000000  0.916667  0.909091  1.000000  0.952381
@@ -251,16 +251,16 @@ Grafana Provisioned API Key              22            1            0
 JSON Web Token                          170           61            0         131   131     0     61    39  0.000000  0.229412  0.831169  1.000000  0.770588  0.870432
 Jira / Confluence PAT token               0            4            0                 0     0      4     0  0.000000            1.000000
 Jira 2FA                                 14            6            0          10    10     0      6     4  0.000000  0.285714  0.800000  1.000000  0.714286  0.833333
-Key                                     522         8453          464         452   447     5   8912    75  0.000561  0.143678  0.991525  0.988938  0.856322  0.917864
-Nonce                                    91           47            0          84    83     1     46     8  0.021277  0.087912  0.934783  0.988095  0.912088  0.948571
+Key                                     538         8456          464         468   461     7   8913    77  0.000785  0.143123  0.991119  0.985043  0.856877  0.916501
+Nonce                                    91           48            0          85    83     2     46     8  0.041667  0.087912  0.928058  0.976471  0.912088  0.943182
 PEM Private Key                        1019         1483            0        1023  1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1841         7468         2724        1691  1637    54  10138   204  0.005298  0.110809  0.978559  0.968066  0.889191  0.926954
-Salt                                     45           73            2          39    39     0     75     6  0.000000  0.133333  0.950000  1.000000  0.866667  0.928571
-Secret                                 1365        28359          868        1237  1233     4  29223   132  0.000137  0.096703  0.995554  0.996766  0.903297  0.947733
+Password                               1842         7476         2724        1725  1656    69  10131   186  0.006765  0.100977  0.978824  0.960000  0.899023  0.928511
+Salt                                     45           74            2          40    39     1     75     6  0.013158  0.133333  0.942149  0.975000  0.866667  0.917647
+Secret                                 1367        28360          868        1240  1234     6  29222   133  0.000205  0.097293  0.995457  0.995161  0.902707  0.946682
 Seed                                      1            6            0                 0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4     4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
-Token                                   612         3949          437         516   511     5   4381   101  0.001140  0.165033  0.978792  0.990310  0.834967  0.906028
+Token                                   648         3952          437         545   539     6   4383   109  0.001367  0.168210  0.977169  0.988991  0.831790  0.903604
 Twilio API Key                            0            5            2                 0     0      7     0  0.000000            1.000000
-URL Credentials                         209          127          240         200   200     0    367     9  0.000000  0.043062  0.984375  1.000000  0.956938  0.977995
+URL Credentials                         209          128          240         200   200     0    368     9  0.000000  0.043062  0.984402  1.000000  0.956938  0.977995
 UUID                                   1068            1            0        1058  1057     1      0    11  1.000000  0.010300  0.988775  0.999055  0.989700  0.994356
-                                       8706        59679         5182        7671  7472   192  59487  1234  0.003217  0.141741  0.979147  0.974948  0.858259  0.912889
+                                       8759        59707         5182        7756  7530   219  59488  1229  0.003668  0.140313  0.978851  0.971738  0.859687  0.912285
diff --git a/credsweeper/common/constants.py b/credsweeper/common/constants.py
index 292fee633..bfe869e5f 100644
--- a/credsweeper/common/constants.py
+++ b/credsweeper/common/constants.py
@@ -13,15 +13,15 @@ class KeywordPattern:
     separator = r"\s*\]?\s*" \
                 r"(?P<separator>:( [a-z]{3,9}[?]? )?=" \
                 r"|:|=>|!=|===|==|=)" \
-                r"((?!\s*ENC(\(|\[))(\s|\w)*\((\s|\w|=|\()*|\s*)"
+                r"\s*(?P<wrap>(\w|\.|->|\(|\[)*[\[\(\{](\w{1,32}=)?\s*)?"
     # Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
     value = r"(?P<value_leftquote>((b|r|br|rb|u|f|rf|fr|\\{0,8})?[`'\"]){1,4})?" \
             r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?" \
             r"(?P<value>" \
-            r"(?(value_leftquote)(?:\\[tnrux0-7][0-9a-f]*|[^`'\"\\])|(?:\\n|\\r|\\?[^\s`'\"\\,;])){3,8000}" \
+            r"(?(value_leftquote)(?:\\[tnrux0-7][0-9a-f]*|[^`'\"\\])|(?:\\n|\\r|\\?[^\s`'\"\\,;])){1,8000}" \
             r"|(?:\{[^}]{3,8000}\})|(?:<[^>]{3,8000}>)" \
             r")" \
-            r"(?(value_leftquote)(?P<value_rightquote>(\\{0,8}[`'\"]){1,4})?)"
+            r"(?(value_leftquote)(?P<value_rightquote>(\\{0,8}[`'\"]){1,4})?|(?(wrap)[\]\)\},;]))"
 
     @classmethod
     def get_keyword_pattern(cls, keyword: str) -> re.Pattern:
diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py
index d407b657a..0822f99e3 100644
--- a/credsweeper/credentials/line_data.py
+++ b/credsweeper/credentials/line_data.py
@@ -79,6 +79,7 @@ def __init__(
         self.value_rightquote: Optional[str] = None
         # is set when variable & value are in URL for any source type
         self.url_part = False
+        self.wrap = None
 
         self.initialize(match_obj)
 
@@ -121,6 +122,7 @@ def get_span_from_match_obj(_match_obj: re.Match, group: str) -> Tuple[int, int]
         self.variable_start, self.variable_end = get_span_from_match_obj(match_obj, "variable")
         self.value_leftquote = get_group_from_match_obj(match_obj, "value_leftquote")
         self.value_rightquote = get_group_from_match_obj(match_obj, "value_rightquote")
+        self.wrap = get_group_from_match_obj(match_obj, "wrap")
         self.sanitize_value()
         self.sanitize_variable()
 
@@ -131,6 +133,7 @@ def sanitize_value(self):
             _value = self.value
             self.clean_url_parameters()
             self.clean_bash_parameters()
+            self.clean_toml_parameters()
             if 0 <= self.value_start and 0 <= self.value_end and len(self.value) < len(_value):
                 start = _value.find(self.value)
                 self.value_start += start
@@ -186,6 +189,11 @@ def clean_bash_parameters(self) -> None:
             if len(value_whsp) > 1:
                 self.value = value_whsp[0]
 
+    def clean_toml_parameters(self) -> None:
+        """Curly brackets may be caught in TOML format"""
+        while self.value.endswith('}') and '{' in self.line[:self.value_start]:
+            self.value = self.value[:-1]
+
     def sanitize_variable(self) -> None:
         """Remove trailing spaces, dashes and quotations around the variable. Correct position."""
         sanitized_var_len = 0
@@ -195,6 +203,9 @@ def sanitize_variable(self) -> None:
             self.variable = self.variable.strip(self.variable_strip_pattern)
             if self.variable.endswith('\\'):
                 self.variable = self.variable[:-1]
+            if self.variable.startswith('{') and '}' in self.line[self.variable_end:]:
+                # TOML case
+                self.variable = self.variable[1:]
         if variable and len(self.variable) < len(variable) and 0 <= self.variable_start and 0 <= self.variable_end:
             start = variable.find(self.variable)
             self.variable_start += start
diff --git a/credsweeper/filters/value_array_dictionary_check.py b/credsweeper/filters/value_array_dictionary_check.py
index 4aa9ced0c..34415a6a3 100644
--- a/credsweeper/filters/value_array_dictionary_check.py
+++ b/credsweeper/filters/value_array_dictionary_check.py
@@ -34,5 +34,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
             return False
         if self.PATTERN.search(line_data.value):
             return True
+        if line_data.wrap and not line_data.is_well_quoted_value and ('[' in line_data.wrap or '(' in line_data.wrap):
+            return True
 
         return False
diff --git a/credsweeper/filters/value_file_path_check.py b/credsweeper/filters/value_file_path_check.py
index 61aa96b46..11d9fb37a 100644
--- a/credsweeper/filters/value_file_path_check.py
+++ b/credsweeper/filters/value_file_path_check.py
@@ -33,8 +33,14 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         value = line_data.value
         contains_unix_separator = '/' in value
         if contains_unix_separator:
-            if "://" in value or value.startswith("~/") or value.startswith("./") or "../" in value or "/.." in value:
+            if ("://" in value  #
+                    or value.startswith("~/")  #
+                    or value.startswith("./")  #
+                    or "../" in value  #
+                    or "/.." in value  #
+                    or value.startswith("//") and ':' == line_data.separator):
                 # common case for url definition or aliases
+                # or _keyword_://example.com where : is the separator
                 return True
             # base64 encoded data might look like linux path
             min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(value))
diff --git a/credsweeper/filters/value_string_type_check.py b/credsweeper/filters/value_string_type_check.py
index eec1f12cc..b573fa47f 100644
--- a/credsweeper/filters/value_string_type_check.py
+++ b/credsweeper/filters/value_string_type_check.py
@@ -40,7 +40,9 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         not_quoted = not line_data.is_well_quoted_value
         not_comment = not line_data.is_comment()
 
-        if line_data.is_source_file_with_quotes() and not_comment and not_quoted and not line_data.is_quoted:
+        if line_data.is_source_file_with_quotes() and not_comment and not_quoted and not line_data.is_quoted \
+                and '=' in line_data.separator:
+            # heterogeneous code e.g. YAML in Python uses colon sign instead equals
             return True
 
         return False
diff --git a/experiment/main.py b/experiment/main.py
index 747b4ae98..e52ce1f14 100644
--- a/experiment/main.py
+++ b/experiment/main.py
@@ -20,7 +20,7 @@
 from experiment.src.features import prepare_data
 from experiment.src.lstm_model import get_model
 from experiment.src.model_config_preprocess import model_config_preprocess
-from experiment.src.prepare_data import prepare_train_data
+from experiment.src.prepare_data import prepare_train_data, meta_checksum
 
 
 def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray], y_label: np.ndarray):
@@ -59,7 +59,7 @@ def main(cred_data_location: str, jobs: int) -> str:
     prepare_train_data(_cred_data_location, jobs)
 
     # detected data means which data is passed to ML validator of credsweeper after filters with RuleName
-    detected_data = read_detected_data("results/detected_data.json")
+    detected_data = read_detected_data(f"results/detected_data.{meta_checksum(cred_data_location)}.json")
     print(f"CredSweeper detected {len(detected_data)} credentials without ML")
     # all markup data
     meta_data = read_metadata(f"{cred_data_location}/meta")
diff --git a/experiment/main.sh b/experiment/main.sh
index 2d50e5ee5..97e6fc811 100755
--- a/experiment/main.sh
+++ b/experiment/main.sh
@@ -12,20 +12,13 @@ now=$(date +%Y%m%d_%H%M%S)
 RESULT_DIR=${CREDSWEEPER_DIR}/experiment/results
 mkdir -vp ${RESULT_DIR}
 
-${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData --jobs 32 | tee ${RESULT_DIR}/train.${now}.log
+${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData --jobs $(nproc) | tee ${RESULT_DIR}/train.${now}.log
+error_code=${PIPESTATUS}
+if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi
 
 cd ${CREDSWEEPER_DIR}
 report_file=${RESULT_DIR}/${now}.json
-${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/auxiliary/data/ --log error --job 32 --save-json ${report_file}
+${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/auxiliary/data/ --log info --job $(nproc) --save-json ${report_file}
 
 cd ~/q/DataCred/auxiliary/
 .venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${report_file}.log
-
-#last_tf_model=$(cat train.log | tail -n1)
-
-#echo $last_tf_model
-
-#pwd
-
-#python -m tf2onnx.convert --saved-model results/$last_tf_model --output ../credsweeper/ml_model/ml_model.onnx --verbose
-
diff --git a/experiment/src/data_loader.py b/experiment/src/data_loader.py
index d5742d042..919378d54 100644
--- a/experiment/src/data_loader.py
+++ b/experiment/src/data_loader.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 import pandas as pd
-from colorama import Fore, Style
+from colorama import Fore, Style, Back
 
 from credsweeper.common.constants import ML_HUNK
 from credsweeper.utils import Util
@@ -95,7 +95,8 @@ def read_metadata(meta_dir: str) -> Dict[identifier, Dict]:
         df.loc[df["GroundTruth"] == "Template", "GroundTruth"] = 'F'
         for _, row in df.iterrows():
             j += 1
-            if row["LineStart"] != row["LineEnd"] or any(x in row["Category"] for x in ["AWS Multi", "Google Multi"]):
+            if row["LineStart"] != row["LineEnd"] \
+                    or all(x in ["AWS Multi", "Google Multi"] for x in row["Category"].split(':')):
                 # print(f"WARNING: skip not ml category {row['FilePath']},{line_start},{line_end}"
                 #      f",{row['GroundTruth']},{row['Category']}")
                 continue
@@ -194,11 +195,17 @@ def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier
         line_data["type"] = line_data["path"].split('/')[-2]
         values.append(line_data)
 
+    all_meta_found = True
     for markup in meta_data.values():
         if 'T' == markup["GroundTruth"] and not markup["Used"]:
             for markup_rule in markup["Category"].split(':'):
                 if markup_rule in detected_rules:
-                    print(f"WARNING: Not found! {markup}")
+                    if all_meta_found:
+                        # print header of the markup once
+                        print(f"{Back.MAGENTA}{Fore.BLACK}WARNING: Not all TRUE meta found!{Style.RESET_ALL}")
+                        print(','.join(markup.keys()))
+                        all_meta_found = False
+                    print(','.join(str(x) for x in markup.values()))
                     text = Util.read_file(f'{cred_data_location}/{markup["FilePath"]}')
                     line = text[markup["LineStart"] - 1].strip()
                     if 0 <= markup["ValueStart"] and 0 <= markup["ValueEnd"]:
diff --git a/experiment/src/prepare_data.py b/experiment/src/prepare_data.py
index 48d8e9075..5d00ab31a 100644
--- a/experiment/src/prepare_data.py
+++ b/experiment/src/prepare_data.py
@@ -1,6 +1,9 @@
+import binascii
+import hashlib
 import os
 import subprocess
 import sys
+from pathlib import Path
 
 from credsweeper.utils import Util
 
@@ -9,9 +12,21 @@ def execute_scanner(dataset_location: str, result_location_str, j):
     """Execute CredSweeper as a separate process to make sure no global states is shared with training script"""
     dir_path = os.path.dirname(os.path.realpath(__file__)) + "/.."
     command = f"{sys.executable} -m credsweeper --path {dataset_location}/data" \
-              f" --save-json {result_location_str} " \
-              f"--job {j} --sort --rules results/train_config.yaml --ml_threshold 0"
-    subprocess.check_call(command, shell=True, cwd=dir_path, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
+              f" --save-json {result_location_str} --log info" \
+              f" --job {j} --sort --rules results/train_config.yaml --ml_threshold 0"
+    error_code = subprocess.check_call(command, shell=True, cwd=dir_path)
+    if 0 != error_code:
+        sys.exit(error_code)
+
+
+def meta_checksum(cred_data_location: str) -> str:
+    checksum = hashlib.md5(b'').digest()
+    for root, dirs, files in os.walk(Path(cred_data_location) / "meta"):
+        for file in files:
+            with open(os.path.join(root, file), "rb") as f:
+                cvs_checksum = hashlib.md5(f.read()).digest()
+            checksum = bytes(a ^ b for a, b in zip(checksum, cvs_checksum))
+    return binascii.hexlify(checksum).decode()
 
 
 def prepare_train_data(cred_data_location: str, j: int):
@@ -23,8 +38,12 @@ def prepare_train_data(cred_data_location: str, j: int):
         new_rules = [x for x in rules if x.get("use_ml")]
         Util.yaml_dump(new_rules, "results/train_config.yaml")
 
-    if not os.path.exists("results/detected_data.json"):
+    detected_data_filename = f"results/detected_data.{meta_checksum(cred_data_location)}.json"
+
+    if not os.path.exists(detected_data_filename):
         print(f"Get CredSweeper results from {cred_data_location}. May take some time")
-        execute_scanner(cred_data_location, "results/detected_data.json", j)
+        execute_scanner(cred_data_location, detected_data_filename, j)
+    else:
+        print(f"Get cached result {meta_checksum(cred_data_location)}")
 
     print("Train data prepared!")
diff --git a/tests/__init__.py b/tests/__init__.py
index 85a275175..73b4cd837 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -7,14 +7,14 @@
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
 
 # credentials count after scan
-SAMPLES_CRED_COUNT: int = 363
-SAMPLES_CRED_LINE_COUNT: int = 380
+SAMPLES_CRED_COUNT: int = 364
+SAMPLES_CRED_LINE_COUNT: int = 381
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 322
+SAMPLES_POST_CRED_COUNT: int = 323
 
 # with option --doc
-SAMPLES_IN_DOC = 416
+SAMPLES_IN_DOC = 417
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24
diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json
index 8fd52ab2c..8ea01c932 100644
--- a/tests/data/depth_3.json
+++ b/tests/data/depth_3.json
@@ -7648,6 +7648,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.999,
+        "rule": "Password",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "password: F1lT3ReDw17hQoT3s",
+                "line_num": 3,
+                "path": "tests/samples/nonce.py",
+                "info": "tests/samples/nonce.py|RAW",
+                "value": "F1lT3ReDw17hQoT3s",
+                "value_start": 10,
+                "value_end": 27,
+                "variable": "password",
+                "variable_start": 0,
+                "variable_end": 8,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.734521664779752,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",
diff --git a/tests/data/doc.json b/tests/data/doc.json
index 0311797b5..b0b19396d 100644
--- a/tests/data/doc.json
+++ b/tests/data/doc.json
@@ -11824,6 +11824,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "PASSWD_PAIR",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "password: F1lT3ReDw17hQoT3s",
+                "line_num": 3,
+                "path": "tests/samples/nonce.py",
+                "info": "tests/samples/nonce.py|RAW",
+                "value": "F1lT3ReDw17hQoT3s",
+                "value_start": 10,
+                "value_end": 27,
+                "variable": "password",
+                "variable_start": 0,
+                "variable_end": 8,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.734521664779752,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",
diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json
index 01c1a0929..8fb14c91f 100644
--- a/tests/data/ml_threshold.json
+++ b/tests/data/ml_threshold.json
@@ -8255,6 +8255,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.999,
+        "rule": "Password",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "password: F1lT3ReDw17hQoT3s",
+                "line_num": 3,
+                "path": "tests/samples/nonce.py",
+                "info": "",
+                "value": "F1lT3ReDw17hQoT3s",
+                "value_start": 10,
+                "value_end": 27,
+                "variable": "password",
+                "variable_start": 0,
+                "variable_end": 8,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.734521664779752,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",
diff --git a/tests/data/output.json b/tests/data/output.json
index f6dcde344..de20dcb49 100644
--- a/tests/data/output.json
+++ b/tests/data/output.json
@@ -7337,6 +7337,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.999,
+        "rule": "Password",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "password: F1lT3ReDw17hQoT3s",
+                "line_num": 3,
+                "path": "tests/samples/nonce.py",
+                "info": "",
+                "value": "F1lT3ReDw17hQoT3s",
+                "value_start": 10,
+                "value_end": 27,
+                "variable": "password",
+                "variable_start": 0,
+                "variable_end": 8,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.734521664779752,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",
diff --git a/tests/filters/test_value_array_dictionary_check.py b/tests/filters/test_value_array_dictionary_check.py
index 3ea0cbd7f..afdc30d3e 100644
--- a/tests/filters/test_value_array_dictionary_check.py
+++ b/tests/filters/test_value_array_dictionary_check.py
@@ -11,7 +11,7 @@ class TestValueArrayDictionaryCheck:
     @pytest.fixture
     def token_rule(self, config) -> Rule:
         token_rule_without_filters = {
-            "name": "Pass",
+            "name": "Password",
             "severity": "medium",
             "confidence": "moderate",
             "type": "keyword",
diff --git a/tests/samples/nonce.py b/tests/samples/nonce.py
index 4e2edd092..0a63f7a1b 100644
--- a/tests/samples/nonce.py
+++ b/tests/samples/nonce.py
@@ -1 +1,4 @@
 nonce = 'bsfcvir57nt40rydvtbhs8lzbgljmet5'
+secure_yaml = """
+password: F1lT3ReDw17hQoT3s
+"""
diff --git a/tests/samples/url_cred.js b/tests/samples/url_cred.js
index d555f7233..2bcbeb844 100644
--- a/tests/samples/url_cred.js
+++ b/tests/samples/url_cred.js
@@ -11,3 +11,7 @@ url = "https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut
 */
 
 email_as_login = "smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465";
+
+/*
+@"otpauth://host/port?set=VNMXQKAZFVOYOJCDNBIYXYIWX2&info=should_not_be_found_even_in_ml_threshold
+*/
diff --git a/tests/test_main.py b/tests/test_main.py
index 7e664004b..7064e3a11 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -774,6 +774,9 @@ def prepare(report: List[Dict[str, Any]]):
     def test_param_n(self) -> None:
         # internal parametrized tests for quick debug - no itms should be found
         items = [  #
+            ('x3.txt', b'passwd = values[token_id]'),
+            ('t.py', b'new_params = {"dsn": new_params["dsn"], "password": new_params["password"]}'),
+            ('t.m', b'@"otpauth://host/port?set=VNMXQKAZFVOYOJCDNBIYXYIWX2&algorithm=F4KE",'),
             ("test.c", b" *keylen = X448_KEYLEN;"),
             ("test.php", b"$yourls_user_passwords = $copy;"),
             ("", b"passwords = List<secret>"),
@@ -800,6 +803,9 @@ def test_param_n(self) -> None:
     def test_param_p(self) -> None:
         # internal parametrized tests for quick debug
         items = [  #
+            ('my.toml', b'{nkey: XMIGDHSYNSJQ0XNR}', "nkey", "XMIGDHSYNSJQ0XNR"),
+            ('my.yaml', b'password: 3287#JQ0XX@IG}', "password", "3287#JQ0XX@IG}"),
+            ("creds.py", b'"tokens": ["xabsjhdbasu7d9g", "ashbjhdifufhsds"]', "tokens", "xabsjhdbasu7d9g"),
             ("slt.py", b'\\t\\tsalt = "\\x187bhgerjhqw\\n iKa\\tW_R~0/8"', "salt", "\\x187bhgerjhqw\\n iKa\\tW_R~0/8"),
             ("log.txt",
              b'json\\nAuthorization: Basic jfhlksadjiu9813ryiuhdfskadjlkjh34\\n\\u003c/code\\u003e\\u003c/pre\\u003e"',