Samsung · babenek · Aug 9, 2024 · Aug 9, 2024 · Aug 9, 2024
@@ -23,7 +23,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: babenek/CredData
-          ref: auxiliary
+          ref: uuid
 
       - name: Markup hashing
         run: |
@@ -74,7 +74,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: babenek/CredData
-          ref: auxiliary
+          ref: uuid
 
       - name: Markup hashing
         run: |
@@ -172,7 +172,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: babenek/CredData
-          ref: auxiliary
+          ref: uuid
 
       - name: Markup hashing
         run: |
@@ -342,6 +342,7 @@ jobs:
           exit ${exit_code}
 
   # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+
   experiment:
     # the ml train test is placed here to use cached data set
     needs: [ download_data ]
@@ -354,7 +355,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: babenek/CredData
-          ref: auxiliary
+          ref: uuid
 
       - name: Markup hashing
         run: |

@@ -126,6 +126,23 @@
   target:
     - code
 
+- name: UUID
+  severity: info
+  confidence: strong
+  type: pattern
+  values:
+    - (?<![0-9A-Za-z_+-])(?P<value>[0-9A-Fa-f]{8}(-[0-9A-Fa-f]{4}){3}-[0-9A-Fa-f]{12})(?![=0-9A-Za-z_+-])
+  min_line_len: 36
+  required_substrings:
+    - "-"
+  required_regex: "[0-9A-Za-z_/+-]{15}"
+  filter_type:
+    - ValuePatternCheck
+  use_ml: false
+  target:
+    - code
+    - doc
+
 - name: AWS Client ID
   severity: high
   confidence: moderate

@@ -1,20 +1,20 @@
 from pathlib import Path
 
 # total number of files in test samples
-SAMPLES_FILES_COUNT: int = 129
+SAMPLES_FILES_COUNT: int = 130
 
 # the lowest value of ML threshold is used to display possible lowest values
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
 
 # credentials count after scan
-SAMPLES_CRED_COUNT: int = 362
-SAMPLES_CRED_LINE_COUNT: int = 379
+SAMPLES_CRED_COUNT: int = 363
+SAMPLES_CRED_LINE_COUNT: int = 380
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 321
+SAMPLES_POST_CRED_COUNT: int = 322
 
 # with option --doc
-SAMPLES_IN_DOC = 415
+SAMPLES_IN_DOC = 416
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24

@@ -10919,6 +10919,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "UUID",
+        "severity": "info",
+        "confidence": "strong",
+        "line_data_list": [
+            {
+                "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
+                "line_num": 1,
+                "path": "tests/samples/uuid",
+                "info": "tests/samples/uuid|RAW",
+                "value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
+                "value_start": 0,
+                "value_end": 36,
+                "variable": null,
+                "variable_start": -2,
+                "variable_end": -2,
+                "entropy_validation": {
+                    "iterator": "BASE36_CHARS",
+                    "entropy": 3.2373263071270246,
+                    "valid": true
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",

@@ -13064,6 +13064,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "UUID",
+        "severity": "info",
+        "confidence": "strong",
+        "line_data_list": [
+            {
+                "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
+                "line_num": 1,
+                "path": "tests/samples/uuid",
+                "info": "tests/samples/uuid|RAW",
+                "value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
+                "value_start": 0,
+                "value_end": 36,
+                "variable": null,
+                "variable_start": -2,
+                "variable_end": -2,
+                "entropy_validation": {
+                    "iterator": "BASE36_CHARS",
+                    "entropy": 3.2373263071270246,
+                    "valid": true
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",

@@ -9981,6 +9981,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "UUID",
+        "severity": "info",
+        "confidence": "strong",
+        "line_data_list": [
+            {
+                "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
+                "line_num": 1,
+                "path": "tests/samples/uuid",
+                "info": "",
+                "value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
+                "value_start": 0,
+                "value_end": 36,
+                "variable": null,
+                "variable_start": -2,
+                "variable_end": -2,
+                "entropy_validation": {
+                    "iterator": "BASE36_CHARS",
+                    "entropy": 3.2373263071270246,
+                    "valid": true
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",

@@ -8901,6 +8901,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "UUID",
+        "severity": "info",
+        "confidence": "strong",
+        "line_data_list": [
+            {
+                "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
+                "line_num": 1,
+                "path": "tests/samples/uuid",
+                "info": "",
+                "value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
+                "value_start": 0,
+                "value_end": 36,
+                "variable": null,
+                "variable_start": -2,
+                "variable_end": -2,
+                "entropy_validation": {
+                    "iterator": "BASE36_CHARS",
+                    "entropy": 3.2373263071270246,
+                    "valid": true
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",

@@ -65,6 +65,32 @@ def validate(_candidate: Candidate) -> Tuple[bool, float]:
         decision, probability = validate(candidate)
         self.assertAlmostEqual(0.9980608820915222, probability, delta=NEGLIGIBLE_ML_THRESHOLD)
 
+    def test_ml_validator_auxiliary_p(self):
+        candidate = Candidate.get_dummy_candidate(self.config, "secret", "", "")
+        candidate.rule_name = "Secret"
+        candidate.line_data_list[0].line = "secret=bace4d19-dead-beef-cafe-9129474bcd81"
+        candidate.line_data_list[0].variable = "secret"
+        candidate.line_data_list[0].value_start = 7
+        candidate.line_data_list[0].value_end = 43
+        candidate.line_data_list[0].value = "bace4d19-dead-beef-cafe-9129474bcd81"
+        # auxiliary candidate for a pattern rule - without variable
+        aux_candidate = copy.deepcopy(candidate)
+        aux_candidate.line_data_list[0].variable = None
+
+        # todo: the scores are low for current ML model - will be changed after train
+
+        candidate_key = CandidateKey(candidate.line_data_list[0])
+        sample_as_batch = [(candidate_key, [candidate])]
+        is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2)
+        self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD)
+
+        # auxiliary rule in train does not increase ML probability yet - will be used after next train
+
+        aux_candidate.rule_name = "UUID"
+        sample_as_batch = [(candidate_key, [candidate, aux_candidate])]
+        is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2)
+        self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD)
+
     def test_extract_features_p(self):
         candidate1 = Candidate.get_dummy_candidate(self.config, "main.py", ".py", "info")
         candidate1.line_data_list[0].line = 'ABC123'

@@ -0,0 +1,2 @@
+bace4d19-fa7e-beef-cafe-9129474bcd81 # tp
+12345678-1234-1234-1234-1234567890ab # fp
@@ -484,7 +484,7 @@ def test_pdf_p(self) -> None:
         # may be tested with
         # https://www.dcc.edu/documents/administration/offices/information-technology/password-examples.pdf
         content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "sample.pdf"])
-        cred_sweeper = CredSweeper(depth=33)
+        cred_sweeper = CredSweeper(depth=7)
         cred_sweeper.run(content_provider=content_provider)
         found_credentials = cred_sweeper.credential_manager.get_credentials()
         self.assertSetEqual({"AWS Client ID", "Password", "Github Classic Token", "Key"},
@@ -786,6 +786,7 @@ def test_param_n(self) -> None:
             ("pager.rs", b"token: impl AsRef<str>,"),  #
             ("pager.rs", b"    let tokens = quote::quote! {"),  #
             ("pager.rs", b"  let cert_chain = x509_rx"),  #
+            ("my.kt", b'val password: String? = null'),  #
         ]
         content_provider: AbstractProvider = FilesProvider([(file_name, io.BytesIO(data_line))
                                                             for file_name, data_line in items])
@@ -819,7 +820,7 @@ def test_param_p(self) -> None:
             ("accept.py", b"password='Ahga%$FiQ@Ei8'", "password", "Ahga%$FiQ@Ei8"),  #
             ("test.template", b" NAMED_API_KEY=qii7t1m6423127xto389xc914l34451qz5135865564sg ", "NAMED_API_KEY",
              "qii7t1m6423127xto389xc914l34451qz5135865564sg"),  #
-            ("my.kt", b'val password: String? = "Ahga%$FiQ@Ei8"', "password", "Ahga%$FiQ@Ei8"),  #
+            ("my.kt", b'val password: String = "Ahga%$FiQ@Ei8"', "password", "Ahga%$FiQ@Ei8"),  #
         ]
         for file_name, data_line, variable, value in items:
             content_provider: AbstractProvider = FilesProvider([