diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 8b418ecf1..90dfa8f8d 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -23,7 +23,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: auxiliary + ref: uuid - name: Markup hashing run: | @@ -74,7 +74,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: auxiliary + ref: uuid - name: Markup hashing run: | @@ -172,7 +172,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: auxiliary + ref: uuid - name: Markup hashing run: | @@ -342,6 +342,7 @@ jobs: exit ${exit_code} # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + experiment: # the ml train test is placed here to use cached data set needs: [ download_data ] @@ -354,7 +355,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: auxiliary + ref: uuid - name: Markup hashing run: | diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml index afe66d028..29c266f8e 100644 --- a/credsweeper/rules/config.yaml +++ b/credsweeper/rules/config.yaml @@ -126,6 +126,23 @@ target: - code +- name: UUID + severity: info + confidence: strong + type: pattern + values: + - (?[0-9A-Fa-f]{8}(-[0-9A-Fa-f]{4}){3}-[0-9A-Fa-f]{12})(?![=0-9A-Za-z_+-]) + min_line_len: 36 + required_substrings: + - "-" + required_regex: "[0-9A-Za-z_/+-]{15}" + filter_type: + - ValuePatternCheck + use_ml: false + target: + - code + - doc + - name: AWS Client ID severity: high confidence: moderate diff --git a/tests/__init__.py b/tests/__init__.py index a501eae08..85a275175 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,20 +1,20 @@ from pathlib import Path # total number of files in test samples -SAMPLES_FILES_COUNT: int = 129 +SAMPLES_FILES_COUNT: int = 130 # the lowest value of ML threshold is used to display possible lowest values NEGLIGIBLE_ML_THRESHOLD = 0.0001 # credentials count after scan -SAMPLES_CRED_COUNT: int = 362 -SAMPLES_CRED_LINE_COUNT: int = 379 +SAMPLES_CRED_COUNT: int = 363 +SAMPLES_CRED_LINE_COUNT: int = 380 # credentials count after post-processing -SAMPLES_POST_CRED_COUNT: int = 321 +SAMPLES_POST_CRED_COUNT: int = 322 # with option --doc -SAMPLES_IN_DOC = 415 +SAMPLES_IN_DOC = 416 # archived credentials that are not found without --depth SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24 diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index 01cdc64ff..8fd52ab2c 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -10919,6 +10919,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "UUID", + "severity": "info", + "confidence": "strong", + "line_data_list": [ + { + "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp", + "line_num": 1, + "path": "tests/samples/uuid", + "info": "tests/samples/uuid|RAW", + "value": "bace4d19-fa7e-beef-cafe-9129474bcd81", + "value_start": 0, + "value_end": 36, + "variable": null, + "variable_start": -2, + "variable_end": -2, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2373263071270246, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/doc.json b/tests/data/doc.json index 46c8cdb7d..0311797b5 100644 --- a/tests/data/doc.json +++ b/tests/data/doc.json @@ -13064,6 +13064,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "UUID", + "severity": "info", + "confidence": "strong", + "line_data_list": [ + { + "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp", + "line_num": 1, + "path": "tests/samples/uuid", + "info": "tests/samples/uuid|RAW", + "value": "bace4d19-fa7e-beef-cafe-9129474bcd81", + "value_start": 0, + "value_end": 36, + "variable": null, + "variable_start": -2, + "variable_end": -2, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2373263071270246, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json index d905cd4b5..01c1a0929 100644 --- a/tests/data/ml_threshold.json +++ b/tests/data/ml_threshold.json @@ -9981,6 +9981,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "UUID", + "severity": "info", + "confidence": "strong", + "line_data_list": [ + { + "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp", + "line_num": 1, + "path": "tests/samples/uuid", + "info": "", + "value": "bace4d19-fa7e-beef-cafe-9129474bcd81", + "value_start": 0, + "value_end": 36, + "variable": null, + "variable_start": -2, + "variable_end": -2, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2373263071270246, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/output.json b/tests/data/output.json index 1d75227d5..f6dcde344 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -8901,6 +8901,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "UUID", + "severity": "info", + "confidence": "strong", + "line_data_list": [ + { + "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp", + "line_num": 1, + "path": "tests/samples/uuid", + "info": "", + "value": "bace4d19-fa7e-beef-cafe-9129474bcd81", + "value_start": 0, + "value_end": 36, + "variable": null, + "variable_start": -2, + "variable_end": -2, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2373263071270246, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/ml_model/test_ml_validator.py b/tests/ml_model/test_ml_validator.py index ee7083ae9..0c2d33c64 100644 --- a/tests/ml_model/test_ml_validator.py +++ b/tests/ml_model/test_ml_validator.py @@ -48,22 +48,53 @@ def validate(_candidate: Candidate) -> Tuple[bool, float]: candidate.line_data_list[0].value = "Ahga%$FiQ@Ei8" decision, probability = validate(candidate) - self.assertAlmostEqual(0.9997520446777344, probability, delta=NEGLIGIBLE_ML_THRESHOLD) + self.assertAlmostEqual(0.9996967911720276, probability, delta=NEGLIGIBLE_ML_THRESHOLD) - candidate.line_data_list[0].path = "sample.py" + candidate.line_data_list[0].path = "sample.yaml" candidate.line_data_list[0].file_type = ".yaml" decision, probability = validate(candidate) self.assertAlmostEqual(0.9994515776634216, probability, delta=NEGLIGIBLE_ML_THRESHOLD) - candidate.line_data_list[0].path = "test.zip" - candidate.line_data_list[0].file_type = ".zip" + candidate.line_data_list[0].path = "test.cc" + candidate.line_data_list[0].file_type = ".cc" decision, probability = validate(candidate) - self.assertAlmostEqual(0.9994281530380249, probability, delta=NEGLIGIBLE_ML_THRESHOLD) + self.assertAlmostEqual(0.9989229440689087, probability, delta=NEGLIGIBLE_ML_THRESHOLD) - candidate.line_data_list[0].path = "other.txt" - candidate.line_data_list[0].file_type = ".txt" + candidate.line_data_list[0].path = "other.unknown" + candidate.line_data_list[0].file_type = ".unknown" decision, probability = validate(candidate) - self.assertAlmostEqual(0.9980608820915222, probability, delta=NEGLIGIBLE_ML_THRESHOLD) + self.assertAlmostEqual(0.999495267868042, probability, delta=NEGLIGIBLE_ML_THRESHOLD) + + def test_ml_validator_auxiliary_p(self): + candidate = Candidate.get_dummy_candidate(self.config, "mycred", "", "") + candidate.rule_name = "Secret" + candidate.line_data_list[0].line = "secret=bace4d19-dead-beef-cafe-9129474bcd81" + candidate.line_data_list[0].variable = "secret" + candidate.line_data_list[0].value_start = 7 + candidate.line_data_list[0].value_end = 43 + candidate.line_data_list[0].value = "bace4d19-dead-beef-cafe-9129474bcd81" + # auxiliary candidate for a pattern rule - without variable + aux_candidate = copy.deepcopy(candidate) + aux_candidate.line_data_list[0].variable = None + + # todo: the scores are low for current ML model - will be changed after train + + candidate_key = CandidateKey(candidate.line_data_list[0]) + sample_as_batch = [(candidate_key, [candidate])] + is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2) + self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD) + + # auxiliary rule which was not trained - keeps the same ML probability + aux_candidate.rule_name = "PASSWD_PAIR" + sample_as_batch = [(candidate_key, [candidate, aux_candidate])] + is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2) + self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD) + + # auxiliary rule in train increases ML probability + aux_candidate.rule_name = "UUID" + sample_as_batch = [(candidate_key, [candidate, aux_candidate])] + is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2) + self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD) def test_extract_features_p(self): candidate1 = Candidate.get_dummy_candidate(self.config, "main.py", ".py", "info") diff --git a/tests/samples/uuid b/tests/samples/uuid new file mode 100644 index 000000000..0ce05f451 --- /dev/null +++ b/tests/samples/uuid @@ -0,0 +1,2 @@ +bace4d19-fa7e-beef-cafe-9129474bcd81 # tp +12345678-1234-1234-1234-1234567890ab # fp diff --git a/tests/test_main.py b/tests/test_main.py index 6c774c4f9..7e664004b 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -484,7 +484,7 @@ def test_pdf_p(self) -> None: # may be tested with # https://www.dcc.edu/documents/administration/offices/information-technology/password-examples.pdf content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "sample.pdf"]) - cred_sweeper = CredSweeper(depth=33) + cred_sweeper = CredSweeper(depth=7) cred_sweeper.run(content_provider=content_provider) found_credentials = cred_sweeper.credential_manager.get_credentials() self.assertSetEqual({"AWS Client ID", "Password", "Github Classic Token", "Key"}, @@ -786,6 +786,7 @@ def test_param_n(self) -> None: ("pager.rs", b"token: impl AsRef,"), # ("pager.rs", b" let tokens = quote::quote! {"), # ("pager.rs", b" let cert_chain = x509_rx"), # + ("my.kt", b'val password: String? = null'), # ] content_provider: AbstractProvider = FilesProvider([(file_name, io.BytesIO(data_line)) for file_name, data_line in items]) @@ -819,7 +820,7 @@ def test_param_p(self) -> None: ("accept.py", b"password='Ahga%$FiQ@Ei8'", "password", "Ahga%$FiQ@Ei8"), # ("test.template", b" NAMED_API_KEY=qii7t1m6423127xto389xc914l34451qz5135865564sg ", "NAMED_API_KEY", "qii7t1m6423127xto389xc914l34451qz5135865564sg"), # - ("my.kt", b'val password: String? = "Ahga%$FiQ@Ei8"', "password", "Ahga%$FiQ@Ei8"), # + ("my.kt", b'val password: String = "Ahga%$FiQ@Ei8"', "password", "Ahga%$FiQ@Ei8"), # ] for file_name, data_line, variable, value in items: content_provider: AbstractProvider = FilesProvider([