Skip to content

Commit

Permalink
UUID pattern added
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Aug 9, 2024
1 parent e31ef71 commit b0339a8
Show file tree
Hide file tree
Showing 10 changed files with 179 additions and 19 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: auxiliary
ref: uuid

- name: Markup hashing
run: |
Expand Down Expand Up @@ -74,7 +74,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: auxiliary
ref: uuid

- name: Markup hashing
run: |
Expand Down Expand Up @@ -172,7 +172,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: auxiliary
ref: uuid

- name: Markup hashing
run: |
Expand Down Expand Up @@ -342,6 +342,7 @@ jobs:
exit ${exit_code}
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

experiment:
# the ml train test is placed here to use cached data set
needs: [ download_data ]
Expand All @@ -354,7 +355,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: auxiliary
ref: uuid

- name: Markup hashing
run: |
Expand Down
17 changes: 17 additions & 0 deletions credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,23 @@
target:
- code

- name: UUID
severity: info
confidence: strong
type: pattern
values:
- (?<![0-9A-Za-z_+-])(?P<value>[0-9A-Fa-f]{8}(-[0-9A-Fa-f]{4}){3}-[0-9A-Fa-f]{12})(?![=0-9A-Za-z_+-])
min_line_len: 36
required_substrings:
- "-"
required_regex: "[0-9A-Za-z_/+-]{15}"
filter_type:
- ValuePatternCheck
use_ml: false
target:
- code
- doc

- name: AWS Client ID
severity: high
confidence: moderate
Expand Down
10 changes: 5 additions & 5 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
from pathlib import Path

# total number of files in test samples
SAMPLES_FILES_COUNT: int = 129
SAMPLES_FILES_COUNT: int = 130

# the lowest value of ML threshold is used to display possible lowest values
NEGLIGIBLE_ML_THRESHOLD = 0.0001

# credentials count after scan
SAMPLES_CRED_COUNT: int = 362
SAMPLES_CRED_LINE_COUNT: int = 379
SAMPLES_CRED_COUNT: int = 363
SAMPLES_CRED_LINE_COUNT: int = 380

# credentials count after post-processing
SAMPLES_POST_CRED_COUNT: int = 321
SAMPLES_POST_CRED_COUNT: int = 322

# with option --doc
SAMPLES_IN_DOC = 415
SAMPLES_IN_DOC = 416

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24
Expand Down
27 changes: 27 additions & 0 deletions tests/data/depth_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -10919,6 +10919,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "UUID",
"severity": "info",
"confidence": "strong",
"line_data_list": [
{
"line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
"line_num": 1,
"path": "tests/samples/uuid",
"info": "tests/samples/uuid|RAW",
"value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
"value_start": 0,
"value_end": 36,
"variable": null,
"variable_start": -2,
"variable_end": -2,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.2373263071270246,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
27 changes: 27 additions & 0 deletions tests/data/doc.json
Original file line number Diff line number Diff line change
Expand Up @@ -13064,6 +13064,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "UUID",
"severity": "info",
"confidence": "strong",
"line_data_list": [
{
"line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
"line_num": 1,
"path": "tests/samples/uuid",
"info": "tests/samples/uuid|RAW",
"value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
"value_start": 0,
"value_end": 36,
"variable": null,
"variable_start": -2,
"variable_end": -2,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.2373263071270246,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
27 changes: 27 additions & 0 deletions tests/data/ml_threshold.json
Original file line number Diff line number Diff line change
Expand Up @@ -9981,6 +9981,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "UUID",
"severity": "info",
"confidence": "strong",
"line_data_list": [
{
"line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
"line_num": 1,
"path": "tests/samples/uuid",
"info": "",
"value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
"value_start": 0,
"value_end": 36,
"variable": null,
"variable_start": -2,
"variable_end": -2,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.2373263071270246,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
27 changes: 27 additions & 0 deletions tests/data/output.json
Original file line number Diff line number Diff line change
Expand Up @@ -8901,6 +8901,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "UUID",
"severity": "info",
"confidence": "strong",
"line_data_list": [
{
"line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
"line_num": 1,
"path": "tests/samples/uuid",
"info": "",
"value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
"value_start": 0,
"value_end": 36,
"variable": null,
"variable_start": -2,
"variable_end": -2,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.2373263071270246,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
47 changes: 39 additions & 8 deletions tests/ml_model/test_ml_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,22 +48,53 @@ def validate(_candidate: Candidate) -> Tuple[bool, float]:
candidate.line_data_list[0].value = "Ahga%$FiQ@Ei8"

decision, probability = validate(candidate)
self.assertAlmostEqual(0.9997520446777344, probability, delta=NEGLIGIBLE_ML_THRESHOLD)
self.assertAlmostEqual(0.9996967911720276, probability, delta=NEGLIGIBLE_ML_THRESHOLD)

candidate.line_data_list[0].path = "sample.py"
candidate.line_data_list[0].path = "sample.yaml"
candidate.line_data_list[0].file_type = ".yaml"
decision, probability = validate(candidate)
self.assertAlmostEqual(0.9994515776634216, probability, delta=NEGLIGIBLE_ML_THRESHOLD)

candidate.line_data_list[0].path = "test.zip"
candidate.line_data_list[0].file_type = ".zip"
candidate.line_data_list[0].path = "test.cc"
candidate.line_data_list[0].file_type = ".cc"
decision, probability = validate(candidate)
self.assertAlmostEqual(0.9994281530380249, probability, delta=NEGLIGIBLE_ML_THRESHOLD)
self.assertAlmostEqual(0.9989229440689087, probability, delta=NEGLIGIBLE_ML_THRESHOLD)

candidate.line_data_list[0].path = "other.txt"
candidate.line_data_list[0].file_type = ".txt"
candidate.line_data_list[0].path = "other.unknown"
candidate.line_data_list[0].file_type = ".unknown"
decision, probability = validate(candidate)
self.assertAlmostEqual(0.9980608820915222, probability, delta=NEGLIGIBLE_ML_THRESHOLD)
self.assertAlmostEqual(0.999495267868042, probability, delta=NEGLIGIBLE_ML_THRESHOLD)

def test_ml_validator_auxiliary_p(self):
candidate = Candidate.get_dummy_candidate(self.config, "mycred", "", "")
candidate.rule_name = "Secret"
candidate.line_data_list[0].line = "secret=bace4d19-dead-beef-cafe-9129474bcd81"
candidate.line_data_list[0].variable = "secret"
candidate.line_data_list[0].value_start = 7
candidate.line_data_list[0].value_end = 43
candidate.line_data_list[0].value = "bace4d19-dead-beef-cafe-9129474bcd81"
# auxiliary candidate for a pattern rule - without variable
aux_candidate = copy.deepcopy(candidate)
aux_candidate.line_data_list[0].variable = None

# todo: the scores are low for current ML model - will be changed after train

candidate_key = CandidateKey(candidate.line_data_list[0])
sample_as_batch = [(candidate_key, [candidate])]
is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2)
self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD)

# auxiliary rule which was not trained - keeps the same ML probability
aux_candidate.rule_name = "PASSWD_PAIR"
sample_as_batch = [(candidate_key, [candidate, aux_candidate])]
is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2)
self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD)

# auxiliary rule in train increases ML probability
aux_candidate.rule_name = "UUID"
sample_as_batch = [(candidate_key, [candidate, aux_candidate])]
is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2)
self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD)

def test_extract_features_p(self):
candidate1 = Candidate.get_dummy_candidate(self.config, "main.py", ".py", "info")
Expand Down
2 changes: 2 additions & 0 deletions tests/samples/uuid
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
bace4d19-fa7e-beef-cafe-9129474bcd81 # tp
12345678-1234-1234-1234-1234567890ab # fp
5 changes: 3 additions & 2 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ def test_pdf_p(self) -> None:
# may be tested with
# https://www.dcc.edu/documents/administration/offices/information-technology/password-examples.pdf
content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "sample.pdf"])
cred_sweeper = CredSweeper(depth=33)
cred_sweeper = CredSweeper(depth=7)
cred_sweeper.run(content_provider=content_provider)
found_credentials = cred_sweeper.credential_manager.get_credentials()
self.assertSetEqual({"AWS Client ID", "Password", "Github Classic Token", "Key"},
Expand Down Expand Up @@ -786,6 +786,7 @@ def test_param_n(self) -> None:
("pager.rs", b"token: impl AsRef<str>,"), #
("pager.rs", b" let tokens = quote::quote! {"), #
("pager.rs", b" let cert_chain = x509_rx"), #
("my.kt", b'val password: String? = null'), #
]
content_provider: AbstractProvider = FilesProvider([(file_name, io.BytesIO(data_line))
for file_name, data_line in items])
Expand Down Expand Up @@ -819,7 +820,7 @@ def test_param_p(self) -> None:
("accept.py", b"password='Ahga%$FiQ@Ei8'", "password", "Ahga%$FiQ@Ei8"), #
("test.template", b" NAMED_API_KEY=qii7t1m6423127xto389xc914l34451qz5135865564sg ", "NAMED_API_KEY",
"qii7t1m6423127xto389xc914l34451qz5135865564sg"), #
("my.kt", b'val password: String? = "Ahga%$FiQ@Ei8"', "password", "Ahga%$FiQ@Ei8"), #
("my.kt", b'val password: String = "Ahga%$FiQ@Ei8"', "password", "Ahga%$FiQ@Ei8"), #
]
for file_name, data_line, variable, value in items:
content_provider: AbstractProvider = FilesProvider([
Expand Down

0 comments on commit b0339a8

Please sign in to comment.