Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UUID pattern #577

Merged
merged 2 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: auxiliary
ref: uuid

- name: Markup hashing
run: |
Expand Down Expand Up @@ -74,7 +74,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: auxiliary
ref: uuid

- name: Markup hashing
run: |
Expand Down Expand Up @@ -172,7 +172,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: auxiliary
ref: uuid

- name: Markup hashing
run: |
Expand Down Expand Up @@ -342,6 +342,7 @@ jobs:
exit ${exit_code}

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

experiment:
# the ml train test is placed here to use cached data set
needs: [ download_data ]
Expand All @@ -354,7 +355,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: auxiliary
ref: uuid

- name: Markup hashing
run: |
Expand Down
150 changes: 74 additions & 76 deletions cicd/benchmark.txt

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,23 @@
target:
- code

- name: UUID
severity: info
confidence: strong
type: pattern
values:
- (?<![0-9A-Za-z_+-])(?P<value>[0-9A-Fa-f]{8}(-[0-9A-Fa-f]{4}){3}-[0-9A-Fa-f]{12})(?![=0-9A-Za-z_+-])
min_line_len: 36
required_substrings:
- "-"
required_regex: "[0-9A-Za-z_/+-]{15}"
filter_type:
- ValuePatternCheck
use_ml: false
target:
- code
- doc

- name: AWS Client ID
severity: high
confidence: moderate
Expand Down
10 changes: 5 additions & 5 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
from pathlib import Path

# total number of files in test samples
SAMPLES_FILES_COUNT: int = 129
SAMPLES_FILES_COUNT: int = 130

# the lowest value of ML threshold is used to display possible lowest values
NEGLIGIBLE_ML_THRESHOLD = 0.0001

# credentials count after scan
SAMPLES_CRED_COUNT: int = 362
SAMPLES_CRED_LINE_COUNT: int = 379
SAMPLES_CRED_COUNT: int = 363
SAMPLES_CRED_LINE_COUNT: int = 380

# credentials count after post-processing
SAMPLES_POST_CRED_COUNT: int = 321
SAMPLES_POST_CRED_COUNT: int = 322

# with option --doc
SAMPLES_IN_DOC = 415
SAMPLES_IN_DOC = 416

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24
Expand Down
27 changes: 27 additions & 0 deletions tests/data/depth_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -10919,6 +10919,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "UUID",
"severity": "info",
"confidence": "strong",
"line_data_list": [
{
"line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
"line_num": 1,
"path": "tests/samples/uuid",
"info": "tests/samples/uuid|RAW",
"value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
"value_start": 0,
"value_end": 36,
"variable": null,
"variable_start": -2,
"variable_end": -2,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.2373263071270246,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
27 changes: 27 additions & 0 deletions tests/data/doc.json
Original file line number Diff line number Diff line change
Expand Up @@ -13064,6 +13064,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "UUID",
"severity": "info",
"confidence": "strong",
"line_data_list": [
{
"line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
"line_num": 1,
"path": "tests/samples/uuid",
"info": "tests/samples/uuid|RAW",
"value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
"value_start": 0,
"value_end": 36,
"variable": null,
"variable_start": -2,
"variable_end": -2,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.2373263071270246,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
27 changes: 27 additions & 0 deletions tests/data/ml_threshold.json
Original file line number Diff line number Diff line change
Expand Up @@ -9981,6 +9981,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "UUID",
"severity": "info",
"confidence": "strong",
"line_data_list": [
{
"line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
"line_num": 1,
"path": "tests/samples/uuid",
"info": "",
"value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
"value_start": 0,
"value_end": 36,
"variable": null,
"variable_start": -2,
"variable_end": -2,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.2373263071270246,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
27 changes: 27 additions & 0 deletions tests/data/output.json
Original file line number Diff line number Diff line change
Expand Up @@ -8901,6 +8901,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "UUID",
"severity": "info",
"confidence": "strong",
"line_data_list": [
{
"line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
"line_num": 1,
"path": "tests/samples/uuid",
"info": "",
"value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
"value_start": 0,
"value_end": 36,
"variable": null,
"variable_start": -2,
"variable_end": -2,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.2373263071270246,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
26 changes: 26 additions & 0 deletions tests/ml_model/test_ml_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,32 @@ def validate(_candidate: Candidate) -> Tuple[bool, float]:
decision, probability = validate(candidate)
self.assertAlmostEqual(0.9980608820915222, probability, delta=NEGLIGIBLE_ML_THRESHOLD)

def test_ml_validator_auxiliary_p(self):
candidate = Candidate.get_dummy_candidate(self.config, "secret", "", "")
candidate.rule_name = "Secret"
candidate.line_data_list[0].line = "secret=bace4d19-dead-beef-cafe-9129474bcd81"
candidate.line_data_list[0].variable = "secret"
candidate.line_data_list[0].value_start = 7
candidate.line_data_list[0].value_end = 43
candidate.line_data_list[0].value = "bace4d19-dead-beef-cafe-9129474bcd81"
# auxiliary candidate for a pattern rule - without variable
aux_candidate = copy.deepcopy(candidate)
aux_candidate.line_data_list[0].variable = None

# todo: the scores are low for current ML model - will be changed after train

candidate_key = CandidateKey(candidate.line_data_list[0])
sample_as_batch = [(candidate_key, [candidate])]
is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2)
self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD)

# auxiliary rule in train does not increase ML probability yet - will be used after next train

aux_candidate.rule_name = "UUID"
sample_as_batch = [(candidate_key, [candidate, aux_candidate])]
is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2)
self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD)

def test_extract_features_p(self):
candidate1 = Candidate.get_dummy_candidate(self.config, "main.py", ".py", "info")
candidate1.line_data_list[0].line = 'ABC123'
Expand Down
2 changes: 2 additions & 0 deletions tests/samples/uuid
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
bace4d19-fa7e-beef-cafe-9129474bcd81 # tp
12345678-1234-1234-1234-1234567890ab # fp
5 changes: 3 additions & 2 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ def test_pdf_p(self) -> None:
# may be tested with
# https://www.dcc.edu/documents/administration/offices/information-technology/password-examples.pdf
content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "sample.pdf"])
cred_sweeper = CredSweeper(depth=33)
cred_sweeper = CredSweeper(depth=7)
cred_sweeper.run(content_provider=content_provider)
found_credentials = cred_sweeper.credential_manager.get_credentials()
self.assertSetEqual({"AWS Client ID", "Password", "Github Classic Token", "Key"},
Expand Down Expand Up @@ -786,6 +786,7 @@ def test_param_n(self) -> None:
("pager.rs", b"token: impl AsRef<str>,"), #
("pager.rs", b" let tokens = quote::quote! {"), #
("pager.rs", b" let cert_chain = x509_rx"), #
("my.kt", b'val password: String? = null'), #
]
content_provider: AbstractProvider = FilesProvider([(file_name, io.BytesIO(data_line))
for file_name, data_line in items])
Expand Down Expand Up @@ -819,7 +820,7 @@ def test_param_p(self) -> None:
("accept.py", b"password='Ahga%$FiQ@Ei8'", "password", "Ahga%$FiQ@Ei8"), #
("test.template", b" NAMED_API_KEY=qii7t1m6423127xto389xc914l34451qz5135865564sg ", "NAMED_API_KEY",
"qii7t1m6423127xto389xc914l34451qz5135865564sg"), #
("my.kt", b'val password: String? = "Ahga%$FiQ@Ei8"', "password", "Ahga%$FiQ@Ei8"), #
("my.kt", b'val password: String = "Ahga%$FiQ@Ei8"', "password", "Ahga%$FiQ@Ei8"), #
]
for file_name, data_line, variable, value in items:
content_provider: AbstractProvider = FilesProvider([
Expand Down
Loading