Skip to content

Commit

Permalink
ValuePathFilter and keyword regex enchancement (#595)
Browse files Browse the repository at this point in the history
* square bracket workaround in keywort regex

* path filter

* BM score fix

* ValueStringTypeCheck workaround for heterogenous source

* wrap added to filter array definitions

* TOML format sanitizer

* YAML case

* BM fix

* BM scores fix

* skip f* in BM experiment

* keep 0*-3* meta for experiment

* less repos in test
  • Loading branch information
babenek authored Aug 13, 2024
1 parent 061d0d5 commit 45e0643
Show file tree
Hide file tree
Showing 20 changed files with 221 additions and 60 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -374,8 +374,8 @@ jobs:

- name: Exclude some sets for speed-up
run: |
rm -rf data/2* data/8* data/b*
rm -rf meta/2* meta/8* meta/b*
rm -rf data/4* data/5* data/6* data/7* data/8* data/9* data/a* data/b* data/c* data/d* data/e* data/f*
rm -rf meta/4* meta/5* meta/6* meta/7* meta/8* meta/9* meta/a* meta/b* meta/c* meta/d* meta/e* meta/f*
mkdir -vp ${{ github.workspace }}/CredData
mv data ${{ github.workspace }}/CredData/
mv meta ${{ github.workspace }}/CredData/
Expand Down Expand Up @@ -424,7 +424,7 @@ jobs:
# run quick scan
python -m credsweeper --log debug --path ../tests/samples --save-json
NEW_MODEL_FOUND_SAMPLES=$(jq '.|length' output.json)
if [ 100 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then
if [ 10 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then
echo "Failure: found ${NEW_MODEL_FOUND_SAMPLES} credentials"
exit 1
fi
Expand Down
52 changes: 26 additions & 26 deletions cicd/benchmark.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
DATA: 16348035 interested lines. MARKUP: 62567 items
DATA: 16348035 interested lines. MARKUP: 62632 items
FileType FileNumber ValidLines Positives Negatives Templates
--------------- ------------ ------------ ----------- ----------- -----------
194 28318 66 427 87
Expand Down Expand Up @@ -33,7 +33,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.creds 1 10 1 1
.crlf 1 27 1
.crt 2 4979 253
.cs 268 82410 158 907 94
.cs 268 82410 158 910 94
.cshtml 5 180 12
.csp 3 379 11
.csproj 1 14 1
Expand Down Expand Up @@ -61,7 +61,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.gd 1 37 1
.gml 3 3075 26
.gni 3 5017 18
.go 1080 566476 673 4319 741
.go 1080 566476 690 4330 741
.golden 5 1168 1 14 29
.gradle 45 3265 4 91 100
.graphql 7 420 13
Expand All @@ -80,10 +80,10 @@ FileType FileNumber ValidLines Positives Negatives Templat
.ipynb 1 134 5
.j 1 241 2 2
.j2 30 5530 6 213 10
.java 621 134132 359 1360 170
.java 621 134132 362 1359 170
.jenkinsfile 1 58 2 7
.jinja2 1 64 2
.js 659 536413 536 2635 330
.js 659 536413 536 2636 330
.json 850 13046270 1074 10778 140
.jsp 13 3202 1 42
.jsx 7 857 19
Expand All @@ -105,12 +105,12 @@ FileType FileNumber ValidLines Positives Negatives Templat
.lock 24 160912 144
.log 2 199 38 52
.lua 10 1924 37 3
.m 16 13358 11 152 3
.m 16 13358 11 154 3
.manifest 3 102 9 3
.markdown 3 139 3 1
.markerb 3 12 3
.marko 1 21 2
.md 674 149399 722 2365 662
.md 674 149399 722 2370 662
.mdx 3 549 7
.mjml 1 18 1
.mjs 22 4424 78 343
Expand All @@ -122,7 +122,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.mqh 1 1023 2
.msg 1 26644 1 1
.mysql 1 36 2
.ndjson 2 5006 70 266 2
.ndjson 2 5006 71 265 2
.nix 4 211 12
.nolint 1 2 1
.odd 1 1281 57
Expand Down Expand Up @@ -150,10 +150,10 @@ FileType FileNumber ValidLines Positives Negatives Templat
.pug 2 193 2
.purs 1 69 4
.pxd 1 150 5 2
.py 890 291553 685 3456 729
.py 890 291553 682 3462 729
.pyi 4 1361 9
.pyp 1 167 1
.pyx 2 1094 21
.pyx 2 1094 23
.r 4 62 6 3 1
.rake 2 51 2
.rb 860 131838 259 3451 612
Expand Down Expand Up @@ -217,32 +217,32 @@ FileType FileNumber ValidLines Positives Negatives Templat
.xml 9 689 9
.xsl 1 311 1
.yaml 137 19004 128 356 44
.yml 418 36162 515 910 384
.yml 418 36162 550 910 384
.zsh 6 872 12
.zsh-theme 1 97 1
TOTAL: 10259 16348035 8706 59679 5182
credsweeper result_cnt : 7664, lost_cnt : 0, true_cnt : 7472, false_cnt : 192
TOTAL: 10259 16348035 8759 59707 5182
credsweeper result_cnt : 7749, lost_cnt : 0, true_cnt : 7530, false_cnt : 219
Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1
------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ---- -------- -------- -------- -------- -------- --------
API 131 3126 185 111 109 2 3309 22 0.000604 0.167939 0.993027 0.981982 0.832061 0.900826
API 128 3130 185 111 109 2 3313 19 0.000603 0.148438 0.993901 0.981982 0.851562 0.912134
AWS Client ID 167 18 0 160 160 0 18 7 0.000000 0.041916 0.962162 1.000000 0.958084 0.978593
AWS Multi 75 14 0 87 75 11 3 0 0.785714 0.000000 0.876404 0.872093 1.000000 0.931677
AWS S3 Bucket 66 24 0 92 66 24 0 0 1.000000 0.000000 0.733333 0.733333 1.000000 0.846154
Atlassian Old PAT token 27 208 3 12 3 8 203 24 0.037915 0.888889 0.865546 0.272727 0.111111 0.157895
Auth 412 2723 76 371 353 18 2781 59 0.006431 0.143204 0.976020 0.951482 0.856796 0.901660
Auth 412 2724 76 373 355 18 2782 57 0.006429 0.138350 0.976650 0.951743 0.861650 0.904459
Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194
BASE64 Private Key 7 2 0 7 7 0 2 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333
Bitbucket Client ID 142 1807 9 46 27 18 1798 115 0.009912 0.809859 0.932074 0.600000 0.190141 0.288770
Bitbucket Client Secret 230 527 10 44 33 11 526 197 0.020484 0.856522 0.728814 0.750000 0.143478 0.240876
Certificate 25 460 1 21 20 1 460 5 0.002169 0.200000 0.987654 0.952381 0.800000 0.869565
Credential 94 154 74 90 90 0 228 4 0.000000 0.042553 0.987578 1.000000 0.957447 0.978261
Certificate 25 466 1 27 20 7 460 5 0.014989 0.200000 0.975610 0.740741 0.800000 0.769231
Credential 94 154 74 83 83 0 228 11 0.000000 0.117021 0.965839 1.000000 0.882979 0.937853
Docker Swarm Token 2 0 0 2 2 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Dropbox App secret 62 114 0 46 36 9 105 26 0.078947 0.419355 0.801136 0.800000 0.580645 0.672897
Facebook Access Token 0 1 0 0 0 1 0 0.000000 1.000000
Firebase Domain 6 1 0 7 6 1 0 0 1.000000 0.000000 0.857143 0.857143 1.000000 0.923077
Github Old Token 1 0 0 1 1 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Gitlab Feed Token 188 451 87 60 47 12 526 141 0.022305 0.750000 0.789256 0.796610 0.250000 0.380567
Gitlab Feed Token 189 450 87 60 48 11 526 141 0.020484 0.746032 0.790634 0.813559 0.253968 0.387097
Gitlab Incoming Email Token 37 3 0 21 19 2 1 18 0.666667 0.486486 0.500000 0.904762 0.513514 0.655172
Google API Key 12 0 0 12 12 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Google Multi 10 2 0 11 10 1 1 0 0.500000 0.000000 0.916667 0.909091 1.000000 0.952381
Expand All @@ -251,16 +251,16 @@ Grafana Provisioned API Key 22 1 0
JSON Web Token 170 61 0 131 131 0 61 39 0.000000 0.229412 0.831169 1.000000 0.770588 0.870432
Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000
Jira 2FA 14 6 0 10 10 0 6 4 0.000000 0.285714 0.800000 1.000000 0.714286 0.833333
Key 522 8453 464 452 447 5 8912 75 0.000561 0.143678 0.991525 0.988938 0.856322 0.917864
Nonce 91 47 0 84 83 1 46 8 0.021277 0.087912 0.934783 0.988095 0.912088 0.948571
Key 538 8456 464 468 461 7 8913 77 0.000785 0.143123 0.991119 0.985043 0.856877 0.916501
Nonce 91 48 0 85 83 2 46 8 0.041667 0.087912 0.928058 0.976471 0.912088 0.943182
PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041
Password 1841 7468 2724 1691 1637 54 10138 204 0.005298 0.110809 0.978559 0.968066 0.889191 0.926954
Salt 45 73 2 39 39 0 75 6 0.000000 0.133333 0.950000 1.000000 0.866667 0.928571
Secret 1365 28359 868 1237 1233 4 29223 132 0.000137 0.096703 0.995554 0.996766 0.903297 0.947733
Password 1842 7476 2724 1725 1656 69 10131 186 0.006765 0.100977 0.978824 0.960000 0.899023 0.928511
Salt 45 74 2 40 39 1 75 6 0.013158 0.133333 0.942149 0.975000 0.866667 0.917647
Secret 1367 28360 868 1240 1234 6 29222 133 0.000205 0.097293 0.995457 0.995161 0.902707 0.946682
Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000
Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
Token 612 3949 437 516 511 5 4381 101 0.001140 0.165033 0.978792 0.990310 0.834967 0.906028
Token 648 3952 437 545 539 6 4383 109 0.001367 0.168210 0.977169 0.988991 0.831790 0.903604
Twilio API Key 0 5 2 0 0 7 0 0.000000 1.000000
URL Credentials 209 127 240 200 200 0 367 9 0.000000 0.043062 0.984375 1.000000 0.956938 0.977995
URL Credentials 209 128 240 200 200 0 368 9 0.000000 0.043062 0.984402 1.000000 0.956938 0.977995
UUID 1068 1 0 1058 1057 1 0 11 1.000000 0.010300 0.988775 0.999055 0.989700 0.994356
8706 59679 5182 7671 7472 192 59487 1234 0.003217 0.141741 0.979147 0.974948 0.858259 0.912889
8759 59707 5182 7756 7530 219 59488 1229 0.003668 0.140313 0.978851 0.971738 0.859687 0.912285
6 changes: 3 additions & 3 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ class KeywordPattern:
separator = r"\s*\]?\s*" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=" \
r"|:|=>|!=|===|==|=)" \
r"((?!\s*ENC(\(|\[))(\s|\w)*\((\s|\w|=|\()*|\s*)"
r"\s*(?P<wrap>(\w|\.|->|\(|\[)*[\[\(\{](\w{1,32}=)?\s*)?"
# Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
value = r"(?P<value_leftquote>((b|r|br|rb|u|f|rf|fr|\\{0,8})?[`'\"]){1,4})?" \
r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?" \
r"(?P<value>" \
r"(?(value_leftquote)(?:\\[tnrux0-7][0-9a-f]*|[^`'\"\\])|(?:\\n|\\r|\\?[^\s`'\"\\,;])){3,8000}" \
r"(?(value_leftquote)(?:\\[tnrux0-7][0-9a-f]*|[^`'\"\\])|(?:\\n|\\r|\\?[^\s`'\"\\,;])){1,8000}" \
r"|(?:\{[^}]{3,8000}\})|(?:<[^>]{3,8000}>)" \
r")" \
r"(?(value_leftquote)(?P<value_rightquote>(\\{0,8}[`'\"]){1,4})?)"
r"(?(value_leftquote)(?P<value_rightquote>(\\{0,8}[`'\"]){1,4})?|(?(wrap)[\]\)\},;]))"

@classmethod
def get_keyword_pattern(cls, keyword: str) -> re.Pattern:
Expand Down
11 changes: 11 additions & 0 deletions credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def __init__(
self.value_rightquote: Optional[str] = None
# is set when variable & value are in URL for any source type
self.url_part = False
self.wrap = None

self.initialize(match_obj)

Expand Down Expand Up @@ -121,6 +122,7 @@ def get_span_from_match_obj(_match_obj: re.Match, group: str) -> Tuple[int, int]
self.variable_start, self.variable_end = get_span_from_match_obj(match_obj, "variable")
self.value_leftquote = get_group_from_match_obj(match_obj, "value_leftquote")
self.value_rightquote = get_group_from_match_obj(match_obj, "value_rightquote")
self.wrap = get_group_from_match_obj(match_obj, "wrap")
self.sanitize_value()
self.sanitize_variable()

Expand All @@ -131,6 +133,7 @@ def sanitize_value(self):
_value = self.value
self.clean_url_parameters()
self.clean_bash_parameters()
self.clean_toml_parameters()
if 0 <= self.value_start and 0 <= self.value_end and len(self.value) < len(_value):
start = _value.find(self.value)
self.value_start += start
Expand Down Expand Up @@ -186,6 +189,11 @@ def clean_bash_parameters(self) -> None:
if len(value_whsp) > 1:
self.value = value_whsp[0]

def clean_toml_parameters(self) -> None:
"""Curly brackets may be caught in TOML format"""
while self.value.endswith('}') and '{' in self.line[:self.value_start]:
self.value = self.value[:-1]

def sanitize_variable(self) -> None:
"""Remove trailing spaces, dashes and quotations around the variable. Correct position."""
sanitized_var_len = 0
Expand All @@ -195,6 +203,9 @@ def sanitize_variable(self) -> None:
self.variable = self.variable.strip(self.variable_strip_pattern)
if self.variable.endswith('\\'):
self.variable = self.variable[:-1]
if self.variable.startswith('{') and '}' in self.line[self.variable_end:]:
# TOML case
self.variable = self.variable[1:]
if variable and len(self.variable) < len(variable) and 0 <= self.variable_start and 0 <= self.variable_end:
start = variable.find(self.variable)
self.variable_start += start
Expand Down
2 changes: 2 additions & 0 deletions credsweeper/filters/value_array_dictionary_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
return False
if self.PATTERN.search(line_data.value):
return True
if line_data.wrap and not line_data.is_well_quoted_value and ('[' in line_data.wrap or '(' in line_data.wrap):
return True

return False
Loading

0 comments on commit 45e0643

Please sign in to comment.