Skip to content

Commit

Permalink
Filters optimisation (#588)
Browse files Browse the repository at this point in the history
* Filters optimisation

* style

* unicode cases in filter

* tmp markup loan

* BM scores fix

* docs upd

* removed unused filter

* doc upd

* [no ci] upd2

* test counters fix

* reduce whitespaces during extracting subtext

* aux BM ref

* BM scores fix

* Rollback BM

* JWT fix

* customBMref

* JWT fix BC scor

* BM scores fix
  • Loading branch information
babenek authored Aug 9, 2024
1 parent af4e9b0 commit e31ef71
Show file tree
Hide file tree
Showing 33 changed files with 318 additions and 223 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: jwt
ref: auxiliary

- name: Markup hashing
run: |
Expand Down Expand Up @@ -74,7 +74,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: jwt
ref: auxiliary

- name: Markup hashing
run: |
Expand Down Expand Up @@ -172,7 +172,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: jwt
ref: auxiliary

- name: Markup hashing
run: |
Expand Down Expand Up @@ -354,7 +354,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: babenek/CredData
ref: jwt
ref: auxiliary

- name: Markup hashing
run: |
Expand Down
66 changes: 33 additions & 33 deletions cicd/benchmark.txt

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ def sanitize_variable(self) -> None:
while self.variable and sanitized_var_len != len(self.variable):
sanitized_var_len = len(self.variable)
self.variable = self.variable.strip(self.variable_strip_pattern)
if self.variable.endswith('\\'):
self.variable = self.variable[:-1]
if variable and len(self.variable) < len(variable) and 0 <= self.variable_start and 0 <= self.variable_end:
start = variable.find(self.variable)
self.variable_start += start
Expand Down
2 changes: 0 additions & 2 deletions credsweeper/filters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,11 @@
from credsweeper.filters.value_jfrog_token_check import ValueJfrogTokenCheck
from credsweeper.filters.value_json_web_token_check import ValueJsonWebTokenCheck
from credsweeper.filters.value_last_word_check import ValueLastWordCheck
from credsweeper.filters.value_length_check import ValueLengthCheck
from credsweeper.filters.value_method_check import ValueMethodCheck
from credsweeper.filters.value_not_allowed_pattern_check import ValueNotAllowedPatternCheck
from credsweeper.filters.value_not_part_encoded_check import ValueNotPartEncodedCheck
from credsweeper.filters.value_number_check import ValueNumberCheck
from credsweeper.filters.value_pattern_check import ValuePatternCheck
from credsweeper.filters.value_pattern_length_check import ValuePatternLengthCheck
from credsweeper.filters.value_similarity_check import ValueSimilarityCheck
from credsweeper.filters.value_split_keyword_check import ValueSplitKeywordCheck
from credsweeper.filters.value_string_type_check import ValueStringTypeCheck
Expand Down
2 changes: 2 additions & 0 deletions credsweeper/filters/group/general_pattern.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from credsweeper.common.constants import GroupType
from credsweeper.config import Config
from credsweeper.filters import ValueUselessWordCheck
from credsweeper.filters.group import Group


Expand All @@ -8,3 +9,4 @@ class GeneralPattern(Group):

def __init__(self, config: Config) -> None:
super().__init__(config, GroupType.PATTERN)
self.filters.extend([ValueUselessWordCheck()])
8 changes: 3 additions & 5 deletions credsweeper/filters/group/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from credsweeper.config import Config
from credsweeper.filters import (Filter, LineSpecificKeyCheck, ValueAllowlistCheck, ValueArrayDictionaryCheck,
ValueBlocklistCheck, ValueCamelCaseCheck, ValueFilePathCheck, ValueFirstWordCheck,
ValueLastWordCheck, ValueLengthCheck, ValueMethodCheck, ValueNotAllowedPatternCheck,
ValuePatternCheck, ValueSimilarityCheck, ValueStringTypeCheck, ValueTokenCheck,
VariableNotAllowedPatternCheck, ValuePatternLengthCheck, ValueHexNumberCheck)
ValueLastWordCheck, ValueMethodCheck, ValueNotAllowedPatternCheck, ValuePatternCheck,
ValueSimilarityCheck, ValueStringTypeCheck, ValueTokenCheck,
VariableNotAllowedPatternCheck, ValueHexNumberCheck)


class Group(ABC):
Expand Down Expand Up @@ -43,7 +43,6 @@ def get_keyword_base_filters(config: Config) -> List[Filter]:
ValueFirstWordCheck(),
ValueHexNumberCheck(),
ValueLastWordCheck(),
ValueLengthCheck(config),
ValueMethodCheck(),
ValueSimilarityCheck(),
ValueStringTypeCheck(config),
Expand All @@ -60,5 +59,4 @@ def get_pattern_base_filters(config: Config) -> List[Filter]:
return [ #
LineSpecificKeyCheck(), #
ValuePatternCheck(config), #
ValuePatternLengthCheck(config), #
]
7 changes: 3 additions & 4 deletions credsweeper/filters/group/url_credentials_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from credsweeper.config import Config
from credsweeper.filters import (ValueAllowlistCheck, ValueArrayDictionaryCheck, ValueBlocklistCheck,
ValueCamelCaseCheck, ValueDictionaryValueLengthCheck, ValueFilePathCheck,
ValueFirstWordCheck, ValueLastWordCheck, ValueLengthCheck, ValueMethodCheck,
ValueNotAllowedPatternCheck, ValuePatternCheck, ValueStringTypeCheck, ValueTokenCheck)
ValueFirstWordCheck, ValueLastWordCheck, ValueMethodCheck, ValueNotAllowedPatternCheck,
ValuePatternCheck, ValueStringTypeCheck, ValueTokenCheck)
from credsweeper.filters.group import Group


Expand All @@ -25,11 +25,10 @@ def __init__(self, config: Config) -> None:
ValueFilePathCheck(),
ValueFirstWordCheck(),
ValueLastWordCheck(),
ValueLengthCheck(config),
ValueMethodCheck(),
ValueStringTypeCheck(config),
ValueNotAllowedPatternCheck(),
ValueTokenCheck(),
ValueDictionaryValueLengthCheck(),
ValueDictionaryValueLengthCheck(min_len=4, max_len=80),
ValuePatternCheck(config)
]
12 changes: 9 additions & 3 deletions credsweeper/filters/line_specific_key_check.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re

from credsweeper.common.constants import ML_HUNK
from credsweeper.config import Config
from credsweeper.credentials import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
Expand All @@ -10,8 +11,8 @@
class LineSpecificKeyCheck(Filter):
"""Check that values from list below is not in candidate line."""

NOT_ALLOWED = [r"example", r"enc\(", r"enc\[", r"true", r"false"]
NOT_ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(NOT_ALLOWED))
NOT_ALLOWED = [r"example", r"\benc[\(\[]", r"\btrue\b", r"\bfalse\b"]
NOT_ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(NOT_ALLOWED), re.IGNORECASE)

def __init__(self, config: Config = None) -> None:
pass
Expand All @@ -29,8 +30,13 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""
if line_data.line is None:
return True
if 0 <= line_data.variable_start:
# variable may be defined too
sub_line_start = 0 if ML_HUNK >= line_data.variable_start else line_data.variable_start - ML_HUNK
else:
sub_line_start = 0 if ML_HUNK >= line_data.value_start else line_data.value_start - ML_HUNK

if self.NOT_ALLOWED_PATTERN.search(target.line_lower):
if self.NOT_ALLOWED_PATTERN.search(line_data.line, sub_line_start, line_data.value_end + ML_HUNK):
return True

return False
49 changes: 0 additions & 49 deletions credsweeper/filters/separator_unusual_check.py

This file was deleted.

7 changes: 4 additions & 3 deletions credsweeper/filters/value_dictionary_value_length_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
class ValueDictionaryValueLengthCheck(Filter):
"""Check that candidate length is between 5 and 30."""

def __init__(self, config: Config = None) -> None:
pass
def __init__(self, config: Config = None, min_len: int = 4, max_len: int = 31) -> None:
self.min_len = min_len
self.max_len = max_len

def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Expand All @@ -21,7 +22,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
True, if need to filter candidate and False if left
"""
if 4 <= len(line_data.value) <= 31:
if self.min_len <= len(line_data.value) <= self.max_len:
return False
else:
return True
2 changes: 1 addition & 1 deletion credsweeper/filters/value_json_web_token_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
if not header_check:
header_check = bool(ValueJsonWebTokenCheck.header_keys.intersection(json_keys))
# payload follows the header
if not payload_check:
elif not payload_check:
payload_check = bool(ValueJsonWebTokenCheck.payload_keys.intersection(json_keys))
# any other payloads are allowed
elif header_check and payload_check and not signature_check:
Expand Down
26 changes: 0 additions & 26 deletions credsweeper/filters/value_length_check.py

This file was deleted.

2 changes: 1 addition & 1 deletion credsweeper/filters/value_not_allowed_pattern_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class ValueNotAllowedPatternCheck(Filter):
"""Check that secret doesn't open or closes brackets or a new line."""

NOT_ALLOWED = [r"[<>\[\]{}]\s+", r"^\s*\\", r"^\s*\\n\s*"]
NOT_ALLOWED = [r"[<>\[\]{}]\s+", r"\\u00(26|3c)gt;?(\s|\\+[nrt])?", r"^\s*\\", r"^\s*\\n\s*"]
NOT_ALLOWED_PATTERN = re.compile( #
f"{Util.get_regex_combine_or(NOT_ALLOWED)}$", #
flags=re.IGNORECASE)
Expand Down
10 changes: 0 additions & 10 deletions credsweeper/filters/value_pattern_length_check.py

This file was deleted.

7 changes: 3 additions & 4 deletions credsweeper/filters/value_useless_word_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,10 @@ class ValueUselessWordCheck(Filter):
"""Check is candidate value contains sub-rows with operators (like ->)."""

NOT_ALLOWED = [
"((\\{)?(0x)+([0-9a-f]|\\%){1}.*)", # Check is contain \{0x or 0x
"(\\-\\>.*)", # Check if contain ->
"(xxxx.*)", # Check if contain xxxxx
"((\\{)?(0x)+([0-9a-f]|\\%){1})", # Check is contain \{0x or 0x
r"((\w+)?->)", # Check if contain ->
"(.*example)", # Check if contain `example` word
"(\\$\\w+)", # Check whether it looks like a variable e.g. $word
"(\\s).*" # Check if contain \s
]
NOT_ALLOWED_PATTERN = re.compile( #
Util.get_regex_combine_or(NOT_ALLOWED), #
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,7 @@
confidence: moderate
type: pattern
values:
- (?P<value_leftquote>[\"'])?(?P<variable>[+0-9A-Za-z-]{2,80}://)([^\s\'"<>\[\]^~`{|}@:/]{0,80}:){1,3}(?P<value>[^\s\'"<>\[\]^~`{|}@:/]{3,80})@[^\s\'"<>\[\]^~`{|}@:/]{1,800}\\{0,8}(?P<value_rightquote>[\"'])?
- (?P<value_leftquote>[\"'])?(?P<variable>[+0-9A-Za-z-]{2,80}://)([^\s\'"<>\[\]^~`{|}:/]{0,80}:){1,3}(?P<value>[^\s\'"<>\[\]^~`{|}@:/]{3,80})@[^\s\'"<>\[\]^~`{|}@:/]{1,800}\\{0,8}(?P<value_rightquote>[\"'])?
filter_type: UrlCredentialsGroup
use_ml: true
required_substrings:
Expand Down
10 changes: 9 additions & 1 deletion credsweeper/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import math
import os
import string
import struct
import tarfile
from dataclasses import dataclass
Expand Down Expand Up @@ -690,6 +691,13 @@ def subtext(text: str, pos: int, hunk_size: int) -> str:
else:
left_quota = hunk_size - pos
left_pos = 0
# skip leading whitespaces in result string
for i in range(left_pos, pos):
if text[i] in string.whitespace:
left_quota += 1
left_pos += 1
else:
break
right_remain = len(text) - pos
if hunk_size <= right_remain:
right_quota = 0
Expand All @@ -703,4 +711,4 @@ def subtext(text: str, pos: int, hunk_size: int) -> str:
left_pos -= right_quota
if 0 > left_pos:
left_pos = 0
return text[left_pos:right_pos]
return text[left_pos:right_pos].rstrip()
24 changes: 0 additions & 24 deletions docs/source/credsweeper.filters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,6 @@ credsweeper.filters.line\_specific\_key\_check module
:undoc-members:
:show-inheritance:

credsweeper.filters.separator\_unusual\_check module
----------------------------------------------------

.. automodule:: credsweeper.filters.separator_unusual_check
:members:
:undoc-members:
:show-inheritance:

credsweeper.filters.value\_allowlist\_check module
--------------------------------------------------

Expand Down Expand Up @@ -260,14 +252,6 @@ credsweeper.filters.value\_last\_word\_check module
:undoc-members:
:show-inheritance:

credsweeper.filters.value\_length\_check module
-----------------------------------------------

.. automodule:: credsweeper.filters.value_length_check
:members:
:undoc-members:
:show-inheritance:

credsweeper.filters.value\_method\_check module
-----------------------------------------------

Expand Down Expand Up @@ -308,14 +292,6 @@ credsweeper.filters.value\_pattern\_check module
:undoc-members:
:show-inheritance:

credsweeper.filters.value\_pattern\_length\_check module
--------------------------------------------------------

.. automodule:: credsweeper.filters.value_pattern_length_check
:members:
:undoc-members:
:show-inheritance:

credsweeper.filters.value\_similarity\_check module
---------------------------------------------------

Expand Down
Loading

0 comments on commit e31ef71

Please sign in to comment.