Skip to content

Commit

Permalink
Add support for merging and matching results as well as some aggregat…
Browse files Browse the repository at this point in the history
…ed stats printed to the console
  • Loading branch information
austimkelly committed Jan 29, 2024
1 parent c51af61 commit ee0aef2
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 12 deletions.
8 changes: 4 additions & 4 deletions org-scan/csv_coalesce.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def unify_csv_files(trufflehog_file, gitleaks_file, output_file):
def find_matches(input_file, output_file):
# Define the headers for the input and output CSV files
input_headers = ['source', 'owner', 'repo_name', 'file', 'line', 'secret', 'th_source_id', 'th_source_type', 'th_source_name', 'th_detector_type', 'th_detector_name', 'th_decoder_name', 'th_verified', 'th_raw', 'th_raw_v2', 'th_redacted', 'gl_owner', 'gl_commit', 'gl_symlink_file', 'gl_secret', 'gl_match', 'gl_start_line', 'gl_end_line', 'gl_start_column', 'gl_end_column', 'gl_author', 'gl_message', 'gl_date', 'gl_email', 'gl_fingerprint', 'gl_tags']
output_headers = ['match', 'match_score', 'match_source', 'matched_on_source', 'match_reason'] + input_headers
output_headers = ['match', 'match_score', 'match_source', 'matched_on_source', 'match_reason'] + input_headers + [f'matched_{key}' for key in input_headers if key.startswith('gl_') or key.startswith('th_')]

# Open the input CSV file and read the data into a list of dictionaries
with open(input_file, 'r') as f_in:
Expand Down Expand Up @@ -93,6 +93,9 @@ def find_matches(input_file, output_file):
if match_score > 50:
# Create a new dictionary for the output row, copying all fields from the input row
output_row = {k: row[k] for k in input_headers}
for key in other_row:
if key.startswith('gl_') or key.startswith('th_'):
output_row[f'matched_{key}'] = other_row[key]
# Replace the 'source' field with 'match'
output_row['match'] = output_row.pop('source')
# Add the match score, match source, matched on source, and match reason fields
Expand All @@ -112,7 +115,4 @@ def find_matches(input_file, output_file):
writer.writeheader()
writer.writerows(matches)

# Call the functions
unify_csv_files('trufflehog_results_202401260938.csv', 'gitleaks_report_concat.csv', 'unified.csv')
find_matches('unified.csv', 'matches.csv')

57 changes: 50 additions & 7 deletions org-scan/gitleaks-org-scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
import json
import sys

# import all functions from csv_coalesce.py
from csv_coalesce import *

# Add command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--clean", action="store_true", help="delete the directories ./checkouts and ./reports. When --clean is present all other commands are ignored.")
Expand Down Expand Up @@ -63,7 +66,7 @@ def check_commands():
exit(0)

# Function to concatenate CSV files
def concatenate_csv_files():
def concatenate_csv_files(report_filename):
# Get a list of all CSV files in the {REPORTS_DIR} directory
csv_files = glob.glob(f'{REPORTS_DIR}/*.csv')

Expand Down Expand Up @@ -108,12 +111,12 @@ def concatenate_csv_files():

# Check if concatenated_df is empty
if concatenated_df.empty:
print("WARNING: No results to write to gitleaks_report_concat.csv")
print(f"WARNING: No results to write to {report_filename}")
else:
# Write the concatenated DataFrame to a new CSV file
print(f"Writing concatenated CSV file to {REPORTS_DIR}/gitleaks_report_concat.csv...")
print(f"Writing concatenated CSV file to ./{report_filename}...")
if not DRY_RUN:
concatenated_df.to_csv('gitleaks_report_concat.csv', index=False)
concatenated_df.to_csv(f"{report_filename}", index=False)

def fetch_repos(account_type, account, headers, page=1, per_page=100):
repos = []
Expand Down Expand Up @@ -181,6 +184,34 @@ def do_trufflehog_scan(target, repo_name, repo_path, report_filename):
else:
print(f"Unexpected structure in finding: {finding}")

def analyze_merged_results(merged_results):
df = pd.read_csv(merged_results)

# Get the distinct owner values
distinct_owners = df['owner'].unique()
print(f"Owners: {distinct_owners}")

# Get the distinct source values
distinct_sources = df['source'].unique()
print(f"Scanning Source Tools: {distinct_sources}")

# Count the total distinct repo_name
total_repos = df['repo_name'].count()
print(f"Total Repos: {total_repos}")

# Group by source and count total values
total_secrets_by_source = df.groupby('source')['secret'].count().to_dict()
print(f"Total Secrets by Source: {total_secrets_by_source}")

# Count the number of total secrets in the secret column
total_secrets = df['secret'].count()
print(f"Total Secrets: {total_secrets}")

# Count the number of total distinct secrets in the secrets column
total_distinct_secrets = df['secret'].nunique()
print(f"Total Distinct Secrets: {total_distinct_secrets}")


# make ./reports directory if it doesn't exist
if not os.path.exists(REPORTS_DIR):
os.makedirs(REPORTS_DIR)
Expand Down Expand Up @@ -226,7 +257,19 @@ def do_trufflehog_scan(target, repo_name, repo_path, report_filename):
do_trufflehog_scan(target, repo_bare_name, repo_checkout_path, trufflehog_report_filename)

# Concatenate all CSV files into a single CSV file
print("Concatenating CSV files...")
concatenate_csv_files()
print("Concatenating gitleaks report CSV files...")
timestamp = datetime.now().strftime('%Y%m%d%H%M')
gitleaks_merged_report_filename = f'gitleaks_report_merged_filename_{timestamp}.csv'
concatenate_csv_files(gitleaks_merged_report_filename)

print("Secrets scanning execution completed.")
print("Creating merge and match reports.")

# Create a unified reports of all secrets as well as a match report limited results
# only to (fuzzy) matches found among the secrets results
merged_report_name = 'merged_scan_results_report.csv'
unify_csv_files(trufflehog_report_filename, gitleaks_merged_report_filename, merged_report_name)
find_matches(merged_report_name, 'scanning_tool_matches_only.csv')

print("Script execution completed.")
# Aggregate report results
analyze_merged_results(merged_report_name)
4 changes: 3 additions & 1 deletion org-scan/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
requests
pandas
pandas
python-Levenshtein
fuzzywuzzy

0 comments on commit ee0aef2

Please sign in to comment.