Add support for merging and matching results as well as some aggregat…

…ed stats printed to the console
austimkelly · Jan 29, 2024 · ee0aef2 · ee0aef2
1 parent c51af61
commit ee0aef2
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 12 deletions.
diff --git a/org-scan/csv_coalesce.py b/org-scan/csv_coalesce.py
@@ -65,7 +65,7 @@ def unify_csv_files(trufflehog_file, gitleaks_file, output_file):
 def find_matches(input_file, output_file):
     # Define the headers for the input and output CSV files
     input_headers = ['source', 'owner', 'repo_name', 'file', 'line', 'secret', 'th_source_id', 'th_source_type', 'th_source_name', 'th_detector_type', 'th_detector_name', 'th_decoder_name', 'th_verified', 'th_raw', 'th_raw_v2', 'th_redacted', 'gl_owner', 'gl_commit', 'gl_symlink_file', 'gl_secret', 'gl_match', 'gl_start_line', 'gl_end_line', 'gl_start_column', 'gl_end_column', 'gl_author', 'gl_message', 'gl_date', 'gl_email', 'gl_fingerprint', 'gl_tags']
-    output_headers = ['match', 'match_score', 'match_source', 'matched_on_source', 'match_reason'] + input_headers
+    output_headers = ['match', 'match_score', 'match_source', 'matched_on_source', 'match_reason'] + input_headers + [f'matched_{key}' for key in input_headers if key.startswith('gl_') or key.startswith('th_')]
 
     # Open the input CSV file and read the data into a list of dictionaries
     with open(input_file, 'r') as f_in:
@@ -93,6 +93,9 @@ def find_matches(input_file, output_file):
             if match_score > 50:
                 # Create a new dictionary for the output row, copying all fields from the input row
                 output_row = {k: row[k] for k in input_headers}
+                for key in other_row:
+                    if key.startswith('gl_') or key.startswith('th_'):
+                        output_row[f'matched_{key}'] = other_row[key]
                 # Replace the 'source' field with 'match'
                 output_row['match'] = output_row.pop('source')
                 # Add the match score, match source, matched on source, and match reason fields
@@ -112,7 +115,4 @@ def find_matches(input_file, output_file):
         writer.writeheader()
         writer.writerows(matches)
 
-# Call the functions
-unify_csv_files('trufflehog_results_202401260938.csv', 'gitleaks_report_concat.csv', 'unified.csv')
-find_matches('unified.csv', 'matches.csv')
 
diff --git a/org-scan/gitleaks-org-scan.py b/org-scan/gitleaks-org-scan.py
@@ -14,6 +14,9 @@
 import json
 import sys
 
+# import all functions from csv_coalesce.py
+from csv_coalesce import *
+
 # Add command line arguments
 parser = argparse.ArgumentParser()
 parser.add_argument("--clean", action="store_true", help="delete the directories ./checkouts and ./reports. When --clean is present all other commands are ignored.")
@@ -63,7 +66,7 @@ def check_commands():
     exit(0)
 
 # Function to concatenate CSV files
-def concatenate_csv_files():
+def concatenate_csv_files(report_filename):
     # Get a list of all CSV files in the {REPORTS_DIR} directory
     csv_files = glob.glob(f'{REPORTS_DIR}/*.csv')
 
@@ -108,12 +111,12 @@ def concatenate_csv_files():
 
     # Check if concatenated_df is empty
     if concatenated_df.empty:
-        print("WARNING: No results to write to gitleaks_report_concat.csv")
+        print(f"WARNING: No results to write to {report_filename}")
     else:
         # Write the concatenated DataFrame to a new CSV file
-        print(f"Writing concatenated CSV file to {REPORTS_DIR}/gitleaks_report_concat.csv...")
+        print(f"Writing concatenated CSV file to ./{report_filename}...")
         if not DRY_RUN:
-            concatenated_df.to_csv('gitleaks_report_concat.csv', index=False)
+            concatenated_df.to_csv(f"{report_filename}", index=False)
 
 def fetch_repos(account_type, account, headers, page=1, per_page=100):
     repos = []
@@ -181,6 +184,34 @@ def do_trufflehog_scan(target, repo_name, repo_path, report_filename):
                 else:
                     print(f"Unexpected structure in finding: {finding}")
 
+def analyze_merged_results(merged_results):
+    df = pd.read_csv(merged_results)
+
+    # Get the distinct owner values
+    distinct_owners = df['owner'].unique()
+    print(f"Owners: {distinct_owners}")
+
+    # Get the distinct source values
+    distinct_sources = df['source'].unique()
+    print(f"Scanning Source Tools: {distinct_sources}")
+
+    # Count the total distinct repo_name
+    total_repos = df['repo_name'].count()
+    print(f"Total Repos: {total_repos}")
+
+    # Group by source and count total values
+    total_secrets_by_source = df.groupby('source')['secret'].count().to_dict()
+    print(f"Total Secrets by Source: {total_secrets_by_source}")
+
+    # Count the number of total secrets in the secret column
+    total_secrets = df['secret'].count()
+    print(f"Total Secrets: {total_secrets}")
+
+    # Count the number of total distinct secrets in the secrets column
+    total_distinct_secrets = df['secret'].nunique()
+    print(f"Total Distinct Secrets: {total_distinct_secrets}")
+
+
 # make ./reports directory if it doesn't exist
 if not os.path.exists(REPORTS_DIR):
     os.makedirs(REPORTS_DIR)
@@ -226,7 +257,19 @@ def do_trufflehog_scan(target, repo_name, repo_path, report_filename):
             do_trufflehog_scan(target, repo_bare_name, repo_checkout_path, trufflehog_report_filename)
 
 # Concatenate all CSV files into a single CSV file
-print("Concatenating CSV files...")
-concatenate_csv_files()
+print("Concatenating gitleaks report CSV files...")
+timestamp = datetime.now().strftime('%Y%m%d%H%M')
+gitleaks_merged_report_filename = f'gitleaks_report_merged_filename_{timestamp}.csv'
+concatenate_csv_files(gitleaks_merged_report_filename)
+
+print("Secrets scanning execution completed.")
+print("Creating merge and match reports.")
+
+# Create a unified reports of all secrets as well as a match report limited results
+# only to (fuzzy) matches found among the secrets results
+merged_report_name = 'merged_scan_results_report.csv'
+unify_csv_files(trufflehog_report_filename, gitleaks_merged_report_filename, merged_report_name)
+find_matches(merged_report_name, 'scanning_tool_matches_only.csv')
 
-print("Script execution completed.")
+# Aggregate report results
+analyze_merged_results(merged_report_name)
diff --git a/org-scan/requirements.txt b/org-scan/requirements.txt
@@ -1,2 +1,4 @@
 requests
-pandas
+pandas
+python-Levenshtein
+fuzzywuzzy