"""
Shannon Entropy Calculator for Secret Detection
High entropy strings are likely to be secrets
"""
import math
import re
from collections import Counter
def calculate_shannon_entropy(data: str) -> float:
"""
Calculate Shannon entropy of a string
Args:
data: String to calculate entropy for
Returns:
Entropy value (0-8, higher = more random)
"""
if not data:
return 0.0
# Count character frequencies
counter = Counter(data)
length = len(data)
# Calculate entropy
entropy = 0.0
for count in counter.values():
probability = count / length
entropy -= probability * math.log2(probability)
return entropy
def calculate_base64_entropy(data: str) -> float:
"""
Calculate entropy specifically for base64-like strings
Args:
data: String to analyze
Returns:
Entropy value adjusted for base64
"""
# Base64 uses A-Z, a-z, 0-9, +, /
base64_chars = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=")
# Check if it's likely base64
if not all(c in base64_chars for c in data):
return 0.0
return calculate_shannon_entropy(data)
def is_high_entropy_string(data: str, threshold: float = 4.5) -> bool:
"""
Check if a string has high entropy (likely a secret)
Args:
data: String to check
threshold: Minimum entropy value (default 4.5)
Returns:
True if high entropy, False otherwise
"""
return calculate_shannon_entropy(data) >= threshold
def extract_high_entropy_strings(text: str, min_length: int = 20, threshold: float = 4.5) -> list[dict]:
"""
Extract all high-entropy strings from text
Args:
text: Text to search
min_length: Minimum string length to consider
threshold: Minimum entropy threshold
Returns:
List of dicts with string and entropy value
"""
findings = []
# Find potential secrets (alphanumeric strings of sufficient length)
pattern = re.compile(r'[A-Za-z0-9+/=_\-]{' + str(min_length) + r',}')
for match in pattern.finditer(text):
string = match.group()
entropy = calculate_shannon_entropy(string)
if entropy >= threshold:
findings.append({
'string': string,
'entropy': round(entropy, 2),
'start': match.start(),
'end': match.end(),
'length': len(string)
})
return findings
def get_entropy_category(entropy: float) -> str:
"""
Categorize entropy level
Args:
entropy: Entropy value
Returns:
Category string
"""
if entropy >= 5.5:
return "VERY_HIGH"
elif entropy >= 4.5:
return "HIGH"
elif entropy >= 3.5:
return "MEDIUM"
else:
return "LOW"
def analyze_string_randomness(data: str) -> dict:
"""
Comprehensive analysis of string randomness
Args:
data: String to analyze
Returns:
Dict with entropy metrics
"""
entropy = calculate_shannon_entropy(data)
# Check for patterns that reduce randomness
has_repeating_chars = bool(re.search(r'(.)\1{3,}', data))
has_sequential = bool(re.search(r'(abc|bcd|cde|def|012|123|234|345)', data, re.IGNORECASE))
has_pattern = bool(re.search(r'(test|demo|example|sample|key|secret)', data, re.IGNORECASE))
# Character diversity
unique_chars = len(set(data))
total_chars = len(data)
diversity = unique_chars / total_chars if total_chars > 0 else 0
return {
'entropy': round(entropy, 2),
'category': get_entropy_category(entropy),
'diversity': round(diversity, 2),
'has_repeating_chars': has_repeating_chars,
'has_sequential_pattern': has_sequential,
'has_common_words': has_pattern,
'likely_secret': entropy >= 4.5 and not (has_repeating_chars or has_pattern),
'confidence': round((entropy / 8.0) * (diversity) * 100, 1)
}