/**
* Data Leak Prevention Layer for Database Operations
* Prevents accidental exposure of sensitive data during ingestion and querying
* Uses Trie (prefix tree) for efficient pattern matching
*/
import { logger } from './logger.js';
import { redactObject, containsSensitiveData } from './redaction.js';
export interface DataLeakRule {
/** Pattern to match sensitive data */
pattern: RegExp;
/** Sensitivity level */
severity: 'low' | 'medium' | 'high' | 'critical';
/** Category of sensitive data */
category: string;
/** Action to take when detected */
action: 'redact' | 'block' | 'warn';
}
export interface ScanResult {
/** Whether sensitive data was detected */
detected: boolean;
/** List of detected issues */
findings: Finding[];
/** Sanitized data (if action was 'redact') */
sanitized?: any;
/** Whether operation should be blocked */
blocked: boolean;
}
export interface Finding {
/** Sensitivity level */
severity: 'low' | 'medium' | 'high' | 'critical';
/** Category of finding */
category: string;
/** Field/location where found */
location: string;
/** Description of what was found */
description: string;
/** Recommended action */
action: 'redact' | 'block' | 'warn';
}
/**
* Trie node for efficient prefix matching
* DSA: Trie (Prefix Tree) for pattern matching
*/
class TrieNode {
children: Map<string, TrieNode> = new Map();
isEndOfWord: boolean = false;
rule?: DataLeakRule;
}
/**
* Trie-based pattern matcher for sensitive keywords
* Time Complexity: O(m) for search where m = keyword length
*/
class SensitiveKeywordTrie {
private root: TrieNode = new TrieNode();
/**
* Inserts a keyword into the trie
*/
insert(keyword: string, rule: DataLeakRule): void {
let node = this.root;
const lowerKeyword = keyword.toLowerCase();
for (const char of lowerKeyword) {
if (!node.children.has(char)) {
node.children.set(char, new TrieNode());
}
node = node.children.get(char)!;
}
node.isEndOfWord = true;
node.rule = rule;
}
/**
* Searches for keywords in text
*/
search(text: string): DataLeakRule[] {
const found: DataLeakRule[] = [];
const lowerText = text.toLowerCase();
for (let i = 0; i < lowerText.length; i++) {
const matches = this.searchFrom(lowerText, i);
found.push(...matches);
}
return found;
}
private searchFrom(text: string, startIndex: number): DataLeakRule[] {
const found: DataLeakRule[] = [];
let node = this.root;
let index = startIndex;
while (index < text.length && node.children.has(text[index])) {
node = node.children.get(text[index])!;
index++;
if (node.isEndOfWord && node.rule) {
found.push(node.rule);
}
}
return found;
}
}
/**
* Comprehensive data leak prevention rules
*/
const DATA_LEAK_RULES: DataLeakRule[] = [
// PII - Personal Identifiable Information
{
pattern: /\b\d{3}-\d{2}-\d{4}\b/g, // SSN
severity: 'critical',
category: 'PII',
action: 'block'
},
{
pattern: /\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b/g,
severity: 'medium',
category: 'PII',
action: 'redact'
},
{
pattern: /\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b/g, // Credit card
severity: 'critical',
category: 'PCI-DSS',
action: 'block'
},
// Credentials
{
pattern: /\b(password|passwd|pwd)\s*[:=]\s*['"]?([^'"\s]+)['"]?/gi,
severity: 'critical',
category: 'Credentials',
action: 'block'
},
{
pattern: /\b(api[_-]?key|apikey)\s*[:=]\s*['"]?([^'"\s]+)['"]?/gi,
severity: 'critical',
category: 'Credentials',
action: 'block'
},
{
pattern: /(Bearer\s+[a-zA-Z0-9\-._~+\/]+=*)/gi,
severity: 'critical',
category: 'Credentials',
action: 'block'
},
// Azure-specific
{
pattern: /(AccountKey|SharedAccessSignature)=([^;]+)/gi,
severity: 'critical',
category: 'Azure Credentials',
action: 'block'
},
{
pattern: /DefaultEndpointsProtocol=https;AccountName=([^;]+);AccountKey=([^;]+)/gi,
severity: 'critical',
category: 'Azure Connection String',
action: 'block'
},
// IP addresses (can be sensitive)
{
pattern: /\b(?:\d{1,3}\.){3}\d{1,3}\b/g,
severity: 'low',
category: 'Network',
action: 'warn'
},
// Private keys
{
pattern: /(-----BEGIN\s+(?:RSA\s+)?PRIVATE\s+KEY-----)([\s\S]*?)(-----END\s+(?:RSA\s+)?PRIVATE\s+KEY-----)/gi,
severity: 'critical',
category: 'Cryptographic Keys',
action: 'block'
}
];
/**
* Data Leak Prevention Scanner
* Scans data for sensitive information before storage/transmission
*/
export class DataLeakScanner {
private rules: DataLeakRule[];
private keywordTrie: SensitiveKeywordTrie;
// Hashmap for fast field name lookups
private sensitiveFieldNames: Set<string> = new Set([
'password', 'secret', 'token', 'key', 'credential',
'ssn', 'social_security', 'credit_card', 'cvv',
'api_key', 'apikey', 'access_token', 'refresh_token',
'connection_string', 'connectionstring', 'private_key'
]);
constructor(customRules: DataLeakRule[] = []) {
this.rules = [...DATA_LEAK_RULES, ...customRules];
this.keywordTrie = new SensitiveKeywordTrie();
// Build trie with sensitive keywords
const keywords = [
'password', 'secret', 'token', 'key', 'credential',
'ssn', 'credit', 'bearer', 'accountkey'
];
for (const keyword of keywords) {
const rule: DataLeakRule = {
pattern: new RegExp(keyword, 'i'),
severity: 'high',
category: 'Keyword Match',
action: 'warn'
};
this.keywordTrie.insert(keyword, rule);
}
}
/**
* Scans a string for sensitive data
*/
scanString(data: string, location: string = 'unknown'): ScanResult {
const findings: Finding[] = [];
let sanitized = data;
let blocked = false;
// Pattern-based scanning
for (const rule of this.rules) {
const matches = data.match(rule.pattern);
if (matches) {
findings.push({
severity: rule.severity,
category: rule.category,
location,
description: `${rule.category} detected in ${location}`,
action: rule.action
});
if (rule.action === 'block') {
blocked = true;
} else if (rule.action === 'redact') {
sanitized = data.replace(rule.pattern, '***REDACTED***');
}
}
}
// Keyword-based scanning using Trie
const keywordMatches = this.keywordTrie.search(data);
for (const rule of keywordMatches) {
if (!findings.some(f => f.category === rule.category)) {
findings.push({
severity: rule.severity,
category: rule.category,
location,
description: `Sensitive keyword detected in ${location}`,
action: rule.action
});
}
}
return {
detected: findings.length > 0,
findings,
sanitized: sanitized !== data ? sanitized : undefined,
blocked
};
}
/**
* Scans an object recursively for sensitive data
* Uses DFS traversal of object tree
*/
scanObject(obj: any, parentPath: string = 'root'): ScanResult {
const findings: Finding[] = [];
let sanitized = obj;
let blocked = false;
if (typeof obj === 'string') {
return this.scanString(obj, parentPath);
}
if (typeof obj !== 'object' || obj === null) {
return { detected: false, findings: [], blocked: false };
}
// Handle arrays
if (Array.isArray(obj)) {
const sanitizedArray: any[] = [];
for (let i = 0; i < obj.length; i++) {
const result = this.scanObject(obj[i], `${parentPath}[${i}]`);
findings.push(...result.findings);
if (result.blocked) {
blocked = true;
}
sanitizedArray.push(result.sanitized !== undefined ? result.sanitized : obj[i]);
}
sanitized = sanitizedArray;
} else {
// Handle objects
const sanitizedObj: any = { ...obj };
for (const [key, value] of Object.entries(obj)) {
const fieldPath = `${parentPath}.${key}`;
const lowerKey = key.toLowerCase();
// Check if field name itself is sensitive
if (this.sensitiveFieldNames.has(lowerKey) ||
Array.from(this.sensitiveFieldNames).some(sf => lowerKey.includes(sf))) {
findings.push({
severity: 'high',
category: 'Sensitive Field Name',
location: fieldPath,
description: `Sensitive field name detected: ${key}`,
action: 'redact'
});
sanitizedObj[key] = '***REDACTED***';
continue;
}
// Recursively scan value
const result = this.scanObject(value, fieldPath);
findings.push(...result.findings);
if (result.blocked) {
blocked = true;
}
if (result.sanitized !== undefined) {
sanitizedObj[key] = result.sanitized;
}
}
sanitized = sanitizedObj;
}
return {
detected: findings.length > 0,
findings,
sanitized: findings.length > 0 ? sanitized : undefined,
blocked
};
}
/**
* Validates data before database ingestion
* Returns sanitized data or throws if blocked
*/
validateForIngestion(data: any, options: {
allowWarnings?: boolean;
blockOnDetection?: boolean;
} = {}): { safe: boolean; sanitized?: any; findings: Finding[] } {
const { allowWarnings = true, blockOnDetection = false } = options;
const result = this.scanObject(data);
// Check if any critical findings
const criticalFindings = result.findings.filter(f => f.severity === 'critical');
const blockingFindings = result.findings.filter(f => f.action === 'block');
if (blockOnDetection && result.detected) {
logger.error('Data leak prevention: Sensitive data detected', undefined, {
findingCount: result.findings.length,
categories: [...new Set(result.findings.map(f => f.category))]
});
return { safe: false, findings: result.findings };
}
if (criticalFindings.length > 0 || blockingFindings.length > 0) {
logger.error('Data leak prevention: Critical data detected', undefined, {
criticalCount: criticalFindings.length,
blockingCount: blockingFindings.length
});
return { safe: false, findings: result.findings };
}
if (!allowWarnings && result.detected) {
return { safe: false, findings: result.findings };
}
if (result.detected) {
logger.warn('Data leak prevention: Warnings detected', {
findingCount: result.findings.length,
categories: [...new Set(result.findings.map(f => f.category))]
});
}
return {
safe: true,
sanitized: result.sanitized,
findings: result.findings
};
}
/**
* Generates a report of all findings
*/
generateReport(findings: Finding[]): string {
const lines: string[] = [];
lines.push('=== Data Leak Prevention Report ===');
lines.push('');
const bySeverity = new Map<string, Finding[]>();
for (const finding of findings) {
const list = bySeverity.get(finding.severity) || [];
list.push(finding);
bySeverity.set(finding.severity, list);
}
for (const severity of ['critical', 'high', 'medium', 'low']) {
const findings = bySeverity.get(severity) || [];
if (findings.length === 0) continue;
lines.push(`${severity.toUpperCase()} (${findings.length}):`);
for (const finding of findings) {
lines.push(` - ${finding.category} at ${finding.location}`);
lines.push(` ${finding.description}`);
lines.push(` Action: ${finding.action}`);
}
lines.push('');
}
lines.push('===================================');
return lines.join('\n');
}
}
/**
* Global data leak scanner instance
*/
export const globalDataLeakScanner = new DataLeakScanner();
/**
* Convenience function to scan data before database operations
*/
export function scanBeforeIngestion(data: any): { safe: boolean; sanitized: any } {
const result = globalDataLeakScanner.validateForIngestion(data, {
allowWarnings: true,
blockOnDetection: false
});
if (!result.safe) {
const report = globalDataLeakScanner.generateReport(result.findings);
logger.error('Data ingestion blocked due to sensitive data', undefined, { report });
throw new Error('Sensitive data detected. Ingestion blocked for security.');
}
return {
safe: true,
sanitized: result.sanitized || data
};
}