evidenceParser.ts•8.01 kB
import { Person, Connection, EvidenceItem } from './types.js';
import { OpenAIClient } from './client.js';
export class EvidenceParser {
private namePattern = /(?:^|[^a-zA-Z])([A-Z][a-z]+ [A-Z][a-z]+|[A-Z][a-z]+(?:\s[A-Z][a-z]+)*)/g;
private emailPattern = /([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/g;
private organizationPattern = /(OpenAI|Microsoft|Meta|X|Tesla|SpaceX|Google|Apple|Amazon)/gi;
private openaiClient: OpenAIClient;
constructor(openaiClient: OpenAIClient) {
this.openaiClient = openaiClient;
}
async parseEvidence(evidenceText: string): Promise<{ people: Person[], connections: Connection[], evidenceItems: EvidenceItem[] }> {
const evidenceItems = this.extractEvidenceItems(evidenceText);
const people = this.extractPeople(evidenceText);
const connections = await this.extractConnections(evidenceText, people);
return { people, connections, evidenceItems };
}
private extractEvidenceItems(text: string): EvidenceItem[] {
const items: EvidenceItem[] = [];
const sections = text.split(/(?=Police Report|Witness Statement|Security Camera|Internal Slack|Email Chain|Meeting Notes)/i);
for (const section of sections) {
if (section.trim().length === 0) continue;
let type: EvidenceItem['type'] = 'other';
if (section.match(/Police Report/i)) type = 'police_report';
else if (section.match(/Witness Statement/i)) type = 'witness_statement';
else if (section.match(/Security Camera/i)) type = 'security_log';
else if (section.match(/Slack|Email/i)) type = 'communication';
else if (section.match(/Meeting Notes/i)) type = 'meeting_notes';
const timestampMatch = section.match(/(?:on|at)\s+([A-Z][a-z]+ \d{1,2}, \d{4}|(?:\d{1,2}:\d{2}\s*[AP]M))/);
const locationMatch = section.match(/(?:at|from)\s+([0-9]+ [A-Za-z\s]+ St|[A-Za-z\s]+ building)/i);
items.push({
type,
source: this.extractSource(section),
content: section.trim(),
timestamp: timestampMatch?.[1],
location: locationMatch?.[1]
});
}
return items;
}
private extractSource(section: string): string {
const sourceMatch = section.match(/^([^:]+):/);
return sourceMatch?.[1] || 'Unknown';
}
private extractPeople(text: string): Person[] {
const peopleMap = new Map<string, Person>();
const nameMatches = Array.from(text.matchAll(this.namePattern));
const emailMatches = Array.from(text.matchAll(this.emailPattern));
for (const match of nameMatches) {
const name = match[1].trim();
if (name.length > 2 && !this.isCommonWord(name)) {
const person: Person = {
name,
organization: this.extractOrganization(text, name),
role: this.extractRole(text, name)
};
peopleMap.set(name, person);
}
}
for (const match of emailMatches) {
const email = match[1];
const domain = email.split('@')[1];
const nameFromEmail = email.split('@')[0].replace(/[._]/g, ' ');
let organization = '';
if (domain.includes('microsoft')) organization = 'Microsoft';
else if (domain.includes('meta')) organization = 'Meta';
else if (domain.includes('x.com')) organization = 'X';
else if (domain.includes('openai')) organization = 'OpenAI';
const existing = Array.from(peopleMap.values()).find(p =>
p.name.toLowerCase().includes(nameFromEmail.toLowerCase()) ||
nameFromEmail.toLowerCase().includes(p.name.toLowerCase())
);
if (existing) {
existing.organization = existing.organization || organization;
}
}
return Array.from(peopleMap.values());
}
private async extractConnections(text: string, people: Person[]): Promise<Connection[]> {
const connections: Connection[] = [];
const peopleNames = people.map(p => p.name);
// Process connections in batches to avoid rate limits
const batchSize = 5;
for (let i = 0; i < peopleNames.length; i++) {
const batch: Promise<Connection | null>[] = [];
for (let j = i + 1; j < peopleNames.length && batch.length < batchSize; j++) {
const person1 = peopleNames[i];
const person2 = peopleNames[j];
batch.push(this.analyzeRelationshipWithLLM(text, person1, person2));
}
const batchResults = await Promise.all(batch);
connections.push(...batchResults.filter(conn => conn !== null) as Connection[]);
// Small delay to respect rate limits
if (i < peopleNames.length - 1) {
await new Promise(resolve => setTimeout(resolve, 100));
}
}
return connections;
}
private async analyzeRelationshipWithLLM(text: string, person1: string, person2: string): Promise<Connection | null> {
try {
const analysisResult = await this.openaiClient.analyzeRelationship(text, person1, person2);
if (!analysisResult) {
// Fallback to pattern matching
return this.fallbackPatternAnalysis(text, person1, person2);
}
return {
from: person1,
to: person2,
relationship: analysisResult.relationship,
evidence: analysisResult.evidence,
strength: analysisResult.strength,
type: analysisResult.type
};
} catch (error) {
console.warn(`Error analyzing relationship between ${person1} and ${person2}:`, error);
// Fallback to simple pattern matching
return this.fallbackPatternAnalysis(text, person1, person2);
}
}
private fallbackPatternAnalysis(text: string, person1: string, person2: string): Connection | null {
const lowerText = text.toLowerCase();
const name1Lower = person1.toLowerCase();
const name2Lower = person2.toLowerCase();
if (!lowerText.includes(name1Lower) || !lowerText.includes(name2Lower)) {
return null;
}
let relationship = 'mentioned together';
let strength = 3;
let type: Connection['type'] = 'other';
let evidence = 'Both mentioned in same document';
if (text.match(new RegExp(`${person1}.*(?:email|message|slack).*${person2}`, 'i'))) {
relationship = 'communication';
strength = 7;
type = 'communication';
evidence = 'Email/message exchange';
} else if (text.match(new RegExp(`${person1}.*(?:meeting|dinner).*${person2}`, 'i'))) {
relationship = 'met in person';
strength = 8;
type = 'meeting';
evidence = 'Meeting attendance';
} else if (text.match(new RegExp(`${person1}.*(?:concerns|worried).*${person2}`, 'i'))) {
relationship = 'expressed concerns about';
strength = 6;
type = 'other';
evidence = 'Expressed concerns';
}
return {
from: person1,
to: person2,
relationship,
evidence,
strength,
type
};
}
private extractOrganization(text: string, name: string): string | undefined {
const orgMatches = Array.from(text.matchAll(this.organizationPattern));
for (const match of orgMatches) {
const beforeName = text.indexOf(name);
const beforeOrg = text.indexOf(match[0]);
if (Math.abs(beforeName - beforeOrg) < 200) {
return match[0];
}
}
return undefined;
}
private extractRole(text: string, name: string): string | undefined {
const rolePattern = new RegExp(`${name}.*?(?:CEO|CTO|Director|Manager|President|Founder)`, 'i');
const match = text.match(rolePattern);
if (match) {
const roleMatch = match[0].match(/CEO|CTO|Director|Manager|President|Founder/i);
return roleMatch?.[0];
}
return undefined;
}
private isCommonWord(word: string): boolean {
const commonWords = ['The', 'And', 'For', 'Are', 'But', 'Not', 'You', 'All', 'Can', 'Had', 'Her', 'Was', 'One', 'Our', 'Out', 'Day', 'Get', 'Has', 'Him', 'His', 'How', 'Man', 'New', 'Now', 'Old', 'See', 'Two', 'Way', 'Who', 'Boy', 'Did', 'Its', 'Let', 'Put', 'Say', 'She', 'Too', 'Use'];
return commonWords.includes(word);
}
}