import {
pipeline,
ZeroShotClassificationPipeline,
FeatureExtractionPipeline,
} from '@huggingface/transformers';
export interface SemanticAnalysisResult {
label: 'CONTRADICTION' | 'ENTAILMENT' | 'NEUTRAL';
score: number;
confidence: number;
}
export interface ActionAssessmentResult {
category: 'success' | 'failure' | 'neutral';
confidence: number;
reasoning: string;
}
export interface SemanticSimilarityResult {
similarity: number;
confidence: number;
reasoning: string;
}
export class SemanticAnalyzer {
private nliClassifier: ZeroShotClassificationPipeline | null = null;
private embeddingModel: FeatureExtractionPipeline | null = null;
private isInitialized = false;
private sessionCaches: Map<string, Map<string, number[]>> = new Map();
private readonly maxCacheSize = 500;
private readonly maxSessions = 100;
async initialize(): Promise<void> {
if (this.isInitialized) {
return;
}
console.log('Initializing semantic models...');
try {
// Initialize fast embedding model for similarity comparisons
console.log('Loading sentence embedding model...');
this.embeddingModel = await pipeline<'feature-extraction'>(
'feature-extraction',
'Xenova/all-MiniLM-L6-v2',
{
cache_dir: process.env.HF_HUB_CACHE || '.hf_cache',
}
);
// Preload NLI model to avoid first-call delay
console.log('Loading NLI model for precision analysis...');
this.nliClassifier = await pipeline<'zero-shot-classification'>(
'zero-shot-classification',
'MoritzLaurer/deberta-v3-base-zeroshot-v1.1-all-33',
{ cache_dir: process.env.HF_HUB_CACHE || '.hf_cache' }
);
this.isInitialized = true;
console.log('Semantic models initialized successfully');
} catch (error) {
console.error('Failed to initialize semantic models:', error);
throw error;
}
}
async analyzeTextPair(premise: string, hypothesis: string): Promise<SemanticAnalysisResult> {
if (!this.isInitialized || !this.nliClassifier) {
throw new Error('SemanticAnalyzer not initialized. Call initialize() first.');
}
try {
const labels = ['CONTRADICTION', 'ENTAILMENT', 'NEUTRAL'];
const result = await this.nliClassifier(premise, labels, {
hypothesis_template: `${hypothesis}`,
});
const output = Array.isArray(result) ? result[0] : result;
const topLabel = output.labels[0];
const topScore = output.scores[0];
return {
label: topLabel as 'CONTRADICTION' | 'ENTAILMENT' | 'NEUTRAL',
score: topScore,
confidence: topScore,
};
} catch (error) {
console.error('Error analyzing text pair:', error);
throw error;
}
}
async assessActionOutcome(
action: string,
expectedOutcome: string
): Promise<ActionAssessmentResult> {
const analysis = await this.analyzeTextPair(action, expectedOutcome);
let category: 'success' | 'failure' | 'neutral';
let reasoning: string;
if (analysis.label === 'ENTAILMENT' && analysis.confidence > 0.7) {
category = 'success';
reasoning = `Action aligns with expected outcome (${analysis.confidence.toFixed(3)})`;
} else if (analysis.label === 'CONTRADICTION' && analysis.confidence > 0.7) {
category = 'failure';
reasoning = `Action contradicts expected outcome (${analysis.confidence.toFixed(3)})`;
} else {
category = 'neutral';
reasoning = `Uncertain relationship: ${analysis.label} (${analysis.confidence.toFixed(3)})`;
}
return {
category,
confidence: analysis.confidence,
reasoning,
};
}
async classifyActionIntent(
action: string,
possibleIntents: string[]
): Promise<{
bestMatch: string;
confidence: number;
allScores: Array<{ intent: string; score: number }>;
}> {
const scores = [];
for (const intent of possibleIntents) {
const analysis = await this.analyzeTextPair(action, intent);
scores.push({
intent,
score:
analysis.label === 'ENTAILMENT'
? analysis.confidence
: analysis.label === 'NEUTRAL'
? analysis.confidence * 0.5
: analysis.confidence * 0.1,
});
}
scores.sort((a, b) => b.score - a.score);
return {
bestMatch: scores[0].intent,
confidence: scores[0].score,
allScores: scores,
};
}
/**
* Calculate semantic similarity between two texts using NLI-based approach
* Higher similarity indicates more related content
*/
async calculateSemanticSimilarity(
text1: string,
text2: string,
sessionId?: string
): Promise<SemanticSimilarityResult> {
if (!this.isInitialized || !this.embeddingModel) {
throw new Error('SemanticAnalyzer not initialized. Call initialize() first.');
}
try {
// Use fast embedding-based similarity (much faster than NLI)
const [embedding1, embedding2] = await this.getBatchEmbeddings([text1, text2], sessionId);
const similarity = this.cosineSimilarity(embedding1, embedding2);
// Calculate dynamic confidence based on similarity score
// Higher similarity scores get higher confidence, with minimum of 0.5
const confidence = Math.max(0.5, 0.3 + similarity * 0.7);
return {
similarity: Math.max(0, Math.min(1, similarity)),
confidence: Math.max(0, Math.min(1, confidence)),
reasoning: 'Fast embedding-based semantic similarity',
};
} catch (error) {
console.error('Error calculating semantic similarity:', error);
throw error;
}
}
/**
* Get or create a session-specific embedding cache
*/
private getSessionCache(sessionId: string = 'default'): Map<string, number[]> {
if (!this.sessionCaches.has(sessionId)) {
// Implement LRU for session management
if (this.sessionCaches.size >= this.maxSessions) {
const firstSession = this.sessionCaches.keys().next().value;
if (firstSession) {
this.sessionCaches.delete(firstSession);
}
}
this.sessionCaches.set(sessionId, new Map());
}
return this.sessionCaches.get(sessionId)!;
}
/**
* Clear cache for a specific session
*/
clearSessionCache(sessionId: string): void {
this.sessionCaches.delete(sessionId);
}
/**
* PERFORMANCE OPTIMIZATION: Batch compute embeddings for multiple texts
* This is 10-100x faster than individual model calls
*/
async getBatchEmbeddings(texts: string[], sessionId?: string): Promise<number[][]> {
if (!this.embeddingModel) {
throw new Error('Embedding model not initialized');
}
const sessionCache = this.getSessionCache(sessionId);
const embeddings: number[][] = [];
const uncachedTexts: string[] = [];
const uncachedIndices: number[] = [];
// Check session-specific cache first
for (let i = 0; i < texts.length; i++) {
const cached = sessionCache.get(texts[i]);
if (cached) {
embeddings[i] = cached;
} else {
uncachedTexts.push(texts[i]);
uncachedIndices.push(i);
}
}
// Process uncached texts individually for consistency (performance preserved via caching)
if (uncachedTexts.length > 0) {
const batchEmbeddings: number[][] = [];
// Process texts individually to ensure consistent embedding dimensions
for (const text of uncachedTexts) {
const singleResult = await this.embeddingModel(text, {
pooling: 'mean',
normalize: true,
});
if (singleResult && singleResult.data) {
const embedding = Array.from(singleResult.data) as number[];
// Validate that we're getting the expected 384-dimensional embeddings
if (embedding.length !== 384) {
console.error(
`Unexpected embedding dimension: ${embedding.length} (expected 384) for text: "${text.slice(0, 50)}..."`
);
console.error('Model output shape:', singleResult.dims);
throw new Error(
`Invalid embedding dimension: ${embedding.length}. Expected 384 for all-MiniLM-L6-v2.`
);
}
batchEmbeddings.push(embedding);
} else {
throw new Error(`Unable to process embedding for text: ${text.slice(0, 50)}...`);
}
}
for (let i = 0; i < uncachedTexts.length; i++) {
// Embeddings are already converted to number arrays above
const embedding = batchEmbeddings[i] as number[];
const originalIndex = uncachedIndices[i];
embeddings[originalIndex] = embedding;
// Cache with LRU eviction per session
if (sessionCache.size >= this.maxCacheSize) {
const firstKey = sessionCache.keys().next().value;
if (firstKey) {
sessionCache.delete(firstKey);
}
}
sessionCache.set(uncachedTexts[i], embedding);
}
}
return embeddings;
}
/**
* Fast cosine similarity calculation between two vectors
*/
private cosineSimilarity(vec1: number[], vec2: number[]): number {
if (vec1.length !== vec2.length) {
console.error('Vector dimension mismatch:', {
vec1Length: vec1.length,
vec2Length: vec2.length,
expected: 384, // all-MiniLM-L6-v2 should return 384-dim vectors
});
throw new Error(
`Vector dimension mismatch: ${vec1.length} vs ${vec2.length}. Expected 384-dimensional embeddings.`
);
}
let dotProduct = 0;
let magnitude1 = 0;
let magnitude2 = 0;
for (let i = 0; i < vec1.length; i++) {
dotProduct += vec1[i] * vec2[i];
magnitude1 += vec1[i] * vec1[i];
magnitude2 += vec2[i] * vec2[i];
}
magnitude1 = Math.sqrt(magnitude1);
magnitude2 = Math.sqrt(magnitude2);
if (magnitude1 === 0 || magnitude2 === 0) {
return 0;
}
return dotProduct / (magnitude1 * magnitude2);
}
/**
* PERFORMANCE OPTIMIZATION: Batch similarity matrix for multiple texts
* Computes all pairwise similarities in one batch - much faster than individual calls
*/
async computeSimilarityMatrix(texts: string[], sessionId?: string): Promise<number[][]> {
const embeddings = await this.getBatchEmbeddings(texts, sessionId);
const matrix: number[][] = [];
for (let i = 0; i < texts.length; i++) {
matrix[i] = [];
for (let j = 0; j < texts.length; j++) {
if (i === j) {
matrix[i][j] = 1.0;
} else if (j < i) {
matrix[i][j] = matrix[j][i]; // Use symmetry
} else {
matrix[i][j] = this.cosineSimilarity(embeddings[i], embeddings[j]);
}
}
}
return matrix;
}
/**
* Extract semantic features from text for advanced similarity calculations
*/
async extractSemanticFeatures(
text: string,
customIntents?: string[]
): Promise<{
intents: string[];
sentiment: 'positive' | 'negative' | 'neutral';
confidence: number;
}> {
if (!this.isInitialized) {
throw new Error('SemanticAnalyzer not initialized. Call initialize() first.');
}
if (!this.nliClassifier) {
throw new Error('SemanticAnalyzer not initialized. Call initialize() first.');
}
try {
// Use custom intents if provided, otherwise use generic ones
const intents = customIntents || [
'performing action',
'checking status',
'retrieving information',
'processing data',
'handling error',
'completing task',
'initiating process',
'validating result',
'organizing information',
'communicating result',
];
const intentAnalysis = await this.classifyActionIntent(text, intents);
// Sentiment analysis using predefined labels
const sentimentLabels = ['positive outcome', 'negative outcome', 'neutral outcome'];
const sentimentResult = await this.nliClassifier(text, sentimentLabels);
const sentimentOutput = Array.isArray(sentimentResult) ? sentimentResult[0] : sentimentResult;
let sentiment: 'positive' | 'negative' | 'neutral' = 'neutral';
if (sentimentOutput.labels[0] === 'positive outcome') {
sentiment = 'positive';
} else if (sentimentOutput.labels[0] === 'negative outcome') {
sentiment = 'negative';
}
return {
intents: [intentAnalysis.bestMatch],
sentiment,
confidence: Math.min(intentAnalysis.confidence, sentimentOutput.scores[0]),
};
} catch (error) {
console.error('Error extracting semantic features:', error);
return {
intents: [],
sentiment: 'neutral',
confidence: 0,
};
}
}
isReady(): boolean {
return this.isInitialized;
}
}
export const semanticAnalyzer = new SemanticAnalyzer();