import { spawn, ChildProcess } from 'child_process';
import logger from '../utils/logger';
export interface EmbeddingModel {
embed(text: string): Promise<number[]>;
embedBatch(texts: string[]): Promise<number[][]>;
}
export class SentenceTransformerEmbedding implements EmbeddingModel {
private model: string;
private pythonProcess: ChildProcess | null = null;
constructor(model: string = 'all-MiniLM-L6-v2') {
this.model = model;
}
async embed(text: string): Promise<number[]> {
const results = await this.embedBatch([text]);
return results[0];
}
async embedBatch(texts: string[]): Promise<number[][]> {
return new Promise((resolve, reject) => {
// Use a simple Python script approach for now
const pythonScript = `
import sys
import json
from sentence_transformers import SentenceTransformer
try:
model = SentenceTransformer('${this.model}')
texts = json.loads(sys.argv[1])
embeddings = model.encode(texts).tolist()
print(json.dumps(embeddings))
except Exception as e:
print(f"ERROR: {str(e)}", file=sys.stderr)
sys.exit(1)
`;
const python = spawn('python3', ['-c', pythonScript, JSON.stringify(texts)]);
let stdout = '';
let stderr = '';
python.stdout.on('data', (data) => {
stdout += data.toString();
});
python.stderr.on('data', (data) => {
stderr += data.toString();
});
python.on('close', (code) => {
if (code !== 0) {
logger.error('Python embedding process failed', { stderr, code });
reject(new Error(`Embedding failed: ${stderr}`));
return;
}
try {
const embeddings = JSON.parse(stdout.trim());
resolve(embeddings);
} catch (error) {
logger.error('Failed to parse embedding results', { stdout, error });
reject(new Error('Failed to parse embedding results'));
}
});
python.on('error', (error) => {
logger.error('Failed to spawn Python process', { error });
reject(new Error(`Failed to spawn Python process: ${error.message}`));
});
});
}
}
// Simple mock embedding for testing without Python dependencies
export class MockEmbedding implements EmbeddingModel {
async embed(text: string): Promise<number[]> {
// Generate deterministic embedding based on text hash
const hash = this.simpleHash(text);
const embedding: number[] = [];
for (let i = 0; i < 384; i++) {
embedding.push((Math.sin(hash + i) + 1) / 2); // Normalize to [0,1]
}
return embedding;
}
async embedBatch(texts: string[]): Promise<number[][]> {
return Promise.all(texts.map(text => this.embed(text)));
}
private simpleHash(str: string): number {
let hash = 0;
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash; // Convert to 32-bit integer
}
return Math.abs(hash);
}
}