import * as fs from 'fs/promises';
import * as path from 'path';
import type { SymbolInfo } from '../types/index.js';
import type { SemanticContext } from '../analysis/SemanticAnalyzer.js';
import type { CrossReference } from '../analysis/CrossReferenceAnalyzer.js';
import { SemanticAnalyzer } from '../analysis/SemanticAnalyzer.js';
export interface EmbeddingVector {
values: number[];
dimensions: number;
magnitude: number;
}
export interface SymbolEmbedding {
symbol: string;
file: string;
embedding: EmbeddingVector;
features: SymbolFeatures;
lastUpdated: number;
}
export interface SymbolFeatures {
// Structural features
type: string;
lineCount: number;
complexity: number;
dependencyCount: number;
usageCount: number;
// Semantic features
nameTokens: string[];
commentTokens: string[];
contextTokens: string[];
// Relationship features
isPublicAPI: boolean;
isDeprecated: boolean;
inheritanceDepth: number;
couplingScore: number;
// Usage patterns
crossFileUsage: number;
recentUsage: number;
changeFrequency: number;
testCoverage: number;
}
export interface SimilarityResult {
symbol: string;
file: string;
score: number;
explanation: string;
features: {
structural: number;
semantic: number;
contextual: number;
usage: number;
};
}
export interface RelevanceQuery {
text: string;
context?: {
currentFile?: string;
currentFunction?: string;
recentSymbols?: string[];
projectContext?: string[];
};
filters?: {
symbolTypes?: string[];
files?: string[];
minUsageCount?: number;
maxComplexity?: number;
isPublicAPI?: boolean;
};
options?: {
maxResults?: number;
threshold?: number;
includeExplanation?: boolean;
weightFactors?: {
semantic: number;
structural: number;
contextual: number;
usage: number;
};
};
}
export interface MLModel {
name: string;
version: string;
type: 'embedding' | 'classification' | 'ranking';
parameters: Record<string, any>;
accuracy?: number;
lastTrained?: number;
}
export interface TrainingData {
queries: string[];
relevantSymbols: string[][];
irrelevantSymbols: string[][];
userFeedback: Array<{
query: string;
symbol: string;
relevance: number; // 0-1
timestamp: number;
}>;
}
export interface ModelMetrics {
precision: number;
recall: number;
f1Score: number;
meanReciprocalRank: number;
ndcg: number; // Normalized Discounted Cumulative Gain
userSatisfaction: number;
}
export class RelevanceScorer {
private semanticAnalyzer: SemanticAnalyzer;
private symbolEmbeddings: Map<string, SymbolEmbedding> = new Map();
private models: Map<string, MLModel> = new Map();
private trainingData: TrainingData;
private vocabularyIndex: Map<string, number> = new Map();
private idfScores: Map<string, number> = new Map();
private embeddingCache: Map<string, EmbeddingVector> = new Map();
constructor() {
this.semanticAnalyzer = new SemanticAnalyzer();
this.trainingData = {
queries: [],
relevantSymbols: [],
irrelevantSymbols: [],
userFeedback: []
};
this.initializeModels();
}
/**
* Initialize and load ML models
*/
private initializeModels(): void {
// Initialize embedding model
this.models.set('embedding', {
name: 'symbol-embedding',
version: '1.0.0',
type: 'embedding',
parameters: {
dimensions: 256,
windowSize: 5,
minCount: 2,
learningRate: 0.025
}
});
// Initialize ranking model
this.models.set('ranking', {
name: 'relevance-ranking',
version: '1.0.0',
type: 'ranking',
parameters: {
features: ['semantic', 'structural', 'contextual', 'usage'],
weights: [0.4, 0.2, 0.2, 0.2],
learningRate: 0.01,
regularization: 0.001
}
});
}
/**
* Build embeddings for all symbols in the codebase
*/
async buildEmbeddings(filePaths: string[]): Promise<void> {
console.log('Building symbol embeddings...');
// First pass: extract features and build vocabulary
const allFeatures: SymbolFeatures[] = [];
for (const filePath of filePaths) {
const features = await this.extractFileFeatures(filePath);
allFeatures.push(...features);
}
// Build vocabulary and IDF scores
this.buildVocabulary(allFeatures);
this.calculateIdfScores(allFeatures);
// Second pass: generate embeddings
for (const filePath of filePaths) {
await this.generateFileEmbeddings(filePath);
}
console.log(`Generated embeddings for ${this.symbolEmbeddings.size} symbols`);
}
/**
* Find relevant symbols based on a query
*/
async findRelevantSymbols(query: RelevanceQuery): Promise<SimilarityResult[]> {
const queryEmbedding = await this.generateQueryEmbedding(query.text);
const candidates = this.filterCandidates(query.filters);
const results: SimilarityResult[] = [];
for (const [symbolKey, symbolEmbedding] of candidates) {
const similarity = this.calculateSimilarity(queryEmbedding, symbolEmbedding, query);
if (similarity.score >= (query.options?.threshold || 0.1)) {
results.push(similarity);
}
}
// Sort by relevance score
results.sort((a, b) => b.score - a.score);
// Apply context boosting
this.applyContextBoosting(results, query.context);
// Limit results
const maxResults = query.options?.maxResults || 20;
return results.slice(0, maxResults);
}
/**
* Get similar symbols to a given symbol
*/
async findSimilarSymbols(symbol: string, filePath: string, maxResults: number = 10): Promise<SimilarityResult[]> {
const symbolKey = `${path.resolve(filePath)}:${symbol}`;
const targetEmbedding = this.symbolEmbeddings.get(symbolKey);
if (!targetEmbedding) {
throw new Error(`Symbol ${symbol} not found in ${filePath}`);
}
const results: SimilarityResult[] = [];
for (const [key, embedding] of this.symbolEmbeddings) {
if (key === symbolKey) continue;
const similarity = this.calculateCosineSimilarity(
targetEmbedding.embedding,
embedding.embedding
);
if (similarity > 0.1) {
results.push({
symbol: embedding.symbol,
file: embedding.file,
score: similarity,
explanation: this.generateSimilarityExplanation(targetEmbedding, embedding, similarity),
features: {
structural: this.calculateStructuralSimilarity(targetEmbedding.features, embedding.features),
semantic: this.calculateSemanticSimilarity(targetEmbedding.features, embedding.features),
contextual: this.calculateContextualSimilarity(targetEmbedding.features, embedding.features),
usage: this.calculateUsageSimilarity(targetEmbedding.features, embedding.features)
}
});
}
}
return results.sort((a, b) => b.score - a.score).slice(0, maxResults);
}
/**
* Train the relevance model with user feedback
*/
async trainModel(trainingData: TrainingData): Promise<ModelMetrics> {
console.log('Training relevance model...');
// Merge with existing training data
this.trainingData.queries.push(...trainingData.queries);
this.trainingData.relevantSymbols.push(...trainingData.relevantSymbols);
this.trainingData.irrelevantSymbols.push(...trainingData.irrelevantSymbols);
this.trainingData.userFeedback.push(...trainingData.userFeedback);
// Generate training examples
const trainingExamples = this.generateTrainingExamples();
// Train embedding model
await this.trainEmbeddingModel(trainingExamples);
// Train ranking model
await this.trainRankingModel(trainingExamples);
// Evaluate model performance
const metrics = await this.evaluateModel();
console.log('Model training completed:', metrics);
return metrics;
}
/**
* Add user feedback to improve model
*/
addUserFeedback(query: string, symbol: string, relevance: number): void {
this.trainingData.userFeedback.push({
query,
symbol,
relevance,
timestamp: Date.now()
});
// Trigger incremental learning if enough feedback accumulated
if (this.trainingData.userFeedback.length % 100 === 0) {
this.incrementalLearning();
}
}
/**
* Get model performance metrics
*/
getModelMetrics(): ModelMetrics {
// Calculate metrics based on recent performance
return {
precision: 0.85, // Placeholder - would calculate from actual data
recall: 0.78,
f1Score: 0.81,
meanReciprocalRank: 0.72,
ndcg: 0.79,
userSatisfaction: 0.83
};
}
/**
* Export trained model
*/
async exportModel(modelPath: string): Promise<void> {
const modelData = {
models: Object.fromEntries(this.models),
embeddings: Array.from(this.symbolEmbeddings.entries()),
vocabulary: Object.fromEntries(this.vocabularyIndex),
idfScores: Object.fromEntries(this.idfScores),
trainingData: this.trainingData,
metadata: {
version: '1.0.0',
createdAt: Date.now(),
symbolCount: this.symbolEmbeddings.size
}
};
await fs.writeFile(modelPath, JSON.stringify(modelData, null, 2));
console.log(`Model exported to ${modelPath}`);
}
/**
* Import trained model
*/
async importModel(modelPath: string): Promise<void> {
const modelData = JSON.parse(await fs.readFile(modelPath, 'utf-8'));
this.models = new Map(Object.entries(modelData.models));
this.symbolEmbeddings = new Map(modelData.embeddings);
this.vocabularyIndex = new Map(Object.entries(modelData.vocabulary));
this.idfScores = new Map(Object.entries(modelData.idfScores));
this.trainingData = modelData.trainingData;
console.log(`Model imported from ${modelPath}`);
console.log(`Loaded ${this.symbolEmbeddings.size} symbol embeddings`);
}
private async extractFileFeatures(filePath: string): Promise<SymbolFeatures[]> {
const context = await this.semanticAnalyzer.analyzeFile(filePath);
const features: SymbolFeatures[] = [];
for (const [symbol, symbolInfo] of context.symbols) {
const symbolFeatures = this.extractSymbolFeatures(symbol, symbolInfo, context);
features.push(symbolFeatures);
}
return features;
}
private extractSymbolFeatures(symbol: string, symbolInfo: SymbolInfo, context: SemanticContext): SymbolFeatures {
const nameTokens = this.tokenize(symbol);
const commentTokens = this.extractCommentTokens(symbolInfo);
const contextTokens = this.extractContextTokens(symbol, context);
return {
type: symbolInfo.type,
lineCount: (symbolInfo.endLine || 0) - (symbolInfo.startLine || 0),
complexity: this.calculateComplexity(symbolInfo),
dependencyCount: symbolInfo.dependencies?.length || 0,
usageCount: this.calculateUsageCount(symbol, context),
nameTokens,
commentTokens,
contextTokens,
isPublicAPI: symbolInfo.exports === true,
isDeprecated: this.isDeprecated(symbol),
inheritanceDepth: this.calculateInheritanceDepth(symbol, context),
couplingScore: this.calculateCouplingScore(symbol, context),
crossFileUsage: 0, // Would be calculated from cross-reference analysis
recentUsage: 0, // Would be calculated from usage history
changeFrequency: 0, // Would be calculated from git history
testCoverage: 0 // Would be calculated from test analysis
};
}
private buildVocabulary(allFeatures: SymbolFeatures[]): void {
const tokenCounts = new Map<string, number>();
for (const features of allFeatures) {
const allTokens = [
...features.nameTokens,
...features.commentTokens,
...features.contextTokens
];
for (const token of allTokens) {
tokenCounts.set(token, (tokenCounts.get(token) || 0) + 1);
}
}
// Filter tokens by minimum frequency
let index = 0;
for (const [token, count] of tokenCounts) {
if (count >= 2) { // Minimum frequency threshold
this.vocabularyIndex.set(token, index++);
}
}
console.log(`Built vocabulary with ${this.vocabularyIndex.size} tokens`);
}
private calculateIdfScores(allFeatures: SymbolFeatures[]): void {
const documentFrequency = new Map<string, number>();
const totalDocuments = allFeatures.length;
for (const features of allFeatures) {
const uniqueTokens = new Set([
...features.nameTokens,
...features.commentTokens,
...features.contextTokens
]);
for (const token of uniqueTokens) {
documentFrequency.set(token, (documentFrequency.get(token) || 0) + 1);
}
}
for (const [token, df] of documentFrequency) {
const idf = Math.log(totalDocuments / df);
this.idfScores.set(token, idf);
}
}
private async generateFileEmbeddings(filePath: string): Promise<void> {
const features = await this.extractFileFeatures(filePath);
for (const feature of features) {
const embedding = this.generateEmbedding(feature);
const symbolKey = `${path.resolve(filePath)}:${feature.nameTokens.join('')}`;
this.symbolEmbeddings.set(symbolKey, {
symbol: feature.nameTokens.join(''),
file: path.resolve(filePath),
embedding,
features: feature,
lastUpdated: Date.now()
});
}
}
private generateEmbedding(features: SymbolFeatures): EmbeddingVector {
const dimensions = 256;
const values = new Array(dimensions).fill(0);
// Combine different feature types into embedding
this.addTokenEmbeddings(values, features.nameTokens, 0.4);
this.addTokenEmbeddings(values, features.commentTokens, 0.3);
this.addTokenEmbeddings(values, features.contextTokens, 0.2);
this.addStructuralFeatures(values, features, 0.1);
// Normalize the embedding
const magnitude = Math.sqrt(values.reduce((sum, val) => sum + val * val, 0));
if (magnitude > 0) {
for (let i = 0; i < values.length; i++) {
values[i] /= magnitude;
}
}
return { values, dimensions, magnitude };
}
private addTokenEmbeddings(embedding: number[], tokens: string[], weight: number): void {
for (const token of tokens) {
const index = this.vocabularyIndex.get(token);
if (index !== undefined && index < embedding.length) {
const idf = this.idfScores.get(token) || 1;
embedding[index] += weight * idf;
}
}
}
private addStructuralFeatures(embedding: number[], features: SymbolFeatures, weight: number): void {
// Add structural features to specific dimensions
const baseIndex = embedding.length - 20; // Use last 20 dimensions for structural features
if (baseIndex > 0) {
embedding[baseIndex] += weight * (features.lineCount / 100);
embedding[baseIndex + 1] += weight * (features.complexity / 10);
embedding[baseIndex + 2] += weight * (features.dependencyCount / 20);
embedding[baseIndex + 3] += weight * (features.usageCount / 50);
embedding[baseIndex + 4] += weight * (features.isPublicAPI ? 1 : 0);
embedding[baseIndex + 5] += weight * (features.inheritanceDepth / 5);
embedding[baseIndex + 6] += weight * (features.couplingScore / 10);
}
}
private async generateQueryEmbedding(query: string): Promise<EmbeddingVector> {
const cacheKey = `query:${query}`;
const cached = this.embeddingCache.get(cacheKey);
if (cached) return cached;
const tokens = this.tokenize(query);
const dimensions = 256;
const values = new Array(dimensions).fill(0);
this.addTokenEmbeddings(values, tokens, 1.0);
// Normalize
const magnitude = Math.sqrt(values.reduce((sum, val) => sum + val * val, 0));
if (magnitude > 0) {
for (let i = 0; i < values.length; i++) {
values[i] /= magnitude;
}
}
const embedding = { values, dimensions, magnitude };
this.embeddingCache.set(cacheKey, embedding);
return embedding;
}
private filterCandidates(filters?: RelevanceQuery['filters']): Map<string, SymbolEmbedding> {
if (!filters) return this.symbolEmbeddings;
const filtered = new Map<string, SymbolEmbedding>();
for (const [key, embedding] of this.symbolEmbeddings) {
let include = true;
if (filters.symbolTypes && !filters.symbolTypes.includes(embedding.features.type)) {
include = false;
}
if (filters.files && !filters.files.some(f => embedding.file.includes(f))) {
include = false;
}
if (filters.minUsageCount && embedding.features.usageCount < filters.minUsageCount) {
include = false;
}
if (filters.maxComplexity && embedding.features.complexity > filters.maxComplexity) {
include = false;
}
if (filters.isPublicAPI !== undefined && embedding.features.isPublicAPI !== filters.isPublicAPI) {
include = false;
}
if (include) {
filtered.set(key, embedding);
}
}
return filtered;
}
private calculateSimilarity(
queryEmbedding: EmbeddingVector,
symbolEmbedding: SymbolEmbedding,
query: RelevanceQuery
): SimilarityResult {
const weights = query.options?.weightFactors || {
semantic: 0.4,
structural: 0.2,
contextual: 0.2,
usage: 0.2
};
const semantic = this.calculateCosineSimilarity(queryEmbedding, symbolEmbedding.embedding);
const structural = this.calculateStructuralRelevance(query, symbolEmbedding.features);
const contextual = this.calculateContextualRelevance(query, symbolEmbedding);
const usage = this.calculateUsageRelevance(symbolEmbedding.features);
const score = (
semantic * weights.semantic +
structural * weights.structural +
contextual * weights.contextual +
usage * weights.usage
);
return {
symbol: symbolEmbedding.symbol,
file: symbolEmbedding.file,
score,
explanation: query.options?.includeExplanation ?
this.generateRelevanceExplanation(query, symbolEmbedding, { semantic, structural, contextual, usage }) : '',
features: { semantic, structural, contextual, usage }
};
}
private calculateCosineSimilarity(a: EmbeddingVector, b: EmbeddingVector): number {
if (a.dimensions !== b.dimensions) return 0;
let dotProduct = 0;
for (let i = 0; i < a.dimensions; i++) {
dotProduct += a.values[i] * b.values[i];
}
return dotProduct; // Already normalized vectors
}
private calculateStructuralRelevance(query: RelevanceQuery, features: SymbolFeatures): number {
let score = 0;
// Prefer simpler symbols for general queries
if (features.complexity < 5) score += 0.2;
// Prefer public APIs
if (features.isPublicAPI) score += 0.3;
// Prefer frequently used symbols
score += Math.min(features.usageCount / 100, 0.3);
// Avoid deprecated symbols
if (features.isDeprecated) score -= 0.4;
return Math.max(0, Math.min(1, score));
}
private calculateContextualRelevance(query: RelevanceQuery, embedding: SymbolEmbedding): number {
let score = 0;
if (query.context?.currentFile && embedding.file === query.context.currentFile) {
score += 0.4; // Same file bonus
}
if (query.context?.recentSymbols?.includes(embedding.symbol)) {
score += 0.3; // Recently used bonus
}
return Math.min(1, score);
}
private calculateUsageRelevance(features: SymbolFeatures): number {
let score = 0;
// Usage frequency
score += Math.min(features.usageCount / 50, 0.4);
// Cross-file usage
score += Math.min(features.crossFileUsage / 10, 0.3);
// Recent usage
score += Math.min(features.recentUsage, 0.3);
return Math.min(1, score);
}
private applyContextBoosting(results: SimilarityResult[], context?: RelevanceQuery['context']): void {
if (!context) return;
for (const result of results) {
let boost = 0;
if (context.currentFile && result.file === context.currentFile) {
boost += 0.1;
}
if (context.recentSymbols?.includes(result.symbol)) {
boost += 0.15;
}
result.score = Math.min(1, result.score + boost);
}
// Re-sort after boosting
results.sort((a, b) => b.score - a.score);
}
private generateRelevanceExplanation(
query: RelevanceQuery,
embedding: SymbolEmbedding,
scores: { semantic: number; structural: number; contextual: number; usage: number }
): string {
const explanations: string[] = [];
if (scores.semantic > 0.7) {
explanations.push('High semantic similarity to query');
}
if (scores.structural > 0.6) {
explanations.push('Good structural match (complexity, API status)');
}
if (scores.contextual > 0.5) {
explanations.push('Relevant to current context');
}
if (scores.usage > 0.6) {
explanations.push('Frequently used symbol');
}
return explanations.join('; ');
}
private generateSimilarityExplanation(
target: SymbolEmbedding,
candidate: SymbolEmbedding,
similarity: number
): string {
const explanations: string[] = [];
if (target.features.type === candidate.features.type) {
explanations.push(`Same type (${target.features.type})`);
}
if (Math.abs(target.features.complexity - candidate.features.complexity) < 2) {
explanations.push('Similar complexity');
}
if (target.features.isPublicAPI === candidate.features.isPublicAPI) {
explanations.push('Same API visibility');
}
return explanations.join('; ');
}
// Helper methods for feature extraction
private tokenize(text: string): string[] {
return text
.toLowerCase()
.replace(/[^a-zA-Z0-9]/g, ' ')
.split(/\s+/)
.filter(token => token.length > 1);
}
private extractCommentTokens(symbolInfo: SymbolInfo): string[] {
// Would extract from actual comments in the code
return [];
}
private extractContextTokens(symbol: string, context: SemanticContext): string[] {
const tokens: string[] = [];
// Extract tokens from related symbols
const relationships = context.relationships.get(symbol) || [];
for (const rel of relationships) {
tokens.push(...this.tokenize(rel.target));
}
return tokens;
}
private calculateComplexity(symbolInfo: SymbolInfo): number {
// Simplified complexity calculation
const lineCount = (symbolInfo.endLine || 0) - (symbolInfo.startLine || 0);
const dependencyCount = symbolInfo.dependencies?.length || 0;
return lineCount / 10 + dependencyCount;
}
private calculateUsageCount(symbol: string, context: SemanticContext): number {
// Would calculate from cross-reference analysis
return 0;
}
private isDeprecated(symbol: string): boolean {
return symbol.includes('deprecated') || symbol.includes('legacy');
}
private calculateInheritanceDepth(symbol: string, context: SemanticContext): number {
// Would calculate from inheritance analysis
return 0;
}
private calculateCouplingScore(symbol: string, context: SemanticContext): number {
// Would calculate from dependency analysis
return 0;
}
private calculateStructuralSimilarity(a: SymbolFeatures, b: SymbolFeatures): number {
let score = 0;
if (a.type === b.type) score += 0.3;
const complexityDiff = Math.abs(a.complexity - b.complexity);
score += Math.max(0, 0.2 - complexityDiff / 10);
const lineDiff = Math.abs(a.lineCount - b.lineCount);
score += Math.max(0, 0.2 - lineDiff / 50);
if (a.isPublicAPI === b.isPublicAPI) score += 0.3;
return Math.min(1, score);
}
private calculateSemanticSimilarity(a: SymbolFeatures, b: SymbolFeatures): number {
const aTokens = new Set([...a.nameTokens, ...a.commentTokens]);
const bTokens = new Set([...b.nameTokens, ...b.commentTokens]);
const intersection = new Set([...aTokens].filter(x => bTokens.has(x)));
const union = new Set([...aTokens, ...bTokens]);
return union.size > 0 ? intersection.size / union.size : 0;
}
private calculateContextualSimilarity(a: SymbolFeatures, b: SymbolFeatures): number {
const aTokens = new Set(a.contextTokens);
const bTokens = new Set(b.contextTokens);
const intersection = new Set([...aTokens].filter(x => bTokens.has(x)));
const union = new Set([...aTokens, ...bTokens]);
return union.size > 0 ? intersection.size / union.size : 0;
}
private calculateUsageSimilarity(a: SymbolFeatures, b: SymbolFeatures): number {
const usageDiff = Math.abs(a.usageCount - b.usageCount);
const maxUsage = Math.max(a.usageCount, b.usageCount);
return maxUsage > 0 ? 1 - (usageDiff / maxUsage) : 1;
}
// Training methods (simplified implementations)
private generateTrainingExamples(): any[] {
// Would generate training examples from user feedback
return [];
}
private async trainEmbeddingModel(examples: any[]): Promise<void> {
// Would implement actual embedding model training
console.log('Training embedding model...');
}
private async trainRankingModel(examples: any[]): Promise<void> {
// Would implement actual ranking model training
console.log('Training ranking model...');
}
private async evaluateModel(): Promise<ModelMetrics> {
// Would evaluate model on test set
return this.getModelMetrics();
}
private async incrementalLearning(): Promise<void> {
// Would implement incremental learning from recent feedback
console.log('Performing incremental learning...');
}
/**
* Clear all cached data
*/
clearCache(): void {
this.embeddingCache.clear();
this.semanticAnalyzer.clearContexts();
}
/**
* Get embedding statistics
*/
getEmbeddingStats(): {
totalEmbeddings: number;
vocabularySize: number;
averageEmbeddingMagnitude: number;
memoryUsage: number;
} {
const totalEmbeddings = this.symbolEmbeddings.size;
const vocabularySize = this.vocabularyIndex.size;
let totalMagnitude = 0;
for (const [, embedding] of this.symbolEmbeddings) {
totalMagnitude += embedding.embedding.magnitude;
}
const averageEmbeddingMagnitude = totalEmbeddings > 0 ? totalMagnitude / totalEmbeddings : 0;
const memoryUsage = totalEmbeddings * 256 * 8; // Rough estimate in bytes
return {
totalEmbeddings,
vocabularySize,
averageEmbeddingMagnitude,
memoryUsage
};
}
}