Metal MCP Server
by aldrin-labs
- src
- core
import natural from 'natural';
import { ContentAnalysis, Topic, KeyPoint, Entity, EntityType, EntityMention, Relationship, Citation, ContentQuality, AnalysisOptions } from '../types/analysis.js';
import { ExtractedContent } from '../types/content.js';
export class ContentAnalyzer {
private tokenizer: natural.WordTokenizer;
private tfidf: natural.TfIdf;
private stemmer: typeof natural.PorterStemmerFr;
private technicalTerms: Set<string>;
private boilerplatePatterns: RegExp[];
private isTechnicalContent(text: string): boolean {
const technicalIndicators = [
'example',
'implementation',
'usage',
'api',
'method',
'function',
'parameter',
'return',
'class',
'interface',
'object',
'pattern'
];
const lowerText = text.toLowerCase();
return technicalIndicators.some(indicator => lowerText.includes(indicator)) ||
text.includes('```') ||
/`[^`]+`/.test(text);
}
private extractTechnicalTermsFromText(text: string): string[] {
const words = text.toLowerCase().split(/\W+/);
return words.filter(word =>
word.length > 3 &&
this.technicalTerms.has(word) &&
!this.isStopWord(word)
);
}
constructor() {
this.tokenizer = new natural.WordTokenizer();
this.tfidf = new natural.TfIdf();
this.stemmer = natural.PorterStemmerFr;
// Initialize technical terms focused on API wrappers and programming
this.technicalTerms = new Set([
// API and Design Patterns
'api', 'wrapper', 'client', 'sdk', 'library', 'interface',
'endpoint', 'request', 'response', 'http', 'rest', 'soap',
'facade', 'adapter', 'proxy', 'decorator', 'factory',
// Implementation Concepts
'implementation', 'method', 'function', 'class', 'object',
'parameter', 'argument', 'return', 'async', 'await', 'promise',
'callback', 'error', 'exception', 'handler', 'middleware',
// Best Practices
'pattern', 'practice', 'standard', 'convention', 'principle',
'solid', 'dry', 'separation', 'concern', 'abstraction',
'encapsulation', 'inheritance', 'polymorphism',
// Testing and Quality
'test', 'mock', 'stub', 'assertion', 'coverage', 'unit',
'integration', 'validation', 'verification', 'documentation',
// Common Features
'authentication', 'authorization', 'security', 'cache',
'rate', 'limit', 'throttle', 'retry', 'timeout', 'logging'
]);
// Initialize boilerplate patterns
this.boilerplatePatterns = [
/copyright/i,
/all rights reserved/i,
/terms of service/i,
/privacy policy/i,
/cookie policy/i,
/contact us/i,
/about us/i,
/follow us/i,
/subscribe/i,
/sign up/i,
/log in/i,
/register/i
];
}
public async analyze(content: ExtractedContent, options: AnalysisOptions = {}): Promise<ContentAnalysis> {
console.log('Starting content analysis for URL:', content.url);
console.log('Content length:', content.content.length);
// Prepare content for analysis
const tokens = this.tokenizeContent(content.content);
this.tfidf.addDocument(tokens);
console.log('Tokenized content length:', tokens.length);
// Extract topics and calculate relevance
console.log('Extracting topics...');
const topics = await this.extractTopics(content, options);
console.log('Found topics:', topics.length, topics.map(t => t.name));
console.log('Extracting key points...');
const keyPoints = this.extractKeyPoints(content, topics, options);
console.log('Found key points:', keyPoints.length);
console.log('Extracting entities...');
const entities = this.extractEntities(content);
console.log('Found entities:', entities.length);
const relationships = this.findRelationships(entities, content);
const sentiment = this.analyzeSentiment(content.content);
const quality = this.assessQuality(content);
// Merge similar topics
console.log('Merging similar topics...');
const mergedTopics = this.mergeSimilarTopics(topics);
console.log('After merging:', mergedTopics.length, mergedTopics.map(t => t.name));
const result = {
relevanceScore: this.calculateRelevanceScore(content, mergedTopics),
topics: mergedTopics,
keyPoints: this.deduplicateKeyPoints(keyPoints),
entities,
sentiment,
relationships,
citations: this.extractCitations(content),
quality
};
console.log('Analysis complete. Topics:', result.topics.length);
console.log('Key points:', result.keyPoints.length);
console.log('Relevance score:', result.relevanceScore);
return result;
}
private tokenizeContent(text: string): string[] {
return this.tokenizer.tokenize(text.toLowerCase()) || [];
}
private async extractTopics(content: ExtractedContent, options: AnalysisOptions): Promise<Topic[]> {
console.log('Extracting topics from content...');
const maxTopics = options.maxTopics || 8;
const minConfidence = options.minConfidence || 0.15;
// Split content into sections
const sections = content.content.split(/\n\n+/);
console.log(`Found ${sections.length} sections to analyze`);
// Initialize topic tracking
const topicMentions = new Map<string, {
count: number,
contexts: string[],
keywords: Set<string>
}>();
// Enhanced topic indicators for quantum computing
const topicIndicators = [
// General technical patterns
{ pattern: /(?:using|implementing|creating)\s+(\w+(?:\s+\w+){0,2})\s+(?:pattern|approach|method)/i, weight: 1.2 },
{ pattern: /(?:best\s+practice|recommended)\s+(?:is|for)\s+(\w+(?:\s+\w+){0,2})/i, weight: 1.1 },
{ pattern: /(\w+(?:\s+\w+){0,2})\s+implementation/i, weight: 1.0 },
{ pattern: /(\w+(?:\s+\w+){0,2})\s+(?:wrapper|api|interface)/i, weight: 1.0 },
// Domain-specific patterns
{ pattern: /(?:quantum)\s+(\w+(?:\s+\w+){0,2})/i, weight: 1.3 },
{ pattern: /(\w+(?:\s+\w+){0,2})\s+(?:qubit|qubits)/i, weight: 1.3 },
{ pattern: /(\w+(?:\s+\w+){0,2})\s+(?:algorithm|computation)/i, weight: 1.2 },
{ pattern: /(?:advances?|developments?|breakthroughs?)\s+in\s+(\w+(?:\s+\w+){0,2})/i, weight: 1.2 }
];
// Analyze each section
sections.forEach((section, index) => {
console.log(`Analyzing section ${index + 1}...`);
const sectionLower = section.toLowerCase();
// Look for topic indicators
topicIndicators.forEach(({ pattern, weight }) => {
const matches = sectionLower.match(pattern);
if (matches && matches[1]) {
const topic = matches[1].trim();
const existing = topicMentions.get(topic) || { count: 0, contexts: [], keywords: new Set() };
existing.count += weight;
existing.contexts.push(section);
// Extract related keywords
const keywords = this.extractKeywords(section);
keywords.forEach(k => existing.keywords.add(k));
topicMentions.set(topic, existing);
console.log(`Found topic: ${topic} (weight: ${weight})`);
}
});
// Look for technical content
if (this.isTechnicalContent(section)) {
const terms = this.extractTechnicalTermsFromText(section);
terms.forEach((term: string) => {
const existing = topicMentions.get(term) || { count: 0, contexts: [], keywords: new Set() };
existing.count += 0.7;
existing.contexts.push(section);
topicMentions.set(term, existing);
});
}
// Look for code examples
if (section.includes('```') || section.includes('`')) {
const codeKeywords = this.extractCodeKeywords(section);
codeKeywords.forEach(keyword => {
const existing = topicMentions.get(keyword) || { count: 0, contexts: [], keywords: new Set() };
existing.count += 0.8;
existing.contexts.push(section);
topicMentions.set(keyword, existing);
console.log(`Found code keyword: ${keyword}`);
});
}
});
console.log(`Found ${topicMentions.size} potential topics`);
// Convert to topics with enhanced scoring
const topics: Topic[] = Array.from(topicMentions.entries())
.map(([name, data]) => {
// Calculate confidence with context bonus
let confidence = Math.min(1, data.count / 3);
// Boost confidence for topics with multiple contexts
if (data.contexts.length > 1) {
confidence *= 1.2;
}
// Boost confidence for topics with technical keywords
if (data.keywords.size > 2) {
confidence *= 1.1;
}
return {
name,
confidence: Math.min(1, confidence),
keywords: Array.from(data.keywords)
};
})
.filter(topic => {
const meetsThreshold = topic.confidence >= minConfidence;
console.log(`Topic ${topic.name}: confidence ${topic.confidence} ${meetsThreshold ? 'accepted' : 'rejected'}`);
return meetsThreshold;
})
.sort((a, b) => b.confidence - a.confidence)
.slice(0, maxTopics);
console.log(`Extracted ${topics.length} topics above confidence threshold`);
return topics;
}
private extractKeywords(text: string): string[] {
const words = text.toLowerCase().split(/\W+/);
return words.filter(word =>
word.length > 3 &&
this.technicalTerms.has(word) &&
!this.isStopWord(word)
);
}
private extractCodeKeywords(text: string): string[] {
const codePatterns = [
/class\s+(\w+)/g,
/function\s+(\w+)/g,
/method\s+(\w+)/g,
/interface\s+(\w+)/g,
/import\s+(\w+)/g,
/require\s+['"](.+?)['"]/g
];
const keywords = new Set<string>();
codePatterns.forEach(pattern => {
let match;
while ((match = pattern.exec(text)) !== null) {
if (match[1]) {
keywords.add(match[1].toLowerCase());
}
}
});
return Array.from(keywords);
}
private getImportantTerms(text: string): Array<{term: string; score: number}> {
const terms: Array<{term: string; score: number}> = [];
const tokens = this.tokenizeContent(text);
this.tfidf.listTerms(0).forEach(item => {
const term = item.term;
if (term.length > 2 && !this.isStopWord(term)) {
// Boost score for technical terms
const score = this.technicalTerms.has(term) ? item.tfidf * 1.5 : item.tfidf;
terms.push({ term, score });
}
});
return terms.sort((a, b) => b.score - a.score);
}
private mergeSimilarTopics(topics: Topic[]): Topic[] {
const merged: Topic[] = [];
const processed = new Set<string>();
for (const topic of topics) {
if (processed.has(topic.name)) continue;
// Find similar topics
const similar = topics.filter(t =>
!processed.has(t.name) &&
(this.areTopicsSimilar(topic, t) || this.areTopicsRelated(topic, t))
);
if (similar.length > 0) {
// Merge topics
const mergedTopic: Topic = {
name: this.selectBestTopicName(similar.map(t => t.name)),
confidence: Math.max(...similar.map(t => t.confidence)),
keywords: Array.from(new Set(similar.flatMap(t => t.keywords)))
};
merged.push(mergedTopic);
similar.forEach(t => processed.add(t.name));
} else {
merged.push(topic);
processed.add(topic.name);
}
}
return merged;
}
private areTopicsSimilar(topic1: Topic, topic2: Topic): boolean {
// Check for stem similarity
const stem1 = this.stemmer.stem(topic1.name);
const stem2 = this.stemmer.stem(topic2.name);
if (stem1 === stem2) return true;
// Check for keyword overlap
const keywords1 = new Set(topic1.keywords);
const keywords2 = new Set(topic2.keywords);
const overlap = [...keywords1].filter(k => keywords2.has(k)).length;
const similarity = overlap / Math.min(keywords1.size, keywords2.size);
return similarity > 0.5;
}
private areTopicsRelated(topic1: Topic, topic2: Topic): boolean {
// Check if topics often appear together in technical contexts
const technicalPairs = [
['api', 'wrapper'],
['wrapper', 'implementation'],
['pattern', 'practice'],
['method', 'interface'],
['class', 'object'],
['error', 'handling'],
['authentication', 'security']
];
return technicalPairs.some(([t1, t2]) =>
(topic1.name.toLowerCase().includes(t1) && topic2.name.toLowerCase().includes(t2)) ||
(topic1.name.toLowerCase().includes(t2) && topic2.name.toLowerCase().includes(t1))
);
}
private selectBestTopicName(names: string[]): string {
// Prefer technical terms
const technicalNames = names.filter(name =>
this.technicalTerms.has(name.toLowerCase())
);
if (technicalNames.length > 0) {
return technicalNames[0];
}
// Otherwise use the longest name
return names.sort((a, b) => b.length - a.length)[0];
}
private areTermsRelated(term1: string, term2: string): boolean {
// Use word stems to check relation
const stem1 = this.stemmer.stem(term1);
const stem2 = this.stemmer.stem(term2);
if (stem1 === stem2) return true;
// Check technical term relationships
const technicalPairs = [
['api', 'wrapper'],
['wrapper', 'implementation'],
['pattern', 'practice'],
['method', 'interface'],
['class', 'object'],
['error', 'handling'],
['authentication', 'security']
];
return technicalPairs.some(([t1, t2]) =>
(term1.includes(t1) && term2.includes(t2)) ||
(term1.includes(t2) && term2.includes(t1))
);
}
private selectTopicName(mainTerm: string, relatedTerms: string[]): string {
// Prefer technical terms
const technicalTerms = [mainTerm, ...relatedTerms].filter(term =>
this.technicalTerms.has(term)
);
if (technicalTerms.length > 0) {
return technicalTerms[0].charAt(0).toUpperCase() + technicalTerms[0].slice(1);
}
return mainTerm.charAt(0).toUpperCase() + mainTerm.slice(1);
}
private extractKeyPoints(content: ExtractedContent, topics: Topic[], options: AnalysisOptions): KeyPoint[] {
// Split content into paragraphs first
const paragraphs = content.content.split(/\n\n+/);
const keyPoints: KeyPoint[] = [];
const minImportance = options.minImportance || 0.25; // Lowered threshold
// First pass: identify best practice and implementation sections
const bestPracticeSections = paragraphs.filter(p =>
/best\s+practices?|recommended|should|must|guidelines?/i.test(p)
);
const implementationSections = paragraphs.filter(p =>
/implementation|example|usage|how\s+to|approach/i.test(p) ||
p.includes('```') ||
/\b(function|class|method|interface)\b/.test(p)
);
// Process best practice sections
bestPracticeSections.forEach(section => {
const sentences = section.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 20);
sentences.forEach(sentence => {
if (this.isBestPracticeStatement(sentence)) {
const importance = this.calculateSentenceImportance(sentence, topics) * 1.3; // Boost best practices
if (importance >= minImportance) {
keyPoints.push({
text: sentence.trim(),
importance,
topics: this.findRelatedTopics(sentence, topics),
supportingEvidence: this.findSupportingEvidence(sentence, content)
});
}
}
});
});
// Process implementation sections
implementationSections.forEach(section => {
const sentences = section.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 20);
sentences.forEach(sentence => {
if (this.isImplementationGuidance(sentence)) {
const importance = this.calculateSentenceImportance(sentence, topics) * 1.2; // Boost implementation guidance
if (importance >= minImportance) {
const evidence = [
...this.findSupportingEvidence(sentence, content),
...this.extractCodeExamples(section)
];
keyPoints.push({
text: sentence.trim(),
importance,
topics: this.findRelatedTopics(sentence, topics),
supportingEvidence: evidence
});
}
}
});
});
// Process remaining paragraphs for other insights
paragraphs.forEach(paragraph => {
if (!bestPracticeSections.includes(paragraph) && !implementationSections.includes(paragraph)) {
const sentences = paragraph.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 20);
sentences.forEach(sentence => {
const importance = this.calculateSentenceImportance(sentence, topics);
if (importance >= minImportance && this.isInsightful(sentence)) {
keyPoints.push({
text: sentence.trim(),
importance,
topics: this.findRelatedTopics(sentence, topics),
supportingEvidence: this.findSupportingEvidence(sentence, content)
});
}
});
}
});
return this.deduplicateKeyPoints(
keyPoints.sort((a, b) => b.importance - a.importance)
.slice(0, options.maxKeyPoints || 15)
);
}
private isBestPracticeStatement(sentence: string): boolean {
const bestPracticeIndicators = [
/\b(?:should|must|recommend|best|practice|important|key|essential|avoid|ensure)\b/i,
/\b(?:pattern|approach|strategy|technique|principle)\b/i,
/\b(?:better|improve|optimize|enhance)\b/i,
/\b(?:common|typical|standard|conventional)\b/i
];
const lowerSentence = sentence.toLowerCase();
return bestPracticeIndicators.some(pattern => pattern.test(lowerSentence)) &&
!this.isBoilerplate(sentence);
}
private isImplementationGuidance(sentence: string): boolean {
const implementationIndicators = [
/\b(?:implement|create|build|develop|use|initialize|configure)\b/i,
/\b(?:method|function|class|interface|object)\b/i,
/\b(?:parameter|argument|return|value|type)\b/i,
/\b(?:example|sample|demo|code)\b/i
];
const lowerSentence = sentence.toLowerCase();
return implementationIndicators.some(pattern => pattern.test(lowerSentence)) &&
!this.isBoilerplate(sentence);
}
private isInsightful(sentence: string): boolean {
// Check if sentence contains meaningful technical content
const technicalTermCount = this.tokenizeContent(sentence)
.filter(token => this.technicalTerms.has(token)).length;
return technicalTermCount >= 2 && // Has multiple technical terms
sentence.length > 30 && // Not too short
!this.isBoilerplate(sentence) &&
!/^\s*[^a-zA-Z]*\s*$/.test(sentence); // Contains actual words
}
private extractCodeExamples(text: string): string[] {
const examples: string[] = [];
// Extract code blocks
const codeBlockRegex = /```[\s\S]*?```/g;
let match;
while ((match = codeBlockRegex.exec(text)) !== null) {
examples.push(match[0]);
}
// Extract inline code
const inlineCodeRegex = /`[^`]+`/g;
while ((match = inlineCodeRegex.exec(text)) !== null) {
examples.push(match[0]);
}
return examples;
}
private deduplicateKeyPoints(keyPoints: KeyPoint[]): KeyPoint[] {
const unique: KeyPoint[] = [];
const seen = new Set<string>();
for (const point of keyPoints) {
const normalized = this.normalizeText(point.text);
if (!seen.has(normalized) && !this.hasVerySimilarPoint(normalized, seen)) {
unique.push(point);
seen.add(normalized);
}
}
return unique;
}
private normalizeText(text: string): string {
return text.toLowerCase()
.replace(/\s+/g, ' ')
.replace(/[^\w\s]/g, '')
.trim();
}
private hasVerySimilarPoint(text: string, seen: Set<string>): boolean {
for (const existing of seen) {
const similarity = this.calculateTextSimilarity(text, existing);
if (similarity > 0.8) return true;
}
return false;
}
private calculateTextSimilarity(text1: string, text2: string): number {
const words1 = new Set(text1.split(' '));
const words2 = new Set(text2.split(' '));
const intersection = new Set([...words1].filter(x => words2.has(x)));
const union = new Set([...words1, ...words2]);
return intersection.size / union.size;
}
private calculateSentenceImportance(sentence: string, topics: Topic[]): number {
const tokens = this.tokenizeContent(sentence);
let importance = 0;
let technicalTermCount = 0;
let hasCodeExample = false;
// Check for code-like content
hasCodeExample = sentence.includes('```') ||
sentence.includes('`') ||
/\b(function|class|const|let|var|import|export)\b/.test(sentence);
// Count technical terms with weighted categories
const termWeights = {
implementation: 1.2, // Implementation details
pattern: 1.2, // Design patterns
practice: 1.2, // Best practices
test: 1.1, // Testing related
error: 1.1, // Error handling
api: 1.3, // API specific
wrapper: 1.3, // Wrapper specific
method: 1.1, // Method related
class: 1.1 // Class related
};
tokens.forEach(token => {
if (this.technicalTerms.has(token)) {
technicalTermCount++;
// Apply additional weight for key terms
for (const [term, weight] of Object.entries(termWeights)) {
if (token.includes(term)) {
importance += weight - 1; // Add the extra weight
}
}
}
});
// Calculate topic relevance with reduced penalty for multiple topics
topics.forEach(topic => {
topic.keywords.forEach(keyword => {
if (tokens.includes(keyword.toLowerCase())) {
importance += topic.confidence * 0.8; // Reduced weight per topic
}
});
});
// Boost importance based on technical term density
const technicalDensity = technicalTermCount / tokens.length;
importance += technicalDensity * 0.5; // Reduced multiplier
// Boost for code examples
if (hasCodeExample) {
importance += 0.3;
}
// Boost for sentences that look like best practices or implementation guidance
if (
sentence.toLowerCase().includes('should') ||
sentence.toLowerCase().includes('best practice') ||
sentence.toLowerCase().includes('recommend') ||
sentence.toLowerCase().includes('pattern') ||
sentence.toLowerCase().includes('example')
) {
importance += 0.2;
}
return Math.min(importance, 1);
}
private findRelatedTopics(sentence: string, topics: Topic[]): string[] {
const tokens = this.tokenizeContent(sentence);
return topics
.filter(topic =>
topic.keywords.some(keyword =>
tokens.includes(keyword.toLowerCase())
)
)
.map(topic => topic.name);
}
private findSupportingEvidence(sentence: string, content: ExtractedContent): string[] {
const tokens = this.tokenizeContent(sentence);
const evidence: string[] = [];
// Split content into sentences
const sentences = content.content.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 0);
// Find sentences that share significant terms with the input sentence
sentences.forEach(s => {
if (s === sentence) return;
const sTokens = this.tokenizeContent(s);
const sharedTerms = tokens.filter(t => sTokens.includes(t));
// Check if the sentence contains technical terms
const hasTechnicalTerms = sTokens.some(t => this.technicalTerms.has(t));
if (sharedTerms.length >= 2 && hasTechnicalTerms) {
evidence.push(s);
}
});
return evidence;
}
private extractEntities(content: ExtractedContent): Entity[] {
// Extract technical entities like algorithm names, standards, etc.
const entities: Entity[] = [];
const text = content.content;
// Look for standard numbers (e.g., FIPS 203)
const standardRegex = /(?:FIPS|SP|RFC)\s+\d+(?:-\d+)?/g;
const standards = text.match(standardRegex) || [];
standards.forEach(standard => {
const mentions = this.findMentions(text, standard);
entities.push({
name: standard,
type: 'standard' as EntityType,
mentions
});
});
// Look for algorithm names
const algorithmRegex = /(?:ML-KEM|ML-DSA|SLH-DSA|CRYSTALS-Kyber|CRYSTALS-Dilithium|SPHINCS\+|FALCON)(?:-\d+)?/g;
const algorithms = text.match(algorithmRegex) || [];
algorithms.forEach(algorithm => {
const mentions = this.findMentions(text, algorithm);
entities.push({
name: algorithm,
type: 'algorithm' as EntityType,
mentions
});
});
return entities;
}
private findMentions(text: string, term: string): EntityMention[] {
const mentions: EntityMention[] = [];
let pos = text.indexOf(term);
while (pos !== -1) {
const start = Math.max(0, pos - 50);
const end = Math.min(text.length, pos + term.length + 50);
mentions.push({
text: term,
position: {
start: pos,
end: pos + term.length
},
context: text.substring(start, end)
});
pos = text.indexOf(term, pos + 1);
}
return mentions;
}
private findRelationships(entities: Entity[], content: ExtractedContent): Relationship[] {
const relationships: Relationship[] = [];
const text = content.content;
// Look for relationships between standards and algorithms
entities.forEach(e1 => {
if (e1.type === 'standard') {
entities.forEach(e2 => {
if (e2.type === 'algorithm') {
// Check if entities appear close to each other
const distance = this.findMinDistance(text, e1.name, e2.name);
if (distance < 100) { // within 100 characters
relationships.push({
source: e1.name,
target: e2.name,
type: 'specifies',
confidence: 1 - (distance / 100)
});
}
}
});
}
});
return relationships;
}
private findMinDistance(text: string, term1: string, term2: string): number {
let minDistance = Infinity;
let pos1 = text.indexOf(term1);
while (pos1 !== -1) {
let pos2 = text.indexOf(term2);
while (pos2 !== -1) {
const distance = Math.abs(pos2 - pos1);
minDistance = Math.min(minDistance, distance);
pos2 = text.indexOf(term2, pos2 + 1);
}
pos1 = text.indexOf(term1, pos1 + 1);
}
return minDistance;
}
private analyzeSentiment(text: string) {
const analyzer = new natural.SentimentAnalyzer(
'English',
natural.PorterStemmerFr,
'afinn'
);
const tokens = this.tokenizeContent(text);
const score = analyzer.getSentiment(tokens);
return {
score: Math.max(-1, Math.min(1, score)), // Normalize to [-1, 1]
confidence: Math.abs(score) / 5, // Simple confidence calculation
aspects: [] // Could be enhanced with aspect-based sentiment analysis
};
}
private assessQuality(content: ExtractedContent): ContentQuality {
return {
readability: this.calculateReadabilityScore(content.content),
informationDensity: this.calculateInformationDensity(content),
technicalDepth: this.calculateTechnicalDepth(content),
credibilityScore: this.calculateCredibilityScore(content),
freshness: this.calculateFreshnessScore(content)
};
}
private calculateReadabilityScore(text: string): number {
const sentences = text.split(/[.!?]+/).length;
const words = text.split(/\s+/).length;
const syllables = this.countSyllables(text);
// Flesch-Kincaid Grade Level
const grade = 0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59;
// Convert to a 0-1 score, where 0.5 represents college level
return Math.max(0, Math.min(1, 1 - (grade / 20)));
}
private countSyllables(text: string): number {
const words = text.split(/\s+/);
return words.reduce((count, word) => {
return count + this.countWordSyllables(word);
}, 0);
}
private countWordSyllables(word: string): number {
word = word.toLowerCase();
if (word.length <= 3) return 1;
word = word.replace(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, '');
word = word.replace(/^y/, '');
const syllables = word.match(/[aeiouy]{1,2}/g);
return syllables ? syllables.length : 1;
}
private calculateInformationDensity(content: ExtractedContent): number {
const tokens = this.tokenizeContent(content.content);
const technicalTerms = tokens.filter(t => this.technicalTerms.has(t));
return Math.min(1, technicalTerms.length / (tokens.length * 0.2));
}
private calculateTechnicalDepth(content: ExtractedContent): number {
const tokens = this.tokenizeContent(content.content);
const uniqueTechnicalTerms = new Set(
tokens.filter(t => this.technicalTerms.has(t))
);
return Math.min(1, uniqueTechnicalTerms.size / 20);
}
private calculateCredibilityScore(content: ExtractedContent): number {
let score = 0.5; // Base score
// Check for technical domain
if (content.url.includes('.gov') ||
content.url.includes('.edu') ||
content.url.includes('csrc.') ||
content.url.includes('nist.')) {
score += 0.2;
}
// Check for citations
const citations = this.extractCitations(content);
if (citations.length > 0) {
score += 0.1;
}
// Check for technical content
const tokens = this.tokenizeContent(content.content);
const technicalTermRatio = tokens.filter(t => this.technicalTerms.has(t)).length / tokens.length;
score += technicalTermRatio * 0.2;
return Math.min(1, score);
}
private calculateFreshnessScore(content: ExtractedContent): number {
if (!content.metadata?.datePublished) return 0.5;
const published = new Date(content.metadata.datePublished);
const now = new Date();
const ageInDays = (now.getTime() - published.getTime()) / (1000 * 60 * 60 * 24);
// Score decreases with age, but technical content stays relevant longer
return Math.max(0, Math.min(1, 1 - (ageInDays / 365)));
}
private extractCitations(content: ExtractedContent): Citation[] {
const citations: Citation[] = [];
const text = content.content;
// Look for standard references
const standardRefs = text.match(/(?:FIPS|SP|RFC)\s+\d+(?:-\d+)?/g) || [];
standardRefs.forEach(ref => {
citations.push({
text: ref,
type: 'standard'
});
});
// Look for URL citations
const urls = text.match(/https?:\/\/[^\s)]+/g) || [];
urls.forEach(url => {
citations.push({
text: url,
type: 'url',
source: url
});
});
return citations;
}
private isStopWord(word: string): boolean {
return natural.stopwords.includes(word.toLowerCase());
}
private calculateRelevanceScore(content: ExtractedContent, topics: Topic[]): number {
// Calculate overall relevance based on topics and content quality
const topicScore = topics.reduce((sum, topic) => sum + topic.confidence, 0) / (topics.length || 1);
const quality = this.assessQuality(content);
return Math.min(
1,
(topicScore * 0.6) +
(quality.technicalDepth * 0.2) +
(quality.informationDensity * 0.2)
);
}
private isBoilerplate(text: string): boolean {
return this.boilerplatePatterns.some(pattern => pattern.test(text));
}
}