Skip to main content
Glama
token-estimator.ts•9.3 kB
/** * Token Estimation Utility for Context Curator * * Provides accurate token count estimation for text content, files, and token budget validation. * Uses conservative estimation algorithms to ensure reliable token budget management. * * Features: * - Basic text token estimation using character-to-token ratio * - File-specific token estimation including path overhead * - Token budget validation with utilization metrics * - Support for different content types and formats * - Conservative estimation to prevent token budget overruns */ export interface TokenEstimationResult { estimatedTokens: number; confidence: 'high' | 'medium' | 'low'; method: 'character_ratio' | 'word_count' | 'hybrid'; breakdown?: { contentTokens: number; pathTokens?: number; metadataTokens?: number; formattingTokens?: number; }; } export interface TokenBudgetValidation { isValid: boolean; utilizationPercentage: number; remainingTokens: number; recommendedAction: 'proceed' | 'optimize' | 'reduce_scope'; warningLevel: 'none' | 'low' | 'medium' | 'high' | 'critical'; } export interface FileTokenEstimate { filePath: string; contentTokens: number; pathTokens: number; totalTokens: number; confidence: 'high' | 'medium' | 'low'; estimationMethod: string; } /** * Token Estimation Utility Class * * Provides comprehensive token estimation capabilities for Context Curator operations. * Uses multiple estimation strategies to provide accurate token counts for budget management. */ export class TokenEstimator { // Conservative character-to-token ratio based on GPT tokenization patterns private static readonly CHARS_PER_TOKEN = 4; // Word-to-token ratio for alternative estimation private static readonly WORDS_PER_TOKEN = 0.75; // Token overhead for different content types private static readonly OVERHEAD_RATIOS = { xml: 1.15, // XML formatting adds ~15% overhead json: 1.10, // JSON formatting adds ~10% overhead markdown: 1.05, // Markdown formatting adds ~5% overhead code: 1.20, // Code content adds ~20% overhead due to syntax plain: 1.0 // Plain text has no overhead }; /** * Estimate tokens for basic text content using character-based method */ static estimateTokens(text: string): number { if (!text || text.length === 0) return 0; // Remove excessive whitespace for more accurate estimation const normalizedText = text.replace(/\s+/g, ' ').trim(); return Math.ceil(normalizedText.length / this.CHARS_PER_TOKEN); } /** * Estimate tokens using word-based method for comparison */ static estimateTokensByWords(text: string): number { if (!text || text.length === 0) return 0; const trimmed = text.trim(); if (trimmed.length === 0) return 0; const words = trimmed.split(/\s+/).length; return Math.ceil(words / this.WORDS_PER_TOKEN); } /** * Advanced token estimation with detailed breakdown and confidence scoring */ static estimateTokensAdvanced(text: string, contentType: keyof typeof TokenEstimator.OVERHEAD_RATIOS = 'plain'): TokenEstimationResult { if (!text || text.length === 0) { return { estimatedTokens: 0, confidence: 'high', method: 'character_ratio', breakdown: { contentTokens: 0 } }; } const charBasedTokens = this.estimateTokens(text); const wordBasedTokens = this.estimateTokensByWords(text); // Use hybrid approach: average of both methods for better accuracy const baseTokens = Math.ceil((charBasedTokens + wordBasedTokens) / 2); // Apply content type overhead const overhead = this.OVERHEAD_RATIOS[contentType]; const finalTokens = Math.ceil(baseTokens * overhead); // Determine confidence based on text characteristics const confidence = this.determineConfidence(text, charBasedTokens, wordBasedTokens); return { estimatedTokens: finalTokens, confidence, method: 'hybrid', breakdown: { contentTokens: baseTokens, formattingTokens: finalTokens - baseTokens } }; } /** * Estimate tokens for a file including path overhead */ static estimateFileTokens(filePath: string, content: string, contentType?: keyof typeof TokenEstimator.OVERHEAD_RATIOS): FileTokenEstimate { const pathTokens = this.estimateTokens(filePath); // Determine content type from file extension if not provided const detectedType = contentType || this.detectContentType(filePath); const contentEstimation = this.estimateTokensAdvanced(content, detectedType); const totalTokens = contentEstimation.estimatedTokens + pathTokens; return { filePath, contentTokens: contentEstimation.estimatedTokens, pathTokens, totalTokens, confidence: contentEstimation.confidence, estimationMethod: `${contentEstimation.method}_with_path` }; } /** * Validate token usage against budget with detailed recommendations */ static validateTokenBudget( estimatedTokens: number, maxBudget: number ): TokenBudgetValidation { const utilizationPercentage = (estimatedTokens / maxBudget) * 100; const remainingTokens = maxBudget - estimatedTokens; const isValid = estimatedTokens <= maxBudget; // Determine warning level and recommended action let warningLevel: TokenBudgetValidation['warningLevel'] = 'none'; let recommendedAction: TokenBudgetValidation['recommendedAction'] = 'proceed'; if (utilizationPercentage >= 100) { warningLevel = 'critical'; recommendedAction = 'reduce_scope'; } else if (utilizationPercentage >= 90) { warningLevel = 'high'; recommendedAction = 'optimize'; } else if (utilizationPercentage >= 75) { warningLevel = 'medium'; recommendedAction = 'optimize'; } else if (utilizationPercentage >= 60) { warningLevel = 'low'; recommendedAction = 'proceed'; } return { isValid, utilizationPercentage: Math.round(utilizationPercentage * 100) / 100, // Round to 2 decimal places remainingTokens, recommendedAction, warningLevel }; } /** * Estimate tokens for multiple files with aggregation */ static estimateMultipleFiles(files: Array<{ path: string; content: string }>): { totalTokens: number; fileEstimates: FileTokenEstimate[]; budgetRecommendation: string; } { const fileEstimates = files.map(file => this.estimateFileTokens(file.path, file.content) ); const totalTokens = fileEstimates.reduce((sum, estimate) => sum + estimate.totalTokens, 0); // Provide budget recommendation based on total let budgetRecommendation = 'suitable_for_standard_budget'; if (totalTokens > 100000) { budgetRecommendation = 'requires_large_budget'; } else if (totalTokens > 50000) { budgetRecommendation = 'requires_medium_budget'; } return { totalTokens, fileEstimates, budgetRecommendation }; } /** * Determine confidence level based on text characteristics */ private static determineConfidence(text: string, charTokens: number, wordTokens: number): 'high' | 'medium' | 'low' { const variance = Math.abs(charTokens - wordTokens) / Math.max(charTokens, wordTokens); // High confidence when both methods agree closely if (variance < 0.1) return 'high'; // Medium confidence for moderate variance if (variance < 0.3) return 'medium'; // Low confidence for high variance (unusual text patterns) return 'low'; } /** * Detect content type from file extension */ private static detectContentType(filePath: string): keyof typeof TokenEstimator.OVERHEAD_RATIOS { const extension = filePath.split('.').pop()?.toLowerCase(); switch (extension) { case 'xml': case 'html': case 'xhtml': return 'xml'; case 'json': case 'jsonl': return 'json'; case 'md': case 'markdown': case 'rst': return 'markdown'; case 'js': case 'ts': case 'jsx': case 'tsx': case 'py': case 'java': case 'cpp': case 'c': case 'cs': case 'php': case 'rb': case 'go': case 'rs': case 'swift': case 'kt': return 'code'; default: return 'plain'; } } /** * Get token estimation statistics for debugging and optimization */ static getEstimationStats(text: string): { characterCount: number; wordCount: number; lineCount: number; charBasedTokens: number; wordBasedTokens: number; averageWordsPerLine: number; averageCharsPerWord: number; } { const lines = text.split('\n'); const words = text.trim().split(/\s+/); return { characterCount: text.length, wordCount: words.length, lineCount: lines.length, charBasedTokens: this.estimateTokens(text), wordBasedTokens: this.estimateTokensByWords(text), averageWordsPerLine: Math.round((words.length / lines.length) * 100) / 100, averageCharsPerWord: Math.round((text.length / words.length) * 100) / 100 }; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/freshtechbro/vibe-coder-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server