AI MCP Gateway

TokenEstimator.ts•13.1 kB

/** * @file Token Estimator Service * @description Estimates token counts for messages to help with context budget management. * * Supports multiple tokenization strategies: * 1. tiktoken (OpenAI's tokenizer) - Most accurate for GPT models * 2. Heuristic estimation - Fast fallback based on character/word count * * @example * ```typescript * const estimator = new TokenEstimator(); * const tokens = await estimator.estimate("Hello, world!"); * // Returns ~3 tokens * * const multiTokens = await estimator.estimateMessages([ * { role: 'user', content: 'Hello' }, * { role: 'assistant', content: 'Hi there!' } * ]); * // Returns estimated tokens for entire message array * ``` */ import { logger } from '../../logging/logger.js'; /** * Message structure for token estimation */ export interface TokenMessage { role: string; content: string; name?: string; } /** * Token estimation result with breakdown */ export interface TokenEstimationResult { totalTokens: number; breakdown?: { content: number; overhead: number; // Role markers, formatting, etc. }; method: 'tiktoken' | 'heuristic'; } /** * Configuration for token estimation */ export interface TokenEstimatorConfig { /** * Model name for tiktoken encoding selection * @default 'gpt-4' */ model?: string; /** * Whether to use heuristic as primary method (faster but less accurate) * @default false */ preferHeuristic?: boolean; /** * Characters per token ratio for heuristic estimation * @default 4 (English average is ~4 chars/token) */ charsPerToken?: number; } /** * Token overhead per message (approximate) * - Each message has ~4 tokens overhead for role/formatting * - System messages have slightly more overhead */ const MESSAGE_OVERHEAD = { system: 5, user: 4, assistant: 4, tool: 5, function: 5, }; /** * TokenEstimator class * Provides token estimation for text and chat messages */ export class TokenEstimator { private config: Required<TokenEstimatorConfig>; private tiktokenEncoder: { encode: (text: string) => number[]; free?: () => void } | null = null; private tiktokenAvailable = false; private initialized = false; constructor(config: TokenEstimatorConfig = {}) { this.config = { model: config.model ?? 'gpt-4', preferHeuristic: config.preferHeuristic ?? false, charsPerToken: config.charsPerToken ?? 4, }; } /** * Initialize tiktoken encoder (lazy loading) */ private async initTiktoken(): Promise<boolean> { if (this.initialized) { return this.tiktokenAvailable; } this.initialized = true; if (this.config.preferHeuristic) { logger.debug('TokenEstimator: Using heuristic method (preferred)'); return false; } try { // Dynamic import of tiktoken (optional dependency) // eslint-disable-next-line @typescript-eslint/no-explicit-any const tiktoken = await import('tiktoken') as any; // Get encoding for the model try { this.tiktokenEncoder = tiktoken.encoding_for_model(this.config.model); } catch { // Fallback to cl100k_base encoding (used by GPT-4, GPT-3.5-turbo) this.tiktokenEncoder = tiktoken.get_encoding('cl100k_base'); } this.tiktokenAvailable = true; logger.debug('TokenEstimator: tiktoken initialized', { model: this.config.model }); return true; } catch (error) { // tiktoken not installed or failed to load logger.debug('TokenEstimator: tiktoken not available, using heuristic', { error: error instanceof Error ? error.message : 'Unknown', }); return false; } } /** * Estimate token count for a single text string * * @param text - The text to estimate tokens for * @returns Token count estimation */ async estimate(text: string): Promise<number> { if (!text || text.length === 0) { return 0; } const hasTiktoken = await this.initTiktoken(); if (hasTiktoken && this.tiktokenEncoder) { try { const tokens = this.tiktokenEncoder.encode(text); return tokens.length; } catch (error) { logger.warn('TokenEstimator: tiktoken encoding failed, falling back to heuristic', { error: error instanceof Error ? error.message : 'Unknown', }); } } // Heuristic estimation return this.estimateHeuristic(text); } /** * Heuristic-based token estimation * Uses character count and word boundaries for estimation * * @param text - Text to estimate * @returns Estimated token count */ private estimateHeuristic(text: string): number { if (!text || text.length === 0) { return 0; } // Base estimation: characters / chars_per_token const charBasedEstimate = Math.ceil(text.length / this.config.charsPerToken); // Word-based adjustment (some words are multiple tokens) const words = text.split(/\s+/).filter(w => w.length > 0); const wordCount = words.length; // Count words that are likely multi-token (long words, non-ASCII, punctuation-heavy) // eslint-disable-next-line no-control-regex const nonAsciiRegex = /[^\u0000-\u007F]/; const punctuationRegex = /[^a-zA-Z0-9\s]/; const complexWords = words.filter(w => w.length > 10 || nonAsciiRegex.test(w) || // Non-ASCII punctuationRegex.test(w) // Punctuation ).length; // Adjust estimate: average of char-based and word-based with complexity factor const wordBasedEstimate = wordCount + Math.ceil(complexWords * 0.5); // Take weighted average, favoring char-based for short text and word-based for long text const textLength = text.length; const weight = Math.min(textLength / 500, 1); // 0-1 based on text length const estimate = Math.ceil( charBasedEstimate * (1 - weight * 0.3) + wordBasedEstimate * (weight * 0.3) ); return Math.max(1, estimate); } /** * Estimate token count for a chat message * Includes overhead for role markers and formatting * * @param message - Chat message to estimate * @returns Token estimation with breakdown */ async estimateMessage(message: TokenMessage): Promise<TokenEstimationResult> { const contentTokens = await this.estimate(message.content); const overhead = MESSAGE_OVERHEAD[message.role as keyof typeof MESSAGE_OVERHEAD] || 4; // Add name overhead if present const nameOverhead = message.name ? Math.ceil(message.name.length / this.config.charsPerToken) + 1 : 0; return { totalTokens: contentTokens + overhead + nameOverhead, breakdown: { content: contentTokens, overhead: overhead + nameOverhead, }, method: this.tiktokenAvailable ? 'tiktoken' : 'heuristic', }; } /** * Estimate total tokens for an array of chat messages * * @param messages - Array of chat messages * @returns Total token count */ async estimateMessages(messages: TokenMessage[]): Promise<number> { if (!messages || messages.length === 0) { return 0; } let totalTokens = 0; // Base overhead for the message array (conversation structure) const conversationOverhead = 3; totalTokens += conversationOverhead; for (const message of messages) { const result = await this.estimateMessage(message); totalTokens += result.totalTokens; } return totalTokens; } /** * Estimate tokens for a messages array and return detailed breakdown * * @param messages - Array of chat messages * @returns Detailed token estimation */ async estimateMessagesDetailed(messages: TokenMessage[]): Promise<{ total: number; perMessage: Array<{ role: string; tokens: number }>; method: 'tiktoken' | 'heuristic'; }> { const perMessage: Array<{ role: string; tokens: number }> = []; let total = 3; // Conversation overhead for (const message of messages) { const result = await this.estimateMessage(message); perMessage.push({ role: message.role, tokens: result.totalTokens, }); total += result.totalTokens; } return { total, perMessage, method: this.tiktokenAvailable ? 'tiktoken' : 'heuristic', }; } /** * Check if a message array fits within a token budget * * @param messages - Messages to check * @param budget - Maximum token budget * @returns Whether messages fit and how many tokens remaining */ async fitsInBudget(messages: TokenMessage[], budget: number): Promise<{ fits: boolean; totalTokens: number; remaining: number; }> { const totalTokens = await this.estimateMessages(messages); return { fits: totalTokens <= budget, totalTokens, remaining: budget - totalTokens, }; } /** * Truncate messages to fit within a token budget * Removes oldest messages first (keeps most recent) * * @param messages - Messages to truncate * @param budget - Maximum token budget * @param keepFirst - Number of messages to always keep from start (e.g., system prompt) * @returns Truncated message array */ async truncateToFit( messages: TokenMessage[], budget: number, keepFirst: number = 1 ): Promise<TokenMessage[]> { if (messages.length === 0) { return []; } const { fits, totalTokens } = await this.fitsInBudget(messages, budget); if (fits) { return messages; } // Keep first N messages (usually system prompt) and last message const protectedStart = messages.slice(0, keepFirst); const lastMessage = messages[messages.length - 1]; const middleMessages = messages.slice(keepFirst, -1); let result = [...protectedStart]; let currentTokens = await this.estimateMessages(protectedStart); const lastMessageTokens = await this.estimateMessages([lastMessage]); // Reserve space for last message const availableBudget = budget - lastMessageTokens; // Add messages from the end of middle section (most recent first) for (let i = middleMessages.length - 1; i >= 0; i--) { const messageTokens = (await this.estimateMessage(middleMessages[i])).totalTokens; if (currentTokens + messageTokens <= availableBudget) { result.splice(keepFirst, 0, middleMessages[i]); currentTokens += messageTokens; } } // Add last message result.push(lastMessage); logger.debug('TokenEstimator: Truncated messages to fit budget', { original: messages.length, truncated: result.length, originalTokens: totalTokens, finalTokens: await this.estimateMessages(result), budget, }); return result; } /** * Free tiktoken encoder resources */ dispose(): void { if (this.tiktokenEncoder && typeof this.tiktokenEncoder.free === 'function') { this.tiktokenEncoder.free(); this.tiktokenEncoder = null; } } } /** * Singleton instance with default configuration */ export const tokenEstimator = new TokenEstimator(); /** * Quick estimation helper (uses singleton) */ export async function estimateTokens(text: string): Promise<number> { return tokenEstimator.estimate(text); } /** * Synchronous quick estimation helper using heuristic method * Useful when async is not practical (e.g., in reduce callbacks) * * @param text - Text to estimate tokens for * @param _modelId - Model ID (optional, reserved for future use) * @returns Estimated token count */ export function estimateTokensSync(text: string, _modelId?: string): number { if (!text || text.length === 0) { return 0; } // Use simple heuristic: ~4 characters per token (average for English) const charsPerToken = 4; return Math.max(1, Math.ceil(text.length / charsPerToken)); } /** * Quick message estimation helper (uses singleton) */ export async function estimateMessageTokens(messages: TokenMessage[]): Promise<number> { return tokenEstimator.estimateMessages(messages); }

Latest Blog Posts

What Is Context Bloat in MCP?
By Om-Shree-0709 on December 16, 2025.
mcp
Context Bloat
MCP Moves to the Linux Foundation: Neutral Stewardship for Agentic Infrastructure
By Om-Shree-0709 on December 15, 2025.
mcp
anthropic
Linux Foundation
Code Execution with MCP: Architecting Agentic Efficiency
By Om-Shree-0709 on December 14, 2025.
mcp
Token bloat

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/babasida246/ai-mcp-gateway'

If you have feedback or need assistance with the MCP directory API, please join our Discord server