Skip to main content
Glama
tokenCounter.ts4.71 kB
import { encoding_for_model, TiktokenModel } from '@dqbd/tiktoken'; /** * Estimates the number of tokens in a string using the specified model's tokenizer. * * @param text - The text to count tokens for * @param modelName - The model to use for tokenization ('gpt-4', 'gpt-3.5-turbo', etc.) * @returns The estimated number of tokens */ export function countTokens(text: string, modelName: TiktokenModel = 'gpt-4'): number { try { // Get the encoding for the specified model const enc = encoding_for_model(modelName); // Encode the text to tokens const tokens = enc.encode(text); // Clean up enc.free(); // Return the token count return tokens.length; } catch (error) { console.error('Error counting tokens:', error); // Fallback to approximate token count if tokenizer fails return approximateTokenCount(text); } } /** * Provides a rough approximation of token count based on character count. * This is used as a fallback if the tokenizer fails. * * @param text - The text to estimate tokens for * @returns The approximate token count */ function approximateTokenCount(text: string): number { // A very rough approximation: ~4 characters per token for English text // This will be less accurate for non-English text return Math.ceil(text.length / 4); } /** * Represents a document extracted from XML */ export interface XmlDocument { path: string; content: string; tokenCount: number; } /** * Analyzes and estimates token count for an XML document with a specific structure. * * @param xmlContent - The XML content in Anthropic's format * @param modelName - The model to use for tokenization * @returns Token count and additional statistics */ export function analyzeXmlTokens(xmlContent: string, modelName: TiktokenModel = 'gpt-4'): { totalTokens: number; documentCount: number; averageTokensPerDocument: number; documents: XmlDocument[]; } { // Count total tokens const totalTokens = countTokens(xmlContent, modelName); // Extract documents const documents: XmlDocument[] = []; const regex = /<document(?:\s+index="\d+"|\s+path="([^"]*)")+[^>]*>([\s\S]*?)<\/document>/g; let match; while ((match = regex.exec(xmlContent)) !== null) { const path = match[1] || ''; const content = match[2]; const tokenCount = countTokens(content, modelName); documents.push({ path, content, tokenCount }); } const documentCount = documents.length; // Calculate average tokens per document const averageTokensPerDocument = documentCount > 0 ? Math.round(totalTokens / documentCount) : 0; return { totalTokens, documentCount, averageTokensPerDocument, documents }; } /** * Splits XML content into multiple batches to avoid hitting token rate limits * * @param xmlContent - The XML content to split * @param tokenLimit - Maximum tokens per batch * @param modelName - The model used for tokenization * @returns Array of XML content batches */ export function splitXmlIntoBatches( xmlContent: string, tokenLimit: number = 25000, // Conservative limit to stay under TPM modelName: TiktokenModel = 'gpt-4' ): string[] { const analysis = analyzeXmlTokens(xmlContent, modelName); // If under limit, return as single batch if (analysis.totalTokens <= tokenLimit) { return [xmlContent]; } // We need to split by documents const batches: string[] = []; let currentBatch = '<documents>\n'; let currentBatchTokens = countTokens(currentBatch, modelName); const closingTagTokens = countTokens('</documents>', modelName); const batchHeaderTokens = countTokens('<documents>\n<!-- CONTINUED FROM PREVIOUS BATCH -->\n', modelName); // For each document in the XML for (const doc of analysis.documents) { // Calculate the document's XML representation const docXml = `<document path="${doc.path}">\n${doc.content}\n</document>\n`; const docTokens = countTokens(docXml, modelName); // If adding this document would exceed batch limit, finalize current batch if (currentBatchTokens + docTokens + closingTagTokens > tokenLimit) { currentBatch += '</documents>'; batches.push(currentBatch); // Start new batch currentBatch = '<documents>\n<!-- CONTINUED FROM PREVIOUS BATCH -->\n'; currentBatchTokens = batchHeaderTokens; } // Add document to current batch currentBatch += docXml; currentBatchTokens += docTokens; } // Add final batch if not empty if (currentBatch !== '<documents>\n') { currentBatch += '</documents>'; batches.push(currentBatch); } return batches; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jalehman/mcp-sage'

If you have feedback or need assistance with the MCP directory API, please join our Discord server