doc-bot

doc-bot
src
utils

TokenEstimator.js

TokenEstimator.js•4.39 KiB

/** * TokenEstimator - Realistic token counting for LLM content * * Implements proper tokenization estimation based on modern LLM patterns. * Much more accurate than naive character-count approaches. */ export class TokenEstimator { /** * Estimate token count using realistic tokenization patterns * Based on GPT-style tokenization rules and observed patterns * * @param {string} text - Text to analyze * @returns {number} Estimated token count */ static estimateTokens(text) { if (!text) return 0; let tokens = 0; // Split by whitespace and punctuation patterns const words = text.split(/(\s+|[^\w\s])/); for (const word of words) { if (!word) continue; // Whitespace is often merged with adjacent tokens if (/^\s+$/.test(word)) { continue; // Don't count pure whitespace as tokens } // Single punctuation marks are usually 1 token if (/^[^\w\s]$/.test(word)) { tokens += 1; continue; } // Handle different word types if (/^\w+$/.test(word)) { // Regular words: estimate based on length and common patterns if (word.length <= 3) { tokens += 1; // Short words: 1 token } else if (word.length <= 6) { tokens += 1; // Medium words: usually 1 token } else if (word.length <= 10) { tokens += Math.ceil(word.length / 5); // Longer words: ~5 chars per token } else { // Very long words (often technical terms): ~4 chars per token tokens += Math.ceil(word.length / 4); } } else { // Mixed content (URLs, emails, code, etc.) // These are often tokenized more aggressively if (word.includes('://') || word.includes('@')) { // URLs and emails: roughly 3-4 chars per token tokens += Math.ceil(word.length / 3.5); } else if (/[A-Z]{2,}/.test(word) || /\d+/.test(word)) { // Acronyms and numbers: often 2-3 chars per token tokens += Math.ceil(word.length / 2.5); } else { // Other mixed content: 4 chars per token tokens += Math.ceil(word.length / 4); } } } // Account for special sequences that are tokenized differently // Code blocks, markdown, JSON, etc. tend to have more tokens const specialPatterns = [ /```[\s\S]*?```/g, // Code blocks /`[^`]+`/g, // Inline code /\[[^\]]*\]$[^)]*$/g, // Markdown links /\*\*[^*]+\*\*/g, // Bold text /\*[^*]+\*/g, // Italic text /{[^}]*}/g, // JSON-like structures /$[^)]*$/g, // Parenthetical content ]; let specialTokens = 0; for (const pattern of specialPatterns) { const matches = text.match(pattern); if (matches) { for (const match of matches) { // Special content has higher token density specialTokens += Math.ceil(match.length / 3); } } } // Use the higher of the two estimates (word-based vs special-pattern-based) // This accounts for content that's heavily formatted vs plain text const wordBasedEstimate = tokens; const specialContentRatio = specialTokens / Math.max(1, text.length); if (specialContentRatio > 0.1) { // High special content - use pattern-based estimate with adjustment tokens = Math.max(wordBasedEstimate, Math.ceil(text.length / 3.2)); } else { // Regular content - use word-based estimate tokens = wordBasedEstimate; } // Add buffer for control tokens, formatting, etc. (5-10% overhead) tokens = Math.ceil(tokens * 1.08); return tokens; } /** * Get the average characters per token for specific text * Useful for chunking operations */ static getAvgCharsPerToken(text) { if (!text) return 4; // Fallback const tokens = this.estimateTokens(text); return tokens > 0 ? text.length / tokens : 4; } /** * Check if text exceeds a token limit */ static exceedsLimit(text, maxTokens) { return this.estimateTokens(text) > maxTokens; } /** * Estimate how many characters would fit within a token budget * for a given text style */ static estimateCharsForTokens(sampleText, targetTokens) { const avgCharsPerToken = this.getAvgCharsPerToken(sampleText); return Math.floor(targetTokens * avgCharsPerToken); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/afterxleep/doc-bot'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

TokenEstimator.js•4.39 KiB