HydraMCP

compress.ts•6.79 KiB

/** * compress — Distill a model response to fit a token budget. * * Uses a cheap/fast model to compress a verbose response while * preserving all actionable details (file paths, code, errors, * function names, commands). This is distillation, not summarization. * * Designed as a shared utility so other tools can reuse it. */ import { Provider, QueryResponse } from "../providers/provider.js"; import { logger } from "./logger.js"; // --------------------------------------------------------------------------- // Types // --------------------------------------------------------------------------- export interface CompressionResult { content: string; compressed: boolean; compressorModel?: string; compressorLatency?: number; originalTokens?: number; compressedTokens?: number; /** Original response content before compression (for escape hatch / quality comparison). */ rawContent?: string; } // --------------------------------------------------------------------------- // Model selection — pick cheapest/fastest available // --------------------------------------------------------------------------- const PREFERRED_COMPRESSOR_MODELS = [ "gemini-2.5-flash-lite", // Cheapest, fastest "gemini-2.5-flash", // Fast, cheap, 1M context "gemini-3-flash", // Flash-class "claude-3-5-haiku", // Fast Claude "claude-haiku-4-5", // Newer Haiku "gpt-5-codex-mini", // Small, fast GPT "gpt-5.1-codex-mini", // Small, fast GPT variant ]; export async function pickCompressorModel( provider: Provider, excludeModel?: string ): Promise<string | null> { try { const available = await provider.listModels(); if (available.length === 0) return null; // Build exclusion set (don't compress with the same model that generated) const excludeLower = excludeModel?.toLowerCase(); const excludeBase = excludeModel?.split("/").pop()?.toLowerCase(); function isExcluded(id: string): boolean { if (!excludeLower) return false; const lower = id.toLowerCase(); return lower === excludeLower || lower === excludeBase; } // Try preferred models in priority order for (const preferred of PREFERRED_COMPRESSOR_MODELS) { const match = available.find( (m) => m.id.toLowerCase().includes(preferred) && !isExcluded(m.id) ); if (match) return match.id; } // Fallback: any model that isn't the worker const fallback = available.find((m) => !isExcluded(m.id)); if (fallback) return fallback.id; // Last resort: even the same model return available[0].id; } catch { return null; } } // --------------------------------------------------------------------------- // Token estimation & skip logic // --------------------------------------------------------------------------- /** * Determine if compression is needed. Skips if the response is within * 20% of the budget — not worth the latency for marginal savings. */ export function shouldCompress( response: QueryResponse, maxResponseTokens: number ): boolean { const threshold = maxResponseTokens * 1.2; // Use actual token count if available (more accurate) if (response.usage?.completion_tokens) { return response.usage.completion_tokens > threshold; } // Fallback: estimate from character count (industry standard ~4 chars/token) const estimatedTokens = Math.ceil(response.content.length / 4); return estimatedTokens > threshold; } // --------------------------------------------------------------------------- // Compression prompt // --------------------------------------------------------------------------- const COMPRESSION_SYSTEM_PROMPT = "You are a response distiller. You compress AI model responses to fit a token budget " + "while preserving ALL actionable content. You never lose specifics."; function buildCompressionPrompt( content: string, budgetTokens: number ): string { return `Distill the following AI response into approximately ${budgetTokens} tokens. RULES: - PRESERVE: file paths, function/class/variable names, error messages, code blocks, URLs, commands, version numbers, configuration values, numeric data, step-by-step instructions - PRESERVE: the response's conclusions, recommendations, and direct answers - REMOVE: filler phrases ("Sure, I'd be happy to help", "Let me explain"), hedging ("it might be", "generally speaking"), unnecessary repetition, verbose transitions - FORMAT: Keep markdown structure (headers, lists, code blocks). Use dense bullet points instead of paragraphs where possible - If the response is mostly code, keep the code intact and compress only the surrounding explanation - Do NOT add your own commentary, preamble, or meta-statements about the compression - Output ONLY the distilled content Response to distill: ${content}`; } // --------------------------------------------------------------------------- // Main compression function // --------------------------------------------------------------------------- export async function compressResponse( provider: Provider, response: QueryResponse, maxResponseTokens: number ): Promise<CompressionResult> { // Check if compression is needed if (!shouldCompress(response, maxResponseTokens)) { return { content: response.content, compressed: false }; } // Pick a compressor model (cheap/fast, not the worker model) const compressorModel = await pickCompressorModel( provider, response.model ); if (!compressorModel) { logger.warn( "compress: no compressor model available, returning raw response" ); return { content: response.content, compressed: false }; } const originalTokens = response.usage?.completion_tokens ?? Math.ceil(response.content.length / 4); logger.info( `compress: distilling ~${originalTokens} tokens to ~${maxResponseTokens} using ${compressorModel}` ); try { const compressResult = await provider.query( compressorModel, buildCompressionPrompt(response.content, maxResponseTokens), { system_prompt: COMPRESSION_SYSTEM_PROMPT, temperature: 0, max_tokens: maxResponseTokens, } ); const compressedTokens = compressResult.usage?.completion_tokens ?? Math.ceil(compressResult.content.length / 4); logger.info( `compress: ${originalTokens} → ${compressedTokens} tokens (${Math.round((1 - compressedTokens / originalTokens) * 100)}% reduction)` ); return { content: compressResult.content, compressed: true, compressorModel, compressorLatency: compressResult.latency_ms, originalTokens, compressedTokens, rawContent: response.content, }; } catch (err) { logger.warn( `compress: compression failed, returning raw response: ${err instanceof Error ? err.message : String(err)}` ); return { content: response.content, compressed: false }; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Pickle-Pixel/HydraMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

compress.ts•6.79 KiB