Skip to main content
Glama
quantization.ts7.38 kB
/** * @fileOverview: Vector quantization utilities for efficient embedding storage * @module: Quantization * @keyFunctions: * - quantizeFloat32ToInt8(): Convert float32 embeddings to int8 for storage * - dequantizeInt8ToFloat32(): Convert int8 embeddings back to float32 for similarity search * - calculateQuantizationError(): Calculate quantization error for quality assessment * @context: Provides 4-8x storage reduction while maintaining similarity search accuracy */ import { logger } from '../utils/logger'; /** * Quantization parameters for int8 conversion */ export interface QuantizationParams { min: number; max: number; scale: number; offset: number; } /** * Quantized embedding with metadata for dequantization */ export interface QuantizedEmbedding { data: Int8Array; params: QuantizationParams; originalDimensions: number; } /** * Convert float32 embedding vector to int8 for efficient storage * Uses symmetric quantization around zero for optimal similarity preservation */ export function quantizeFloat32ToInt8(embedding: number[]): QuantizedEmbedding { if (!embedding || embedding.length === 0) { throw new Error('Cannot quantize empty embedding'); } // Find the absolute maximum value for symmetric quantization let absMax = 0; for (const value of embedding) { const abs = Math.abs(value); if (abs > absMax) { absMax = abs; } } // Handle edge case of all zeros if (absMax === 0) { return { data: new Int8Array(embedding.length), params: { min: 0, max: 0, scale: 1, offset: 0 }, originalDimensions: embedding.length, }; } // Calculate quantization parameters // Use symmetric range [-absMax, absMax] for better similarity preservation const scale = absMax / 127; // 127 to leave room for rounding const quantized = new Int8Array(embedding.length); for (let i = 0; i < embedding.length; i++) { // Quantize to int8 range [-128, 127] const quantizedValue = Math.round(embedding[i] / scale); // Clamp to int8 range quantized[i] = Math.max(-128, Math.min(127, quantizedValue)); } const params: QuantizationParams = { min: -absMax, max: absMax, scale, offset: 0, // No offset needed for symmetric quantization }; logger.debug('🔢 Quantized embedding', { originalSize: embedding.length * 4, // 4 bytes per float32 quantizedSize: quantized.length, // 1 byte per int8 compressionRatio: ((embedding.length * 4) / quantized.length).toFixed(1), absMax, scale, }); return { data: quantized, params, originalDimensions: embedding.length, }; } /** * Convert quantized int8 embedding back to float32 for similarity calculations */ export function dequantizeInt8ToFloat32(quantized: QuantizedEmbedding): number[] { const dequantized = new Array(quantized.originalDimensions); for (let i = 0; i < quantized.originalDimensions; i++) { dequantized[i] = quantized.data[i] * quantized.params.scale; } return dequantized; } /** * Calculate quantization error to assess quality impact */ export function calculateQuantizationError( original: number[], quantized: QuantizedEmbedding ): { meanAbsoluteError: number; maxAbsoluteError: number; rootMeanSquareError: number; similarityPreservation: number; } { if (original.length !== quantized.originalDimensions) { throw new Error('Dimension mismatch between original and quantized embedding'); } const dequantized = dequantizeInt8ToFloat32(quantized); const errors: number[] = []; let sumSquaredError = 0; let maxError = 0; let sumAbsoluteError = 0; for (let i = 0; i < original.length; i++) { const error = Math.abs(original[i] - dequantized[i]); errors.push(error); sumAbsoluteError += error; sumSquaredError += error * error; if (error > maxError) { maxError = error; } } const meanAbsoluteError = sumAbsoluteError / original.length; const rootMeanSquareError = Math.sqrt(sumSquaredError / original.length); // Estimate similarity preservation using cosine similarity const originalNorm = Math.sqrt(original.reduce((sum, val) => sum + val * val, 0)); const dequantizedNorm = Math.sqrt(dequantized.reduce((sum, val) => sum + val * val, 0)); let dotProduct = 0; for (let i = 0; i < original.length; i++) { dotProduct += original[i] * dequantized[i]; } const similarityPreservation = originalNorm === 0 || dequantizedNorm === 0 ? 1.0 : dotProduct / (originalNorm * dequantizedNorm); return { meanAbsoluteError, maxAbsoluteError: maxError, rootMeanSquareError, similarityPreservation, }; } /** * Check if an embedding is quantized (int8) or raw (float32) */ export function isQuantized( embedding: number[] | QuantizedEmbedding ): embedding is QuantizedEmbedding { return 'data' in embedding && 'params' in embedding; } /** * Serialize quantized embedding for storage */ export function serializeQuantizedEmbedding(quantized: QuantizedEmbedding): Buffer { const jsonString = JSON.stringify({ data: Array.from(quantized.data), // Convert Int8Array to regular array for JSON params: quantized.params, originalDimensions: quantized.originalDimensions, }); return Buffer.from(jsonString, 'utf8'); } /** * Deserialize quantized embedding from storage */ export function deserializeQuantizedEmbedding(buffer: Buffer): QuantizedEmbedding { const jsonString = buffer.toString('utf8'); const parsed = JSON.parse(jsonString); return { data: new Int8Array(parsed.data), params: parsed.params, originalDimensions: parsed.originalDimensions, }; } /** * Convert any embedding (quantized or float32) to float32 for similarity search */ export function normalizeToFloat32(embedding: number[] | QuantizedEmbedding): number[] { if (isQuantized(embedding)) { return dequantizeInt8ToFloat32(embedding); } return embedding; } /** * Get storage size in bytes for an embedding */ export function getEmbeddingStorageSize(embedding: number[] | QuantizedEmbedding): number { if (isQuantized(embedding)) { // JSON overhead + int8 data + metadata return JSON.stringify(embedding).length; } else { // JSON overhead + float32 data (4 bytes per float) return JSON.stringify(embedding).length; } } /** * Estimate storage savings from quantization */ export function estimateQuantizationSavings( float32Embeddings: number[][], quantizedEmbeddings: QuantizedEmbedding[] ): { originalSize: number; quantizedSize: number; compressionRatio: number; totalSavings: number; percentageSaved: number; } { if (float32Embeddings.length !== quantizedEmbeddings.length) { throw new Error('Embedding count mismatch'); } let originalSize = 0; let quantizedSize = 0; for (let i = 0; i < float32Embeddings.length; i++) { const original = float32Embeddings[i]; const quantized = quantizedEmbeddings[i]; originalSize += JSON.stringify(original).length; quantizedSize += JSON.stringify(quantized).length; } const compressionRatio = originalSize / quantizedSize; const totalSavings = originalSize - quantizedSize; const percentageSaved = (totalSavings / originalSize) * 100; return { originalSize, quantizedSize, compressionRatio, totalSavings, percentageSaved, }; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sbarron/AmbianceMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server