Skip to main content
Glama
batch-embedding.test.js8.59 kB
/** * Unit tests for batch embedding functionality * Tests: embedBatch, embed, embedding pipeline behavior */ import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest' import { createEmbeddingMock, EMBEDDING_DIM, BATCH_SIZE } from '../helpers/indexing-mocks.js' import { generateSearchTexts } from '../helpers/test-data-generators.js' import { measureTime, cosineSimilarity } from '../helpers/performance-utils.js' describe('embedBatch', () => { let embeddingMock let mockEmbedder beforeEach(() => { vi.clearAllMocks() const mock = createEmbeddingMock() embeddingMock = mock mockEmbedder = mock.mockEmbedder }) afterEach(() => { vi.restoreAllMocks() }) describe('empty input handling', () => { it('should return empty array for empty input', async () => { const texts = [] // Simulate embedBatch behavior if (texts.length === 0) { const result = [] expect(result).toEqual([]) } }) it('should not call embedder for empty input', async () => { const texts = [] // When texts.length === 0, embedder should not be called if (texts.length > 0) { await mockEmbedder(texts, { pooling: 'mean', normalize: true }) } expect(mockEmbedder).not.toHaveBeenCalled() }) }) describe('single text optimization', () => { it('should handle single text input', async () => { const texts = ['single text input'] const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should return array with one vector for single text', async () => { const texts = ['test'] const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true }) // Extract single vector const embeddings = [] for (let i = 0; i < texts.length; i++) { const start = i * EMBEDDING_DIM const end = start + EMBEDDING_DIM embeddings.push(Array.from(result.data.slice(start, end))) } expect(embeddings.length).toBe(1) expect(embeddings[0].length).toBe(EMBEDDING_DIM) }) }) describe('vector dimensions', () => { it('should produce 384-dim vectors', async () => { const texts = ['test text'] const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) expect(EMBEDDING_DIM).toBe(384) }) it('should maintain consistent dimension for all texts', async () => { const texts = ['short', 'medium length text', 'a'.repeat(1000)] const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(texts.length * EMBEDDING_DIM) // Extract each vector and verify dimension for (let i = 0; i < texts.length; i++) { const start = i * EMBEDDING_DIM const end = start + EMBEDDING_DIM const vector = result.data.slice(start, end) expect(vector.length).toBe(EMBEDDING_DIM) } }) }) describe('batch processing', () => { it('should process multiple texts in single call', async () => { const texts = generateSearchTexts(10) const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true }) expect(mockEmbedder).toHaveBeenCalledTimes(1) expect(result.data.length).toBe(texts.length * EMBEDDING_DIM) }) it('should correctly slice result data into individual vectors', async () => { const texts = ['text one', 'text two', 'text three'] const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true }) const embeddings = [] for (let i = 0; i < texts.length; i++) { const start = i * EMBEDDING_DIM const end = start + EMBEDDING_DIM embeddings.push(Array.from(result.data.slice(start, end))) } expect(embeddings.length).toBe(3) embeddings.forEach(vec => { expect(vec.length).toBe(EMBEDDING_DIM) }) }) it('should handle batch size of 32 (BATCH_SIZE)', async () => { const texts = generateSearchTexts(BATCH_SIZE) const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(BATCH_SIZE * EMBEDDING_DIM) }) }) describe('vector quality', () => { it('should produce non-zero vectors', async () => { const texts = ['meaningful text content'] const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true }) const vector = Array.from(result.data.slice(0, EMBEDDING_DIM)) const hasNonZero = vector.some(v => v !== 0) expect(hasNonZero).toBe(true) }) it('should produce vectors with values in reasonable range', async () => { const texts = ['test text for embedding'] const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true }) const vector = Array.from(result.data.slice(0, EMBEDDING_DIM)) // Normalized vectors should have values roughly in [-1, 1] const allInRange = vector.every(v => v >= -2 && v <= 2) expect(allInRange).toBe(true) }) }) describe('deterministic output', () => { it('should produce identical vectors for identical text', async () => { const text = 'identical text input' const result1 = await mockEmbedder([text], { pooling: 'mean', normalize: true }) const result2 = await mockEmbedder([text], { pooling: 'mean', normalize: true }) const vec1 = Array.from(result1.data.slice(0, EMBEDDING_DIM)) const vec2 = Array.from(result2.data.slice(0, EMBEDDING_DIM)) expect(vec1).toEqual(vec2) }) }) }) describe('embedding pipeline initialization', () => { it('should be lazy loaded (not initialized until first use)', () => { // The pipeline is initialized on first getEmbedder() call // This is tested by checking that embeddingPipeline is null initially const pipelineFunction = createEmbeddingMock().pipeline expect(pipelineFunction).toBeDefined() expect(typeof pipelineFunction).toBe('function') }) it('should reuse pipeline instance after initialization', async () => { const { pipeline, mockEmbedder } = createEmbeddingMock() // Call pipeline multiple times await pipeline() await pipeline() await pipeline() // Pipeline function should be called, but embedder should be reused expect(pipeline).toHaveBeenCalledTimes(3) }) }) describe('embedding performance characteristics', () => { it('should complete batch embedding within reasonable time', async () => { const { mockEmbedder } = createEmbeddingMock() const texts = generateSearchTexts(32) const { duration } = await measureTime(async () => { await mockEmbedder(texts, { pooling: 'mean', normalize: true }) }) // Mock should be very fast (< 100ms) expect(duration).toBeLessThan(100) }) it('should scale linearly with batch size', async () => { const { mockEmbedder } = createEmbeddingMock() const { duration: duration16 } = await measureTime(async () => { await mockEmbedder(generateSearchTexts(16), { pooling: 'mean', normalize: true }) }) const { duration: duration32 } = await measureTime(async () => { await mockEmbedder(generateSearchTexts(32), { pooling: 'mean', normalize: true }) }) // Duration should roughly scale with batch size (allowing for overhead) // For mocked tests, both should be very fast expect(duration16).toBeLessThan(50) expect(duration32).toBeLessThan(100) }) }) describe('vector similarity', () => { it('should support cosine similarity calculation', () => { const vec1 = new Array(EMBEDDING_DIM).fill(0.1) const vec2 = new Array(EMBEDDING_DIM).fill(0.1) const similarity = cosineSimilarity(vec1, vec2) // Identical vectors should have similarity = 1 expect(similarity).toBeCloseTo(1, 5) }) it('should handle orthogonal vectors', () => { const vec1 = new Array(EMBEDDING_DIM).fill(0).map((_, i) => i % 2 === 0 ? 1 : 0) const vec2 = new Array(EMBEDDING_DIM).fill(0).map((_, i) => i % 2 === 1 ? 1 : 0) const similarity = cosineSimilarity(vec1, vec2) // Orthogonal vectors should have similarity = 0 expect(similarity).toBeCloseTo(0, 5) }) it('should throw for mismatched dimensions', () => { const vec1 = new Array(384).fill(0.1) const vec2 = new Array(256).fill(0.1) expect(() => cosineSimilarity(vec1, vec2)).toThrow('Vectors must have the same length') }) })

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sfls1397/Apple-Tools-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server