/**
* Unit tests for batch embedding functionality
* Tests: embedBatch, embed, embedding pipeline behavior
*/
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'
import {
createEmbeddingMock,
EMBEDDING_DIM,
BATCH_SIZE
} from '../helpers/indexing-mocks.js'
import { generateSearchTexts } from '../helpers/test-data-generators.js'
import { measureTime, cosineSimilarity } from '../helpers/performance-utils.js'
describe('embedBatch', () => {
let embeddingMock
let mockEmbedder
beforeEach(() => {
vi.clearAllMocks()
const mock = createEmbeddingMock()
embeddingMock = mock
mockEmbedder = mock.mockEmbedder
})
afterEach(() => {
vi.restoreAllMocks()
})
describe('empty input handling', () => {
it('should return empty array for empty input', async () => {
const texts = []
// Simulate embedBatch behavior
if (texts.length === 0) {
const result = []
expect(result).toEqual([])
}
})
it('should not call embedder for empty input', async () => {
const texts = []
// When texts.length === 0, embedder should not be called
if (texts.length > 0) {
await mockEmbedder(texts, { pooling: 'mean', normalize: true })
}
expect(mockEmbedder).not.toHaveBeenCalled()
})
})
describe('single text optimization', () => {
it('should handle single text input', async () => {
const texts = ['single text input']
const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should return array with one vector for single text', async () => {
const texts = ['test']
const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true })
// Extract single vector
const embeddings = []
for (let i = 0; i < texts.length; i++) {
const start = i * EMBEDDING_DIM
const end = start + EMBEDDING_DIM
embeddings.push(Array.from(result.data.slice(start, end)))
}
expect(embeddings.length).toBe(1)
expect(embeddings[0].length).toBe(EMBEDDING_DIM)
})
})
describe('vector dimensions', () => {
it('should produce 384-dim vectors', async () => {
const texts = ['test text']
const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
expect(EMBEDDING_DIM).toBe(384)
})
it('should maintain consistent dimension for all texts', async () => {
const texts = ['short', 'medium length text', 'a'.repeat(1000)]
const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(texts.length * EMBEDDING_DIM)
// Extract each vector and verify dimension
for (let i = 0; i < texts.length; i++) {
const start = i * EMBEDDING_DIM
const end = start + EMBEDDING_DIM
const vector = result.data.slice(start, end)
expect(vector.length).toBe(EMBEDDING_DIM)
}
})
})
describe('batch processing', () => {
it('should process multiple texts in single call', async () => {
const texts = generateSearchTexts(10)
const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true })
expect(mockEmbedder).toHaveBeenCalledTimes(1)
expect(result.data.length).toBe(texts.length * EMBEDDING_DIM)
})
it('should correctly slice result data into individual vectors', async () => {
const texts = ['text one', 'text two', 'text three']
const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true })
const embeddings = []
for (let i = 0; i < texts.length; i++) {
const start = i * EMBEDDING_DIM
const end = start + EMBEDDING_DIM
embeddings.push(Array.from(result.data.slice(start, end)))
}
expect(embeddings.length).toBe(3)
embeddings.forEach(vec => {
expect(vec.length).toBe(EMBEDDING_DIM)
})
})
it('should handle batch size of 32 (BATCH_SIZE)', async () => {
const texts = generateSearchTexts(BATCH_SIZE)
const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(BATCH_SIZE * EMBEDDING_DIM)
})
})
describe('vector quality', () => {
it('should produce non-zero vectors', async () => {
const texts = ['meaningful text content']
const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true })
const vector = Array.from(result.data.slice(0, EMBEDDING_DIM))
const hasNonZero = vector.some(v => v !== 0)
expect(hasNonZero).toBe(true)
})
it('should produce vectors with values in reasonable range', async () => {
const texts = ['test text for embedding']
const result = await mockEmbedder(texts, { pooling: 'mean', normalize: true })
const vector = Array.from(result.data.slice(0, EMBEDDING_DIM))
// Normalized vectors should have values roughly in [-1, 1]
const allInRange = vector.every(v => v >= -2 && v <= 2)
expect(allInRange).toBe(true)
})
})
describe('deterministic output', () => {
it('should produce identical vectors for identical text', async () => {
const text = 'identical text input'
const result1 = await mockEmbedder([text], { pooling: 'mean', normalize: true })
const result2 = await mockEmbedder([text], { pooling: 'mean', normalize: true })
const vec1 = Array.from(result1.data.slice(0, EMBEDDING_DIM))
const vec2 = Array.from(result2.data.slice(0, EMBEDDING_DIM))
expect(vec1).toEqual(vec2)
})
})
})
describe('embedding pipeline initialization', () => {
it('should be lazy loaded (not initialized until first use)', () => {
// The pipeline is initialized on first getEmbedder() call
// This is tested by checking that embeddingPipeline is null initially
const pipelineFunction = createEmbeddingMock().pipeline
expect(pipelineFunction).toBeDefined()
expect(typeof pipelineFunction).toBe('function')
})
it('should reuse pipeline instance after initialization', async () => {
const { pipeline, mockEmbedder } = createEmbeddingMock()
// Call pipeline multiple times
await pipeline()
await pipeline()
await pipeline()
// Pipeline function should be called, but embedder should be reused
expect(pipeline).toHaveBeenCalledTimes(3)
})
})
describe('embedding performance characteristics', () => {
it('should complete batch embedding within reasonable time', async () => {
const { mockEmbedder } = createEmbeddingMock()
const texts = generateSearchTexts(32)
const { duration } = await measureTime(async () => {
await mockEmbedder(texts, { pooling: 'mean', normalize: true })
})
// Mock should be very fast (< 100ms)
expect(duration).toBeLessThan(100)
})
it('should scale linearly with batch size', async () => {
const { mockEmbedder } = createEmbeddingMock()
const { duration: duration16 } = await measureTime(async () => {
await mockEmbedder(generateSearchTexts(16), { pooling: 'mean', normalize: true })
})
const { duration: duration32 } = await measureTime(async () => {
await mockEmbedder(generateSearchTexts(32), { pooling: 'mean', normalize: true })
})
// Duration should roughly scale with batch size (allowing for overhead)
// For mocked tests, both should be very fast
expect(duration16).toBeLessThan(50)
expect(duration32).toBeLessThan(100)
})
})
describe('vector similarity', () => {
it('should support cosine similarity calculation', () => {
const vec1 = new Array(EMBEDDING_DIM).fill(0.1)
const vec2 = new Array(EMBEDDING_DIM).fill(0.1)
const similarity = cosineSimilarity(vec1, vec2)
// Identical vectors should have similarity = 1
expect(similarity).toBeCloseTo(1, 5)
})
it('should handle orthogonal vectors', () => {
const vec1 = new Array(EMBEDDING_DIM).fill(0).map((_, i) => i % 2 === 0 ? 1 : 0)
const vec2 = new Array(EMBEDDING_DIM).fill(0).map((_, i) => i % 2 === 1 ? 1 : 0)
const similarity = cosineSimilarity(vec1, vec2)
// Orthogonal vectors should have similarity = 0
expect(similarity).toBeCloseTo(0, 5)
})
it('should throw for mismatched dimensions', () => {
const vec1 = new Array(384).fill(0.1)
const vec2 = new Array(256).fill(0.1)
expect(() => cosineSimilarity(vec1, vec2)).toThrow('Vectors must have the same length')
})
})