/**
* Performance tests for scaling behavior
* Tests linear O(n) complexity and throughput at varying sizes
*/
import { describe, it, expect, beforeEach, vi } from 'vitest'
import {
createEmbeddingMock,
createLanceDBMock,
BATCH_SIZE,
EMBEDDING_DIM
} from '../helpers/indexing-mocks.js'
import {
generateTestEmails,
generateTestMessages,
generateCalendarEvents
} from '../helpers/test-data-generators.js'
import {
measureTime,
calculateThroughput,
ThroughputTracker
} from '../helpers/performance-utils.js'
describe('Scaling Behavior', () => {
let mockEmbedder
let mockDb
beforeEach(() => {
vi.clearAllMocks()
const embedding = createEmbeddingMock()
mockEmbedder = embedding.mockEmbedder
mockDb = createLanceDBMock()
})
describe('linear O(n) time complexity', () => {
it('should scale linearly with email count', async () => {
const sizes = [100, 200, 400]
const results = []
for (const size of sizes) {
const emails = generateTestEmails(size)
const texts = emails.map(e => e.content)
const { duration } = await measureTime(async () => {
// Simulate batched embedding
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
const batch = texts.slice(i, i + BATCH_SIZE)
await mockEmbedder(batch, { pooling: 'mean', normalize: true })
}
})
results.push({
size,
duration,
timePerItem: duration / size
})
}
console.table(results)
// Time per item should be relatively constant (within 3x tolerance)
const avgTimePerItem = results.reduce((s, r) => s + r.timePerItem, 0) / results.length
for (const result of results) {
expect(result.timePerItem).toBeLessThan(avgTimePerItem * 3)
}
})
it('should scale linearly with message count', async () => {
const sizes = [200, 400, 800]
const results = []
for (const size of sizes) {
const messages = generateTestMessages(size)
const texts = messages.map(m => m.text)
const { duration } = await measureTime(async () => {
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
const batch = texts.slice(i, i + BATCH_SIZE)
await mockEmbedder(batch, { pooling: 'mean', normalize: true })
}
})
results.push({
size,
duration,
timePerItem: duration / size
})
}
console.table(results)
// Verify linear scaling
const avgTimePerItem = results.reduce((s, r) => s + r.timePerItem, 0) / results.length
for (const result of results) {
expect(result.timePerItem).toBeLessThan(avgTimePerItem * 3)
}
})
it('should scale linearly with calendar event count', async () => {
const sizes = [150, 300, 600]
const results = []
for (const size of sizes) {
const events = generateCalendarEvents(size)
const texts = events.map(e => `${e.title} ${e.location} ${e.notes}`)
const { duration } = await measureTime(async () => {
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
const batch = texts.slice(i, i + BATCH_SIZE)
await mockEmbedder(batch, { pooling: 'mean', normalize: true })
}
})
results.push({
size,
duration,
timePerItem: duration / size
})
}
console.table(results)
const avgTimePerItem = results.reduce((s, r) => s + r.timePerItem, 0) / results.length
for (const result of results) {
expect(result.timePerItem).toBeLessThan(avgTimePerItem * 3)
}
})
})
describe('throughput at varying sizes', () => {
it('should maintain throughput - Small (350 items)', async () => {
const count = 350
const emails = generateTestEmails(count)
const texts = emails.map(e => e.content)
const tracker = new ThroughputTracker().start()
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
const batch = texts.slice(i, i + BATCH_SIZE)
await mockEmbedder(batch, { pooling: 'mean', normalize: true })
tracker.recordBatch(batch.length)
}
const summary = tracker.getSummary()
console.log(`Small (${count}): ${summary.overallThroughput.toFixed(0)} items/sec`)
// With mocked embeddings, should achieve good throughput
expect(summary.overallThroughput).toBeGreaterThan(50)
})
it('should maintain throughput - Medium (1700 items)', async () => {
const count = 1700
const emails = generateTestEmails(count)
const texts = emails.map(e => e.content)
const tracker = new ThroughputTracker().start()
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
const batch = texts.slice(i, i + BATCH_SIZE)
await mockEmbedder(batch, { pooling: 'mean', normalize: true })
tracker.recordBatch(batch.length)
}
const summary = tracker.getSummary()
console.log(`Medium (${count}): ${summary.overallThroughput.toFixed(0)} items/sec`)
expect(summary.overallThroughput).toBeGreaterThan(50)
})
it('should maintain throughput - Large (7500 items)', async () => {
const count = 7500
const emails = generateTestEmails(count)
const texts = emails.map(e => e.content)
const tracker = new ThroughputTracker().start()
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
const batch = texts.slice(i, i + BATCH_SIZE)
await mockEmbedder(batch, { pooling: 'mean', normalize: true })
tracker.recordBatch(batch.length)
}
const summary = tracker.getSummary()
console.log(`Large (${count}): ${summary.overallThroughput.toFixed(0)} items/sec`)
// Throughput should stay reasonable even at scale
expect(summary.overallThroughput).toBeGreaterThan(50)
})
})
describe('throughput degradation', () => {
it('should not degrade more than 50% as size increases 10x', async () => {
const smallCount = 100
const largeCount = 1000
// Small batch
const smallTexts = generateTestEmails(smallCount).map(e => e.content)
const { duration: smallDuration } = await measureTime(async () => {
for (let i = 0; i < smallTexts.length; i += BATCH_SIZE) {
const batch = smallTexts.slice(i, i + BATCH_SIZE)
await mockEmbedder(batch, { pooling: 'mean', normalize: true })
}
})
const smallThroughput = calculateThroughput(smallCount, smallDuration)
// Large batch
const largeTexts = generateTestEmails(largeCount).map(e => e.content)
const { duration: largeDuration } = await measureTime(async () => {
for (let i = 0; i < largeTexts.length; i += BATCH_SIZE) {
const batch = largeTexts.slice(i, i + BATCH_SIZE)
await mockEmbedder(batch, { pooling: 'mean', normalize: true })
}
})
const largeThroughput = calculateThroughput(largeCount, largeDuration)
console.log(`Small (${smallCount}): ${smallThroughput.toFixed(0)} items/sec`)
console.log(`Large (${largeCount}): ${largeThroughput.toFixed(0)} items/sec`)
console.log(`Degradation: ${((1 - largeThroughput / smallThroughput) * 100).toFixed(1)}%`)
// Throughput should not degrade by more than 50%
expect(largeThroughput).toBeGreaterThan(smallThroughput * 0.5)
})
})
describe('batch count scaling', () => {
it('should use correct number of batches for any size', () => {
const sizes = [1, 31, 32, 33, 64, 100, 1000]
for (const size of sizes) {
const expectedBatches = Math.ceil(size / BATCH_SIZE)
let actualBatches = 0
for (let i = 0; i < size; i += BATCH_SIZE) {
actualBatches++
}
expect(actualBatches).toBe(expectedBatches)
}
})
it('should handle edge case batch boundaries', () => {
// Exactly one batch
expect(Math.ceil(32 / BATCH_SIZE)).toBe(1)
// One more than batch size
expect(Math.ceil(33 / BATCH_SIZE)).toBe(2)
// Multiple full batches
expect(Math.ceil(96 / BATCH_SIZE)).toBe(3)
// Multiple with remainder
expect(Math.ceil(100 / BATCH_SIZE)).toBe(4)
})
})
describe('mixed source scaling', () => {
it('should handle combined email + message + calendar indexing', async () => {
const emailCount = 200
const messageCount = 300
const eventCount = 100
const totalCount = emailCount + messageCount + eventCount
const emails = generateTestEmails(emailCount)
const messages = generateTestMessages(messageCount)
const events = generateCalendarEvents(eventCount)
const allTexts = [
...emails.map(e => e.content),
...messages.map(m => m.text),
...events.map(e => `${e.title} ${e.location}`)
]
const tracker = new ThroughputTracker().start()
const { duration } = await measureTime(async () => {
for (let i = 0; i < allTexts.length; i += BATCH_SIZE) {
const batch = allTexts.slice(i, i + BATCH_SIZE)
await mockEmbedder(batch, { pooling: 'mean', normalize: true })
tracker.recordBatch(batch.length)
}
})
const throughput = calculateThroughput(totalCount, duration)
console.log(`Mixed sources (${totalCount} total): ${throughput.toFixed(0)} items/sec`)
expect(throughput).toBeGreaterThan(50)
expect(tracker.getTotalItems()).toBe(totalCount)
})
})
describe('consistent batch throughput', () => {
it('should have consistent per-batch throughput', async () => {
const count = 320 // 10 batches
const texts = generateTestEmails(count).map(e => e.content)
const batchDurations = []
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
const batch = texts.slice(i, i + BATCH_SIZE)
const { duration } = await measureTime(async () => {
await mockEmbedder(batch, { pooling: 'mean', normalize: true })
})
batchDurations.push(duration)
}
// Calculate variance in batch durations
const avgDuration = batchDurations.reduce((a, b) => a + b, 0) / batchDurations.length
const variance = batchDurations.reduce((sum, d) => sum + Math.pow(d - avgDuration, 2), 0) / batchDurations.length
const stdDev = Math.sqrt(variance)
const cv = stdDev / avgDuration // Coefficient of variation
console.log(`Avg batch duration: ${avgDuration.toFixed(2)}ms`)
console.log(`Std dev: ${stdDev.toFixed(2)}ms`)
console.log(`CV: ${(cv * 100).toFixed(1)}%`)
// Coefficient of variation should be reasonable (< 100%)
// Higher tolerance for mocked operations which can have timing jitter
expect(cv).toBeLessThan(2)
})
})
})