/**
* Edge case tests for Unicode handling
* Tests CJK, emoji, RTL, and multi-byte characters
*/
import { describe, it, expect, beforeEach, vi } from 'vitest'
import {
createEmbeddingMock,
EMBEDDING_DIM
} from '../helpers/indexing-mocks.js'
describe('Unicode Handling', () => {
let mockEmbedder
beforeEach(() => {
vi.clearAllMocks()
const mock = createEmbeddingMock()
mockEmbedder = mock.mockEmbedder
})
describe('CJK characters', () => {
it('should handle Japanese text', async () => {
const text = '会議の予定について'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle Chinese text', async () => {
const text = '关于预算的讨论'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle Korean text', async () => {
const text = '프로젝트 회의 안건'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle mixed CJK and English', async () => {
const text = 'Meeting about 日本 project with 한국 team'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should preserve CJK in subject', () => {
const subject = 'Re: 会議の件'
expect(subject.includes('会議')).toBe(true)
expect(subject.length).toBe(8) // "Re: " (4) + "会議の件" (4) = 8
})
it('should count CJK characters correctly', () => {
const text = '日本語テスト'
// Each CJK character is 1 character in JS
expect(text.length).toBe(6)
expect([...text].length).toBe(6)
})
})
describe('emoji handling', () => {
it('should handle basic emoji', async () => {
const text = 'Great meeting! 😀'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle multiple emoji', async () => {
const text = '🎉 Celebration 🎂 Birthday 🎁 Gift'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle emoji sequences (ZWJ)', async () => {
// Family emoji: man + ZWJ + woman + ZWJ + girl
const text = 'Family event 👨👩👧'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle flag emoji', async () => {
const text = 'International meeting 🇺🇸 🇯🇵 🇬🇧'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should preserve emoji in subject', () => {
const subject = '✅ Task Complete'
expect(subject.includes('✅')).toBe(true)
})
it('should handle skin tone modifiers', async () => {
const text = 'Meeting 👋🏻 👋🏽 👋🏿'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
})
describe('RTL text (Hebrew, Arabic)', () => {
it('should handle Hebrew text', async () => {
const text = 'פגישה על הפרויקט'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle Arabic text', async () => {
const text = 'اجتماع حول المشروع'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle mixed RTL and LTR', async () => {
const text = 'Meeting with שלום about project'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should preserve RTL characters', () => {
const text = 'שלום'
expect(text.length).toBe(4)
expect(text[0]).toBe('ש')
})
})
describe('multi-byte characters', () => {
it('should handle 2-byte UTF-8 (Latin Extended)', async () => {
const text = 'Café résumé naïve'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle 3-byte UTF-8 (CJK)', async () => {
const text = '中文测试'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle 4-byte UTF-8 (emoji, rare chars)', async () => {
const text = '𝕳𝖊𝖑𝖑𝖔 🎵'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should correctly count surrogate pairs', () => {
// 𝕳 is a surrogate pair (2 UTF-16 code units)
const text = '𝕳'
// JS string length counts UTF-16 code units
expect(text.length).toBe(2)
// Spread operator counts graphemes
expect([...text].length).toBe(1)
})
it('should handle combining characters', () => {
// é can be e + combining acute accent
const composed = 'é' // Single code point
const decomposed = 'e\u0301' // e + combining accent
expect(composed.normalize('NFC')).toBe(composed)
expect(decomposed.normalize('NFC')).toBe(composed)
})
})
describe('mixed scripts', () => {
it('should handle English + Japanese', async () => {
const text = 'Project meeting 会議 with team メンバー'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle English + Chinese + Emoji', async () => {
const text = 'Budget 预算 approved ✅'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle multiple scripts in email', () => {
const email = {
from: 'José García <jose@example.com>',
to: '田中太郎 <tanaka@example.jp>',
subject: 'Re: 会议 Meeting ✅',
body: 'Discussion about 项目 project with שלום'
}
// All fields should be valid strings
expect(typeof email.from).toBe('string')
expect(typeof email.to).toBe('string')
expect(typeof email.subject).toBe('string')
expect(typeof email.body).toBe('string')
})
})
describe('zero-width characters', () => {
it('should handle zero-width space (U+200B)', () => {
const text = 'Hello\u200BWorld'
expect(text.includes('\u200B')).toBe(true)
expect(text.length).toBe(11) // Includes invisible char
})
it('should handle zero-width joiner (U+200D)', () => {
// Used in emoji sequences
const emoji = '👨\u200D👩\u200D👧'
expect(emoji.includes('\u200D')).toBe(true)
})
it('should handle zero-width non-joiner (U+200C)', () => {
const text = 'Test\u200Cword'
expect(text.includes('\u200C')).toBe(true)
})
it('should optionally strip zero-width chars', () => {
const text = 'Hello\u200B\u200C\u200DWorld'
const stripped = text.replace(/[\u200B\u200C\u200D]/g, '')
expect(stripped).toBe('HelloWorld')
})
it('should handle BOM (Byte Order Mark)', () => {
const textWithBOM = '\uFEFFHello'
const stripped = textWithBOM.replace(/^\uFEFF/, '')
expect(stripped).toBe('Hello')
})
})
describe('normalization', () => {
it('should normalize to NFC form', () => {
const decomposed = 'e\u0301' // e + combining accent
const normalized = decomposed.normalize('NFC')
expect(normalized).toBe('é')
})
it('should handle already normalized text', () => {
const text = 'café'
const normalized = text.normalize('NFC')
expect(normalized).toBe(text)
})
it('should normalize for consistent comparison', () => {
const text1 = 'café'.normalize('NFC')
const text2 = 'cafe\u0301'.normalize('NFC')
expect(text1).toBe(text2)
})
})
describe('truncation with unicode', () => {
it('should not break surrogate pairs when truncating', () => {
const text = '😀'.repeat(100) // Each emoji is 2 UTF-16 code units
const maxLength = 50
// Safe truncation using spread
const chars = [...text]
const truncated = chars.slice(0, maxLength).join('')
// Should not end with half a surrogate pair
expect(truncated).not.toMatch(/[\uD800-\uDBFF]$/)
})
it('should handle CJK truncation correctly', () => {
const text = '会議会議会議会議会議' // 10 CJK chars
const maxLength = 5
const chars = [...text]
const truncated = chars.slice(0, maxLength).join('')
expect([...truncated].length).toBe(5)
})
it('should preserve complete graphemes', () => {
// Family emoji is single grapheme but multiple code points
const text = '👨👩👧'.repeat(3)
// Use Intl.Segmenter if available (Node 16+)
if (typeof Intl.Segmenter !== 'undefined') {
const segmenter = new Intl.Segmenter('en', { granularity: 'grapheme' })
const graphemes = [...segmenter.segment(text)]
expect(graphemes.length).toBe(3)
} else {
// Fallback: just verify the string contains expected content
expect(text.includes('👨')).toBe(true)
}
})
})
describe('search text with unicode', () => {
it('should generate searchable text with unicode', async () => {
const email = {
subject: '会議 Meeting 📅',
from: 'José García',
body: 'Discussion about 项目'
}
const searchText = `${email.subject} ${email.from} ${email.body}`
const result = await mockEmbedder([searchText], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should maintain unicode in indexed fields', () => {
const record = {
subject: '日本語メール',
fromEmail: 'tanaka@example.jp',
body: 'テスト本文'
}
// Verify unicode is preserved
expect(record.subject.includes('日本語')).toBe(true)
expect(record.body.includes('テスト')).toBe(true)
})
})
describe('special unicode categories', () => {
it('should handle mathematical symbols', async () => {
const text = 'Formula: ∑∏∫∂√∞'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle currency symbols', async () => {
const text = 'Budget: $100 €50 £30 ¥1000 ₹500'
const result = await mockEmbedder([text], { pooling: 'mean', normalize: true })
expect(result.data.length).toBe(EMBEDDING_DIM)
})
it('should handle box drawing characters', () => {
const text = '┌──────┐\n│ Box │\n└──────┘'
expect(text.includes('┌')).toBe(true)
})
it('should handle control characters gracefully', () => {
const text = 'Text\x00with\x01control\x02chars'
// Should be able to strip or handle control chars
const cleaned = text.replace(/[\x00-\x1F]/g, '')
expect(cleaned).toBe('Textwithcontrolchars')
})
})
})