Skip to main content
Glama
unicode-handling.test.js12 kB
/** * Edge case tests for Unicode handling * Tests CJK, emoji, RTL, and multi-byte characters */ import { describe, it, expect, beforeEach, vi } from 'vitest' import { createEmbeddingMock, EMBEDDING_DIM } from '../helpers/indexing-mocks.js' describe('Unicode Handling', () => { let mockEmbedder beforeEach(() => { vi.clearAllMocks() const mock = createEmbeddingMock() mockEmbedder = mock.mockEmbedder }) describe('CJK characters', () => { it('should handle Japanese text', async () => { const text = '会議の予定について' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle Chinese text', async () => { const text = '关于预算的讨论' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle Korean text', async () => { const text = '프로젝트 회의 안건' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle mixed CJK and English', async () => { const text = 'Meeting about 日本 project with 한국 team' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should preserve CJK in subject', () => { const subject = 'Re: 会議の件' expect(subject.includes('会議')).toBe(true) expect(subject.length).toBe(8) // "Re: " (4) + "会議の件" (4) = 8 }) it('should count CJK characters correctly', () => { const text = '日本語テスト' // Each CJK character is 1 character in JS expect(text.length).toBe(6) expect([...text].length).toBe(6) }) }) describe('emoji handling', () => { it('should handle basic emoji', async () => { const text = 'Great meeting! 😀' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle multiple emoji', async () => { const text = '🎉 Celebration 🎂 Birthday 🎁 Gift' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle emoji sequences (ZWJ)', async () => { // Family emoji: man + ZWJ + woman + ZWJ + girl const text = 'Family event 👨‍👩‍👧' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle flag emoji', async () => { const text = 'International meeting 🇺🇸 🇯🇵 🇬🇧' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should preserve emoji in subject', () => { const subject = '✅ Task Complete' expect(subject.includes('✅')).toBe(true) }) it('should handle skin tone modifiers', async () => { const text = 'Meeting 👋🏻 👋🏽 👋🏿' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) }) describe('RTL text (Hebrew, Arabic)', () => { it('should handle Hebrew text', async () => { const text = 'פגישה על הפרויקט' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle Arabic text', async () => { const text = 'اجتماع حول المشروع' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle mixed RTL and LTR', async () => { const text = 'Meeting with שלום about project' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should preserve RTL characters', () => { const text = 'שלום' expect(text.length).toBe(4) expect(text[0]).toBe('ש') }) }) describe('multi-byte characters', () => { it('should handle 2-byte UTF-8 (Latin Extended)', async () => { const text = 'Café résumé naïve' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle 3-byte UTF-8 (CJK)', async () => { const text = '中文测试' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle 4-byte UTF-8 (emoji, rare chars)', async () => { const text = '𝕳𝖊𝖑𝖑𝖔 🎵' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should correctly count surrogate pairs', () => { // 𝕳 is a surrogate pair (2 UTF-16 code units) const text = '𝕳' // JS string length counts UTF-16 code units expect(text.length).toBe(2) // Spread operator counts graphemes expect([...text].length).toBe(1) }) it('should handle combining characters', () => { // é can be e + combining acute accent const composed = 'é' // Single code point const decomposed = 'e\u0301' // e + combining accent expect(composed.normalize('NFC')).toBe(composed) expect(decomposed.normalize('NFC')).toBe(composed) }) }) describe('mixed scripts', () => { it('should handle English + Japanese', async () => { const text = 'Project meeting 会議 with team メンバー' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle English + Chinese + Emoji', async () => { const text = 'Budget 预算 approved ✅' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle multiple scripts in email', () => { const email = { from: 'José García <jose@example.com>', to: '田中太郎 <tanaka@example.jp>', subject: 'Re: 会议 Meeting ✅', body: 'Discussion about 项目 project with שלום' } // All fields should be valid strings expect(typeof email.from).toBe('string') expect(typeof email.to).toBe('string') expect(typeof email.subject).toBe('string') expect(typeof email.body).toBe('string') }) }) describe('zero-width characters', () => { it('should handle zero-width space (U+200B)', () => { const text = 'Hello\u200BWorld' expect(text.includes('\u200B')).toBe(true) expect(text.length).toBe(11) // Includes invisible char }) it('should handle zero-width joiner (U+200D)', () => { // Used in emoji sequences const emoji = '👨\u200D👩\u200D👧' expect(emoji.includes('\u200D')).toBe(true) }) it('should handle zero-width non-joiner (U+200C)', () => { const text = 'Test\u200Cword' expect(text.includes('\u200C')).toBe(true) }) it('should optionally strip zero-width chars', () => { const text = 'Hello\u200B\u200C\u200DWorld' const stripped = text.replace(/[\u200B\u200C\u200D]/g, '') expect(stripped).toBe('HelloWorld') }) it('should handle BOM (Byte Order Mark)', () => { const textWithBOM = '\uFEFFHello' const stripped = textWithBOM.replace(/^\uFEFF/, '') expect(stripped).toBe('Hello') }) }) describe('normalization', () => { it('should normalize to NFC form', () => { const decomposed = 'e\u0301' // e + combining accent const normalized = decomposed.normalize('NFC') expect(normalized).toBe('é') }) it('should handle already normalized text', () => { const text = 'café' const normalized = text.normalize('NFC') expect(normalized).toBe(text) }) it('should normalize for consistent comparison', () => { const text1 = 'café'.normalize('NFC') const text2 = 'cafe\u0301'.normalize('NFC') expect(text1).toBe(text2) }) }) describe('truncation with unicode', () => { it('should not break surrogate pairs when truncating', () => { const text = '😀'.repeat(100) // Each emoji is 2 UTF-16 code units const maxLength = 50 // Safe truncation using spread const chars = [...text] const truncated = chars.slice(0, maxLength).join('') // Should not end with half a surrogate pair expect(truncated).not.toMatch(/[\uD800-\uDBFF]$/) }) it('should handle CJK truncation correctly', () => { const text = '会議会議会議会議会議' // 10 CJK chars const maxLength = 5 const chars = [...text] const truncated = chars.slice(0, maxLength).join('') expect([...truncated].length).toBe(5) }) it('should preserve complete graphemes', () => { // Family emoji is single grapheme but multiple code points const text = '👨‍👩‍👧'.repeat(3) // Use Intl.Segmenter if available (Node 16+) if (typeof Intl.Segmenter !== 'undefined') { const segmenter = new Intl.Segmenter('en', { granularity: 'grapheme' }) const graphemes = [...segmenter.segment(text)] expect(graphemes.length).toBe(3) } else { // Fallback: just verify the string contains expected content expect(text.includes('👨')).toBe(true) } }) }) describe('search text with unicode', () => { it('should generate searchable text with unicode', async () => { const email = { subject: '会議 Meeting 📅', from: 'José García', body: 'Discussion about 项目' } const searchText = `${email.subject} ${email.from} ${email.body}` const result = await mockEmbedder([searchText], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should maintain unicode in indexed fields', () => { const record = { subject: '日本語メール', fromEmail: 'tanaka@example.jp', body: 'テスト本文' } // Verify unicode is preserved expect(record.subject.includes('日本語')).toBe(true) expect(record.body.includes('テスト')).toBe(true) }) }) describe('special unicode categories', () => { it('should handle mathematical symbols', async () => { const text = 'Formula: ∑∏∫∂√∞' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle currency symbols', async () => { const text = 'Budget: $100 €50 £30 ¥1000 ₹500' const result = await mockEmbedder([text], { pooling: 'mean', normalize: true }) expect(result.data.length).toBe(EMBEDDING_DIM) }) it('should handle box drawing characters', () => { const text = '┌──────┐\n│ Box │\n└──────┘' expect(text.includes('┌')).toBe(true) }) it('should handle control characters gracefully', () => { const text = 'Text\x00with\x01control\x02chars' // Should be able to strip or handle control chars const cleaned = text.replace(/[\x00-\x1F]/g, '') expect(cleaned).toBe('Textwithcontrolchars') }) }) })

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sfls1397/Apple-Tools-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server