Skip to main content
Glama
orneryd

M.I.M.I.R - Multi-agent Intelligent Memory & Insight Repository

by orneryd
file-indexer-binary-detection.test.ts13.4 kB
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; import { FileIndexer } from '../src/indexing/FileIndexer.js'; import type { Driver } from 'neo4j-driver'; /** * Unit tests for FileIndexer binary content detection * * Tests the isTextContent method's ability to correctly identify: * - Text files with various Unicode content (emojis, CJK, etc.) * - Binary files (null bytes, control characters) * - Edge cases with mixed content * * The industry standard approach: * - Check for null bytes (definitive binary indicator) * - Count problematic control characters (0x00-0x08, 0x0E-0x1F) * - Allow valid Unicode including emojis, CJK, etc. * - Allow common whitespace (tab, newline, CR, form feed) * - Threshold: <10% control characters = text */ describe('FileIndexer - Binary Content Detection', () => { let fileIndexer: FileIndexer; let mockDriver: Driver; beforeEach(() => { // Create mock driver mockDriver = { session: vi.fn().mockReturnValue({ run: vi.fn(), close: vi.fn() }) } as unknown as Driver; fileIndexer = new FileIndexer(mockDriver); }); // Access private method for testing const isTextContent = (content: string): boolean => { return (fileIndexer as any).isTextContent(content); }; describe('Text Files - Should Be Detected as Text', () => { describe('Plain ASCII', () => { it('should detect plain ASCII text as text', () => { const content = 'Hello, World! This is a test file.\nWith multiple lines.\n'; expect(isTextContent(content)).toBe(true); }); it('should detect source code as text', () => { const content = ` function hello() { console.log("Hello, World!"); return 42; } export default hello; `; expect(isTextContent(content)).toBe(true); }); it('should detect JSON as text', () => { const content = JSON.stringify({ hello: 'world', num: 42, arr: [1, 2, 3] }, null, 2); expect(isTextContent(content)).toBe(true); }); it('should detect HTML as text', () => { const content = `<!DOCTYPE html> <html> <head><title>Test</title></head> <body><h1>Hello</h1></body> </html>`; expect(isTextContent(content)).toBe(true); }); }); describe('Unicode Content', () => { it('should detect text with emojis as text', () => { const content = `# 🔧 Configuration Guide This guide covers: - 📄 File setup - 🚀 Deployment - ⚙️ Settings ## Features ✅ Automatic detection ✅ Fast processing `; expect(isTextContent(content)).toBe(true); }); it('should detect Chinese text as text', () => { const content = `# 配置指南 这是一个测试文档。 ## 功能说明 - 自动检测 - 快速处理 - 高效编码 `; expect(isTextContent(content)).toBe(true); }); it('should detect Japanese text as text', () => { const content = `# 設定ガイド これはテストドキュメントです。 ## 機能 - こんにちは世界 - カタカナテスト - 漢字テスト `; expect(isTextContent(content)).toBe(true); }); it('should detect Korean text as text', () => { const content = `# 설정 가이드 이것은 테스트 문서입니다. ## 기능 - 안녕하세요 - 테스트 `; expect(isTextContent(content)).toBe(true); }); it('should detect Arabic text as text', () => { const content = `# دليل الإعدادات هذا مستند اختبار. ## الميزات - مرحبا بالعالم `; expect(isTextContent(content)).toBe(true); }); it('should detect mixed CJK and emoji text as text', () => { const content = `# 🌍 国際化テスト ## 中文 Chinese 你好世界 🇨🇳 ## 日本語 Japanese こんにちは 🇯🇵 ## 한국어 Korean 안녕하세요 🇰🇷 `; expect(isTextContent(content)).toBe(true); }); it('should detect extended Latin characters as text', () => { const content = `# Café Menu - Résumé review - Naïve implementation - Façade pattern - Señor developer - Über configuration `; expect(isTextContent(content)).toBe(true); }); it('should detect mathematical symbols as text', () => { const content = `# Mathematical Notation ∑ (summation) ∏ (product) ∫ (integral) ∂ (partial derivative) √ (square root) ∞ (infinity) ≤ ≥ ≠ ≈ `; expect(isTextContent(content)).toBe(true); }); it('should detect currency symbols as text', () => { const content = `# Pricing - USD: $100 - EUR: €85 - GBP: £75 - JPY: ¥15000 - INR: ₹8500 `; expect(isTextContent(content)).toBe(true); }); }); describe('Whitespace and Formatting', () => { it('should detect text with tabs as text', () => { const content = 'Column1\tColumn2\tColumn3\nValue1\tValue2\tValue3'; expect(isTextContent(content)).toBe(true); }); it('should detect text with Windows line endings as text', () => { const content = 'Line 1\r\nLine 2\r\nLine 3'; expect(isTextContent(content)).toBe(true); }); it('should detect text with form feeds as text', () => { const content = 'Page 1 content\fPage 2 content\fPage 3 content'; expect(isTextContent(content)).toBe(true); }); it('should detect text with mixed whitespace as text', () => { const content = 'Mixed\t\n\r\n \t whitespace\n\ntest'; expect(isTextContent(content)).toBe(true); }); }); }); describe('Binary Files - Should Be Detected as Binary', () => { it('should detect content with null bytes as binary', () => { const content = 'Some text\0with null\0bytes'; expect(isTextContent(content)).toBe(false); }); it('should detect content with many control characters as binary', () => { // Create content with >10% control characters const controlChars = '\x01\x02\x03\x04\x05\x06\x07\x08'; const content = controlChars.repeat(20) + 'Some text here'; expect(isTextContent(content)).toBe(false); }); it('should detect simulated binary file as binary', () => { // Simulate binary file header (like PNG) const binaryHeader = '\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR'; expect(isTextContent(binaryHeader)).toBe(false); }); it('should detect content with lone surrogates as potentially binary', () => { // Many lone surrogates indicate corrupted/binary content const lonesurrogates = '\uD800\uD801\uD802\uDC00\uDC01\uDC02'.repeat(50); // This should be caught by the surrogate detection expect(isTextContent(lonesurrogates)).toBe(false); }); it('should detect content with excessive control chars as binary', () => { // SOH, STX, ETX, etc. are strong binary indicators let content = ''; for (let i = 0; i < 100; i++) { content += String.fromCharCode(1) + String.fromCharCode(2); } content += 'small text'; expect(isTextContent(content)).toBe(false); }); }); describe('Edge Cases', () => { it('should handle empty string', () => { expect(isTextContent('')).toBe(true); // Empty is technically text }); it('should handle single character', () => { expect(isTextContent('a')).toBe(true); }); it('should handle single null byte', () => { expect(isTextContent('\0')).toBe(false); }); it('should handle single emoji', () => { expect(isTextContent('🚀')).toBe(true); }); it('should handle very long text file', () => { const content = 'This is a test. '.repeat(10000); // ~170KB expect(isTextContent(content)).toBe(true); }); it('should handle mostly text with few control chars (under threshold)', () => { // Less than 10% control characters should be treated as text const text = 'Normal text content '.repeat(100); const controlChars = '\x01\x02\x03'; // Just 3 control chars const content = text + controlChars; expect(isTextContent(content)).toBe(true); }); it('should handle text with valid surrogate pairs (emojis)', () => { // All valid emojis - should not be detected as binary const content = '🔧📄🚀⚙️✅❌🎉💡🔥⭐'; expect(isTextContent(content)).toBe(true); }); it('should handle markdown documentation with everything', () => { const content = `# 🔧 Complete Documentation ## Overview 概要 概要 This file contains: - Emojis: 🚀 ⚙️ 📄 ✅ - Chinese: 你好世界 - Japanese: こんにちは - Korean: 안녕하세요 - Math: ∑ ∫ √ ∞ - Currency: $ € £ ¥ - Extended Latin: café résumé ## Code Example \`\`\`typescript function greet(name: string): string { return \`Hello, \${name}! 👋\`; } \`\`\` ## Table | Feature | Status | |---------|--------| | Emoji | ✅ | | Unicode | ✅ | | Binary | ❌ | `; expect(isTextContent(content)).toBe(true); }); }); describe('Threshold Behavior', () => { it('should accept 5% control characters as text', () => { // 95 normal chars + 5 control chars = 5% control const normalText = 'A'.repeat(95); const controlChars = '\x01'.repeat(5); expect(isTextContent(normalText + controlChars)).toBe(true); }); it('should accept 9% control characters as text', () => { // 91 normal chars + 9 control chars = 9% control const normalText = 'A'.repeat(91); const controlChars = '\x01'.repeat(9); expect(isTextContent(normalText + controlChars)).toBe(true); }); it('should reject 15% control characters as binary', () => { // 85 normal chars + 15 control chars = 15% control const normalText = 'A'.repeat(85); const controlChars = '\x01'.repeat(15); expect(isTextContent(normalText + controlChars)).toBe(false); }); it('should reject 20% control characters as binary', () => { // 80 normal chars + 20 control chars = 20% control const normalText = 'A'.repeat(80); const controlChars = '\x01'.repeat(20); expect(isTextContent(normalText + controlChars)).toBe(false); }); }); describe('Sample Size Behavior', () => { it('should sample first 8KB for large files', () => { // Content with control characters at the start should be detected const binaryStart = '\x01\x02\x03\x04\x05'.repeat(2000); // 10KB of control chars const textEnd = 'Normal text '.repeat(10000); // More text after const content = binaryStart + textEnd; expect(isTextContent(content)).toBe(false); }); it('should detect text file even if control chars appear after 8KB', () => { // Normal text in first 8KB, control chars after (not null bytes) const textStart = 'Normal text content. '.repeat(500); // ~10.5KB of text const controlEnd = '\x01\x02\x03'.repeat(100); // Control chars after sample const content = textStart + controlEnd; // Should be detected as text since first 8KB is clean expect(isTextContent(content)).toBe(true); }); }); }); describe('FileIndexer - shouldSkipFile', () => { let fileIndexer: FileIndexer; let mockDriver: Driver; beforeEach(() => { mockDriver = { session: vi.fn().mockReturnValue({ run: vi.fn(), close: vi.fn() }) } as unknown as Driver; fileIndexer = new FileIndexer(mockDriver); }); const shouldSkipFile = (filePath: string, extension: string): boolean => { return (fileIndexer as any).shouldSkipFile(filePath, extension); }; describe('Image Files', () => { it('should skip PNG files', () => { expect(shouldSkipFile('/path/image.png', '.png')).toBe(true); }); it('should skip JPG files', () => { expect(shouldSkipFile('/path/photo.jpg', '.jpg')).toBe(true); }); it('should skip GIF files', () => { expect(shouldSkipFile('/path/animation.gif', '.gif')).toBe(true); }); }); describe('Archive Files', () => { it('should skip ZIP files', () => { expect(shouldSkipFile('/path/archive.zip', '.zip')).toBe(true); }); it('should skip TAR.GZ files', () => { expect(shouldSkipFile('/path/archive.tar.gz', '.gz')).toBe(true); }); }); describe('Sensitive Files', () => { it('should skip private key files', () => { expect(shouldSkipFile('/path/server.key', '.key')).toBe(true); }); it('should skip PEM certificate files', () => { expect(shouldSkipFile('/path/cert.pem', '.pem')).toBe(true); }); }); describe('Text Files - Should NOT Skip', () => { it('should not skip TypeScript files', () => { expect(shouldSkipFile('/path/app.ts', '.ts')).toBe(false); }); it('should not skip JavaScript files', () => { expect(shouldSkipFile('/path/app.js', '.js')).toBe(false); }); it('should not skip Markdown files', () => { expect(shouldSkipFile('/path/README.md', '.md')).toBe(false); }); it('should not skip JSON files', () => { expect(shouldSkipFile('/path/package.json', '.json')).toBe(false); }); it('should not skip YAML files', () => { expect(shouldSkipFile('/path/config.yaml', '.yaml')).toBe(false); }); }); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/orneryd/Mimir'

If you have feedback or need assistance with the MCP directory API, please join our Discord server