Sys8 MCP Server

language-analysis.ts•14 KiB

/** * Language and character analysis utilities */ export interface AnalyzeLanguageArgs { text: string; } export interface AnalyzeLanguageResult { total_characters: number; encoding?: string; languages: { english: { count: number; percentage: number }; chinese: { count: number; percentage: number }; russian: { count: number; percentage: number }; ukrainian: { count: number; percentage: number }; vietnamese: { count: number; percentage: number }; japanese: { count: number; percentage: number }; turkish: { count: number; percentage: number }; spanish: { count: number; percentage: number }; }; categories: { digits: { count: number; percentage: number }; punctuation: { count: number; percentage: number }; symbols: { count: number; percentage: number }; whitespace: { count: number; percentage: number }; other: { count: number; percentage: number }; }; } // Unicode ranges for different languages const UNICODE_RANGES = { // English (Basic Latin) english: [ { start: 0x0041, end: 0x005A }, // A-Z { start: 0x0061, end: 0x007A }, // a-z ], // Chinese (CJK Unified Ideographs) chinese: [ { start: 0x4E00, end: 0x9FFF }, // CJK Unified Ideographs { start: 0x3400, end: 0x4DBF }, // Extension A { start: 0xF900, end: 0xFAFF }, // Compatibility Ideographs { start: 0x2E80, end: 0x2EFF }, // CJK Radicals Supplement { start: 0x2F00, end: 0x2FDF }, // Kangxi Radicals { start: 0x3000, end: 0x303F }, // CJK Symbols and Punctuation { start: 0x3100, end: 0x312F }, // Bopomofo { start: 0x31A0, end: 0x31BF }, // Bopomofo Extended ], // Japanese japanese: [ { start: 0x3040, end: 0x309F }, // Hiragana { start: 0x30A0, end: 0x30FF }, // Katakana { start: 0x31F0, end: 0x31FF }, // Katakana Phonetic Extensions { start: 0xFF00, end: 0xFFEF }, // Halfwidth and Fullwidth Forms // Also includes CJK ranges (shared with Chinese) { start: 0x4E00, end: 0x9FFF }, // CJK Unified Ideographs ], // Russian and Ukrainian (Cyrillic) russian: [ { start: 0x0400, end: 0x04FF }, // Cyrillic { start: 0x0500, end: 0x052F }, // Cyrillic Supplement { start: 0x2DE0, end: 0x2DFF }, // Cyrillic Extended-A { start: 0xA640, end: 0xA69F }, // Cyrillic Extended-B { start: 0x1C80, end: 0x1C8F }, // Cyrillic Extended-C ], ukrainian: [ { start: 0x0400, end: 0x04FF }, // Cyrillic (shared with Russian) { start: 0x0500, end: 0x052F }, // Cyrillic Supplement { start: 0x2DE0, end: 0x2DFF }, // Cyrillic Extended-A { start: 0xA640, end: 0xA69F }, // Cyrillic Extended-B { start: 0x1C80, end: 0x1C8F }, // Cyrillic Extended-C ], // Vietnamese (Latin with diacritics) vietnamese: [ { start: 0x0041, end: 0x005A }, // A-Z { start: 0x0061, end: 0x007A }, // a-z { start: 0x0080, end: 0x00FF }, // Latin-1 Supplement { start: 0x0100, end: 0x017F }, // Latin Extended-A { start: 0x0180, end: 0x024F }, // Latin Extended-B { start: 0x1E00, end: 0x1EFF }, // Latin Extended Additional { start: 0x0300, end: 0x036F }, // Combining Diacritical Marks ], // Turkish (Latin with specific characters) turkish: [ { start: 0x0041, end: 0x005A }, // A-Z { start: 0x0061, end: 0x007A }, // a-z { start: 0x0080, end: 0x00FF }, // Latin-1 Supplement (includes İ, ı, Ş, ş, Ğ, ğ, Ç, ç, Ö, ö, Ü, ü) { start: 0x0100, end: 0x017F }, // Latin Extended-A ], // Spanish (Latin) spanish: [ { start: 0x0041, end: 0x005A }, // A-Z { start: 0x0061, end: 0x007A }, // a-z { start: 0x0080, end: 0x00FF }, // Latin-1 Supplement (includes á, é, í, ó, ú, ñ, ü) ], }; // Helper function to check if a character code is in a range function isInRange(code: number, ranges: Array<{ start: number; end: number }>): boolean { return ranges.some(range => code >= range.start && code <= range.end); } // Helper function to detect encoding (simple heuristic) function detectEncoding(text: string): string | undefined { try { // Try to detect UTF-8 BOM if (text.charCodeAt(0) === 0xFEFF) { return 'UTF-8 (BOM)'; } // Check for UTF-16 BOM if (text.length >= 2) { const first = text.charCodeAt(0); const second = text.charCodeAt(1); if (first === 0xFE && second === 0xFF) { return 'UTF-16 BE'; } if (first === 0xFF && second === 0xFE) { return 'UTF-16 LE'; } } // Check if all characters are valid UTF-8 // In JavaScript, strings are always UTF-16 internally, but we can check for valid Unicode let hasHighSurrogate = false; for (let i = 0; i < text.length; i++) { const code = text.charCodeAt(i); // Check for surrogate pairs (UTF-16) if (code >= 0xD800 && code <= 0xDBFF) { hasHighSurrogate = true; if (i + 1 < text.length) { const next = text.charCodeAt(i + 1); if (next >= 0xDC00 && next <= 0xDFFF) { i++; // Skip low surrogate continue; } } } // Check for invalid characters if (code === 0xFFFD) { return 'UTF-8 (with replacement characters)'; } } // If we have surrogate pairs, it's likely UTF-16 if (hasHighSurrogate) { return 'UTF-16'; } // Default assumption for JavaScript strings return 'UTF-16 (JavaScript default)'; } catch (e) { return undefined; } } // Helper function to calculate percentage with 2 decimal places function calculatePercentage(count: number, total: number): number { if (total === 0) return 0; return Math.round((count / total) * 10000) / 100; // Round to 2 decimal places } /** * Analyze text for language and character distribution */ export function analyzeLanguage(args: AnalyzeLanguageArgs): AnalyzeLanguageResult { const { text } = args; if (typeof text !== 'string') { throw new Error('Input must be a string'); } const totalChars = text.length; if (totalChars === 0) { return { total_characters: 0, languages: { english: { count: 0, percentage: 0 }, chinese: { count: 0, percentage: 0 }, russian: { count: 0, percentage: 0 }, ukrainian: { count: 0, percentage: 0 }, vietnamese: { count: 0, percentage: 0 }, japanese: { count: 0, percentage: 0 }, turkish: { count: 0, percentage: 0 }, spanish: { count: 0, percentage: 0 }, }, categories: { digits: { count: 0, percentage: 0 }, punctuation: { count: 0, percentage: 0 }, symbols: { count: 0, percentage: 0 }, whitespace: { count: 0, percentage: 0 }, other: { count: 0, percentage: 0 }, }, }; } // Initialize counters const languageCounts: Record<string, number> = { english: 0, chinese: 0, russian: 0, ukrainian: 0, vietnamese: 0, japanese: 0, turkish: 0, spanish: 0, }; const categoryCounts: Record<string, number> = { digits: 0, punctuation: 0, symbols: 0, whitespace: 0, other: 0, }; // Track which characters have been classified const classified = new Set<number>(); // Analyze each character for (let i = 0; i < text.length; i++) { const char = text[i]; const code = char.charCodeAt(0); // Handle surrogate pairs (for characters > U+FFFF) let fullCode = code; if (code >= 0xD800 && code <= 0xDBFF && i + 1 < text.length) { const next = text.charCodeAt(i + 1); if (next >= 0xDC00 && next <= 0xDFFF) { fullCode = 0x10000 + ((code - 0xD800) << 10) + (next - 0xDC00); i++; // Skip low surrogate } } // Skip if already classified if (classified.has(i)) { continue; } // Check language ranges (priority order matters for overlapping ranges) let languageFound = false; // Chinese (check first as it has specific ranges) if (isInRange(fullCode, UNICODE_RANGES.chinese)) { languageCounts.chinese++; languageFound = true; classified.add(i); } // Japanese (includes CJK, so check after Chinese but before others) else if (isInRange(fullCode, UNICODE_RANGES.japanese)) { languageCounts.japanese++; languageFound = true; classified.add(i); } // Russian and Ukrainian (Cyrillic - same ranges, count separately) else if (isInRange(fullCode, UNICODE_RANGES.russian)) { // Try to distinguish Russian vs Ukrainian by specific characters // Ukrainian has specific characters like і, ї, є if (fullCode === 0x0456 || fullCode === 0x0457 || fullCode === 0x0454 || // і, ї, є fullCode === 0x0406 || fullCode === 0x0407 || fullCode === 0x0404) { // І, Ї, Є languageCounts.ukrainian++; } else { languageCounts.russian++; } languageFound = true; classified.add(i); } // Vietnamese (Latin with diacritics) - check for Vietnamese-specific diacritics first else if (fullCode >= 0x1E00 && fullCode <= 0x1EFF) { languageCounts.vietnamese++; languageFound = true; classified.add(i); } // Turkish (Latin with specific characters) - check for Turkish-specific characters first else if (fullCode === 0x0130 || fullCode === 0x0131 || // İ, ı fullCode === 0x015E || fullCode === 0x015F || // Ş, ş fullCode === 0x011E || fullCode === 0x011F || // Ğ, ğ fullCode === 0x00C7 || fullCode === 0x00E7 || // Ç, ç fullCode === 0x00D6 || fullCode === 0x00F6 || // Ö, ö fullCode === 0x00DC || fullCode === 0x00FC) { // Ü, ü languageCounts.turkish++; languageFound = true; classified.add(i); } // Spanish (Latin with specific characters) - check for Spanish-specific characters first else if (fullCode === 0x00E1 || fullCode === 0x00E9 || fullCode === 0x00ED || // á, é, í fullCode === 0x00F3 || fullCode === 0x00FA || // ó, ú fullCode === 0x00F1 || fullCode === 0x00FC || // ñ, ü fullCode === 0x00C1 || fullCode === 0x00C9 || fullCode === 0x00CD || // Á, É, Í fullCode === 0x00D3 || fullCode === 0x00DA || // Ó, Ú fullCode === 0x00D1) { // Ñ languageCounts.spanish++; languageFound = true; classified.add(i); } // English (Basic Latin letters) - check basic A-Z, a-z else if ((fullCode >= 0x0041 && fullCode <= 0x005A) || // A-Z (fullCode >= 0x0061 && fullCode <= 0x007A)) { // a-z languageCounts.english++; languageFound = true; classified.add(i); } // If not classified as a language character, check categories if (!languageFound) { // Digits if (fullCode >= 0x0030 && fullCode <= 0x0039) { categoryCounts.digits++; classified.add(i); } // Whitespace else if (/\s/.test(char)) { categoryCounts.whitespace++; classified.add(i); } // Punctuation (common punctuation marks) else if (/[.,!?;:()\[\]{}\-"'`]/.test(char) || (fullCode >= 0x2000 && fullCode <= 0x206F) || // General Punctuation (fullCode >= 0x2E00 && fullCode <= 0x2E7F)) { // Supplemental Punctuation categoryCounts.punctuation++; classified.add(i); } // Symbols (mathematical, currency, etc.) else if ((fullCode >= 0x20A0 && fullCode <= 0x20CF) || // Currency Symbols (fullCode >= 0x2100 && fullCode <= 0x214F) || // Letterlike Symbols (fullCode >= 0x2190 && fullCode <= 0x21FF) || // Arrows (fullCode >= 0x2200 && fullCode <= 0x22FF) || // Mathematical Operators (fullCode >= 0x2300 && fullCode <= 0x23FF) || // Miscellaneous Technical (fullCode >= 0x2600 && fullCode <= 0x26FF) || // Miscellaneous Symbols (fullCode >= 0x2700 && fullCode <= 0x27BF)) { // Dingbats categoryCounts.symbols++; classified.add(i); } // Other (unclassified) else { categoryCounts.other++; classified.add(i); } } } // Calculate percentages const result: AnalyzeLanguageResult = { total_characters: totalChars, encoding: detectEncoding(text), languages: { english: { count: languageCounts.english, percentage: calculatePercentage(languageCounts.english, totalChars), }, chinese: { count: languageCounts.chinese, percentage: calculatePercentage(languageCounts.chinese, totalChars), }, russian: { count: languageCounts.russian, percentage: calculatePercentage(languageCounts.russian, totalChars), }, ukrainian: { count: languageCounts.ukrainian, percentage: calculatePercentage(languageCounts.ukrainian, totalChars), }, vietnamese: { count: languageCounts.vietnamese, percentage: calculatePercentage(languageCounts.vietnamese, totalChars), }, japanese: { count: languageCounts.japanese, percentage: calculatePercentage(languageCounts.japanese, totalChars), }, turkish: { count: languageCounts.turkish, percentage: calculatePercentage(languageCounts.turkish, totalChars), }, spanish: { count: languageCounts.spanish, percentage: calculatePercentage(languageCounts.spanish, totalChars), }, }, categories: { digits: { count: categoryCounts.digits, percentage: calculatePercentage(categoryCounts.digits, totalChars), }, punctuation: { count: categoryCounts.punctuation, percentage: calculatePercentage(categoryCounts.punctuation, totalChars), }, symbols: { count: categoryCounts.symbols, percentage: calculatePercentage(categoryCounts.symbols, totalChars), }, whitespace: { count: categoryCounts.whitespace, percentage: calculatePercentage(categoryCounts.whitespace, totalChars), }, other: { count: categoryCounts.other, percentage: calculatePercentage(categoryCounts.other, totalChars), }, }, }; return result; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Angry-Robot-Deals/mcp-sys8'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

language-analysis.ts•14 KiB