ConceptNet MCP Server

test_text_utils.py•29.8 KiB

""" Comprehensive unit tests for the text_utils module. This test suite verifies all text processing functions including normalization, URI construction/parsing, validation, and security features. """ import pytest import unicodedata from unittest.mock import patch from src.conceptnet_mcp.utils.text_utils import ( normalize_concept_text, construct_concept_uri, parse_concept_uri, validate_concept_uri, validate_concept_text, validate_language_code, normalize_language_code, clean_text_for_uri, sanitize_search_query, sanitize_text_for_uri, clean_whitespace, is_valid_concept_text, is_valid_concept_format, normalize_unicode, validate_text_length, normalize_text_for_display, normalize_uri_to_text, normalize_relation_text, find_similar_languages, extract_language_from_uri, estimate_text_language, get_text_language_hints, calculate_text_similarity, fuzzy_match_concepts, split_compound_terms, truncate_text_safely, clear_text_caches, get_cache_info, SUPPORTED_LANGUAGES, LANGUAGE_ALIASES, RELATION_PATTERNS, MAX_CONCEPT_LENGTH, MAX_URI_LENGTH, MAX_TEXT_LENGTH, ) from src.conceptnet_mcp.utils.exceptions import ( InvalidConceptURIError, InvalidLanguageError, ValidationError, TextValidationError, URIValidationError, ) class TestNormalizeConceptText: """Test the normalize_concept_text function.""" def test_basic_normalization(self): """Test basic text normalization.""" # Simple text assert normalize_concept_text("hello world") == "hello_world" assert normalize_concept_text("Hello World") == "hello_world" # Text with special characters assert normalize_concept_text("café") == "café" assert normalize_concept_text("naïve") == "naïve" def test_whitespace_handling(self): """Test whitespace handling in normalization.""" # Multiple spaces assert normalize_concept_text("hello world") == "hello_world" # Leading/trailing whitespace assert normalize_concept_text(" hello world ") == "hello_world" # Mixed whitespace types assert normalize_concept_text("hello\t\nworld") == "hello_world" # Empty and whitespace-only strings assert normalize_concept_text("") == "" assert normalize_concept_text(" ") == "" def test_special_character_handling(self): """Test handling of special characters.""" # Punctuation removal assert normalize_concept_text("hello, world!") == "hello_world" assert normalize_concept_text("don't worry") == "dont_worry" assert normalize_concept_text("user@domain.com") == "userdomaincom" # Hyphen to underscore conversion assert normalize_concept_text("multi-word-term") == "multi_word_term" # Mixed special characters assert normalize_concept_text("hello... world???") == "hello_world" def test_unicode_normalization(self): """Test Unicode normalization.""" # Composed vs decomposed characters text1 = "café" # é as single character text2 = "cafe\u0301" # e + combining acute accent result1 = normalize_concept_text(text1) result2 = normalize_concept_text(text2) assert result1 == result2 # Various Unicode characters assert normalize_concept_text("Москва") == "москва" assert normalize_concept_text("東京") == "東京" assert normalize_concept_text("العربية") == "العربية" def test_length_limits(self): """Test length limiting.""" long_text = "a" * 300 result = normalize_concept_text(long_text, max_length=100) assert len(result) <= 100 # With default max_length very_long_text = "word " * 100 result = normalize_concept_text(very_long_text) assert len(result) <= 255 def test_preserve_underscores_option(self): """Test preserve_underscores option.""" text = "hello_world test" # Default behavior (preserve underscores) assert normalize_concept_text(text) == "hello_world_test" # With preserve_underscores=False assert normalize_concept_text(text, preserve_underscores=False) == "hello_world_test" # Multiple underscores text_multi = "hello__world___test" result = normalize_concept_text(text_multi) assert result == "hello_world_test" # Multiple underscores should be normalized def test_strip_diacritics_option(self): """Test strip_diacritics option.""" text = "café naïve résumé" # Default behavior (preserve diacritics) assert normalize_concept_text(text) == "café_naïve_résumé" # With strip_diacritics=True result = normalize_concept_text(text, strip_diacritics=True) assert result == "cafe_naive_resume" def test_edge_cases(self): """Test edge cases and error conditions.""" # None input with pytest.raises(AttributeError): normalize_concept_text(None) # Non-string input (should convert to string) assert normalize_concept_text(123) == "123" assert normalize_concept_text(True) == "true" # Empty string assert normalize_concept_text("") == "" # Only special characters assert normalize_concept_text("!@#$%^&*()") == "" # Very short max_length result = normalize_concept_text("hello world", max_length=3) assert len(result) <= 3 class TestConstructConceptURI: """Test the construct_concept_uri function.""" def test_basic_construction(self): """Test basic URI construction.""" uri = construct_concept_uri("hello", "en") assert uri == "/c/en/hello" uri = construct_concept_uri("world", "es") assert uri == "/c/es/world" def test_text_normalization(self): """Test that text is normalized during construction.""" uri = construct_concept_uri("Hello World", "en") assert uri == "/c/en/hello_world" uri = construct_concept_uri("café", "fr") assert uri == "/c/fr/café" def test_language_validation(self): """Test language code validation.""" # Valid language codes assert construct_concept_uri("test", "en") == "/c/en/test" assert construct_concept_uri("test", "zh-cn") == "/c/zh-cn/test" # Invalid language codes with pytest.raises(InvalidLanguageError): construct_concept_uri("test", "invalid") with pytest.raises(InvalidLanguageError): construct_concept_uri("test", "") with pytest.raises(InvalidLanguageError): construct_concept_uri("test", "x") # Too short def test_text_validation(self): """Test text validation during construction.""" # Empty text with pytest.raises(ValidationError): construct_concept_uri("", "en") # Whitespace-only text with pytest.raises(ValidationError): construct_concept_uri(" ", "en") # Text that becomes empty after normalization with pytest.raises(ValidationError): construct_concept_uri("!@#$%", "en") def test_auto_normalize_option(self): """Test auto_normalize option.""" # With auto_normalize=True (default) uri = construct_concept_uri("Hello World", "en") assert uri == "/c/en/hello_world" # With auto_normalize=False uri = construct_concept_uri("hello_world", "en", auto_normalize=False) assert uri == "/c/en/hello_world" # Invalid characters with auto_normalize=False should raise error with pytest.raises(ValidationError): construct_concept_uri("hello world", "en", auto_normalize=False) def test_validate_option(self): """Test validate option.""" # With validate=True (default) with pytest.raises(InvalidLanguageError): construct_concept_uri("test", "invalid") # With validate=False (bypasses validation) uri = construct_concept_uri("test", "invalid", validate=False) assert uri == "/c/invalid/test" def test_special_characters_handling(self): """Test handling of special characters in URI construction.""" # Characters that should be URL-safe uri = construct_concept_uri("hello-world", "en") assert uri == "/c/en/hello_world" # Unicode characters should be preserved uri = construct_concept_uri("東京", "ja") assert uri == "/c/ja/東京" class TestParseConceptURI: """Test the parse_concept_uri function.""" def test_basic_parsing(self): """Test basic URI parsing.""" result = parse_concept_uri("/c/en/hello") expected = { "language": "en", "term": "hello", "original_uri": "/c/en/hello", "normalized_term": "hello" } assert result == expected def test_complex_terms(self): """Test parsing with complex terms.""" # Multi-word terms result = parse_concept_uri("/c/en/hello_world") assert result["term"] == "hello_world" assert result["language"] == "en" # Unicode terms result = parse_concept_uri("/c/ja/東京") assert result["term"] == "東京" assert result["language"] == "ja" # Terms with special characters result = parse_concept_uri("/c/fr/café") assert result["term"] == "café" assert result["language"] == "fr" def test_language_codes(self): """Test parsing different language codes.""" # Standard language codes result = parse_concept_uri("/c/en/test") assert result["language"] == "en" result = parse_concept_uri("/c/zh-cn/test") assert result["language"] == "zh-cn" result = parse_concept_uri("/c/pt-br/test") assert result["language"] == "pt-br" def test_normalize_term_option(self): """Test normalize_term option.""" # With normalize_term=True (default) result = parse_concept_uri("/c/en/Hello_World") assert result["normalized_term"] == "hello_world" # With normalize_term=False result = parse_concept_uri("/c/en/Hello_World", normalize_term=False) assert result["normalized_term"] == "Hello_World" def test_validate_option(self): """Test validate option.""" # With validate=True (default) with pytest.raises(InvalidConceptURIError): parse_concept_uri("/invalid/format") # With validate=False result = parse_concept_uri("/invalid/format", validate=False) assert result is not None # Should not raise error def test_invalid_uri_formats(self): """Test handling of invalid URI formats.""" invalid_uris = [ "/c/en", # Missing term "/c", # Missing language and term "/x/en/test", # Wrong prefix "c/en/test", # Missing leading slash "/c//test", # Empty language "/c/en/", # Empty term "", # Empty string "not-a-uri", # Not a URI format ] for uri in invalid_uris: with pytest.raises(InvalidConceptURIError): parse_concept_uri(uri) def test_edge_cases(self): """Test edge cases in URI parsing.""" # Very long terms long_term = "a" * 200 uri = f"/c/en/{long_term}" result = parse_concept_uri(uri) assert result["term"] == long_term # Special characters in terms uri = "/c/en/hello%20world" # URL encoded space result = parse_concept_uri(uri) assert "hello" in result["term"] # Should handle URL encoding class TestValidateConceptText: """Test the validate_concept_text function.""" def test_valid_text(self): """Test validation of valid text.""" # Should not raise exceptions validate_concept_text("hello") validate_concept_text("hello_world") validate_concept_text("café") validate_concept_text("東京") validate_concept_text("hello123") def test_invalid_text(self): """Test validation of invalid text.""" # Empty text with pytest.raises(TextValidationError): validate_concept_text("") # Whitespace-only text with pytest.raises(TextValidationError): validate_concept_text(" ") # Text with invalid characters with pytest.raises(TextValidationError): validate_concept_text("hello world") # Space not allowed with pytest.raises(TextValidationError): validate_concept_text("hello/world") # Slash not allowed with pytest.raises(TextValidationError): validate_concept_text("hello?world") # Question mark not allowed def test_length_limits(self): """Test length validation.""" # Text too long long_text = "a" * 300 with pytest.raises(TextValidationError): validate_concept_text(long_text, max_length=100) # Text too short with pytest.raises(TextValidationError): validate_concept_text("a", min_length=5) def test_custom_allowed_chars(self): """Test custom allowed characters.""" # Allow spaces validate_concept_text("hello world", allowed_chars=" ") # Disallow underscores with pytest.raises(TextValidationError): validate_concept_text("hello_world", allowed_chars="") class TestValidateLanguageCode: """Test the validate_language_code function.""" def test_valid_language_codes(self): """Test validation of valid language codes.""" valid_codes = [ "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh", "ar", "hi", "nl", "sv", "no", "da", "fi", "pl", "cs", "hu", "en-us", "en-gb", "zh-cn", "zh-tw", "pt-br", "es-es" ] for code in valid_codes: validate_language_code(code) # Should not raise exception def test_invalid_language_codes(self): """Test validation of invalid language codes.""" invalid_codes = [ "", "x", "xyz", "english", "ENGLISH", "EN", "123", "en_us", "en/us", "en us", "toolong" ] for code in invalid_codes: with pytest.raises(InvalidLanguageError): validate_language_code(code) def test_case_sensitivity(self): """Test case sensitivity in language codes.""" # Lowercase should be valid validate_language_code("en") validate_language_code("zh-cn") # Uppercase should be invalid with pytest.raises(InvalidLanguageError): validate_language_code("EN") with pytest.raises(InvalidLanguageError): validate_language_code("ZH-CN") class TestSanitizeTextForURI: """Test the sanitize_text_for_uri function.""" def test_basic_sanitization(self): """Test basic text sanitization.""" assert sanitize_text_for_uri("hello world") == "hello_world" assert sanitize_text_for_uri("hello/world") == "hello_world" assert sanitize_text_for_uri("hello?world") == "hello_world" def test_multiple_separators(self): """Test handling of multiple separators.""" assert sanitize_text_for_uri("hello world") == "hello_world" assert sanitize_text_for_uri("hello///world") == "hello_world" assert sanitize_text_for_uri("hello___world") == "hello_world" def test_leading_trailing_separators(self): """Test removal of leading/trailing separators.""" assert sanitize_text_for_uri("_hello_world_") == "hello_world" assert sanitize_text_for_uri("/hello/world/") == "hello_world" assert sanitize_text_for_uri(" hello world ") == "hello_world" def test_special_characters(self): """Test handling of special characters.""" text = "hello@world.com" result = sanitize_text_for_uri(text) assert result == "hello_world_com" text = "hello(world)!" result = sanitize_text_for_uri(text) assert result == "hello_world" def test_unicode_preservation(self): """Test preservation of Unicode characters.""" assert sanitize_text_for_uri("café world") == "café_world" assert sanitize_text_for_uri("東京駅") == "東京_駅" assert sanitize_text_for_uri("Москва город") == "москва_город" def test_length_limits(self): """Test length limiting in sanitization.""" long_text = "word " * 100 result = sanitize_text_for_uri(long_text, max_length=50) assert len(result) <= 50 assert not result.endswith("_") # Should not end with separator def test_empty_result_handling(self): """Test handling when sanitization results in empty string.""" # Only special characters result = sanitize_text_for_uri("!@#$%^&*()") assert result == "" # Only whitespace result = sanitize_text_for_uri(" ") assert result == "" class TestCleanWhitespace: """Test the clean_whitespace function.""" def test_basic_cleaning(self): """Test basic whitespace cleaning.""" assert clean_whitespace("hello world") == "hello world" assert clean_whitespace(" hello world ") == "hello world" assert clean_whitespace("hello\t\nworld") == "hello world" def test_different_whitespace_types(self): """Test cleaning different types of whitespace.""" text = "hello\t\n\r\v\f world" result = clean_whitespace(text) assert result == "hello world" def test_unicode_whitespace(self): """Test cleaning Unicode whitespace characters.""" # Non-breaking space and other Unicode spaces text = "hello\u00A0\u2000\u2001world" result = clean_whitespace(text) assert result == "hello world" def test_preserve_single_spaces(self): """Test that single spaces are preserved.""" assert clean_whitespace("hello world test") == "hello world test" def test_empty_and_whitespace_only(self): """Test handling of empty and whitespace-only strings.""" assert clean_whitespace("") == "" assert clean_whitespace(" ") == "" assert clean_whitespace("\t\n\r") == "" class TestIsValidConceptFormat: """Test the is_valid_concept_format function.""" def test_valid_formats(self): """Test validation of valid concept formats.""" valid_texts = [ "hello", "hello_world", "café", "東京", "test123", "a", "very_long_concept_name_that_is_still_valid" ] for text in valid_texts: assert is_valid_concept_format(text) is True def test_invalid_formats(self): """Test validation of invalid concept formats.""" invalid_texts = [ "", " ", "hello world", # Space "hello/world", # Slash "hello?world", # Question mark "hello world", # Space "hello\nworld", # Newline "hello\tworld", # Tab ] for text in invalid_texts: assert is_valid_concept_format(text) is False def test_edge_cases(self): """Test edge cases in format validation.""" # Very long text long_text = "a" * 300 assert is_valid_concept_format(long_text) is False # Unicode characters assert is_valid_concept_format("Москва") is True assert is_valid_concept_format("العربية") is True class TestGetTextLanguageHints: """Test the get_text_language_hints function.""" def test_obvious_language_hints(self): """Test detection of obvious language hints.""" # English text hints = get_text_language_hints("hello world") assert "en" in hints # Spanish text hints = get_text_language_hints("hola mundo") assert "es" in hints # German text hints = get_text_language_hints("guten tag") assert "de" in hints def test_script_based_detection(self): """Test script-based language detection.""" # Japanese (Hiragana/Katakana/Kanji) hints = get_text_language_hints("東京") assert "ja" in hints or "zh" in hints # Could be Japanese or Chinese # Arabic script hints = get_text_language_hints("العربية") assert "ar" in hints # Cyrillic script hints = get_text_language_hints("Москва") assert "ru" in hints def test_mixed_scripts(self): """Test handling of mixed scripts.""" hints = get_text_language_hints("hello 東京") assert len(hints) >= 1 # Should detect at least one language def test_ambiguous_text(self): """Test handling of ambiguous text.""" # Numbers and symbols hints = get_text_language_hints("123456") assert isinstance(hints, list) # Should return a list even if empty # Short text hints = get_text_language_hints("a") assert isinstance(hints, list) def test_confidence_ordering(self): """Test that hints are ordered by confidence.""" hints = get_text_language_hints("hello world how are you") if len(hints) > 1: # Should be ordered by confidence (most likely first) assert isinstance(hints, list) class TestNormalizeUnicode: """Test the normalize_unicode function.""" def test_nfc_normalization(self): """Test NFC normalization (default).""" # Decomposed to composed text = "cafe\u0301" # e + combining acute accent result = normalize_unicode(text) expected = "café" # é as single character assert result == expected assert unicodedata.is_normalized('NFC', result) def test_nfd_normalization(self): """Test NFD normalization.""" # Composed to decomposed text = "café" # é as single character result = normalize_unicode(text, form='NFD') expected = "cafe\u0301" # e + combining acute accent assert result == expected assert unicodedata.is_normalized('NFD', result) def test_case_folding(self): """Test case folding option.""" text = "Hello WORLD" result = normalize_unicode(text, case_fold=True) assert result == "hello world" # Unicode case folding text = "İSTANBUL" # Turkish capital I with dot result = normalize_unicode(text, case_fold=True) assert result.lower() == result def test_strip_accents(self): """Test accent stripping.""" text = "café naïve résumé" result = normalize_unicode(text, strip_accents=True) assert result == "cafe naive resume" # Mixed text with accents text = "Hëllö Wörld" result = normalize_unicode(text, strip_accents=True) assert result == "Hello World" def test_combined_options(self): """Test combination of normalization options.""" text = "CAFÉ" result = normalize_unicode(text, case_fold=True, strip_accents=True) assert result == "cafe" def test_different_scripts(self): """Test normalization of different scripts.""" # Should not affect non-Latin scripts text = "東京" result = normalize_unicode(text, strip_accents=True) assert result == text # Should be unchanged text = "Москва" result = normalize_unicode(text, case_fold=True) assert result == "москва" class TestTruncateTextSafely: """Test the truncate_text_safely function.""" def test_basic_truncation(self): """Test basic text truncation.""" text = "hello world test" result = truncate_text_safely(text, 10) assert len(result) <= 10 assert not result.endswith(" ") # Should not end with space def test_word_boundary_preservation(self): """Test preservation of word boundaries.""" text = "hello world test" result = truncate_text_safely(text, 12) # Should fit "hello world" assert result == "hello world" result = truncate_text_safely(text, 8) # Should only fit "hello" assert result == "hello" def test_truncation_indicator(self): """Test truncation indicator.""" text = "hello world test" result = truncate_text_safely(text, 10, truncate_indicator="...") assert result.endswith("...") assert len(result) <= 10 def test_no_truncation_needed(self): """Test when no truncation is needed.""" text = "hello" result = truncate_text_safely(text, 10) assert result == text def test_edge_cases(self): """Test edge cases in truncation.""" # Very short max_length text = "hello world" result = truncate_text_safely(text, 3) assert len(result) <= 3 # Empty text result = truncate_text_safely("", 10) assert result == "" # Single word longer than max_length text = "supercalifragilisticexpialidocious" result = truncate_text_safely(text, 10) assert len(result) <= 10 class TestExceptionClasses: """Test custom exception classes.""" def test_text_validation_error(self): """Test TextValidationError.""" error = TextValidationError("Invalid text", "hello world", "alphanumeric only") assert "Invalid text" in str(error) assert error.text == "hello world" assert error.reason == "alphanumeric only" def test_uri_validation_error(self): """Test URIValidationError.""" error = URIValidationError("Invalid URI", "/invalid/uri", "missing component") assert "Invalid URI" in str(error) assert error.uri == "/invalid/uri" assert error.reason == "missing component" class TestCachingBehavior: """Test caching behavior of functions.""" def test_normalize_concept_text_caching(self): """Test that normalize_concept_text uses caching.""" text = "hello world test" # First call result1 = normalize_concept_text(text) # Second call should use cache result2 = normalize_concept_text(text) assert result1 == result2 assert result1 is result2 # Should be same object due to caching def test_validate_language_code_caching(self): """Test that validate_language_code uses caching.""" # This should not raise exception and should be cached validate_language_code("en") validate_language_code("en") # Second call uses cache @patch('src.conceptnet_mcp.utils.text_utils.normalize_concept_text.cache_info') def test_cache_statistics(self, mock_cache_info): """Test that cache statistics are available.""" mock_cache_info.return_value = type('CacheInfo', (), { 'hits': 5, 'misses': 3, 'maxsize': 128, 'currsize': 8 })() info = normalize_concept_text.cache_info() assert hasattr(info, 'hits') assert hasattr(info, 'misses') class TestSecurityConsiderations: """Test security-related aspects of text processing.""" def test_injection_prevention(self): """Test prevention of injection attacks through text processing.""" # SQL injection attempts malicious_texts = [ "'; DROP TABLE users; --", "admin'--", "1' OR '1'='1", "<script>alert('xss')</script>", "../../../etc/passwd", "javascript:alert('xss')" ] for text in malicious_texts: # Should not crash and should sanitize the text result = normalize_concept_text(text) assert isinstance(result, str) # Should not contain dangerous characters assert "'" not in result assert '"' not in result assert "<" not in result assert ">" not in result assert "/" not in result or result.count("/") == 0 def test_unicode_security(self): """Test security considerations with Unicode text.""" # Unicode normalization attacks suspicious_texts = [ "\uFEFF", # Byte order mark "\u200B", # Zero width space "\u202E", # Right-to-left override "𝒂𝒅𝒎𝒊𝒏", # Mathematical script letters that look like "admin" ] for text in suspicious_texts: result = normalize_concept_text(text) # Should handle gracefully without crashing assert isinstance(result, str) def test_length_bomb_prevention(self): """Test prevention of algorithmic complexity attacks.""" # Very long input that could cause performance issues very_long_text = "a" * 10000 # Should complete in reasonable time due to length limits result = normalize_concept_text(very_long_text) assert len(result) <= 255 # Should be truncated if __name__ == "__main__": pytest.main([__file__])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/infinitnet/conceptnet-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_text_utils.py•29.8 KiB