Code-Index-MCP

test_malformed_documents.py•7.45 KiB

"""Test cases for handling malformed and corrupted documents.""" import tempfile from pathlib import Path import pytest from mcp_server.plugins.markdown_plugin.plugin import MarkdownPlugin from mcp_server.plugins.plaintext_plugin.plugin import PlaintextPlugin from mcp_server.storage.sqlite_store import SQLiteStore from tests.test_utils import create_malformed_content class TestMalformedDocuments: """Test suite for handling various types of malformed documents.""" @pytest.fixture def temp_db(self): """Create a temporary database for testing.""" with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: db_path = f.name store = SQLiteStore(db_path) yield store Path(db_path).unlink(missing_ok=True) @pytest.fixture def markdown_plugin(self, temp_db): """Create a markdown plugin instance.""" return MarkdownPlugin(sqlite_store=temp_db) @pytest.fixture def plaintext_plugin(self, temp_db): """Create a plaintext plugin instance.""" return PlaintextPlugin(sqlite_store=temp_db) def test_invalid_yaml_frontmatter(self, markdown_plugin, tmp_path): """Test handling of documents with invalid YAML frontmatter.""" # Create file with malformed YAML content = create_malformed_content("invalid_yaml") test_file = tmp_path / "invalid_yaml.md" test_file.write_text(content, encoding="utf-8") # Index the file - should handle gracefully result = markdown_plugin.indexFile(str(test_file)) # Should still extract content despite invalid frontmatter assert result is not None assert "chunks" in result assert len(result["chunks"]) > 0 assert result["chunks"][0]["content"].strip() == "# Content" # Should have error metadata assert "metadata" in result assert "parse_errors" in result["metadata"] assert "invalid_yaml" in str(result["metadata"]["parse_errors"]) def test_incomplete_code_blocks(self, markdown_plugin, tmp_path): """Test handling of documents with unclosed code blocks.""" content = create_malformed_content("incomplete_code") test_file = tmp_path / "incomplete_code.md" test_file.write_text(content, encoding="utf-8") result = markdown_plugin.indexFile(str(test_file)) # Should handle incomplete code blocks assert result is not None assert "chunks" in result # Should capture the content up to the incomplete block assert any("# Document" in chunk["content"] for chunk in result["chunks"]) # Should note the parsing issue assert "metadata" in result assert result["metadata"].get("has_incomplete_blocks", False) def test_binary_content_in_text(self, plaintext_plugin, tmp_path): """Test handling of files with embedded binary content.""" content = create_malformed_content("binary_content") test_file = tmp_path / "binary_content.txt" test_file.write_bytes(content.encode("utf-8", errors="surrogateescape")) result = plaintext_plugin.indexFile(str(test_file)) # Should handle binary content gracefully assert result is not None assert "chunks" in result # Should skip or sanitize binary portions text_content = " ".join(chunk["content"] for chunk in result["chunks"]) assert "Document" in text_content assert "\x00" not in text_content # Binary should be filtered def test_corrupted_file_encoding(self, plaintext_plugin, tmp_path): """Test handling of files with encoding issues.""" # Create file with mixed encodings test_file = tmp_path / "corrupted_encoding.txt" # Write content with different encodings with open(test_file, "wb") as f: f.write("UTF-8 content: Hello\n".encode("utf-8")) f.write("Latin-1 content: café\n".encode("latin-1")) f.write("Invalid UTF-8: \xff\xfe\n".encode("latin-1")) result = plaintext_plugin.indexFile(str(test_file)) # Should handle encoding errors assert result is not None assert "chunks" in result assert len(result["chunks"]) > 0 # Should have encoding error metadata assert "metadata" in result assert result["metadata"].get("encoding_errors", 0) > 0 def test_excessively_nested_structure(self, markdown_plugin, tmp_path): """Test handling of documents with excessive nesting.""" # Create deeply nested markdown content = "# Level 1\n" for i in range(2, 20): # Create 19 levels of nesting content += f"{'#' * i} Level {i}\n" content += f"Content at level {i}\n\n" test_file = tmp_path / "deeply_nested.md" test_file.write_text(content, encoding="utf-8") result = markdown_plugin.indexFile(str(test_file)) # Should handle deep nesting without crashing assert result is not None assert "chunks" in result assert len(result["chunks"]) > 0 # Should cap nesting at reasonable level assert "metadata" in result max_depth = result["metadata"].get("max_heading_depth", 0) assert max_depth <= 10 # Should cap at reasonable depth def test_invalid_file_permissions(self, plaintext_plugin, tmp_path): """Test handling of files with permission issues.""" test_file = tmp_path / "no_read_permission.txt" test_file.write_text("Content that can't be read", encoding="utf-8") # Change permissions to write-only test_file.chmod(0o200) try: result = plaintext_plugin.indexFile(str(test_file)) # Should handle permission errors gracefully assert result is not None assert result.get("error") is not None assert "permission" in result["error"].lower() finally: # Restore permissions for cleanup test_file.chmod(0o644) def test_circular_references(self, markdown_plugin, tmp_path): """Test handling of documents with circular references.""" content = create_malformed_content("circular_reference") test_file = tmp_path / "circular_refs.md" test_file.write_text(content, encoding="utf-8") result = markdown_plugin.indexFile(str(test_file)) # Should handle circular references without infinite loops assert result is not None assert "chunks" in result # Should detect and note circular references assert "metadata" in result assert result["metadata"].get("has_circular_refs", False) def test_truncated_file(self, markdown_plugin, tmp_path): """Test handling of truncated/incomplete files.""" # Create a file that appears truncated content = """# Document Title This is the beginning of a document that gets cut off... ## Section 1 Some content here ## Section 2 This section is incomple""" # Truncated mid-word test_file = tmp_path / "truncated.md" test_file.write_text(content, encoding="utf-8") result = markdown_plugin.indexFile(str(test_file)) # Should handle truncated files assert result is not None assert "chunks" in result assert len(result["chunks"]) > 0 # Should process what's available content_joined = " ".join(chunk["content"] for chunk in result["chunks"]) assert "Document Title" in content_joined assert "Section 1" in content_joined

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ViperJuice/Code-Index-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_malformed_documents.py•7.45 KiB