Skip to main content
Glama
test_markdown_chunker.py10.4 kB
""" Unit tests for MarkdownChunker. @REQ: REQ-graphiti-chunk-mcp @BP: BP-graphiti-chunk-mcp @TASK: TASK-003-MarkdownChunker """ import pytest from rbt_mcp_server.chunking.markdown_chunker import MarkdownChunker from rbt_mcp_server.chunking.models import ChunkMetadata @pytest.fixture def chunker(): """ Create a MarkdownChunker instance for testing. @REQ: REQ-graphiti-chunk-mcp @BP: BP-graphiti-chunk-mcp @TASK: TASK-003-MarkdownChunker """ return MarkdownChunker() class TestMarkdownChunker: """ Test suite for MarkdownChunker. @REQ: REQ-graphiti-chunk-mcp @BP: BP-graphiti-chunk-mcp @TASK: TASK-003-MarkdownChunker """ def test_chunk_with_multiple_h3_headings(self, chunker): """ Test Case 1: Parse Markdown document with multiple h3 headings. @REQ: REQ-graphiti-chunk-mcp @BP: BP-graphiti-chunk-mcp @TASK: TASK-003-MarkdownChunker Given: Markdown document contains 3 ### headings When: Call chunk() Then: Return 3 ChunkMetadata objects, each corresponding to one h3 section """ # Arrange document_content = """# Main Title Some introduction text. ### Introduction This is the introduction section with some content. ### Implementation This is the implementation section with code examples. ### Testing This is the testing section with test cases. """ project_id = "knowledge-smith" feature_id = "test-feature" doc_type = "Guide" file_path = "docs/guide.md" # Act chunks = chunker.chunk(document_content, project_id, feature_id, doc_type, file_path) # Assert assert len(chunks) == 3 # Check first chunk assert chunks[0].metadata["section_title"] == "Introduction" assert chunks[0].metadata["section_id"] == "introduction" assert chunks[0].metadata["chunk_id"] == "knowledge-smith+test-feature+introduction" assert chunks[0].metadata["parent_document_id"] == "knowledge-smith+test-feature+Guide" assert "introduction section" in chunks[0].content.lower() # Check second chunk assert chunks[1].metadata["section_title"] == "Implementation" assert chunks[1].metadata["section_id"] == "implementation" assert chunks[1].metadata["chunk_id"] == "knowledge-smith+test-feature+implementation" assert "implementation section" in chunks[1].content.lower() # Check third chunk assert chunks[2].metadata["section_title"] == "Testing" assert chunks[2].metadata["section_id"] == "testing" assert chunks[2].metadata["chunk_id"] == "knowledge-smith+test-feature+testing" assert "testing section" in chunks[2].content.lower() def test_chunk_without_h3_headings(self, chunker): """ Test Case 2: Handle Markdown document without h3 headings. @REQ: REQ-graphiti-chunk-mcp @BP: BP-graphiti-chunk-mcp @TASK: TASK-003-MarkdownChunker Given: Markdown document has no h3 headings When: Call chunk() Then: Return 1 ChunkMetadata containing the entire document """ # Arrange document_content = """# Main Title This is a document without any h3 headings. ## Section 1 Some content here. ## Section 2 More content here. """ project_id = "knowledge-smith" feature_id = None doc_type = "Guide" file_path = "docs/simple.md" # Act chunks = chunker.chunk(document_content, project_id, feature_id, doc_type, file_path) # Assert assert len(chunks) == 1 assert chunks[0].metadata["chunk_id"] == "knowledge-smith+general+document" assert chunks[0].metadata["parent_document_id"] == "knowledge-smith+general+Guide" assert chunks[0].metadata["section_id"] is None assert chunks[0].metadata["section_title"] is None assert chunks[0].content == document_content def test_heading_slug_generation(self, chunker): """ Test Case 3: Verify heading_slug generation correctness. @REQ: REQ-graphiti-chunk-mcp @BP: BP-graphiti-chunk-mcp @TASK: TASK-003-MarkdownChunker Given: h3 heading contains special characters, spaces, and mixed case When: Generate slug Then: Slug is lowercase, hyphen-separated, with special characters removed """ # Test various heading formats test_cases = [ ("Hello World!", "hello-world"), ("Implementation & Testing", "implementation-testing"), ("1. 任務目標與前置", "1-任務目標與前置"), ("API Reference: v2.0", "api-reference-v20"), ("Multiple Spaces", "multiple-spaces"), ("Trailing-Hyphen--", "trailing-hyphen"), ("UPPERCASE TEXT", "uppercase-text"), ("under_score_test", "under-score-test"), ] for heading, expected_slug in test_cases: # Act slug = chunker._generate_slug(heading) # Assert assert slug == expected_slug, f"Failed for heading: {heading}" def test_chunk_id_stability(self, chunker): """ Test Case 4: Verify chunk_id stability across multiple runs. @REQ: REQ-graphiti-chunk-mcp @BP: BP-graphiti-chunk-mcp @TASK: TASK-003-MarkdownChunker Given: Same Markdown document chunked multiple times When: Call chunk() multiple times Then: chunk_id for same heading remains unchanged """ # Arrange document_content = """### First Section Content for first section. ### Second Section Content for second section. """ project_id = "knowledge-smith" feature_id = "stable-test" doc_type = "Guide" file_path = "docs/stability.md" # Act - chunk the same document 3 times chunks_run1 = chunker.chunk(document_content, project_id, feature_id, doc_type, file_path) chunks_run2 = chunker.chunk(document_content, project_id, feature_id, doc_type, file_path) chunks_run3 = chunker.chunk(document_content, project_id, feature_id, doc_type, file_path) # Assert - all runs should produce identical chunk_ids assert len(chunks_run1) == 2 assert len(chunks_run2) == 2 assert len(chunks_run3) == 2 # Check first chunk assert chunks_run1[0].metadata["chunk_id"] == chunks_run2[0].metadata["chunk_id"] == chunks_run3[0].metadata["chunk_id"] assert chunks_run1[0].metadata["chunk_id"] == "knowledge-smith+stable-test+first-section" # Check second chunk assert chunks_run1[1].metadata["chunk_id"] == chunks_run2[1].metadata["chunk_id"] == chunks_run3[1].metadata["chunk_id"] assert chunks_run1[1].metadata["chunk_id"] == "knowledge-smith+stable-test+second-section" def test_chunk_with_no_feature_id(self, chunker): """ Test chunking general documents without feature_id. @REQ: REQ-graphiti-chunk-mcp @BP: BP-graphiti-chunk-mcp @TASK: TASK-003-MarkdownChunker Given: Document has no feature_id (general document) When: Call chunk() Then: chunk_id and parent_document_id use "general" as feature part """ # Arrange document_content = """### Overview This is a general document. """ project_id = "knowledge-smith" feature_id = None doc_type = "Architecture" file_path = "docs/architecture/overview.md" # Act chunks = chunker.chunk(document_content, project_id, feature_id, doc_type, file_path) # Assert assert len(chunks) == 1 assert chunks[0].metadata["chunk_id"] == "knowledge-smith+general+overview" assert chunks[0].metadata["parent_document_id"] == "knowledge-smith+general+Architecture" assert chunks[0].metadata["feature_id"] is None def test_chunk_preserves_content_boundaries(self, chunker): """ Test that content boundaries are preserved correctly. @REQ: REQ-graphiti-chunk-mcp @BP: BP-graphiti-chunk-mcp @TASK: TASK-003-MarkdownChunker Given: Document with multiple h3 sections with distinct content When: Call chunk() Then: Each chunk contains only its section's content """ # Arrange document_content = """### Section A Content for A. More A content. ### Section B Content for B. More B content. ### Section C Content for C. """ project_id = "test-project" feature_id = "test-feature" doc_type = "Doc" file_path = "test.md" # Act chunks = chunker.chunk(document_content, project_id, feature_id, doc_type, file_path) # Assert assert len(chunks) == 3 # Section A should not contain Section B or C content assert "Content for A" in chunks[0].content assert "More A content" in chunks[0].content assert "Content for B" not in chunks[0].content assert "Content for C" not in chunks[0].content # Section B should not contain Section A or C content assert "Content for B" in chunks[1].content assert "More B content" in chunks[1].content assert "Content for A" not in chunks[1].content assert "Content for C" not in chunks[1].content # Section C should not contain Section A or B content assert "Content for C" in chunks[2].content assert "Content for A" not in chunks[2].content assert "Content for B" not in chunks[2].content def test_empty_document(self, chunker): """ Test handling of empty document. @REQ: REQ-graphiti-chunk-mcp @BP: BP-graphiti-chunk-mcp @TASK: TASK-003-MarkdownChunker Given: Document is empty or only whitespace When: Call chunk() Then: Return 1 chunk with empty or whitespace content """ # Arrange document_content = "" project_id = "test-project" feature_id = "test-feature" doc_type = "Doc" file_path = "empty.md" # Act chunks = chunker.chunk(document_content, project_id, feature_id, doc_type, file_path) # Assert assert len(chunks) == 1 assert chunks[0].content == ""

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/leo7nel23/KnowkedgeSmith-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server