Markdown RAG Documentation

Overview Schema Related Servers Score Discussions

test_markdown_parser.py•9.57 KiB

from datetime import datetime import pytest from src.models import Document from src.parsers.markdown import MarkdownParser def test_parse_simple_markdown(tmp_path): md_file = tmp_path / "test.md" md_file.write_text("# Hello World\n\nThis is a test.") parser = MarkdownParser() doc = parser.parse(str(md_file)) assert isinstance(doc, Document) assert doc.id == "test" assert "Hello World" in doc.content assert "This is a test" in doc.content assert doc.file_path == str(md_file) assert isinstance(doc.modified_time, datetime) def test_parse_with_yaml_frontmatter(tmp_path): md_file = tmp_path / "note.md" md_file.write_text("""--- title: My Note author: Test User aliases: - alias1 - alias2 tags: - tag1 - tag2 --- # Content Body text here. """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "note" assert "# Content" in doc.content assert "Body text here" in doc.content assert "---" not in doc.content assert doc.metadata.get("title") == "My Note" assert doc.metadata.get("author") == "Test User" assert doc.tags == ["tag1", "tag2"] def test_parse_aliases_from_frontmatter(tmp_path): md_file = tmp_path / "doc.md" md_file.write_text("""--- aliases: - first-alias - second-alias --- Content here. """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "doc" def test_parse_malformed_yaml_frontmatter(tmp_path): md_file = tmp_path / "bad.md" md_file.write_text("""--- title: Unclosed String bad_yaml: [this is not valid --- Content here. """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "bad" assert doc.metadata == {} assert "Content here" in doc.content def test_parse_missing_file(): parser = MarkdownParser() with pytest.raises(FileNotFoundError, match="File not found"): parser.parse("/nonexistent/file.md") def test_parse_no_frontmatter(tmp_path): md_file = tmp_path / "plain.md" md_file.write_text("Just plain markdown content.\n\nNo frontmatter here.") parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "plain" assert doc.metadata == {} assert "Just plain markdown content" in doc.content assert "No frontmatter here" in doc.content def test_parse_empty_frontmatter(tmp_path): md_file = tmp_path / "empty.md" md_file.write_text("""--- --- Content after empty frontmatter. """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "empty" assert doc.metadata == {} assert "Content after empty frontmatter" in doc.content def test_parse_single_alias_as_string(tmp_path): md_file = tmp_path / "single.md" md_file.write_text("""--- aliases: single-alias --- Content. """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "single" def test_parse_single_tag_as_string(tmp_path): md_file = tmp_path / "tag.md" md_file.write_text("""--- tags: single-tag --- Content. """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "tag" assert doc.tags == ["single-tag"] def test_modified_time_reflects_file_stat(tmp_path): md_file = tmp_path / "time.md" md_file.write_text("Content") parser = MarkdownParser() doc = parser.parse(str(md_file)) file_mtime = datetime.fromtimestamp(md_file.stat().st_mtime) assert abs((doc.modified_time - file_mtime).total_seconds()) < 1 def test_parse_empty_file(tmp_path): """ Verify parser handles empty (0-byte) markdown files gracefully. Prevents crashes when indexing directories with placeholder files. """ md_file = tmp_path / "empty.md" md_file.write_text("") parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "empty" assert doc.content == "" assert doc.metadata == {} assert doc.tags == [] def test_parse_unicode_content(tmp_path): """ Verify parser correctly handles unicode characters (emoji, CJK, accents). Ensures international content and modern markdown (emoji) are preserved. """ md_file = tmp_path / "unicode.md" md_file.write_text("""--- title: Résumé 📝 author: 中文用户 --- # Hello 世界 🌍 Content with émojis 🎉 and àccénts. """, encoding="utf-8") parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "unicode" assert "世界" in doc.content assert "🌍" in doc.content assert "🎉" in doc.content assert "àccénts" in doc.content assert doc.metadata["title"] == "Résumé 📝" assert doc.metadata["author"] == "中文用户" def test_parse_latin1_encoding(tmp_path): """ Verify parser handles files with latin-1 encoding gracefully. Falls back to alternative encodings when UTF-8 fails. """ md_file = tmp_path / "latin1.md" # Write content with latin-1 specific bytes that aren't valid UTF-8 content = "# Test Document\n\nCafé with extended chars: \xe9\xe0\xfc" with open(md_file, "wb") as f: f.write(content.encode("latin-1")) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "latin1" assert "Test Document" in doc.content # Content should be decoded successfully with latin-1 fallback assert "Café" in doc.content or "Caf" in doc.content def test_extract_wikilinks_both_forms(tmp_path): """ Verify parser extracts wikilinks in both [[Note]] and [[Note|Display]] forms. Ensures graph edges are created for all link types while excluding transclusions. """ md_file = tmp_path / "links.md" md_file.write_text("""# Document with Links Standard link: [[Target Note]] Link with display text: [[Another Note|Custom Display]] Transclusion should be ignored: ![[Embedded Note]] Multiple references: [[Target Note]] and [[Third Note]] """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "links" assert set(doc.links) == {"Target Note", "Another Note", "Third Note"} assert "Embedded Note" not in doc.links def test_extract_transclusions(tmp_path): """ Verify parser extracts transclusions (![[Note]]) as separate metadata. Ensures embedded content relationships are tracked in graph without duplication. """ md_file = tmp_path / "transclusions.md" md_file.write_text("""# Document with Transclusions Embed this: ![[Template Note]] Regular link: [[Reference Note]] Another embed: ![[Snippet]] """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "transclusions" transclusions = doc.metadata.get("transclusions", []) assert isinstance(transclusions, list) assert set(transclusions) == {"Template Note", "Snippet"} assert "Template Note" not in doc.links assert "Reference Note" in doc.links def test_extract_inline_tags(tmp_path): """ Verify parser extracts inline tags (#tag) from markdown content. Ensures content-based categorization works alongside frontmatter tags. """ md_file = tmp_path / "inline_tags.md" md_file.write_text("""# Document with Tags This is about #programming and #python specifically. Also covers #data-science topics. #machine-learning is mentioned at line start. """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "inline_tags" assert set(doc.tags) == {"data-science", "machine-learning", "programming", "python"} def test_code_block_exclusion_for_links_and_tags(tmp_path): """ Verify parser excludes links and tags inside code blocks from extraction. Prevents false positives from code examples containing markdown-like syntax. """ md_file = tmp_path / "code_blocks.md" md_file.write_text("""# Code Examples This has a real link: [[Actual Note]] ```markdown This is an example: [[Fake Link]] Use #fake-tag in examples. ``` Real inline tag: #real-tag Inline code: `[[Not A Link]]` and `#not-a-tag` """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "code_blocks" assert doc.links == ["Actual Note"] assert "Fake Link" not in doc.links assert "Not A Link" not in doc.links assert doc.tags == ["real-tag"] assert "fake-tag" not in doc.tags assert "not-a-tag" not in doc.tags def test_frontmatter_and_inline_tags_combination(tmp_path): """ Verify parser merges frontmatter and inline tags into unified sorted list. Ensures complete tag coverage from both metadata and content sources. """ md_file = tmp_path / "combined_tags.md" md_file.write_text("""--- tags: - yaml-tag - metadata-tag --- # Content This mentions #content-tag and #inline-tag. Reference to #yaml-tag again (should not duplicate). """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "combined_tags" assert doc.tags == ["content-tag", "inline-tag", "metadata-tag", "yaml-tag"] assert len(doc.tags) == 4 def test_malformed_wikilinks_edge_cases(tmp_path): """ Verify parser handles malformed wikilinks gracefully without crashes. Ensures robustness against user typos and incomplete markdown syntax. """ md_file = tmp_path / "malformed.md" md_file.write_text("""# Malformed Links Unclosed: [[Unclosed Link Empty: [[]] Nested start: [[[Note]]] Valid: [[Proper Link]] Missing second bracket: [[Missing] """) parser = MarkdownParser() doc = parser.parse(str(md_file)) assert doc.id == "malformed" assert "Proper Link" in doc.links assert len([link for link in doc.links if link.strip()]) >= 1

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/andnp/ragdocs-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_markdown_parser.py•9.57 KiB