Skip to main content
Glama
juanqui
by juanqui
test_markdown_parser.py7.54 kB
"""Tests for the Markdown parser implementation.""" import tempfile from pathlib import Path import pytest from src.pdfkb.parsers.parser_markdown import MarkdownParser class TestMarkdownParser: """Test suite for the Markdown parser.""" @pytest.fixture def parser(self): """Create a markdown parser instance.""" return MarkdownParser() @pytest.fixture def sample_markdown(self): """Sample markdown content for testing.""" return """--- title: Test Document author: Test Author date: 2024-01-01 tags: [test, markdown, parser] --- # Introduction This is a test document for the markdown parser. ## Section 1 Some content in section 1 with **bold** and *italic* text. ### Subsection 1.1 - List item 1 - List item 2 - List item 3 ## Section 2 ```python def hello_world(): print("Hello, World!") ``` A [link](https://example.com) to somewhere. """ @pytest.fixture def sample_markdown_no_frontmatter(self): """Sample markdown without frontmatter.""" return """# My Document Title This is a document without frontmatter. ## Content Section Some regular content here. """ @pytest.mark.asyncio async def test_parse_with_frontmatter(self, parser, sample_markdown): """Test parsing markdown with YAML frontmatter.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write(sample_markdown) temp_path = Path(f.name) try: result = await parser.parse(temp_path) # Check content is returned assert len(result.pages) > 0 assert result.pages[0].markdown_content is not None assert "# Introduction" in result.pages[0].markdown_content assert "This is a test document" in result.pages[0].markdown_content # Check frontmatter was parsed assert result.metadata["title"] == "Test Document" assert result.metadata["author"] == "Test Author" # YAML may parse date as date object or string assert str(result.metadata["date"]) == "2024-01-01" assert result.metadata["tags"] == ["test", "markdown", "parser"] # Check file metadata assert result.metadata["document_type"] == "markdown" assert result.metadata["source_filename"] == temp_path.name # Check statistics assert result.metadata["line_count"] > 0 assert result.metadata["word_count"] > 0 assert result.metadata["heading_count"] == 4 # h1, h2, h3 headers assert result.metadata["code_block_count"] == 1 assert result.metadata["link_count"] == 1 finally: temp_path.unlink() @pytest.mark.asyncio async def test_parse_without_frontmatter(self, sample_markdown_no_frontmatter): """Test parsing markdown without frontmatter.""" parser = MarkdownParser(config={"parse_frontmatter": True, "extract_title": True}) with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write(sample_markdown_no_frontmatter) temp_path = Path(f.name) try: result = await parser.parse(temp_path) # Check content assert "# My Document Title" in result.pages[0].markdown_content assert "This is a document without frontmatter" in result.pages[0].markdown_content # Check title extraction from H1 assert result.metadata.get("title") == "My Document Title" # Check no frontmatter metadata assert "author" not in result.metadata assert "date" not in result.metadata finally: temp_path.unlink() @pytest.mark.asyncio async def test_parse_with_disabled_frontmatter(self, sample_markdown): """Test parsing with frontmatter parsing disabled.""" parser = MarkdownParser(config={"parse_frontmatter": False}) with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write(sample_markdown) temp_path = Path(f.name) try: result = await parser.parse(temp_path) # Check frontmatter is included in content when parsing is disabled assert "---" in result.pages[0].markdown_content assert "title: Test Document" in result.pages[0].markdown_content # Check no frontmatter in metadata assert "title" not in result.metadata or result.metadata["title"] != "Test Document" assert "author" not in result.metadata finally: temp_path.unlink() @pytest.mark.asyncio async def test_parse_with_toml_frontmatter(self): """Test parsing markdown with TOML frontmatter.""" content = """+++ title = "TOML Document" author = "TOML Author" +++ # Content This uses TOML frontmatter. """ parser = MarkdownParser() with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write(content) temp_path = Path(f.name) try: result = await parser.parse(temp_path) # TOML parsing requires toml library, might not work without it # But content should still be returned assert "# Content" in result.pages[0].markdown_content assert "This uses TOML frontmatter" in result.pages[0].markdown_content finally: temp_path.unlink() @pytest.mark.asyncio async def test_parse_empty_file(self, parser): """Test parsing an empty markdown file.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write("") temp_path = Path(f.name) try: result = await parser.parse(temp_path) assert result.pages[0].markdown_content == "" assert result.metadata["line_count"] == 1 # Empty string splits to [''] assert result.metadata["word_count"] == 0 assert result.metadata["char_count"] == 0 finally: temp_path.unlink() @pytest.mark.asyncio async def test_parse_nonexistent_file(self, parser): """Test parsing a nonexistent file raises an error.""" nonexistent_path = Path("/tmp/nonexistent_file_12345.md") with pytest.raises(Exception): await parser.parse(nonexistent_path) def test_extract_title_from_content(self, parser): """Test title extraction from various content formats.""" # Test H1 extraction content1 = "# Main Title\nSome content" title1 = parser._extract_title_from_content(content1) assert title1 == "Main Title" # Test with formatting in title content2 = "# Title with **bold** and *italic*" title2 = parser._extract_title_from_content(content2) assert title2 == "Title with bold and italic" # Test with link in title content3 = "# Title with [link](url)" title3 = parser._extract_title_from_content(content3) assert title3 == "Title with link" # Test no H1 header content4 = "## Only H2\nSome content" title4 = parser._extract_title_from_content(content4) # Should return None when no H1 is present assert title4 is None # Test empty content content5 = "" title5 = parser._extract_title_from_content(content5) assert title5 is None

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server