Law Scrapper MCP

test_content_processor.py•10.1 KiB

"""Tests for ContentProcessor service.""" from __future__ import annotations from unittest.mock import MagicMock, patch from law_scrapper_mcp.services.content_processor import ContentProcessor, Section class TestHtmlToMarkdown: """Tests for HTML to Markdown conversion.""" def test_html_to_markdown_basic(self, content_processor: ContentProcessor): """Test basic HTML to Markdown conversion.""" html = "<h1>Title</h1><p>Paragraph text.</p>" md = content_processor.html_to_markdown(html) assert "# Title" in md assert "Paragraph text." in md def test_html_to_markdown_with_sample( self, content_processor: ContentProcessor, sample_act_html: str ): """Test HTML to Markdown with sample act HTML.""" md = content_processor.html_to_markdown(sample_act_html) assert "USTAWA z dnia 1 stycznia 2024 r." in md assert "Rozdział 1" in md assert "Art. 1." in md assert "Art. 2." in md assert "Art. 3." in md def test_html_to_markdown_strips_scripts(self, content_processor: ContentProcessor): """Test that script tags are stripped from output.""" html = "<h1>Title</h1><script>alert('test');</script><p>Text</p>" md = content_processor.html_to_markdown(html) # markdownify strips <script> tags but may leave text content assert "# Title" in md assert "Text" in md assert "<script>" not in md def test_html_to_markdown_normalizes_whitespace( self, content_processor: ContentProcessor ): """Test that excessive blank lines are removed.""" html = "<h1>Title</h1>\n\n\n\n<p>Text</p>" md = content_processor.html_to_markdown(html) # Should not have more than 2 consecutive newlines assert "\n\n\n" not in md def test_html_to_markdown_empty_input(self, content_processor: ContentProcessor): """Test HTML to Markdown with empty input.""" md = content_processor.html_to_markdown("") assert md == "" def test_html_to_markdown_with_lists(self, content_processor: ContentProcessor): """Test HTML to Markdown with ordered lists.""" html = "<ol><li>First item</li><li>Second item</li></ol>" md = content_processor.html_to_markdown(html) assert "First item" in md assert "Second item" in md class TestPdfToText: """Tests for PDF to text extraction.""" def test_pdf_to_text_empty_input(self, content_processor: ContentProcessor): """Test PDF to text with empty input.""" text = content_processor.pdf_to_text(b"") assert text == "" @patch("pdfplumber.open") def test_pdf_to_text_with_mock(self, mock_pdfplumber, content_processor: ContentProcessor): """Test PDF to text extraction with mocked pdfplumber.""" # Create mock PDF with pages mock_page1 = MagicMock() mock_page1.extract_text.return_value = "Page 1 text" mock_page2 = MagicMock() mock_page2.extract_text.return_value = "Page 2 text" mock_pdf = MagicMock() mock_pdf.__enter__.return_value.pages = [mock_page1, mock_page2] mock_pdfplumber.return_value = mock_pdf text = content_processor.pdf_to_text(b"fake pdf bytes") assert "Page 1 text" in text assert "Page 2 text" in text assert "\n\n" in text # Pages should be separated @patch("pdfplumber.open") def test_pdf_to_text_extraction_failure( self, mock_pdfplumber, content_processor: ContentProcessor ): """Test that PDF extraction failures are handled gracefully.""" mock_pdfplumber.side_effect = Exception("PDF parsing error") text = content_processor.pdf_to_text(b"invalid pdf") # Should return empty string on error, not raise assert text == "" @patch("pdfplumber.open") def test_pdf_to_text_no_text_on_page( self, mock_pdfplumber, content_processor: ContentProcessor ): """Test PDF with pages that have no text.""" mock_page = MagicMock() mock_page.extract_text.return_value = None mock_pdf = MagicMock() mock_pdf.__enter__.return_value.pages = [mock_page] mock_pdfplumber.return_value = mock_pdf text = content_processor.pdf_to_text(b"fake pdf") assert text == "" class TestIndexSections: """Tests for section indexing.""" def test_index_sections_with_markdown_headings( self, content_processor: ContentProcessor ): """Test indexing sections with standard Markdown headings.""" markdown = """# Chapter 1 Content of chapter 1 ## Section 1.1 Content of section 1.1 ## Section 1.2 Content of section 1.2""" sections = content_processor.index_sections(markdown) assert len(sections) >= 2 assert any(s.title == "Chapter 1" for s in sections) assert any("Section 1.1" in s.title for s in sections) def test_index_sections_with_art_pattern(self, content_processor: ContentProcessor): """Test indexing sections with Art. pattern.""" markdown = """Art. 1. First article content. Art. 2. Second article content. Art. 3. Third article content.""" sections = content_processor.index_sections(markdown) assert len(sections) >= 3 art_sections = [s for s in sections if s.title.startswith("Art.")] assert len(art_sections) >= 3 def test_index_sections_with_rozdzial_pattern( self, content_processor: ContentProcessor ): """Test indexing sections with Rozdział pattern.""" markdown = """Rozdział 1 General provisions Some content here. Rozdział 2 Special provisions More content here.""" sections = content_processor.index_sections(markdown) rozdzial_sections = [s for s in sections if "Rozdział" in s.title] assert len(rozdzial_sections) >= 2 def test_index_sections_with_sample_act( self, content_processor: ContentProcessor, sample_act_html: str ): """Test indexing sections with sample act HTML.""" markdown = content_processor.html_to_markdown(sample_act_html) sections = content_processor.index_sections(markdown) assert len(sections) > 0 # Should find Art. sections art_sections = [s for s in sections if "Art." in s.title] assert len(art_sections) >= 3 # Check section properties for section in sections: assert section.id assert section.title assert section.level > 0 assert section.start_pos >= 0 assert section.end_pos is None or section.end_pos > section.start_pos def test_index_sections_empty_markdown(self, content_processor: ContentProcessor): """Test indexing with empty markdown.""" sections = content_processor.index_sections("") assert sections == [] def test_index_sections_no_headings(self, content_processor: ContentProcessor): """Test indexing markdown with no headings.""" markdown = "Just plain text without any headings." sections = content_processor.index_sections(markdown) assert sections == [] def test_section_content_extraction(self, content_processor: ContentProcessor): """Test that section content is properly extracted.""" markdown = """# Section 1 Content for section 1. # Section 2 Content for section 2.""" sections = content_processor.index_sections(markdown) assert len(sections) >= 2 assert "Content for section 1" in sections[0].content if len(sections) > 1: assert "Content for section 2" in sections[1].content def test_section_id_generation(self, content_processor: ContentProcessor): """Test that section IDs are properly generated.""" markdown = """# Test Section With Spaces Content here. Art. 123. Article with number.""" sections = content_processor.index_sections(markdown) for section in sections: # IDs should have underscores instead of spaces assert " " not in section.id # IDs should be limited in length assert len(section.id) <= 50 def test_section_levels(self, content_processor: ContentProcessor): """Test that section levels are correctly assigned.""" markdown = """# Level 1 ## Level 2 ### Level 3 Art. 1. Article (should be level 2) Rozdział 1 Chapter (should be level 1)""" sections = content_processor.index_sections(markdown) # Find specific sections and check their levels art_sections = [s for s in sections if "Art." in s.title] if art_sections: assert art_sections[0].level == 2 rozdzial_sections = [s for s in sections if "Rozdział" in s.title] if rozdzial_sections: assert rozdzial_sections[0].level == 1 def test_section_with_dział_pattern(self, content_processor: ContentProcessor): """Test indexing sections with DZIAŁ pattern.""" markdown = """DZIAŁ I General Part Content of part 1. DZIAŁ II Special Part Content of part 2.""" sections = content_processor.index_sections(markdown) dział_sections = [s for s in sections if "DZIAŁ" in s.title] assert len(dział_sections) >= 2 assert dział_sections[0].level == 1 class TestSection: """Tests for Section dataclass.""" def test_section_creation(self): """Test creating a Section instance.""" section = Section( id="art_1", title="Art. 1.", level=2, start_pos=100, end_pos=200, content="Content here", ) assert section.id == "art_1" assert section.title == "Art. 1." assert section.level == 2 assert section.start_pos == 100 assert section.end_pos == 200 assert section.content == "Content here" def test_section_optional_fields(self): """Test Section with optional fields.""" section = Section( id="test", title="Test", level=1, start_pos=0, ) assert section.end_pos is None assert section.content == ""

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/numikel/law-scrapper-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_content_processor.py•10.1 KiB