OpenZIM MCP Server

Overview Schema Related Servers Score Discussions

openzim-mcp
tests

test_content_processor.py•12.3 KiB

""" Tests for content processor module. """ from openzim_mcp.content_processor import ContentProcessor class TestContentProcessor: """Test ContentProcessor class.""" def test_html_to_plain_text( self, content_processor: ContentProcessor, sample_html: str ): """Test HTML to plain text conversion.""" result = content_processor.html_to_plain_text(sample_html) # Should contain main content assert "Main Title" in result assert "first paragraph" in result assert "bold text" in result # Should not contain unwanted elements assert "alert('test')" not in result assert "Edit section" not in result assert "Footer content" not in result def test_html_to_plain_text_empty(self, content_processor: ContentProcessor): """Test HTML to plain text with empty input.""" result = content_processor.html_to_plain_text("") assert result == "" def test_html_to_plain_text_invalid_html(self, content_processor: ContentProcessor): """Test HTML to plain text with invalid HTML.""" result = content_processor.html_to_plain_text("<invalid>test</invalid>") assert "test" in result def test_create_snippet_short_content(self, content_processor: ContentProcessor): """Test creating snippet from short content.""" content = "This is a short piece of content." result = content_processor.create_snippet(content) assert result == content def test_create_snippet_long_content(self, content_processor: ContentProcessor): """Test creating snippet from long content.""" content = "a" * 200 # Longer than snippet_length (100) result = content_processor.create_snippet(content) assert len(result) <= 103 # 100 + "..." assert result.endswith("...") def test_create_snippet_multiple_paragraphs( self, content_processor: ContentProcessor ): """Test creating snippet from multiple paragraphs.""" content = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph." result = content_processor.create_snippet(content, max_paragraphs=2) assert "First paragraph" in result assert "Second paragraph" in result assert "Third paragraph" not in result def test_truncate_content_short(self, content_processor: ContentProcessor): """Test truncating short content.""" content = "Short content" result = content_processor.truncate_content(content, 100) assert result == content def test_truncate_content_long(self, content_processor: ContentProcessor): """Test truncating long content.""" content = "a" * 200 result = content_processor.truncate_content(content, 100) assert len(result) > 100 # Includes truncation message assert "Content truncated" in result assert "200 characters" in result def test_process_mime_content_html(self, content_processor: ContentProcessor): """Test processing HTML MIME content.""" html_bytes = b"<html><body><h1>Test</h1></body></html>" result = content_processor.process_mime_content(html_bytes, "text/html") assert "Test" in result assert "<html>" not in result def test_process_mime_content_plain_text(self, content_processor: ContentProcessor): """Test processing plain text MIME content.""" text_bytes = b"Plain text content" result = content_processor.process_mime_content(text_bytes, "text/plain") assert result == "Plain text content" def test_process_mime_content_image(self, content_processor: ContentProcessor): """Test processing image MIME content.""" image_bytes = b"fake image data" result = content_processor.process_mime_content(image_bytes, "image/png") assert "Image content - Cannot display directly" in result def test_process_mime_content_unsupported( self, content_processor: ContentProcessor ): """Test processing unsupported MIME content.""" data_bytes = b"binary data" result = content_processor.process_mime_content( data_bytes, "application/octet-stream" ) assert "Unsupported content type" in result def test_html_to_plain_text_exception_handling( self, content_processor: ContentProcessor ): """Test html_to_plain_text exception handling.""" # Test with malformed HTML that might cause parsing issues malformed_html = "<html><body><div><p>Unclosed tags" # This should not raise an exception, but handle it gracefully result = content_processor.html_to_plain_text(malformed_html) assert "Unclosed tags" in result def test_process_mime_content_exception_handling( self, content_processor: ContentProcessor ): """Test process_mime_content exception handling.""" from unittest.mock import patch # Mock the html_to_plain_text method to raise an exception with patch.object( content_processor, "html_to_plain_text", side_effect=Exception("Test error") ): result = content_processor.process_mime_content( b"<html>test</html>", "text/html" ) assert "Error processing content" in result def test_create_snippet_exception_handling( self, content_processor: ContentProcessor ): """Test create_snippet exception handling.""" from unittest.mock import patch # Mock re.sub to raise an exception with patch( "openzim_mcp.content_processor.re.sub", side_effect=Exception("Test error") ): result = content_processor.create_snippet("test content") # Should return original content when exception occurs (line 92) assert result == "test content" def test_extract_html_structure_exception_handling( self, content_processor: ContentProcessor ): """Test extract_html_structure exception handling.""" from unittest.mock import patch # Test with content that causes an exception during processing with patch( "openzim_mcp.content_processor.BeautifulSoup", side_effect=Exception("Parse error"), ): result = content_processor.extract_html_structure( "<html><body>test</body></html>" ) # Should return basic structure when exception occurs assert "headings" in result assert "sections" in result def test_extract_html_links_exception_handling( self, content_processor: ContentProcessor ): """Test extract_html_links exception handling.""" from unittest.mock import patch # Test with content that causes an exception during link extraction with patch( "openzim_mcp.content_processor.BeautifulSoup", side_effect=Exception("Parse error"), ): result = content_processor.extract_html_links( "<html><body><a href='test'>link</a></body></html>" ) # Should return empty structure when exception occurs assert "internal_links" in result assert "external_links" in result def test_extract_html_structure(self, content_processor: ContentProcessor): """Test HTML structure extraction.""" html_content = """ <html> <head> <title>Test Article</title> <meta name="description" content="Test description"> </head> <body> <h1 id="intro">Introduction</h1> <p>This is the introduction paragraph.</p> <h2>Section 1</h2> <p>Content of section 1 with multiple words.</p> <h3>Subsection 1.1</h3> <p>Subsection content here.</p> <h2>Section 2</h2> <p>Content of section 2.</p> </body> </html> """ structure = content_processor.extract_html_structure(html_content) # Check basic structure assert "headings" in structure assert "sections" in structure assert "metadata" in structure assert "word_count" in structure # Check headings headings = structure["headings"] assert len(headings) == 4 assert headings[0]["level"] == 1 assert headings[0]["text"] == "Introduction" assert headings[0]["id"] == "intro" assert headings[1]["level"] == 2 assert headings[1]["text"] == "Section 1" # Check sections sections = structure["sections"] assert len(sections) > 0 assert any("Introduction" in section["title"] for section in sections) # Check metadata metadata = structure["metadata"] assert "description" in metadata assert metadata["description"] == "Test description" # Check word count assert structure["word_count"] > 0 def test_extract_html_structure_empty(self, content_processor: ContentProcessor): """Test HTML structure extraction with empty content.""" structure = content_processor.extract_html_structure("") assert "headings" in structure assert "sections" in structure assert "metadata" in structure assert "word_count" in structure assert structure["word_count"] == 0 def test_extract_html_links(self, content_processor: ContentProcessor): """Test HTML link extraction.""" html_content = """ <html> <body> <p>Internal link: <a href="C/Other_Article" title="Other Article"> Link to other article</a></p> <p>External link: <a href="https://example.com">Example website</a></p> <p>Anchor link: <a href="#section1">Go to section 1</a></p> <img src="I/image.jpg" alt="Test image" title="Image title"> <video src="M/video.mp4">Video content</video> <audio src="M/audio.mp3">Audio content</audio> </body> </html> """ links_data = content_processor.extract_html_links(html_content) # Check basic structure assert "internal_links" in links_data assert "external_links" in links_data assert "media_links" in links_data # Check internal links internal_links = links_data["internal_links"] assert ( len(internal_links) >= 2 ) # Should have the internal article link and anchor link # Find the internal article link article_link = next( (link for link in internal_links if "Other_Article" in link["url"]), None ) assert article_link is not None assert article_link["text"] == "Link to other article" assert article_link["title"] == "Other Article" assert article_link["type"] == "internal" # Find the anchor link anchor_link = next( (link for link in internal_links if link["url"].startswith("#")), None ) assert anchor_link is not None assert anchor_link["type"] == "anchor" # Check external links external_links = links_data["external_links"] assert len(external_links) >= 1 example_link = next( (link for link in external_links if "example.com" in link["url"]), None ) assert example_link is not None assert example_link["domain"] == "example.com" # Check media links media_links = links_data["media_links"] assert len(media_links) >= 3 # image, video, audio # Check for image image_link = next( (link for link in media_links if link["type"] == "image"), None ) assert image_link is not None assert "image.jpg" in image_link["url"] assert image_link["alt"] == "Test image" assert image_link["title"] == "Image title" def test_extract_html_links_empty(self, content_processor: ContentProcessor): """Test HTML link extraction with empty content.""" links_data = content_processor.extract_html_links("") assert "internal_links" in links_data assert "external_links" in links_data assert "media_links" in links_data assert len(links_data["internal_links"]) == 0 assert len(links_data["external_links"]) == 0 assert len(links_data["media_links"]) == 0

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cameronrye/openzim-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_content_processor.py•12.3 KiB