Skip to main content
Glama
test_content_processor.py12.6 kB
""" Tests for content processor module. """ from openzim_mcp.content_processor import ContentProcessor class TestContentProcessor: """Test ContentProcessor class.""" def test_html_to_plain_text( self, content_processor: ContentProcessor, sample_html: str ): """Test HTML to plain text conversion.""" result = content_processor.html_to_plain_text(sample_html) # Should contain main content assert "Main Title" in result assert "first paragraph" in result assert "bold text" in result # Should not contain unwanted elements assert "alert('test')" not in result assert "Edit section" not in result assert "Footer content" not in result def test_html_to_plain_text_empty(self, content_processor: ContentProcessor): """Test HTML to plain text with empty input.""" result = content_processor.html_to_plain_text("") assert result == "" def test_html_to_plain_text_invalid_html(self, content_processor: ContentProcessor): """Test HTML to plain text with invalid HTML.""" result = content_processor.html_to_plain_text("<invalid>test</invalid>") assert "test" in result def test_create_snippet_short_content(self, content_processor: ContentProcessor): """Test creating snippet from short content.""" content = "This is a short piece of content." result = content_processor.create_snippet(content) assert result == content def test_create_snippet_long_content(self, content_processor: ContentProcessor): """Test creating snippet from long content.""" content = "a" * 200 # Longer than snippet_length (100) result = content_processor.create_snippet(content) assert len(result) <= 103 # 100 + "..." assert result.endswith("...") def test_create_snippet_multiple_paragraphs( self, content_processor: ContentProcessor ): """Test creating snippet from multiple paragraphs.""" content = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph." result = content_processor.create_snippet(content, max_paragraphs=2) assert "First paragraph" in result assert "Second paragraph" in result assert "Third paragraph" not in result def test_truncate_content_short(self, content_processor: ContentProcessor): """Test truncating short content.""" content = "Short content" result = content_processor.truncate_content(content, 100) assert result == content def test_truncate_content_long(self, content_processor: ContentProcessor): """Test truncating long content.""" content = "a" * 200 result = content_processor.truncate_content(content, 100) assert len(result) > 100 # Includes truncation message assert "Content truncated" in result assert "200 characters" in result def test_process_mime_content_html(self, content_processor: ContentProcessor): """Test processing HTML MIME content.""" html_bytes = b"<html><body><h1>Test</h1></body></html>" result = content_processor.process_mime_content(html_bytes, "text/html") assert "Test" in result assert "<html>" not in result def test_process_mime_content_plain_text(self, content_processor: ContentProcessor): """Test processing plain text MIME content.""" text_bytes = b"Plain text content" result = content_processor.process_mime_content(text_bytes, "text/plain") assert result == "Plain text content" def test_process_mime_content_image(self, content_processor: ContentProcessor): """Test processing image MIME content.""" image_bytes = b"fake image data" result = content_processor.process_mime_content(image_bytes, "image/png") assert "Image content - Cannot display directly" in result def test_process_mime_content_unsupported( self, content_processor: ContentProcessor ): """Test processing unsupported MIME content.""" data_bytes = b"binary data" result = content_processor.process_mime_content( data_bytes, "application/octet-stream" ) assert "Unsupported content type" in result def test_html_to_plain_text_exception_handling( self, content_processor: ContentProcessor ): """Test html_to_plain_text exception handling.""" # Test with malformed HTML that might cause parsing issues malformed_html = "<html><body><div><p>Unclosed tags" # This should not raise an exception, but handle it gracefully result = content_processor.html_to_plain_text(malformed_html) assert "Unclosed tags" in result def test_process_mime_content_exception_handling( self, content_processor: ContentProcessor ): """Test process_mime_content exception handling.""" from unittest.mock import patch # Mock the html_to_plain_text method to raise an exception with patch.object( content_processor, "html_to_plain_text", side_effect=Exception("Test error") ): result = content_processor.process_mime_content( b"<html>test</html>", "text/html" ) assert "Error processing content" in result def test_create_snippet_exception_handling( self, content_processor: ContentProcessor ): """Test create_snippet exception handling.""" from unittest.mock import patch # Mock re.sub to raise an exception with patch( "openzim_mcp.content_processor.re.sub", side_effect=Exception("Test error") ): result = content_processor.create_snippet("test content") # Should return original content when exception occurs (line 92) assert result == "test content" def test_extract_html_structure_exception_handling( self, content_processor: ContentProcessor ): """Test extract_html_structure exception handling.""" from unittest.mock import patch # Test with content that causes an exception during processing with patch( "openzim_mcp.content_processor.BeautifulSoup", side_effect=Exception("Parse error"), ): result = content_processor.extract_html_structure( "<html><body>test</body></html>" ) # Should return basic structure when exception occurs assert "headings" in result assert "sections" in result def test_extract_html_links_exception_handling( self, content_processor: ContentProcessor ): """Test extract_html_links exception handling.""" from unittest.mock import patch # Test with content that causes an exception during link extraction with patch( "openzim_mcp.content_processor.BeautifulSoup", side_effect=Exception("Parse error"), ): result = content_processor.extract_html_links( "<html><body><a href='test'>link</a></body></html>" ) # Should return empty structure when exception occurs assert "internal_links" in result assert "external_links" in result def test_extract_html_structure(self, content_processor: ContentProcessor): """Test HTML structure extraction.""" html_content = """ <html> <head> <title>Test Article</title> <meta name="description" content="Test description"> </head> <body> <h1 id="intro">Introduction</h1> <p>This is the introduction paragraph.</p> <h2>Section 1</h2> <p>Content of section 1 with multiple words.</p> <h3>Subsection 1.1</h3> <p>Subsection content here.</p> <h2>Section 2</h2> <p>Content of section 2.</p> </body> </html> """ structure = content_processor.extract_html_structure(html_content) # Check basic structure assert "headings" in structure assert "sections" in structure assert "metadata" in structure assert "word_count" in structure # Check headings headings = structure["headings"] assert len(headings) == 4 assert headings[0]["level"] == 1 assert headings[0]["text"] == "Introduction" assert headings[0]["id"] == "intro" assert headings[1]["level"] == 2 assert headings[1]["text"] == "Section 1" # Check sections sections = structure["sections"] assert len(sections) > 0 assert any("Introduction" in section["title"] for section in sections) # Check metadata metadata = structure["metadata"] assert "description" in metadata assert metadata["description"] == "Test description" # Check word count assert structure["word_count"] > 0 def test_extract_html_structure_empty(self, content_processor: ContentProcessor): """Test HTML structure extraction with empty content.""" structure = content_processor.extract_html_structure("") assert "headings" in structure assert "sections" in structure assert "metadata" in structure assert "word_count" in structure assert structure["word_count"] == 0 def test_extract_html_links(self, content_processor: ContentProcessor): """Test HTML link extraction.""" html_content = """ <html> <body> <p>Internal link: <a href="C/Other_Article" title="Other Article"> Link to other article</a></p> <p>External link: <a href="https://example.com">Example website</a></p> <p>Anchor link: <a href="#section1">Go to section 1</a></p> <img src="I/image.jpg" alt="Test image" title="Image title"> <video src="M/video.mp4">Video content</video> <audio src="M/audio.mp3">Audio content</audio> </body> </html> """ links_data = content_processor.extract_html_links(html_content) # Check basic structure assert "internal_links" in links_data assert "external_links" in links_data assert "media_links" in links_data # Check internal links internal_links = links_data["internal_links"] assert ( len(internal_links) >= 2 ) # Should have the internal article link and anchor link # Find the internal article link article_link = next( (link for link in internal_links if "Other_Article" in link["url"]), None ) assert article_link is not None assert article_link["text"] == "Link to other article" assert article_link["title"] == "Other Article" assert article_link["type"] == "internal" # Find the anchor link anchor_link = next( (link for link in internal_links if link["url"].startswith("#")), None ) assert anchor_link is not None assert anchor_link["type"] == "anchor" # Check external links external_links = links_data["external_links"] assert len(external_links) >= 1 example_link = next( (link for link in external_links if "example.com" in link["url"]), None ) assert example_link is not None assert example_link["domain"] == "example.com" # Check media links media_links = links_data["media_links"] assert len(media_links) >= 3 # image, video, audio # Check for image image_link = next( (link for link in media_links if link["type"] == "image"), None ) assert image_link is not None assert "image.jpg" in image_link["url"] assert image_link["alt"] == "Test image" assert image_link["title"] == "Image title" def test_extract_html_links_empty(self, content_processor: ContentProcessor): """Test HTML link extraction with empty content.""" links_data = content_processor.extract_html_links("") assert "internal_links" in links_data assert "external_links" in links_data assert "media_links" in links_data assert len(links_data["internal_links"]) == 0 assert len(links_data["external_links"]) == 0 assert len(links_data["media_links"]) == 0

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cameronrye/openzim-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server