Skip to main content
Glama

Scrapy MCP Server

by ThreeFish-AI
test_markdown_converter.py25 kB
""" 单元测试:Markdown转换模块 测试 extractor.markdown_converter 模块的网页转Markdown功能 """ import pytest from unittest.mock import Mock, patch, MagicMock from bs4 import BeautifulSoup from extractor.markdown_converter import MarkdownConverter class TestMarkdownConverter: """测试Markdown转换器主要功能""" def setup_method(self): """测试前准备""" try: self.converter = MarkdownConverter() except ImportError: pytest.skip("MarkItDown not available, skipping tests") def test_converter_initialization(self): """测试转换器初始化""" try: assert self.converter is not None assert hasattr(self.converter, "html_to_markdown") assert hasattr(self.converter, "convert_webpage_to_markdown") assert hasattr(self.converter, "batch_convert_to_markdown") assert hasattr(self.converter, "convert_pdf_to_markdown") assert isinstance(self.converter.default_options, dict) assert isinstance(self.converter.formatting_options, dict) # Check if markitdown is properly initialized assert hasattr(self.converter, "markitdown") except ImportError: pytest.skip("MarkItDown not available, skipping initialization test") def test_default_options(self): """测试默认选项配置""" options = self.converter.default_options # Updated options for MarkItDown assert options["extract_main_content"] is True assert options["preserve_structure"] is True assert options["clean_output"] is True assert options["include_links"] is True assert options["include_images"] is True def test_formatting_options(self): """测试格式化选项配置""" options = self.converter.formatting_options assert options["format_tables"] is True assert options["enhance_images"] is True assert options["optimize_links"] is True assert options["format_lists"] is True assert options["format_headings"] is True assert options["apply_typography"] is True assert options["smart_quotes"] is True assert options["em_dashes"] is True assert options["fix_spacing"] is True def test_basic_html_conversion(self): """测试基本HTML转换为Markdown""" html_content = """ <html> <head><title>Test Page</title></head> <body> <h1>Main Title</h1> <p>This is a paragraph with <strong>bold</strong> text.</p> <ul> <li>Item 1</li> <li>Item 2</li> </ul> </body> </html> """ result = self.converter.html_to_markdown(html_content) assert isinstance(result, str) assert "# Main Title" in result assert "**bold**" in result assert "- Item 1" in result or "* Item 1" in result def test_link_conversion(self): """测试链接转换""" html_content = """ <p>Check out <a href="https://example.com">this link</a> for more info.</p> """ result = self.converter.html_to_markdown(html_content) assert "[this link](https://example.com)" in result def test_image_conversion(self): """测试图片转换""" html_content = """ <img src="/images/test.jpg" alt="Test Image" /> """ result = self.converter.html_to_markdown(html_content) assert "![Test Image](/images/test.jpg)" in result def test_table_conversion(self): """测试表格转换""" html_content = """ <table> <thead> <tr><th>Name</th><th>Age</th></tr> </thead> <tbody> <tr><td>John</td><td>25</td></tr> <tr><td>Jane</td><td>30</td></tr> </tbody> </table> """ result = self.converter.html_to_markdown(html_content) assert "Name" in result assert "Age" in result assert "John" in result assert "Jane" in result def test_code_block_conversion(self): """测试代码块转换""" html_content = """ <pre><code>def hello(): print("Hello, World!")</code></pre> """ result = self.converter.html_to_markdown(html_content) assert "```" in result or "`def hello():`" in result def test_nested_elements_conversion(self): """测试嵌套元素转换""" html_content = """ <div> <h2>Section Title</h2> <p>A paragraph with <em>italic</em> and <strong>bold</strong> text.</p> <blockquote> <p>This is a quote with <a href="http://example.com">a link</a>.</p> </blockquote> </div> """ result = self.converter.html_to_markdown(html_content) assert "## Section Title" in result assert "*italic*" in result assert "**bold**" in result assert ">" in result # blockquote class TestPreprocessHTML: """测试HTML预处理功能""" def setup_method(self): """测试前准备""" self.converter = MarkdownConverter() def test_script_and_style_removal(self): """测试脚本和样式标签移除""" html_content = """ <html> <head> <style>body { color: red; }</style> <script>console.log('test');</script> </head> <body> <h1>Title</h1> <p>Content</p> <script>alert('popup');</script> </body> </html> """ result = self.converter.preprocess_html(html_content) assert "Title" in result assert "Content" in result assert "console.log" not in result assert "alert" not in result assert "color: red" not in result def test_unwanted_elements_removal(self): """测试不需要元素的移除""" html_content = """ <html> <body> <nav>Navigation menu</nav> <header>Header content</header> <main> <h1>Main Content</h1> <p>Important content</p> </main> <aside>Sidebar content</aside> <footer>Footer content</footer> </body> </html> """ result = self.converter.preprocess_html(html_content) assert "Main Content" in result assert "Important content" in result assert "Navigation menu" not in result assert "Header content" not in result assert "Sidebar content" not in result assert "Footer content" not in result def test_relative_url_conversion(self): """测试相对URL转换""" html_content = """ <div> <a href="/page1">Internal Link</a> <img src="/images/logo.png" alt="Logo" /> </div> """ base_url = "https://example.com" result = self.converter.preprocess_html(html_content, base_url) assert "https://example.com/page1" in result assert "https://example.com/images/logo.png" in result def test_comment_removal(self): """测试HTML注释移除""" html_content = """ <div> <!-- This is a comment --> <p>Visible content</p> <!-- Another comment --> </div> """ result = self.converter.preprocess_html(html_content) assert "Visible content" in result assert "This is a comment" not in result assert "Another comment" not in result def test_empty_elements_cleanup(self): """测试空元素清理""" html_content = """ <div> <p>Content paragraph</p> <p></p> <div></div> <p>Another paragraph</p> </div> """ result = self.converter.preprocess_html(html_content) soup = BeautifulSoup(result, "html.parser") # 应该保留有内容的段落 content_paras = [p for p in soup.find_all("p") if p.get_text(strip=True)] assert len(content_paras) >= 2 class TestPostprocessMarkdown: """测试Markdown后处理功能""" def setup_method(self): """测试前准备""" self.converter = MarkdownConverter() def test_table_formatting(self): """测试表格格式化""" markdown_content = """ |Name|Age|City| |---|---|---| |John|25|NYC| |Jane|30|LA| """ result = self.converter._format_tables(markdown_content) assert "| Name | Age | City |" in result assert "| John | 25 | NYC |" in result def test_code_block_language_detection(self): """测试代码块语言检测""" markdown_content = """ ``` def hello(): print("Hello, World!") ``` ``` function greet() { console.log("Hello!"); } ``` """ result = self.converter._format_code_blocks(markdown_content) assert "```python" in result assert "```javascript" in result def test_quote_formatting(self): """测试引用格式化""" markdown_content = """ >This is a quote > Another quote line """ result = self.converter._format_quotes(markdown_content) assert "> This is a quote" in result assert "> Another quote line" in result def test_image_alt_text_improvement(self): """测试图片alt文本改进""" markdown_content = """ ![](test-image.jpg) ![img](profile-photo.png) """ result = self.converter._format_images(markdown_content) assert "![Test Image](test-image.jpg)" in result assert "![Profile Photo](profile-photo.png)" in result def test_link_formatting(self): """测试链接格式化""" markdown_content = """ [Link text] (https://example.com) [Another link] (https://test.com) """ result = self.converter._format_links(markdown_content) assert "[Link text](https://example.com)" in result assert "[Another link](https://test.com)" in result def test_list_formatting(self): """测试列表格式化""" markdown_content = """ -Item 1 * Item 2 + Item 3 1.First item 2) Second item """ result = self.converter._format_lists(markdown_content) assert "- Item 1" in result assert "- Item 2" in result assert "- Item 3" in result assert "1. First item" in result assert "2. Second item" in result def test_heading_formatting(self): """测试标题格式化""" markdown_content = """# Title Some content here ## Subtitle More content """ result = self.converter._format_headings(markdown_content) lines = result.split("\n") # 检查标题前后有适当的空行 title_idx = next(i for i, line in enumerate(lines) if line.strip() == "# Title") subtitle_idx = next( i for i, line in enumerate(lines) if line.strip() == "## Subtitle" ) assert title_idx >= 0 assert subtitle_idx >= 0 def test_typography_fixes(self): """测试排版修复""" markdown_content = """ Text with -- double hyphens. "Quote text" and 'another quote'. Multiple spaces here. """ result = self.converter._apply_typography_fixes(markdown_content) assert "—" in result # em dash assert " " not in result # multiple spaces removed # 注意:智能引号转换可能在某些情况下被跳过 class TestContentExtraction: """测试内容提取功能""" def setup_method(self): """测试前准备""" self.converter = MarkdownConverter() def test_main_content_extraction(self): """测试主要内容提取""" html_content = """ <html> <body> <nav>Navigation</nav> <main> <h1>Main Title</h1> <p>Main content paragraph</p> </main> <footer>Footer</footer> </body> </html> """ result = self.converter.extract_content_area(html_content) assert "Main Title" in result assert "Main content paragraph" in result assert "Navigation" not in result assert "Footer" not in result def test_content_selectors_priority(self): """测试内容选择器优先级""" html_content = """ <html> <body> <div class="sidebar">Sidebar content</div> <article> <h1>Article Title</h1> <p>Article content with substantial text to meet minimum length requirements.</p> </article> <div class="content"> <h2>Content Area</h2> <p>Content area text</p> </div> </body> </html> """ result = self.converter.extract_content_area(html_content) # article标签应该有优先级 assert "Article Title" in result assert "Article content" in result def test_text_paragraph_splitting(self): """测试文本段落分割""" text_content = """First paragraph with some content. Second paragraph starts here. Third paragraph after double newlines. Fourth paragraph continues the text flow.""" paragraphs = self.converter._split_text_into_paragraphs(text_content) assert len(paragraphs) > 1 assert any("First paragraph" in p for p in paragraphs) assert any("Third paragraph" in p for p in paragraphs) def test_long_text_splitting(self): """测试长文本分割""" long_text = "This is a very long text. " * 20 # 创建很长的文本 paragraphs = self.converter._split_long_text(long_text, max_length=100) assert len(paragraphs) > 1 for para in paragraphs: assert len(para) <= 150 # 允许一些弹性 class TestWebpageConversion: """测试网页转换功能""" def setup_method(self): """测试前准备""" self.converter = MarkdownConverter() def test_successful_webpage_conversion(self): """测试成功的网页转换""" scrape_result = { "url": "https://example.com", "title": "Test Page", "content": { "html": "<html><body><h1>Title</h1><p>Content</p></body></html>" }, } result = self.converter.convert_webpage_to_markdown(scrape_result) assert result["success"] is True assert "Title" in result["markdown"] assert "Content" in result["markdown"] assert result["url"] == "https://example.com" def test_webpage_conversion_with_metadata(self): """测试带元数据的网页转换""" scrape_result = { "url": "https://example.com", "title": "Test Page", "meta_description": "Test description", "content": { "html": "<html><body><h1>Title</h1><p>Content paragraph</p></body></html>", "links": [{"url": "https://example.com/link1", "text": "Link 1"}], "images": [{"src": "/image1.jpg", "alt": "Image 1"}], }, } result = self.converter.convert_webpage_to_markdown( scrape_result, include_metadata=True ) assert result["success"] is True assert "metadata" in result assert result["metadata"]["title"] == "Test Page" assert result["metadata"]["meta_description"] == "Test description" assert result["metadata"]["word_count"] > 0 assert result["metadata"]["links_count"] == 1 assert result["metadata"]["images_count"] == 1 def test_webpage_conversion_with_text_only(self): """测试仅文本内容的网页转换""" scrape_result = { "url": "https://example.com", "title": "Test Page", "content": { "text": "First paragraph content. Second paragraph content here.", "links": [{"url": "https://example.com/link1", "text": "Link 1"}], "images": [{"src": "/image1.jpg", "alt": "Image 1"}], }, } result = self.converter.convert_webpage_to_markdown(scrape_result) assert result["success"] is True assert "First paragraph" in result["markdown"] assert "Second paragraph" in result["markdown"] def test_webpage_conversion_error_handling(self): """测试网页转换错误处理""" scrape_result = {"error": "Failed to scrape", "url": "https://example.com"} result = self.converter.convert_webpage_to_markdown(scrape_result) assert result["success"] is False assert result["error"] == "Failed to scrape" assert result["url"] == "https://example.com" def test_batch_webpage_conversion(self): """测试批量网页转换""" scrape_results = [ { "url": "https://example1.com", "title": "Page 1", "content": {"html": "<html><body><h1>Title 1</h1></body></html>"}, }, { "url": "https://example2.com", "title": "Page 2", "content": {"html": "<html><body><h1>Title 2</h1></body></html>"}, }, {"error": "Failed to scrape", "url": "https://example3.com"}, ] result = self.converter.batch_convert_to_markdown(scrape_results) assert result["success"] is True assert len(result["results"]) == 3 assert result["summary"]["total"] == 3 assert result["summary"]["successful"] == 2 assert result["summary"]["failed"] == 1 assert result["summary"]["success_rate"] == 2 / 3 class TestImageEmbedding: """测试图片嵌入功能""" def setup_method(self): """测试前准备""" self.converter = MarkdownConverter() @patch("requests.get") def test_image_embedding_success(self, mock_get): """测试成功的图片嵌入""" # 模拟成功的HTTP响应 mock_response = Mock() mock_response.headers = {"Content-Type": "image/jpeg"} mock_response.content = b"fake image data" mock_response.raise_for_status = Mock() mock_get.return_value = mock_response markdown_content = "![Alt text](https://example.com/image.jpg)" result = self.converter._embed_images_in_markdown(markdown_content) assert result["stats"]["attempted"] == 1 assert result["stats"]["embedded"] == 1 assert "data:image/jpeg;base64," in result["markdown"] @patch("requests.get") def test_image_embedding_size_limit(self, mock_get): """测试图片大小限制""" # 模拟大文件响应 mock_response = Mock() mock_response.headers = { "Content-Type": "image/jpeg", "Content-Length": "5000000", } mock_get.return_value = mock_response markdown_content = "![Alt text](https://example.com/large-image.jpg)" result = self.converter._embed_images_in_markdown( markdown_content, max_bytes_per_image=1000000 ) assert result["stats"]["attempted"] == 1 assert result["stats"]["embedded"] == 0 assert result["stats"]["skipped_large"] == 1 @patch("requests.get") def test_image_embedding_error_handling(self, mock_get): """测试图片嵌入错误处理""" # 模拟HTTP错误 mock_get.side_effect = Exception("Network error") markdown_content = "![Alt text](https://example.com/image.jpg)" result = self.converter._embed_images_in_markdown(markdown_content) assert result["stats"]["attempted"] == 1 assert result["stats"]["embedded"] == 0 assert result["stats"]["skipped_errors"] == 1 # 原始链接应该保留 assert "https://example.com/image.jpg" in result["markdown"] class TestErrorHandling: """测试错误处理和边界情况""" def setup_method(self): """测试前准备""" self.converter = MarkdownConverter() def test_empty_html_input(self): """测试空HTML输入""" result = self.converter.html_to_markdown("") assert isinstance(result, str) def test_invalid_html_input(self): """测试无效HTML输入""" invalid_html = "<html><body><div>Unclosed div<p>Paragraph</body></html>" result = self.converter.html_to_markdown(invalid_html) assert isinstance(result, str) assert len(result) >= 0 def test_none_input(self): """测试None输入处理""" # html_to_markdown应该能处理None输入而不崩溃 try: result = self.converter.html_to_markdown(None) # 如果没有抛出异常,结果应该是字符串 assert isinstance(result, str) except (TypeError, AttributeError): # 如果抛出异常,这也是可接受的行为 pass def test_special_characters_handling(self): """测试特殊字符处理""" html_content = """ <p>Special chars: &amp; &lt; &gt; &quot; &#39; &copy; &reg;</p> <p>Unicode: 中文 éñ ñoël</p> """ result = self.converter.html_to_markdown(html_content) assert isinstance(result, str) assert len(result) > 0 def test_malformed_markup(self): """测试格式错误的标记""" malformed_html = """ <div> <p>Normal paragraph</p> <strong>Unclosed strong tag <p>Another paragraph <em>Nested <strong>tags</em> wrong order</strong> </div> """ result = self.converter.html_to_markdown(malformed_html) assert isinstance(result, str) assert "Normal paragraph" in result assert "Another paragraph" in result class TestPerformanceAndLimits: """测试性能相关功能""" def setup_method(self): """测试前准备""" self.converter = MarkdownConverter() def test_large_html_conversion(self): """测试大型HTML内容转换""" # 生成大型HTML内容 large_html = "<html><body>" for i in range(100): large_html += f"<p>Paragraph {i} with some content text here.</p>" large_html += "</body></html>" result = self.converter.html_to_markdown(large_html) assert isinstance(result, str) assert len(result) > 0 assert "Paragraph 0" in result assert "Paragraph 99" in result def test_conversion_speed_benchmark(self): """测试转换速度基准""" import time html_content = "<html><body>" for i in range(50): html_content += f""" <div> <h2>Section {i}</h2> <p>This is paragraph {i} with <strong>bold</strong> text.</p> <ul> <li>Item 1</li> <li>Item 2</li> </ul> </div> """ html_content += "</body></html>" start_time = time.time() result = self.converter.html_to_markdown(html_content) end_time = time.time() conversion_time = end_time - start_time assert isinstance(result, str) assert len(result) > 0 # 转换应该在合理时间内完成(5秒) assert conversion_time < 5.0 def test_max_images_limit(self): """测试图片数量限制""" markdown_content = "" for i in range(60): markdown_content += f"![Image {i}](https://example.com/image{i}.jpg)\n" with patch("requests.get") as mock_get: mock_response = Mock() mock_response.headers = {"Content-Type": "image/jpeg"} mock_response.content = b"fake image data" mock_response.raise_for_status = Mock() mock_get.return_value = mock_response result = self.converter._embed_images_in_markdown( markdown_content, max_images=10 ) assert result["stats"]["attempted"] >= 10 assert result["stats"]["embedded"] <= 10

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server