test_markdown_converter.py•25 kB
"""
单元测试:Markdown转换模块
测试 extractor.markdown_converter 模块的网页转Markdown功能
"""
import pytest
from unittest.mock import Mock, patch, MagicMock
from bs4 import BeautifulSoup
from extractor.markdown_converter import MarkdownConverter
class TestMarkdownConverter:
"""测试Markdown转换器主要功能"""
def setup_method(self):
"""测试前准备"""
try:
self.converter = MarkdownConverter()
except ImportError:
pytest.skip("MarkItDown not available, skipping tests")
def test_converter_initialization(self):
"""测试转换器初始化"""
try:
assert self.converter is not None
assert hasattr(self.converter, "html_to_markdown")
assert hasattr(self.converter, "convert_webpage_to_markdown")
assert hasattr(self.converter, "batch_convert_to_markdown")
assert hasattr(self.converter, "convert_pdf_to_markdown")
assert isinstance(self.converter.default_options, dict)
assert isinstance(self.converter.formatting_options, dict)
# Check if markitdown is properly initialized
assert hasattr(self.converter, "markitdown")
except ImportError:
pytest.skip("MarkItDown not available, skipping initialization test")
def test_default_options(self):
"""测试默认选项配置"""
options = self.converter.default_options
# Updated options for MarkItDown
assert options["extract_main_content"] is True
assert options["preserve_structure"] is True
assert options["clean_output"] is True
assert options["include_links"] is True
assert options["include_images"] is True
def test_formatting_options(self):
"""测试格式化选项配置"""
options = self.converter.formatting_options
assert options["format_tables"] is True
assert options["enhance_images"] is True
assert options["optimize_links"] is True
assert options["format_lists"] is True
assert options["format_headings"] is True
assert options["apply_typography"] is True
assert options["smart_quotes"] is True
assert options["em_dashes"] is True
assert options["fix_spacing"] is True
def test_basic_html_conversion(self):
"""测试基本HTML转换为Markdown"""
html_content = """
<html>
<head><title>Test Page</title></head>
<body>
<h1>Main Title</h1>
<p>This is a paragraph with <strong>bold</strong> text.</p>
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
</body>
</html>
"""
result = self.converter.html_to_markdown(html_content)
assert isinstance(result, str)
assert "# Main Title" in result
assert "**bold**" in result
assert "- Item 1" in result or "* Item 1" in result
def test_link_conversion(self):
"""测试链接转换"""
html_content = """
<p>Check out <a href="https://example.com">this link</a> for more info.</p>
"""
result = self.converter.html_to_markdown(html_content)
assert "[this link](https://example.com)" in result
def test_image_conversion(self):
"""测试图片转换"""
html_content = """
<img src="/images/test.jpg" alt="Test Image" />
"""
result = self.converter.html_to_markdown(html_content)
assert "" in result
def test_table_conversion(self):
"""测试表格转换"""
html_content = """
<table>
<thead>
<tr><th>Name</th><th>Age</th></tr>
</thead>
<tbody>
<tr><td>John</td><td>25</td></tr>
<tr><td>Jane</td><td>30</td></tr>
</tbody>
</table>
"""
result = self.converter.html_to_markdown(html_content)
assert "Name" in result
assert "Age" in result
assert "John" in result
assert "Jane" in result
def test_code_block_conversion(self):
"""测试代码块转换"""
html_content = """
<pre><code>def hello():
print("Hello, World!")</code></pre>
"""
result = self.converter.html_to_markdown(html_content)
assert "```" in result or "`def hello():`" in result
def test_nested_elements_conversion(self):
"""测试嵌套元素转换"""
html_content = """
<div>
<h2>Section Title</h2>
<p>A paragraph with <em>italic</em> and <strong>bold</strong> text.</p>
<blockquote>
<p>This is a quote with <a href="http://example.com">a link</a>.</p>
</blockquote>
</div>
"""
result = self.converter.html_to_markdown(html_content)
assert "## Section Title" in result
assert "*italic*" in result
assert "**bold**" in result
assert ">" in result # blockquote
class TestPreprocessHTML:
"""测试HTML预处理功能"""
def setup_method(self):
"""测试前准备"""
self.converter = MarkdownConverter()
def test_script_and_style_removal(self):
"""测试脚本和样式标签移除"""
html_content = """
<html>
<head>
<style>body { color: red; }</style>
<script>console.log('test');</script>
</head>
<body>
<h1>Title</h1>
<p>Content</p>
<script>alert('popup');</script>
</body>
</html>
"""
result = self.converter.preprocess_html(html_content)
assert "Title" in result
assert "Content" in result
assert "console.log" not in result
assert "alert" not in result
assert "color: red" not in result
def test_unwanted_elements_removal(self):
"""测试不需要元素的移除"""
html_content = """
<html>
<body>
<nav>Navigation menu</nav>
<header>Header content</header>
<main>
<h1>Main Content</h1>
<p>Important content</p>
</main>
<aside>Sidebar content</aside>
<footer>Footer content</footer>
</body>
</html>
"""
result = self.converter.preprocess_html(html_content)
assert "Main Content" in result
assert "Important content" in result
assert "Navigation menu" not in result
assert "Header content" not in result
assert "Sidebar content" not in result
assert "Footer content" not in result
def test_relative_url_conversion(self):
"""测试相对URL转换"""
html_content = """
<div>
<a href="/page1">Internal Link</a>
<img src="/images/logo.png" alt="Logo" />
</div>
"""
base_url = "https://example.com"
result = self.converter.preprocess_html(html_content, base_url)
assert "https://example.com/page1" in result
assert "https://example.com/images/logo.png" in result
def test_comment_removal(self):
"""测试HTML注释移除"""
html_content = """
<div>
<!-- This is a comment -->
<p>Visible content</p>
<!-- Another comment -->
</div>
"""
result = self.converter.preprocess_html(html_content)
assert "Visible content" in result
assert "This is a comment" not in result
assert "Another comment" not in result
def test_empty_elements_cleanup(self):
"""测试空元素清理"""
html_content = """
<div>
<p>Content paragraph</p>
<p></p>
<div></div>
<p>Another paragraph</p>
</div>
"""
result = self.converter.preprocess_html(html_content)
soup = BeautifulSoup(result, "html.parser")
# 应该保留有内容的段落
content_paras = [p for p in soup.find_all("p") if p.get_text(strip=True)]
assert len(content_paras) >= 2
class TestPostprocessMarkdown:
"""测试Markdown后处理功能"""
def setup_method(self):
"""测试前准备"""
self.converter = MarkdownConverter()
def test_table_formatting(self):
"""测试表格格式化"""
markdown_content = """
|Name|Age|City|
|---|---|---|
|John|25|NYC|
|Jane|30|LA|
"""
result = self.converter._format_tables(markdown_content)
assert "| Name | Age | City |" in result
assert "| John | 25 | NYC |" in result
def test_code_block_language_detection(self):
"""测试代码块语言检测"""
markdown_content = """
```
def hello():
print("Hello, World!")
```
```
function greet() {
console.log("Hello!");
}
```
"""
result = self.converter._format_code_blocks(markdown_content)
assert "```python" in result
assert "```javascript" in result
def test_quote_formatting(self):
"""测试引用格式化"""
markdown_content = """
>This is a quote
> Another quote line
"""
result = self.converter._format_quotes(markdown_content)
assert "> This is a quote" in result
assert "> Another quote line" in result
def test_image_alt_text_improvement(self):
"""测试图片alt文本改进"""
markdown_content = """


"""
result = self.converter._format_images(markdown_content)
assert "" in result
assert "" in result
def test_link_formatting(self):
"""测试链接格式化"""
markdown_content = """
[Link text] (https://example.com)
[Another link]
(https://test.com)
"""
result = self.converter._format_links(markdown_content)
assert "[Link text](https://example.com)" in result
assert "[Another link](https://test.com)" in result
def test_list_formatting(self):
"""测试列表格式化"""
markdown_content = """
-Item 1
* Item 2
+ Item 3
1.First item
2) Second item
"""
result = self.converter._format_lists(markdown_content)
assert "- Item 1" in result
assert "- Item 2" in result
assert "- Item 3" in result
assert "1. First item" in result
assert "2. Second item" in result
def test_heading_formatting(self):
"""测试标题格式化"""
markdown_content = """# Title
Some content here
## Subtitle
More content
"""
result = self.converter._format_headings(markdown_content)
lines = result.split("\n")
# 检查标题前后有适当的空行
title_idx = next(i for i, line in enumerate(lines) if line.strip() == "# Title")
subtitle_idx = next(
i for i, line in enumerate(lines) if line.strip() == "## Subtitle"
)
assert title_idx >= 0
assert subtitle_idx >= 0
def test_typography_fixes(self):
"""测试排版修复"""
markdown_content = """
Text with -- double hyphens.
"Quote text" and 'another quote'.
Multiple spaces here.
"""
result = self.converter._apply_typography_fixes(markdown_content)
assert "—" in result # em dash
assert " " not in result # multiple spaces removed
# 注意:智能引号转换可能在某些情况下被跳过
class TestContentExtraction:
"""测试内容提取功能"""
def setup_method(self):
"""测试前准备"""
self.converter = MarkdownConverter()
def test_main_content_extraction(self):
"""测试主要内容提取"""
html_content = """
<html>
<body>
<nav>Navigation</nav>
<main>
<h1>Main Title</h1>
<p>Main content paragraph</p>
</main>
<footer>Footer</footer>
</body>
</html>
"""
result = self.converter.extract_content_area(html_content)
assert "Main Title" in result
assert "Main content paragraph" in result
assert "Navigation" not in result
assert "Footer" not in result
def test_content_selectors_priority(self):
"""测试内容选择器优先级"""
html_content = """
<html>
<body>
<div class="sidebar">Sidebar content</div>
<article>
<h1>Article Title</h1>
<p>Article content with substantial text to meet minimum length requirements.</p>
</article>
<div class="content">
<h2>Content Area</h2>
<p>Content area text</p>
</div>
</body>
</html>
"""
result = self.converter.extract_content_area(html_content)
# article标签应该有优先级
assert "Article Title" in result
assert "Article content" in result
def test_text_paragraph_splitting(self):
"""测试文本段落分割"""
text_content = """First paragraph with some content. Second paragraph starts here.
Third paragraph after double newlines. Fourth paragraph continues the text flow."""
paragraphs = self.converter._split_text_into_paragraphs(text_content)
assert len(paragraphs) > 1
assert any("First paragraph" in p for p in paragraphs)
assert any("Third paragraph" in p for p in paragraphs)
def test_long_text_splitting(self):
"""测试长文本分割"""
long_text = "This is a very long text. " * 20 # 创建很长的文本
paragraphs = self.converter._split_long_text(long_text, max_length=100)
assert len(paragraphs) > 1
for para in paragraphs:
assert len(para) <= 150 # 允许一些弹性
class TestWebpageConversion:
"""测试网页转换功能"""
def setup_method(self):
"""测试前准备"""
self.converter = MarkdownConverter()
def test_successful_webpage_conversion(self):
"""测试成功的网页转换"""
scrape_result = {
"url": "https://example.com",
"title": "Test Page",
"content": {
"html": "<html><body><h1>Title</h1><p>Content</p></body></html>"
},
}
result = self.converter.convert_webpage_to_markdown(scrape_result)
assert result["success"] is True
assert "Title" in result["markdown"]
assert "Content" in result["markdown"]
assert result["url"] == "https://example.com"
def test_webpage_conversion_with_metadata(self):
"""测试带元数据的网页转换"""
scrape_result = {
"url": "https://example.com",
"title": "Test Page",
"meta_description": "Test description",
"content": {
"html": "<html><body><h1>Title</h1><p>Content paragraph</p></body></html>",
"links": [{"url": "https://example.com/link1", "text": "Link 1"}],
"images": [{"src": "/image1.jpg", "alt": "Image 1"}],
},
}
result = self.converter.convert_webpage_to_markdown(
scrape_result, include_metadata=True
)
assert result["success"] is True
assert "metadata" in result
assert result["metadata"]["title"] == "Test Page"
assert result["metadata"]["meta_description"] == "Test description"
assert result["metadata"]["word_count"] > 0
assert result["metadata"]["links_count"] == 1
assert result["metadata"]["images_count"] == 1
def test_webpage_conversion_with_text_only(self):
"""测试仅文本内容的网页转换"""
scrape_result = {
"url": "https://example.com",
"title": "Test Page",
"content": {
"text": "First paragraph content. Second paragraph content here.",
"links": [{"url": "https://example.com/link1", "text": "Link 1"}],
"images": [{"src": "/image1.jpg", "alt": "Image 1"}],
},
}
result = self.converter.convert_webpage_to_markdown(scrape_result)
assert result["success"] is True
assert "First paragraph" in result["markdown"]
assert "Second paragraph" in result["markdown"]
def test_webpage_conversion_error_handling(self):
"""测试网页转换错误处理"""
scrape_result = {"error": "Failed to scrape", "url": "https://example.com"}
result = self.converter.convert_webpage_to_markdown(scrape_result)
assert result["success"] is False
assert result["error"] == "Failed to scrape"
assert result["url"] == "https://example.com"
def test_batch_webpage_conversion(self):
"""测试批量网页转换"""
scrape_results = [
{
"url": "https://example1.com",
"title": "Page 1",
"content": {"html": "<html><body><h1>Title 1</h1></body></html>"},
},
{
"url": "https://example2.com",
"title": "Page 2",
"content": {"html": "<html><body><h1>Title 2</h1></body></html>"},
},
{"error": "Failed to scrape", "url": "https://example3.com"},
]
result = self.converter.batch_convert_to_markdown(scrape_results)
assert result["success"] is True
assert len(result["results"]) == 3
assert result["summary"]["total"] == 3
assert result["summary"]["successful"] == 2
assert result["summary"]["failed"] == 1
assert result["summary"]["success_rate"] == 2 / 3
class TestImageEmbedding:
"""测试图片嵌入功能"""
def setup_method(self):
"""测试前准备"""
self.converter = MarkdownConverter()
@patch("requests.get")
def test_image_embedding_success(self, mock_get):
"""测试成功的图片嵌入"""
# 模拟成功的HTTP响应
mock_response = Mock()
mock_response.headers = {"Content-Type": "image/jpeg"}
mock_response.content = b"fake image data"
mock_response.raise_for_status = Mock()
mock_get.return_value = mock_response
markdown_content = ""
result = self.converter._embed_images_in_markdown(markdown_content)
assert result["stats"]["attempted"] == 1
assert result["stats"]["embedded"] == 1
assert "data:image/jpeg;base64," in result["markdown"]
@patch("requests.get")
def test_image_embedding_size_limit(self, mock_get):
"""测试图片大小限制"""
# 模拟大文件响应
mock_response = Mock()
mock_response.headers = {
"Content-Type": "image/jpeg",
"Content-Length": "5000000",
}
mock_get.return_value = mock_response
markdown_content = ""
result = self.converter._embed_images_in_markdown(
markdown_content, max_bytes_per_image=1000000
)
assert result["stats"]["attempted"] == 1
assert result["stats"]["embedded"] == 0
assert result["stats"]["skipped_large"] == 1
@patch("requests.get")
def test_image_embedding_error_handling(self, mock_get):
"""测试图片嵌入错误处理"""
# 模拟HTTP错误
mock_get.side_effect = Exception("Network error")
markdown_content = ""
result = self.converter._embed_images_in_markdown(markdown_content)
assert result["stats"]["attempted"] == 1
assert result["stats"]["embedded"] == 0
assert result["stats"]["skipped_errors"] == 1
# 原始链接应该保留
assert "https://example.com/image.jpg" in result["markdown"]
class TestErrorHandling:
"""测试错误处理和边界情况"""
def setup_method(self):
"""测试前准备"""
self.converter = MarkdownConverter()
def test_empty_html_input(self):
"""测试空HTML输入"""
result = self.converter.html_to_markdown("")
assert isinstance(result, str)
def test_invalid_html_input(self):
"""测试无效HTML输入"""
invalid_html = "<html><body><div>Unclosed div<p>Paragraph</body></html>"
result = self.converter.html_to_markdown(invalid_html)
assert isinstance(result, str)
assert len(result) >= 0
def test_none_input(self):
"""测试None输入处理"""
# html_to_markdown应该能处理None输入而不崩溃
try:
result = self.converter.html_to_markdown(None)
# 如果没有抛出异常,结果应该是字符串
assert isinstance(result, str)
except (TypeError, AttributeError):
# 如果抛出异常,这也是可接受的行为
pass
def test_special_characters_handling(self):
"""测试特殊字符处理"""
html_content = """
<p>Special chars: & < > " ' © ®</p>
<p>Unicode: 中文 éñ ñoël</p>
"""
result = self.converter.html_to_markdown(html_content)
assert isinstance(result, str)
assert len(result) > 0
def test_malformed_markup(self):
"""测试格式错误的标记"""
malformed_html = """
<div>
<p>Normal paragraph</p>
<strong>Unclosed strong tag
<p>Another paragraph
<em>Nested <strong>tags</em> wrong order</strong>
</div>
"""
result = self.converter.html_to_markdown(malformed_html)
assert isinstance(result, str)
assert "Normal paragraph" in result
assert "Another paragraph" in result
class TestPerformanceAndLimits:
"""测试性能相关功能"""
def setup_method(self):
"""测试前准备"""
self.converter = MarkdownConverter()
def test_large_html_conversion(self):
"""测试大型HTML内容转换"""
# 生成大型HTML内容
large_html = "<html><body>"
for i in range(100):
large_html += f"<p>Paragraph {i} with some content text here.</p>"
large_html += "</body></html>"
result = self.converter.html_to_markdown(large_html)
assert isinstance(result, str)
assert len(result) > 0
assert "Paragraph 0" in result
assert "Paragraph 99" in result
def test_conversion_speed_benchmark(self):
"""测试转换速度基准"""
import time
html_content = "<html><body>"
for i in range(50):
html_content += f"""
<div>
<h2>Section {i}</h2>
<p>This is paragraph {i} with <strong>bold</strong> text.</p>
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
</div>
"""
html_content += "</body></html>"
start_time = time.time()
result = self.converter.html_to_markdown(html_content)
end_time = time.time()
conversion_time = end_time - start_time
assert isinstance(result, str)
assert len(result) > 0
# 转换应该在合理时间内完成(5秒)
assert conversion_time < 5.0
def test_max_images_limit(self):
"""测试图片数量限制"""
markdown_content = ""
for i in range(60):
markdown_content += f"\n"
with patch("requests.get") as mock_get:
mock_response = Mock()
mock_response.headers = {"Content-Type": "image/jpeg"}
mock_response.content = b"fake image data"
mock_response.raise_for_status = Mock()
mock_get.return_value = mock_response
result = self.converter._embed_images_in_markdown(
markdown_content, max_images=10
)
assert result["stats"]["attempted"] >= 10
assert result["stats"]["embedded"] <= 10