Skip to main content
Glama

Scrapy MCP Server

by ThreeFish-AI
test_pdf_processor.py28 kB
""" 单元测试:PDF 处理模块 测试 extractor.pdf_processor 模块的PDF文档处理和转换功能 """ import pytest from unittest.mock import Mock, patch, AsyncMock from pathlib import Path import tempfile import os from extractor.pdf_processor import PDFProcessor class TestPDFProcessor: """测试PDF处理器主要功能""" def setup_method(self): """测试前准备""" self.processor = PDFProcessor() def teardown_method(self): """测试后清理""" self.processor.cleanup() def test_processor_initialization(self): """测试处理器初始化""" assert self.processor is not None assert hasattr(self.processor, "process_pdf") assert hasattr(self.processor, "batch_process_pdfs") assert self.processor.supported_methods == ["pymupdf", "pypdf", "auto"] assert os.path.exists(self.processor.temp_dir) def test_supported_methods(self): """测试支持的方法列表""" expected_methods = ["pymupdf", "pypdf", "auto"] assert self.processor.supported_methods == expected_methods def test_url_detection(self): """测试URL检测功能""" # 有效的URL assert self.processor._is_url("https://example.com/document.pdf") is True assert self.processor._is_url("http://example.com/document.pdf") is True # 无效的URL assert self.processor._is_url("/local/path/document.pdf") is False assert self.processor._is_url("document.pdf") is False assert self.processor._is_url("ftp://example.com/document.pdf") is False assert self.processor._is_url("") is False @pytest.mark.asyncio async def test_invalid_method_validation(self): """测试无效方法验证""" result = await self.processor.process_pdf("test.pdf", method="invalid_method") assert result["success"] is False assert "Method must be one of" in result["error"] assert result["source"] == "test.pdf" @pytest.mark.asyncio async def test_nonexistent_file_handling(self): """测试不存在文件的处理""" result = await self.processor.process_pdf("nonexistent.pdf") assert result["success"] is False assert result["error"] == "PDF file does not exist" assert result["source"] == "nonexistent.pdf" @pytest.mark.asyncio @patch("aiohttp.ClientSession.get") async def test_pdf_download_success(self, mock_get): """测试PDF下载成功""" # 模拟成功的HTTP响应 mock_response = AsyncMock() mock_response.status = 200 mock_response.read = AsyncMock(return_value=b"fake PDF content") mock_get.return_value.__aenter__.return_value = mock_response result_path = await self.processor._download_pdf("https://example.com/test.pdf") assert result_path is not None assert isinstance(result_path, Path) assert result_path.suffix == ".pdf" assert str(result_path).startswith(self.processor.temp_dir) @pytest.mark.asyncio @patch("aiohttp.ClientSession.get") async def test_pdf_download_failure(self, mock_get): """测试PDF下载失败""" # 模拟HTTP错误响应 mock_response = AsyncMock() mock_response.status = 404 mock_get.return_value.__aenter__.return_value = mock_response result_path = await self.processor._download_pdf( "https://example.com/nonexistent.pdf" ) assert result_path is None @pytest.mark.asyncio @patch("aiohttp.ClientSession.get") async def test_pdf_download_network_error(self, mock_get): """测试PDF下载网络错误""" # 模拟网络异常 mock_get.side_effect = Exception("Network error") result_path = await self.processor._download_pdf("https://example.com/test.pdf") assert result_path is None class TestPyMuPDFExtraction: """测试PyMuPDF提取功能""" def setup_method(self): """测试前准备""" self.processor = PDFProcessor() def teardown_method(self): """测试后清理""" self.processor.cleanup() @pytest.mark.asyncio @patch("extractor.pdf_processor._import_fitz") async def test_pymupdf_extraction_success(self, mock_import_fitz): """测试PyMuPDF提取成功""" # 模拟fitz模块 mock_fitz = Mock() mock_import_fitz.return_value = mock_fitz # 模拟PDF文档 mock_doc = Mock() mock_doc.page_count = 2 mock_doc.metadata = { "title": "Test Document", "author": "Test Author", "creationDate": "2023-01-01", } mock_fitz.open.return_value = mock_doc # 模拟页面 mock_page1 = Mock() mock_page1.get_text.return_value = "Page 1 content" mock_page2 = Mock() mock_page2.get_text.return_value = "Page 2 content" mock_doc.load_page.side_effect = [mock_page1, mock_page2] # 创建临时PDF文件 with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_path = Path(tmp_file.name) try: result = await self.processor._extract_with_pymupdf( tmp_path, include_metadata=True ) assert result["success"] is True assert "Page 1 content" in result["text"] assert "Page 2 content" in result["text"] assert result["pages_processed"] == 2 assert result["total_pages"] == 2 assert result["metadata"]["title"] == "Test Document" assert result["metadata"]["author"] == "Test Author" mock_doc.close.assert_called_once() finally: # 清理临时文件 if tmp_path.exists(): tmp_path.unlink() @pytest.mark.asyncio @patch("extractor.pdf_processor._import_fitz") async def test_pymupdf_with_page_range(self, mock_import_fitz): """测试PyMuPDF页面范围提取""" # 模拟fitz模块 mock_fitz = Mock() mock_import_fitz.return_value = mock_fitz # 模拟PDF文档 mock_doc = Mock() mock_doc.page_count = 5 mock_doc.metadata = {} mock_fitz.open.return_value = mock_doc # 模拟页面 mock_pages = [] for i in range(5): mock_page = Mock() mock_page.get_text.return_value = f"Page {i + 1} content" mock_pages.append(mock_page) # 正确设置 load_page 的模拟行为 def mock_load_page(page_num): return mock_pages[page_num] mock_doc.load_page.side_effect = mock_load_page # 创建临时PDF文件 with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_path = Path(tmp_file.name) try: result = await self.processor._extract_with_pymupdf( tmp_path, page_range=(1, 3), include_metadata=False ) assert result["success"] is True assert "Page 2 content" in result["text"] assert "Page 3 content" in result["text"] assert "Page 1 content" not in result["text"] assert "Page 4 content" not in result["text"] assert result["pages_processed"] == 2 # pages 1-2 (0-indexed) assert "metadata" not in result finally: if tmp_path.exists(): tmp_path.unlink() @pytest.mark.asyncio @patch("extractor.pdf_processor._import_fitz") async def test_pymupdf_extraction_error(self, mock_import_fitz): """测试PyMuPDF提取错误""" # 模拟导入错误 mock_import_fitz.side_effect = ImportError("PyMuPDF not available") with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_path = Path(tmp_file.name) try: result = await self.processor._extract_with_pymupdf(tmp_path) assert result["success"] is False assert "PyMuPDF extraction failed" in result["error"] finally: if tmp_path.exists(): tmp_path.unlink() class TestPyPDFExtraction: """测试pypdf提取功能""" def setup_method(self): """测试前准备""" self.processor = PDFProcessor() def teardown_method(self): """测试后清理""" self.processor.cleanup() @pytest.mark.asyncio @patch("extractor.pdf_processor._import_pypdf") @patch("builtins.open", create=True) async def test_pypdf_extraction_success(self, mock_open, mock_import_pypdf): """测试pypdf提取成功""" # 模拟pypdf模块 mock_pypdf = Mock() mock_import_pypdf.return_value = mock_pypdf # 模拟PDF阅读器 mock_reader = Mock() mock_reader.pages = [Mock(), Mock()] # 2页 mock_reader.metadata = { "/Title": "Test Document", "/Author": "Test Author", "/CreationDate": "D:20230101000000Z", } mock_pypdf.PdfReader.return_value = mock_reader # 模拟页面 mock_reader.pages[0].extract_text.return_value = "Page 1 content" mock_reader.pages[1].extract_text.return_value = "Page 2 content" # 模拟文件操作 mock_file = Mock() mock_open.return_value.__enter__.return_value = mock_file # 创建临时PDF文件 with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_path = Path(tmp_file.name) try: result = await self.processor._extract_with_pypdf( tmp_path, include_metadata=True ) assert result["success"] is True assert "Page 1 content" in result["text"] assert "Page 2 content" in result["text"] assert result["pages_processed"] == 2 assert result["total_pages"] == 2 assert result["metadata"]["title"] == "Test Document" assert result["metadata"]["author"] == "Test Author" finally: if tmp_path.exists(): tmp_path.unlink() @pytest.mark.asyncio @patch("extractor.pdf_processor._import_pypdf") @patch("builtins.open", create=True) async def test_pypdf_with_page_range(self, mock_open, mock_import_pypdf): """测试pypdf页面范围提取""" # 模拟pypdf模块 mock_pypdf = Mock() mock_import_pypdf.return_value = mock_pypdf # 模拟PDF阅读器(5页) mock_reader = Mock() mock_reader.pages = [Mock() for _ in range(5)] mock_reader.metadata = None mock_pypdf.PdfReader.return_value = mock_reader # 模拟页面内容 for i, page in enumerate(mock_reader.pages): page.extract_text.return_value = f"Page {i + 1} content" # 模拟文件操作 mock_file = Mock() mock_open.return_value.__enter__.return_value = mock_file with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_path = Path(tmp_file.name) try: result = await self.processor._extract_with_pypdf( tmp_path, page_range=(2, 4), include_metadata=False ) assert result["success"] is True assert "Page 3 content" in result["text"] assert "Page 4 content" in result["text"] assert "Page 1 content" not in result["text"] assert "Page 5 content" not in result["text"] assert result["pages_processed"] == 2 assert "metadata" not in result finally: if tmp_path.exists(): tmp_path.unlink() @pytest.mark.asyncio @patch("extractor.pdf_processor._import_pypdf") async def test_pypdf_extraction_error(self, mock_import_pypdf): """测试pypdf提取错误""" mock_import_pypdf.side_effect = ImportError("pypdf not available") with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_path = Path(tmp_file.name) try: result = await self.processor._extract_with_pypdf(tmp_path) assert result["success"] is False assert "pypdf extraction failed" in result["error"] finally: if tmp_path.exists(): tmp_path.unlink() class TestAutoExtraction: """测试自动提取功能""" def setup_method(self): """测试前准备""" self.processor = PDFProcessor() def teardown_method(self): """测试后清理""" self.processor.cleanup() @pytest.mark.asyncio async def test_auto_extraction_pymupdf_success(self): """测试自动提取PyMuPDF成功""" with patch.object(self.processor, "_extract_with_pymupdf") as mock_pymupdf: mock_pymupdf.return_value = { "success": True, "text": "Extracted text", "pages_processed": 1, } with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_path = Path(tmp_file.name) try: result = await self.processor._auto_extract(tmp_path) assert result["success"] is True assert result["method_used"] == "pymupdf" assert result["text"] == "Extracted text" mock_pymupdf.assert_called_once() finally: if tmp_path.exists(): tmp_path.unlink() @pytest.mark.asyncio async def test_auto_extraction_fallback_to_pypdf(self): """测试自动提取回退到pypdf""" with ( patch.object(self.processor, "_extract_with_pymupdf") as mock_pymupdf, patch.object(self.processor, "_extract_with_pypdf") as mock_pypdf, ): # PyMuPDF失败 mock_pymupdf.side_effect = Exception("PyMuPDF failed") # pypdf成功 mock_pypdf.return_value = { "success": True, "text": "Extracted with pypdf", "pages_processed": 1, } with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_path = Path(tmp_file.name) try: result = await self.processor._auto_extract(tmp_path) assert result["success"] is True assert result["method_used"] == "pypdf" assert result["text"] == "Extracted with pypdf" mock_pymupdf.assert_called_once() mock_pypdf.assert_called_once() finally: if tmp_path.exists(): tmp_path.unlink() @pytest.mark.asyncio async def test_auto_extraction_both_methods_fail(self): """测试自动提取两种方法都失败""" with ( patch.object(self.processor, "_extract_with_pymupdf") as mock_pymupdf, patch.object(self.processor, "_extract_with_pypdf") as mock_pypdf, ): # 两种方法都失败 mock_pymupdf.side_effect = Exception("PyMuPDF failed") mock_pypdf.side_effect = Exception("pypdf failed") with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_path = Path(tmp_file.name) try: result = await self.processor._auto_extract(tmp_path) assert result["success"] is False assert ( "Both PyMuPDF and pypdf extraction methods failed" in result["error"] ) finally: if tmp_path.exists(): tmp_path.unlink() class TestMarkdownConversion: """测试Markdown转换功能""" def setup_method(self): """测试前准备""" self.processor = PDFProcessor() def teardown_method(self): """测试后清理""" self.processor.cleanup() def test_basic_markdown_conversion(self): """测试基本Markdown转换""" text = "INTRODUCTION\n\nThis is a paragraph.\n\nSection Title:\n\nAnother paragraph." result = self.processor._convert_to_markdown(text) assert "# INTRODUCTION" in result assert "## Section Title:" in result assert "This is a paragraph." in result assert "Another paragraph." in result def test_heading_detection(self): """测试标题检测""" text = "MAIN TITLE\n\nSubsection Header:\n\nNormal text here." result = self.processor._convert_to_markdown(text) assert "# MAIN TITLE" in result assert "## Subsection Header:" in result assert "Normal text here." in result def test_empty_lines_handling(self): """测试空行处理""" text = "Line 1\n\n\nLine 2\n\n\n\nLine 3" result = self.processor._convert_to_markdown(text) lines = result.split("\n") assert "Line 1" in result assert "Line 2" in result assert "Line 3" in result def test_long_heading_not_converted(self): """测试长标题不被转换""" text = "THIS IS A VERY LONG TITLE THAT SHOULD NOT BE CONVERTED TO HEADING" result = self.processor._convert_to_markdown(text) # 由于标题太长(超过5个词),不应该转换为Markdown标题 assert result.strip() == text class TestPDFProcessing: """测试PDF完整处理流程""" def setup_method(self): """测试前准备""" self.processor = PDFProcessor() def teardown_method(self): """测试后清理""" self.processor.cleanup() @pytest.mark.asyncio async def test_process_pdf_with_text_output(self): """测试PDF处理文本输出""" with patch.object(self.processor, "_extract_with_pymupdf") as mock_extract: mock_extract.return_value = { "success": True, "text": "Extracted PDF text", "pages_processed": 1, "total_pages": 1, "metadata": {"title": "Test PDF"}, } # 创建临时PDF文件 with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_file.write(b"fake PDF content") tmp_path = tmp_file.name try: result = await self.processor.process_pdf( tmp_path, method="pymupdf", output_format="text" ) assert result["success"] is True assert result["text"] == "Extracted PDF text" assert result["output_format"] == "text" assert result["method_used"] == "pymupdf" assert result["pages_processed"] == 1 assert result["word_count"] == 3 # "Extracted PDF text" assert "markdown" not in result finally: os.unlink(tmp_path) @pytest.mark.asyncio async def test_process_pdf_with_markdown_output(self): """测试PDF处理Markdown输出""" with patch.object(self.processor, "_extract_with_pymupdf") as mock_extract: mock_extract.return_value = { "success": True, "text": "TITLE\n\nContent paragraph.", "pages_processed": 1, "total_pages": 1, } with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_file.write(b"fake PDF content") tmp_path = tmp_file.name try: result = await self.processor.process_pdf( tmp_path, method="pymupdf", output_format="markdown" ) assert result["success"] is True assert "markdown" in result assert "# TITLE" in result["markdown"] assert "Content paragraph." in result["markdown"] assert result["output_format"] == "markdown" finally: os.unlink(tmp_path) @pytest.mark.asyncio @patch("aiohttp.ClientSession.get") async def test_process_pdf_from_url(self, mock_get): """测试从URL处理PDF""" # 模拟HTTP下载 mock_response = AsyncMock() mock_response.status = 200 mock_response.read = AsyncMock(return_value=b"fake PDF content") mock_get.return_value.__aenter__.return_value = mock_response with patch.object(self.processor, "_extract_with_pymupdf") as mock_extract: mock_extract.return_value = { "success": True, "text": "URL PDF content", "pages_processed": 1, "total_pages": 1, } result = await self.processor.process_pdf( "https://example.com/test.pdf", method="pymupdf" ) assert result["success"] is True assert result["source"] == "https://example.com/test.pdf" assert "URL PDF content" in result["text"] class TestBatchProcessing: """测试批量处理功能""" def setup_method(self): """测试前准备""" self.processor = PDFProcessor() def teardown_method(self): """测试后清理""" self.processor.cleanup() @pytest.mark.asyncio async def test_empty_batch_processing(self): """测试空批量处理""" result = await self.processor.batch_process_pdfs([]) assert result["success"] is False assert result["error"] == "PDF sources list cannot be empty" @pytest.mark.asyncio async def test_successful_batch_processing(self): """测试成功的批量处理""" with patch.object(self.processor, "process_pdf") as mock_process: # 模拟成功的处理结果 mock_process.side_effect = [ { "success": True, "text": "PDF 1 content", "pages_processed": 2, "word_count": 10, "source": "pdf1.pdf", }, { "success": True, "text": "PDF 2 content", "pages_processed": 3, "word_count": 15, "source": "pdf2.pdf", }, {"success": False, "error": "Processing failed", "source": "pdf3.pdf"}, ] pdf_sources = ["pdf1.pdf", "pdf2.pdf", "pdf3.pdf"] result = await self.processor.batch_process_pdfs( pdf_sources, method="auto", output_format="markdown" ) assert result["success"] is True assert len(result["results"]) == 3 assert result["summary"]["total_pdfs"] == 3 assert result["summary"]["successful"] == 2 assert result["summary"]["failed"] == 1 assert result["summary"]["total_pages_processed"] == 5 # 2 + 3 assert result["summary"]["total_words_extracted"] == 25 # 10 + 15 assert result["summary"]["method_used"] == "auto" assert result["summary"]["output_format"] == "markdown" @pytest.mark.asyncio async def test_batch_processing_with_exceptions(self): """测试批量处理异常情况""" with patch.object(self.processor, "process_pdf") as mock_process: # 模拟处理异常 mock_process.side_effect = [ { "success": True, "text": "Success", "pages_processed": 1, "word_count": 5, }, Exception("Processing error"), {"success": False, "error": "Failed", "source": "pdf3.pdf"}, ] pdf_sources = ["pdf1.pdf", "pdf2.pdf", "pdf3.pdf"] result = await self.processor.batch_process_pdfs(pdf_sources) assert result["success"] is True assert len(result["results"]) == 3 assert result["results"][0]["success"] is True assert result["results"][1]["success"] is False assert "Processing error" in result["results"][1]["error"] assert result["results"][2]["success"] is False assert result["summary"]["successful"] == 1 assert result["summary"]["failed"] == 2 class TestCleanup: """测试清理功能""" def test_cleanup_temp_directory(self): """测试清理临时目录""" processor = PDFProcessor() temp_dir = processor.temp_dir # 验证临时目录存在 assert os.path.exists(temp_dir) # 执行清理 processor.cleanup() # 验证临时目录被删除 assert not os.path.exists(temp_dir) def test_cleanup_with_missing_directory(self): """测试清理不存在的目录""" processor = PDFProcessor() # 手动删除目录 import shutil shutil.rmtree(processor.temp_dir) # 清理应该不会抛出异常 processor.cleanup() # 应该正常执行 class TestErrorHandling: """测试错误处理""" def setup_method(self): """测试前准备""" self.processor = PDFProcessor() def teardown_method(self): """测试后清理""" self.processor.cleanup() @pytest.mark.asyncio async def test_extraction_method_failure(self): """测试提取方法失败""" with patch.object(self.processor, "_extract_with_pymupdf") as mock_extract: mock_extract.return_value = {"success": False, "error": "Extraction failed"} with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_path = tmp_file.name try: result = await self.processor.process_pdf(tmp_path, method="pymupdf") assert result["success"] is False assert result["error"] == "Extraction failed" finally: os.unlink(tmp_path) @pytest.mark.asyncio async def test_general_processing_exception(self): """测试处理过程中的一般异常""" with patch.object(self.processor, "_extract_with_pymupdf") as mock_extract: mock_extract.side_effect = Exception("Unexpected error") with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: tmp_path = tmp_file.name try: result = await self.processor.process_pdf(tmp_path, method="pymupdf") assert result["success"] is False assert "Unexpected error" in result["error"] finally: os.unlink(tmp_path) @pytest.mark.asyncio async def test_file_cleanup_after_error(self): """测试错误后的文件清理""" with ( patch.object(self.processor, "_download_pdf") as mock_download, patch.object(self.processor, "_extract_with_pymupdf") as mock_extract, ): # 模拟下载成功 temp_file = tempfile.NamedTemporaryFile( suffix=".pdf", dir=self.processor.temp_dir, delete=False ) temp_path = Path(temp_file.name) temp_file.close() mock_download.return_value = temp_path # 模拟提取失败 mock_extract.side_effect = Exception("Extraction error") # 文件应该存在 assert temp_path.exists() result = await self.processor.process_pdf("https://example.com/test.pdf") # 处理应该失败 assert result["success"] is False # 临时文件应该被清理 assert not temp_path.exists()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server