Registry Review MCP Server

test_document_processing.py•7.58 KiB

"""Tests for document processing (Phase 2).""" import pytest from pathlib import Path from registry_review_mcp.tools import session_tools, document_tools from registry_review_mcp.utils.state import StateManager class TestDocumentDiscovery: """Test document discovery functionality.""" @pytest.mark.asyncio async def test_discover_documents_botany_farm(self, example_documents_path): """Test document discovery with real Botany Farm data.""" # Create session session = await session_tools.create_session( project_name="Botany Farm Test", documents_path=str(example_documents_path), methodology="soil-carbon-v1.2.2", ) session_id = session["session_id"] # Run discovery results = await document_tools.discover_documents(session_id) # Verify results assert results["documents_found"] > 0, "Should find documents in examples/22-23" assert "classification_summary" in results assert "documents" in results # Check that at least some PDFs were found assert results["documents_found"] >= 5, "Should find at least 5 PDFs" # Verify classification summary has expected types summary = results["classification_summary"] assert "project_plan" in summary or "baseline_report" in summary, \ "Should classify at least one project plan or baseline report" # Verify document structure first_doc = results["documents"][0] assert "document_id" in first_doc assert "filename" in first_doc assert "classification" in first_doc assert "confidence" in first_doc assert first_doc["document_id"].startswith("DOC-") # Cleanup await session_tools.delete_session(session_id) @pytest.mark.asyncio async def test_document_classification_patterns(self): """Test that classification patterns work correctly.""" # Test project plan classification, confidence, method = await document_tools.classify_document_by_filename( "/path/to/4997Botany22_Public_Project_Plan.pdf" ) assert classification == "project_plan" assert confidence >= 0.9 assert method == "filename" # Test baseline report classification, confidence, method = await document_tools.classify_document_by_filename( "/path/to/Baseline_Report_2022.pdf" ) assert classification == "baseline_report" assert confidence >= 0.9 # Test monitoring report classification, confidence, method = await document_tools.classify_document_by_filename( "/path/to/Monitoring_Report_2023.pdf" ) assert classification == "monitoring_report" assert confidence >= 0.8 # Test GHG emissions classification, confidence, method = await document_tools.classify_document_by_filename( "/path/to/GHG_Emissions_Report.pdf" ) assert classification == "ghg_emissions" assert confidence >= 0.8 # Test unknown classification, confidence, method = await document_tools.classify_document_by_filename( "/path/to/random_file.pdf" ) assert classification == "unknown" assert confidence <= 0.6 class TestPDFExtraction: """Test PDF text extraction.""" @pytest.mark.marker @pytest.mark.asyncio async def test_extract_pdf_text_basic(self, example_documents_path): """Test basic PDF text extraction.""" # Find a PDF in the example data pdf_files = list(example_documents_path.glob("*.pdf")) if not pdf_files: pytest.skip("No PDF files found in example data") pdf_path = str(pdf_files[0]) # Extract text results = await document_tools.extract_pdf_text(pdf_path) # Verify results assert results["filepath"] == pdf_path assert results["page_count"] > 0 assert len(results["full_text"]) > 0 assert len(results["pages"]) > 0 # Verify page structure first_page = results["pages"][0] assert "page_number" in first_page assert "text" in first_page assert first_page["page_number"] == 1 @pytest.mark.marker @pytest.mark.asyncio async def test_extract_pdf_text_with_page_range(self, example_documents_path): """Test PDF extraction with specific page range.""" pdf_files = list(example_documents_path.glob("*.pdf")) if not pdf_files: pytest.skip("No PDF files found in example data") pdf_path = str(pdf_files[0]) # Extract pages 1-2 results = await document_tools.extract_pdf_text(pdf_path, page_range=(1, 2)) # Verify only 2 pages extracted assert len(results["pages"]) == 2 assert results["pages"][0]["page_number"] == 1 assert results["pages"][1]["page_number"] == 2 @pytest.mark.marker @pytest.mark.asyncio async def test_extract_pdf_text_caching(self, example_documents_path): """Test that PDF extraction results are cached.""" pdf_files = list(example_documents_path.glob("*.pdf")) if not pdf_files: pytest.skip("No PDF files found in example data") pdf_path = str(pdf_files[0]) # First extraction results1 = await document_tools.extract_pdf_text(pdf_path) # Second extraction (should be from cache) results2 = await document_tools.extract_pdf_text(pdf_path) # Should be identical assert results1["full_text"] == results2["full_text"] assert results1["page_count"] == results2["page_count"] class TestEndToEnd: """End-to-end workflow tests.""" @pytest.mark.marker @pytest.mark.asyncio async def test_full_discovery_workflow(self, example_documents_path): """Test complete discovery workflow from session to results.""" # Step 1: Create session session = await session_tools.create_session( project_name="E2E Test Project", documents_path=str(example_documents_path), methodology="soil-carbon-v1.2.2", project_id="C06-9999", ) session_id = session["session_id"] try: # Step 2: Discover documents discovery = await document_tools.discover_documents(session_id) assert discovery["documents_found"] > 0 assert len(discovery["documents"]) == discovery["documents_found"] # Step 3: Verify session state updated updated_session = await session_tools.load_session(session_id) assert updated_session["statistics"]["documents_found"] == discovery["documents_found"] assert updated_session["workflow_progress"]["document_discovery"] == "completed" # Step 4: Verify documents.json was created state_manager = StateManager(session_id) docs_data = state_manager.read_json("documents.json") assert docs_data["total_count"] == discovery["documents_found"] assert len(docs_data["documents"]) == discovery["documents_found"] # Step 5: Extract text from first PDF pdf_docs = [d for d in discovery["documents"] if d["filename"].endswith(".pdf")] if pdf_docs: first_pdf = pdf_docs[0] text_results = await document_tools.extract_pdf_text(first_pdf["filepath"]) assert text_results["page_count"] > 0 assert len(text_results["full_text"]) > 100 # Should have substantial text finally: # Cleanup await session_tools.delete_session(session_id)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/gaiaaiagent/regen-registry-review-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_document_processing.py•7.58 KiB