Registry Review MCP Server

test_document_processing.py•19.3 KiB

"""Tests for document processing (Phase 2)."""

import pytest
from pathlib import Path

from registry_review_mcp.tools import session_tools, document_tools
from registry_review_mcp.utils.state import StateManager
from registry_review_mcp.utils.patterns import is_spreadsheet_file


class TestDocumentDiscovery:
    """Test document discovery functionality."""

    @pytest.mark.asyncio
    async def test_discover_documents_botany_farm(self, example_documents_path):
        """Test document discovery with real Botany Farm data."""
        # Create session
        session = await session_tools.create_session(
            project_name="Botany Farm Test",
            documents_path=str(example_documents_path),
            methodology="soil-carbon-v1.2.2",
        )
        session_id = session["session_id"]

        # Run discovery
        results = await document_tools.discover_documents(session_id)

        # Verify results
        assert results["documents_found"] > 0, "Should find documents in examples/22-23"
        assert "classification_summary" in results
        assert "documents" in results

        # Check that at least some PDFs were found
        assert results["documents_found"] >= 5, "Should find at least 5 PDFs"

        # Verify classification summary has expected types
        summary = results["classification_summary"]
        assert "project_plan" in summary or "baseline_report" in summary, \
            "Should classify at least one project plan or baseline report"

        # Verify document structure
        first_doc = results["documents"][0]
        assert "document_id" in first_doc
        assert "filename" in first_doc
        assert "classification" in first_doc
        assert "confidence" in first_doc
        assert first_doc["document_id"].startswith("DOC-")

        # Cleanup
        await session_tools.delete_session(session_id)

    @pytest.mark.asyncio
    async def test_document_classification_patterns(self):
        """Test that classification patterns work correctly."""
        # Test project plan
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "/path/to/4997Botany22_Public_Project_Plan.pdf"
        )
        assert classification == "project_plan"
        assert confidence >= 0.9
        assert method == "filename"

        # Test baseline report
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "/path/to/Baseline_Report_2022.pdf"
        )
        assert classification == "baseline_report"
        assert confidence >= 0.9

        # Test monitoring report
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "/path/to/Monitoring_Report_2023.pdf"
        )
        assert classification == "monitoring_report"
        assert confidence >= 0.8

        # Test GHG emissions
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "/path/to/GHG_Emissions_Report.pdf"
        )
        assert classification == "ghg_emissions"
        assert confidence >= 0.8

        # Test unknown
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "/path/to/random_file.pdf"
        )
        assert classification == "unknown"
        assert confidence <= 0.6


class TestMappingConventionConsistency:
    """Regression tests for the classifier ↔ mapping naming convention bug.

    The classifier (classify_document_by_filename) produces classification labels.
    The mapper (_infer_document_types) expects those same labels when looking up
    documents by type. If the conventions diverge, requirements silently fail to
    match their correct documents and fall back to the project plan.

    Diagnosed 2026-02-07: land_tenure, ghg_emissions, and gis_shapefile were
    missed because the mapper used hyphens while the classifier used underscores.
    """

    def test_all_classifier_labels_recognized_by_mapper(self):
        """Every label the classifier can produce must appear in at least one
        mapper category's expected types."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        # All labels the classifier can produce (from classify_document_by_filename)
        classifier_labels = {
            "project_plan", "baseline_report", "monitoring_report",
            "ghg_emissions", "land_tenure", "gis_shapefile",
            "land_cover_map", "registry_review", "methodology_reference",
            "unknown",
        }

        # Collect every label the mapper can return across all checklist categories
        # Use the actual checklist categories from soil-carbon-v1.2.2
        import json
        from registry_review_mcp.config.settings import settings
        checklist_path = settings.get_checklist_path("soil-carbon-v1.2.2")
        with open(checklist_path) as f:
            checklist = json.load(f)

        mapper_labels = set()
        for req in checklist["requirements"]:
            category = req.get("category", "")
            evidence = req.get("accepted_evidence", "")
            types = _infer_document_types(category, evidence)
            mapper_labels.update(types)

        # These labels are informational (not evidence sources), so we don't
        # expect the mapper to reference them
        non_evidence_labels = {"unknown", "registry_review", "methodology_reference"}

        # Every evidence-producing classifier label should appear in mapper output
        evidence_labels = classifier_labels - non_evidence_labels
        missing = evidence_labels - mapper_labels
        assert not missing, (
            f"Classifier labels not recognized by mapper: {missing}. "
            f"Add these to _infer_document_types() in mapping_tools.py."
        )

    @pytest.mark.asyncio
    async def test_land_tenure_document_maps_to_land_tenure_requirement(self):
        """A document classified as land_tenure should map to REQ-002 (Land Tenure),
        not fall back to the project plan."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        types = _infer_document_types("Land Tenure", "Deeds, lease agreements")
        assert "land_tenure" in types, (
            f"Land Tenure requirement should look for 'land_tenure' documents, got: {types}"
        )

    @pytest.mark.asyncio
    async def test_ghg_emissions_document_maps_to_emissions_requirement(self):
        """A document classified as ghg_emissions should map to GHG Accounting requirements."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        types = _infer_document_types("GHG Accounting", "Proof of additionality")
        assert "ghg_emissions" in types or "project_plan" in types, (
            f"GHG Accounting should look for ghg_emissions or project_plan, got: {types}"
        )

    @pytest.mark.asyncio
    async def test_gis_shapefile_maps_to_gis_requirement(self):
        """A document classified as gis_shapefile should map to Project Area requirements."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        types = _infer_document_types(
            "Project Area",
            "GIS shapefiles and maps, with delineations of eligible and ineligible land",
        )
        assert "gis_shapefile" in types, (
            f"Project Area requirement should look for 'gis_shapefile' documents, got: {types}"
        )

    @pytest.mark.asyncio
    async def test_land_registry_documents_classified_as_land_tenure(self):
        """UK Land Registry 'Official Copy (Register)' files should classify as land_tenure."""
        test_cases = [
            "Official Copy (Register) - LT330529.pdf",
            "Official Copy (Register) - LT438680.pdf",
        ]
        for filename in test_cases:
            classification, confidence, method = await document_tools.classify_document_by_filename(filename)
            assert classification == "land_tenure", (
                f"'{filename}' should classify as land_tenure, got: {classification}"
            )
            assert confidence >= 0.80

    @pytest.mark.asyncio
    async def test_land_cover_map_classified(self):
        """Land Cover Map PDFs should classify as land_cover_map."""
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "Greens Lodge Farm - 2012 Land Cover Map.pdf"
        )
        assert classification == "land_cover_map", (
            f"Land cover map should classify as land_cover_map, got: {classification}"
        )

    @pytest.mark.asyncio
    async def test_land_cover_map_maps_to_project_area(self):
        """Land cover maps should be included in Project Area requirement mapping."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        types = _infer_document_types(
            "Project Area",
            "GIS shapefiles and maps, with delineations of eligible and ineligible land",
        )
        assert "land_cover_map" in types

    @pytest.mark.asyncio
    async def test_ecosystem_type_includes_spreadsheet_data(self):
        """Ecosystem Type requirements should accept spreadsheet and land cover data."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        types = _infer_document_types("Ecosystem Type", "Provide proof of land use")
        assert "spreadsheet_data" in types, (
            f"Ecosystem Type should include spreadsheet_data, got: {types}"
        )
        assert "land_cover_map" in types, (
            f"Ecosystem Type should include land_cover_map, got: {types}"
        )

    def test_no_hyphenated_labels_in_mapper(self):
        """The mapper should never return hyphenated labels. All labels must use
        underscores to match the classifier convention."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        # Test every checklist category
        import json
        from registry_review_mcp.config.settings import settings
        checklist_path = settings.get_checklist_path("soil-carbon-v1.2.2")
        with open(checklist_path) as f:
            checklist = json.load(f)

        for req in checklist["requirements"]:
            types = _infer_document_types(
                req.get("category", ""),
                req.get("accepted_evidence", ""),
            )
            for t in types:
                assert "-" not in t, (
                    f"Mapper returned hyphenated label '{t}' for requirement "
                    f"{req['requirement_id']} ({req['category']}). "
                    f"Use underscores to match classifier convention."
                )


class TestPDFExtraction:
    """Test PDF text extraction."""

    @pytest.mark.marker
    @pytest.mark.asyncio
    async def test_extract_pdf_text_basic(self, example_documents_path):
        """Test basic PDF text extraction."""
        # Find a PDF in the example data
        pdf_files = list(example_documents_path.glob("*.pdf"))
        if not pdf_files:
            pytest.skip("No PDF files found in example data")

        pdf_path = str(pdf_files[0])

        # Extract text
        results = await document_tools.extract_pdf_text(pdf_path)

        # Verify results
        assert results["filepath"] == pdf_path
        assert results["page_count"] > 0
        assert len(results["full_text"]) > 0
        assert len(results["pages"]) > 0

        # Verify page structure
        first_page = results["pages"][0]
        assert "page_number" in first_page
        assert "text" in first_page
        assert first_page["page_number"] == 1

    @pytest.mark.marker
    @pytest.mark.asyncio
    async def test_extract_pdf_text_with_page_range(self, example_documents_path):
        """Test PDF extraction with specific page range."""
        pdf_files = list(example_documents_path.glob("*.pdf"))
        if not pdf_files:
            pytest.skip("No PDF files found in example data")

        pdf_path = str(pdf_files[0])

        # Extract pages 1-2
        results = await document_tools.extract_pdf_text(pdf_path, page_range=(1, 2))

        # Verify only 2 pages extracted
        assert len(results["pages"]) == 2
        assert results["pages"][0]["page_number"] == 1
        assert results["pages"][1]["page_number"] == 2

    @pytest.mark.marker
    @pytest.mark.asyncio
    async def test_extract_pdf_text_caching(self, example_documents_path):
        """Test that PDF extraction results are cached."""
        pdf_files = list(example_documents_path.glob("*.pdf"))
        if not pdf_files:
            pytest.skip("No PDF files found in example data")

        pdf_path = str(pdf_files[0])

        # First extraction
        results1 = await document_tools.extract_pdf_text(pdf_path)

        # Second extraction (should be from cache)
        results2 = await document_tools.extract_pdf_text(pdf_path)

        # Should be identical
        assert results1["full_text"] == results2["full_text"]
        assert results1["page_count"] == results2["page_count"]


class TestEndToEnd:
    """End-to-end workflow tests."""

    @pytest.mark.marker
    @pytest.mark.asyncio
    async def test_full_discovery_workflow(self, example_documents_path):
        """Test complete discovery workflow from session to results."""
        # Step 1: Create session
        session = await session_tools.create_session(
            project_name="E2E Test Project",
            documents_path=str(example_documents_path),
            methodology="soil-carbon-v1.2.2",
            project_id="C06-9999",
        )
        session_id = session["session_id"]

        try:
            # Step 2: Discover documents
            discovery = await document_tools.discover_documents(session_id)

            assert discovery["documents_found"] > 0
            assert len(discovery["documents"]) == discovery["documents_found"]

            # Step 3: Verify session state updated
            updated_session = await session_tools.load_session(session_id)
            assert updated_session["statistics"]["documents_found"] == discovery["documents_found"]
            assert updated_session["workflow_progress"]["document_discovery"] == "completed"

            # Step 4: Verify documents.json was created
            state_manager = StateManager(session_id)
            docs_data = state_manager.read_json("documents.json")
            assert docs_data["total_count"] == discovery["documents_found"]
            assert len(docs_data["documents"]) == discovery["documents_found"]

            # Step 5: Extract text from first PDF
            pdf_docs = [d for d in discovery["documents"] if d["filename"].endswith(".pdf")]
            if pdf_docs:
                first_pdf = pdf_docs[0]
                text_results = await document_tools.extract_pdf_text(first_pdf["filepath"])
                assert text_results["page_count"] > 0
                assert len(text_results["full_text"]) > 100  # Should have substantial text

        finally:
            # Cleanup
            await session_tools.delete_session(session_id)


class TestSpreadsheetIngestion:
    """Tests for spreadsheet (.xlsx, .csv) discovery, classification, and extraction."""

    @pytest.mark.asyncio
    async def test_spreadsheet_discovery(self, sample_xlsx, sample_csv, tmp_path):
        """Verify .xlsx and .csv files are found during document discovery."""
        session = await session_tools.create_session(
            project_name="Spreadsheet Discovery Test",
            documents_path=str(tmp_path),
            methodology="soil-carbon-v1.2.2",
        )
        session_id = session["session_id"]

        try:
            results = await document_tools.discover_documents(session_id)
            filenames = {doc["filename"] for doc in results["documents"]}

            assert "farm_data.xlsx" in filenames, "Should discover .xlsx files"
            assert "monitoring_data.csv" in filenames, "Should discover .csv files"
            assert results["documents_found"] >= 2
        finally:
            await session_tools.delete_session(session_id)

    @pytest.mark.asyncio
    async def test_spreadsheet_extraction_xlsx(self, sample_xlsx):
        """Verify .xlsx extraction produces well-structured markdown."""
        from registry_review_mcp.extractors.spreadsheet_extractor import extract_spreadsheet

        result = await extract_spreadsheet(str(sample_xlsx))

        assert result["sheet_count"] == 2
        assert result["page_count"] == 2  # compatibility alias
        assert result["row_count"] == 8  # 5 farm + 3 tenure
        assert result["tables_found"] == 2
        assert result["extraction_method"] == "openpyxl"
        assert result["total_chars"] > 0

        md = result["markdown"]
        assert '--- Sheet "Farm Data" (1 of 2) ---' in md
        assert '--- Sheet "Land Tenure" (2 of 2) ---' in md
        assert "CE-001" in md
        assert "Alice" in md

    @pytest.mark.asyncio
    async def test_spreadsheet_extraction_csv(self, sample_csv):
        """Verify .csv extraction produces well-structured markdown."""
        from registry_review_mcp.extractors.spreadsheet_extractor import extract_spreadsheet

        result = await extract_spreadsheet(str(sample_csv))

        assert result["sheet_count"] == 1
        assert result["page_count"] == 1
        assert result["row_count"] == 3
        assert result["extraction_method"] == "csv"

        md = result["markdown"]
        assert '--- Sheet "monitoring_data.csv" (1 of 1) ---' in md
        assert "SOC (g/kg)" in md
        assert "23.4" in md

    @pytest.mark.asyncio
    async def test_spreadsheet_classification(self, sample_land_tenure_xlsx):
        """Verify land_tenure_records.xlsx classifies as land_tenure, not generic spreadsheet_data."""
        classification, confidence, method = await document_tools.classify_document_by_filename(
            str(sample_land_tenure_xlsx)
        )
        assert classification == "land_tenure", (
            f"Expected 'land_tenure' for land_tenure_records.xlsx, got '{classification}'"
        )
        assert confidence >= 0.80
        assert method == "filename"

    @pytest.mark.asyncio
    async def test_generic_spreadsheet_classification(self, sample_xlsx):
        """Verify a generically-named spreadsheet gets spreadsheet_data classification."""
        classification, confidence, method = await document_tools.classify_document_by_filename(
            str(sample_xlsx)
        )
        assert classification == "spreadsheet_data"
        assert method == "file_type"

    def test_spreadsheet_data_in_mapper(self):
        """Verify spreadsheet_data appears in mapper for relevant requirement categories."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        # Land tenure requirements should look for spreadsheet data
        types = _infer_document_types("Land Tenure", "Deeds, lease agreements")
        assert "spreadsheet_data" in types

        # Monitoring requirements should look for spreadsheet data
        types = _infer_document_types("Monitoring", "Sampling records")
        assert "spreadsheet_data" in types

    def test_is_spreadsheet_file_helper(self):
        """Verify the is_spreadsheet_file pattern helper."""
        assert is_spreadsheet_file("data.xlsx")
        assert is_spreadsheet_file("records.csv")
        assert is_spreadsheet_file("report.tsv")
        assert is_spreadsheet_file("DATA.XLSX")  # case insensitive
        assert not is_spreadsheet_file("report.pdf")
        assert not is_spreadsheet_file("map.shp")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/gaiaaiagent/regen-registry-review-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_document_processing.py•19.3 KiB

"""Tests for document processing (Phase 2)."""

import pytest
from pathlib import Path

from registry_review_mcp.tools import session_tools, document_tools
from registry_review_mcp.utils.state import StateManager
from registry_review_mcp.utils.patterns import is_spreadsheet_file


class TestDocumentDiscovery:
    """Test document discovery functionality."""

    @pytest.mark.asyncio
    async def test_discover_documents_botany_farm(self, example_documents_path):
        """Test document discovery with real Botany Farm data."""
        # Create session
        session = await session_tools.create_session(
            project_name="Botany Farm Test",
            documents_path=str(example_documents_path),
            methodology="soil-carbon-v1.2.2",
        )
        session_id = session["session_id"]

        # Run discovery
        results = await document_tools.discover_documents(session_id)

        # Verify results
        assert results["documents_found"] > 0, "Should find documents in examples/22-23"
        assert "classification_summary" in results
        assert "documents" in results

        # Check that at least some PDFs were found
        assert results["documents_found"] >= 5, "Should find at least 5 PDFs"

        # Verify classification summary has expected types
        summary = results["classification_summary"]
        assert "project_plan" in summary or "baseline_report" in summary, \
            "Should classify at least one project plan or baseline report"

        # Verify document structure
        first_doc = results["documents"][0]
        assert "document_id" in first_doc
        assert "filename" in first_doc
        assert "classification" in first_doc
        assert "confidence" in first_doc
        assert first_doc["document_id"].startswith("DOC-")

        # Cleanup
        await session_tools.delete_session(session_id)

    @pytest.mark.asyncio
    async def test_document_classification_patterns(self):
        """Test that classification patterns work correctly."""
        # Test project plan
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "/path/to/4997Botany22_Public_Project_Plan.pdf"
        )
        assert classification == "project_plan"
        assert confidence >= 0.9
        assert method == "filename"

        # Test baseline report
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "/path/to/Baseline_Report_2022.pdf"
        )
        assert classification == "baseline_report"
        assert confidence >= 0.9

        # Test monitoring report
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "/path/to/Monitoring_Report_2023.pdf"
        )
        assert classification == "monitoring_report"
        assert confidence >= 0.8

        # Test GHG emissions
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "/path/to/GHG_Emissions_Report.pdf"
        )
        assert classification == "ghg_emissions"
        assert confidence >= 0.8

        # Test unknown
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "/path/to/random_file.pdf"
        )
        assert classification == "unknown"
        assert confidence <= 0.6


class TestMappingConventionConsistency:
    """Regression tests for the classifier ↔ mapping naming convention bug.

    The classifier (classify_document_by_filename) produces classification labels.
    The mapper (_infer_document_types) expects those same labels when looking up
    documents by type. If the conventions diverge, requirements silently fail to
    match their correct documents and fall back to the project plan.

    Diagnosed 2026-02-07: land_tenure, ghg_emissions, and gis_shapefile were
    missed because the mapper used hyphens while the classifier used underscores.
    """

    def test_all_classifier_labels_recognized_by_mapper(self):
        """Every label the classifier can produce must appear in at least one
        mapper category's expected types."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        # All labels the classifier can produce (from classify_document_by_filename)
        classifier_labels = {
            "project_plan", "baseline_report", "monitoring_report",
            "ghg_emissions", "land_tenure", "gis_shapefile",
            "land_cover_map", "registry_review", "methodology_reference",
            "unknown",
        }

        # Collect every label the mapper can return across all checklist categories
        # Use the actual checklist categories from soil-carbon-v1.2.2
        import json
        from registry_review_mcp.config.settings import settings
        checklist_path = settings.get_checklist_path("soil-carbon-v1.2.2")
        with open(checklist_path) as f:
            checklist = json.load(f)

        mapper_labels = set()
        for req in checklist["requirements"]:
            category = req.get("category", "")
            evidence = req.get("accepted_evidence", "")
            types = _infer_document_types(category, evidence)
            mapper_labels.update(types)

        # These labels are informational (not evidence sources), so we don't
        # expect the mapper to reference them
        non_evidence_labels = {"unknown", "registry_review", "methodology_reference"}

        # Every evidence-producing classifier label should appear in mapper output
        evidence_labels = classifier_labels - non_evidence_labels
        missing = evidence_labels - mapper_labels
        assert not missing, (
            f"Classifier labels not recognized by mapper: {missing}. "
            f"Add these to _infer_document_types() in mapping_tools.py."
        )

    @pytest.mark.asyncio
    async def test_land_tenure_document_maps_to_land_tenure_requirement(self):
        """A document classified as land_tenure should map to REQ-002 (Land Tenure),
        not fall back to the project plan."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        types = _infer_document_types("Land Tenure", "Deeds, lease agreements")
        assert "land_tenure" in types, (
            f"Land Tenure requirement should look for 'land_tenure' documents, got: {types}"
        )

    @pytest.mark.asyncio
    async def test_ghg_emissions_document_maps_to_emissions_requirement(self):
        """A document classified as ghg_emissions should map to GHG Accounting requirements."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        types = _infer_document_types("GHG Accounting", "Proof of additionality")
        assert "ghg_emissions" in types or "project_plan" in types, (
            f"GHG Accounting should look for ghg_emissions or project_plan, got: {types}"
        )

    @pytest.mark.asyncio
    async def test_gis_shapefile_maps_to_gis_requirement(self):
        """A document classified as gis_shapefile should map to Project Area requirements."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        types = _infer_document_types(
            "Project Area",
            "GIS shapefiles and maps, with delineations of eligible and ineligible land",
        )
        assert "gis_shapefile" in types, (
            f"Project Area requirement should look for 'gis_shapefile' documents, got: {types}"
        )

    @pytest.mark.asyncio
    async def test_land_registry_documents_classified_as_land_tenure(self):
        """UK Land Registry 'Official Copy (Register)' files should classify as land_tenure."""
        test_cases = [
            "Official Copy (Register) - LT330529.pdf",
            "Official Copy (Register) - LT438680.pdf",
        ]
        for filename in test_cases:
            classification, confidence, method = await document_tools.classify_document_by_filename(filename)
            assert classification == "land_tenure", (
                f"'{filename}' should classify as land_tenure, got: {classification}"
            )
            assert confidence >= 0.80

    @pytest.mark.asyncio
    async def test_land_cover_map_classified(self):
        """Land Cover Map PDFs should classify as land_cover_map."""
        classification, confidence, method = await document_tools.classify_document_by_filename(
            "Greens Lodge Farm - 2012 Land Cover Map.pdf"
        )
        assert classification == "land_cover_map", (
            f"Land cover map should classify as land_cover_map, got: {classification}"
        )

    @pytest.mark.asyncio
    async def test_land_cover_map_maps_to_project_area(self):
        """Land cover maps should be included in Project Area requirement mapping."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        types = _infer_document_types(
            "Project Area",
            "GIS shapefiles and maps, with delineations of eligible and ineligible land",
        )
        assert "land_cover_map" in types

    @pytest.mark.asyncio
    async def test_ecosystem_type_includes_spreadsheet_data(self):
        """Ecosystem Type requirements should accept spreadsheet and land cover data."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        types = _infer_document_types("Ecosystem Type", "Provide proof of land use")
        assert "spreadsheet_data" in types, (
            f"Ecosystem Type should include spreadsheet_data, got: {types}"
        )
        assert "land_cover_map" in types, (
            f"Ecosystem Type should include land_cover_map, got: {types}"
        )

    def test_no_hyphenated_labels_in_mapper(self):
        """The mapper should never return hyphenated labels. All labels must use
        underscores to match the classifier convention."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        # Test every checklist category
        import json
        from registry_review_mcp.config.settings import settings
        checklist_path = settings.get_checklist_path("soil-carbon-v1.2.2")
        with open(checklist_path) as f:
            checklist = json.load(f)

        for req in checklist["requirements"]:
            types = _infer_document_types(
                req.get("category", ""),
                req.get("accepted_evidence", ""),
            )
            for t in types:
                assert "-" not in t, (
                    f"Mapper returned hyphenated label '{t}' for requirement "
                    f"{req['requirement_id']} ({req['category']}). "
                    f"Use underscores to match classifier convention."
                )


class TestPDFExtraction:
    """Test PDF text extraction."""

    @pytest.mark.marker
    @pytest.mark.asyncio
    async def test_extract_pdf_text_basic(self, example_documents_path):
        """Test basic PDF text extraction."""
        # Find a PDF in the example data
        pdf_files = list(example_documents_path.glob("*.pdf"))
        if not pdf_files:
            pytest.skip("No PDF files found in example data")

        pdf_path = str(pdf_files[0])

        # Extract text
        results = await document_tools.extract_pdf_text(pdf_path)

        # Verify results
        assert results["filepath"] == pdf_path
        assert results["page_count"] > 0
        assert len(results["full_text"]) > 0
        assert len(results["pages"]) > 0

        # Verify page structure
        first_page = results["pages"][0]
        assert "page_number" in first_page
        assert "text" in first_page
        assert first_page["page_number"] == 1

    @pytest.mark.marker
    @pytest.mark.asyncio
    async def test_extract_pdf_text_with_page_range(self, example_documents_path):
        """Test PDF extraction with specific page range."""
        pdf_files = list(example_documents_path.glob("*.pdf"))
        if not pdf_files:
            pytest.skip("No PDF files found in example data")

        pdf_path = str(pdf_files[0])

        # Extract pages 1-2
        results = await document_tools.extract_pdf_text(pdf_path, page_range=(1, 2))

        # Verify only 2 pages extracted
        assert len(results["pages"]) == 2
        assert results["pages"][0]["page_number"] == 1
        assert results["pages"][1]["page_number"] == 2

    @pytest.mark.marker
    @pytest.mark.asyncio
    async def test_extract_pdf_text_caching(self, example_documents_path):
        """Test that PDF extraction results are cached."""
        pdf_files = list(example_documents_path.glob("*.pdf"))
        if not pdf_files:
            pytest.skip("No PDF files found in example data")

        pdf_path = str(pdf_files[0])

        # First extraction
        results1 = await document_tools.extract_pdf_text(pdf_path)

        # Second extraction (should be from cache)
        results2 = await document_tools.extract_pdf_text(pdf_path)

        # Should be identical
        assert results1["full_text"] == results2["full_text"]
        assert results1["page_count"] == results2["page_count"]


class TestEndToEnd:
    """End-to-end workflow tests."""

    @pytest.mark.marker
    @pytest.mark.asyncio
    async def test_full_discovery_workflow(self, example_documents_path):
        """Test complete discovery workflow from session to results."""
        # Step 1: Create session
        session = await session_tools.create_session(
            project_name="E2E Test Project",
            documents_path=str(example_documents_path),
            methodology="soil-carbon-v1.2.2",
            project_id="C06-9999",
        )
        session_id = session["session_id"]

        try:
            # Step 2: Discover documents
            discovery = await document_tools.discover_documents(session_id)

            assert discovery["documents_found"] > 0
            assert len(discovery["documents"]) == discovery["documents_found"]

            # Step 3: Verify session state updated
            updated_session = await session_tools.load_session(session_id)
            assert updated_session["statistics"]["documents_found"] == discovery["documents_found"]
            assert updated_session["workflow_progress"]["document_discovery"] == "completed"

            # Step 4: Verify documents.json was created
            state_manager = StateManager(session_id)
            docs_data = state_manager.read_json("documents.json")
            assert docs_data["total_count"] == discovery["documents_found"]
            assert len(docs_data["documents"]) == discovery["documents_found"]

            # Step 5: Extract text from first PDF
            pdf_docs = [d for d in discovery["documents"] if d["filename"].endswith(".pdf")]
            if pdf_docs:
                first_pdf = pdf_docs[0]
                text_results = await document_tools.extract_pdf_text(first_pdf["filepath"])
                assert text_results["page_count"] > 0
                assert len(text_results["full_text"]) > 100  # Should have substantial text

        finally:
            # Cleanup
            await session_tools.delete_session(session_id)


class TestSpreadsheetIngestion:
    """Tests for spreadsheet (.xlsx, .csv) discovery, classification, and extraction."""

    @pytest.mark.asyncio
    async def test_spreadsheet_discovery(self, sample_xlsx, sample_csv, tmp_path):
        """Verify .xlsx and .csv files are found during document discovery."""
        session = await session_tools.create_session(
            project_name="Spreadsheet Discovery Test",
            documents_path=str(tmp_path),
            methodology="soil-carbon-v1.2.2",
        )
        session_id = session["session_id"]

        try:
            results = await document_tools.discover_documents(session_id)
            filenames = {doc["filename"] for doc in results["documents"]}

            assert "farm_data.xlsx" in filenames, "Should discover .xlsx files"
            assert "monitoring_data.csv" in filenames, "Should discover .csv files"
            assert results["documents_found"] >= 2
        finally:
            await session_tools.delete_session(session_id)

    @pytest.mark.asyncio
    async def test_spreadsheet_extraction_xlsx(self, sample_xlsx):
        """Verify .xlsx extraction produces well-structured markdown."""
        from registry_review_mcp.extractors.spreadsheet_extractor import extract_spreadsheet

        result = await extract_spreadsheet(str(sample_xlsx))

        assert result["sheet_count"] == 2
        assert result["page_count"] == 2  # compatibility alias
        assert result["row_count"] == 8  # 5 farm + 3 tenure
        assert result["tables_found"] == 2
        assert result["extraction_method"] == "openpyxl"
        assert result["total_chars"] > 0

        md = result["markdown"]
        assert '--- Sheet "Farm Data" (1 of 2) ---' in md
        assert '--- Sheet "Land Tenure" (2 of 2) ---' in md
        assert "CE-001" in md
        assert "Alice" in md

    @pytest.mark.asyncio
    async def test_spreadsheet_extraction_csv(self, sample_csv):
        """Verify .csv extraction produces well-structured markdown."""
        from registry_review_mcp.extractors.spreadsheet_extractor import extract_spreadsheet

        result = await extract_spreadsheet(str(sample_csv))

        assert result["sheet_count"] == 1
        assert result["page_count"] == 1
        assert result["row_count"] == 3
        assert result["extraction_method"] == "csv"

        md = result["markdown"]
        assert '--- Sheet "monitoring_data.csv" (1 of 1) ---' in md
        assert "SOC (g/kg)" in md
        assert "23.4" in md

    @pytest.mark.asyncio
    async def test_spreadsheet_classification(self, sample_land_tenure_xlsx):
        """Verify land_tenure_records.xlsx classifies as land_tenure, not generic spreadsheet_data."""
        classification, confidence, method = await document_tools.classify_document_by_filename(
            str(sample_land_tenure_xlsx)
        )
        assert classification == "land_tenure", (
            f"Expected 'land_tenure' for land_tenure_records.xlsx, got '{classification}'"
        )
        assert confidence >= 0.80
        assert method == "filename"

    @pytest.mark.asyncio
    async def test_generic_spreadsheet_classification(self, sample_xlsx):
        """Verify a generically-named spreadsheet gets spreadsheet_data classification."""
        classification, confidence, method = await document_tools.classify_document_by_filename(
            str(sample_xlsx)
        )
        assert classification == "spreadsheet_data"
        assert method == "file_type"

    def test_spreadsheet_data_in_mapper(self):
        """Verify spreadsheet_data appears in mapper for relevant requirement categories."""
        from registry_review_mcp.tools.mapping_tools import _infer_document_types

        # Land tenure requirements should look for spreadsheet data
        types = _infer_document_types("Land Tenure", "Deeds, lease agreements")
        assert "spreadsheet_data" in types

        # Monitoring requirements should look for spreadsheet data
        types = _infer_document_types("Monitoring", "Sampling records")
        assert "spreadsheet_data" in types

    def test_is_spreadsheet_file_helper(self):
        """Verify the is_spreadsheet_file pattern helper."""
        assert is_spreadsheet_file("data.xlsx")
        assert is_spreadsheet_file("records.csv")
        assert is_spreadsheet_file("report.tsv")
        assert is_spreadsheet_file("DATA.XLSX")  # case insensitive
        assert not is_spreadsheet_file("report.pdf")
        assert not is_spreadsheet_file("map.shp")