Registry Review MCP Server

test_llm_extraction_integration.py•7.82 KiB

"""Integration tests for LLM extraction with real API calls."""

import pytest
from pathlib import Path

from registry_review_mcp.config.settings import settings
from registry_review_mcp.extractors.llm_extractors import (
    DateExtractor,
    extract_fields_with_llm,
)


pytestmark = [
    pytest.mark.expensive,
    pytest.mark.skipif(
        not settings.anthropic_api_key or not settings.llm_extraction_enabled,
        reason="LLM extraction not configured (set ANTHROPIC_API_KEY and enable LLM extraction)"
    )
]


class TestRealAPIDateExtraction:
    """Test date extraction with real Anthropic API calls."""

    @pytest.mark.asyncio
    async def test_extract_simple_project_start_date(self):
        """Test extracting a simple project start date."""
        markdown = """
        # Project Information

        ## 1.8. Project Start Date

        01/01/2022. The project will be aligned with the calendar year, with annual
        monitoring rounds taking place in the August  March bracket when the soil is
        dormant.
        """

        extractor = DateExtractor()
        results = await extractor.extract(markdown, [], "test_project_plan.pdf")

        print(f"\n=== Extracted {len(results)} dates ===")
        for field in results:
            print(f"  {field.field_type}: {field.value} (confidence: {field.confidence})")
            print(f"    Reasoning: {field.reasoning}")

        # Verify we got results
        assert len(results) > 0, "Should extract at least one date"

        # Check for project start date
        project_start_dates = [f for f in results if f.field_type == "project_start_date"]
        assert len(project_start_dates) > 0, "Should find project start date"

        # Verify the date value
        assert "2022-01-01" in project_start_dates[0].value

        # Verify confidence is reasonable
        assert project_start_dates[0].confidence >= 0.7

    @pytest.mark.asyncio
    async def test_extract_multiple_date_types(self):
        """Test extracting multiple date types from complex text."""
        markdown = """
        # Project Timeline

        Project Start Date: January 1, 2022

        Baseline assessment was conducted on March 15, 2022.

        Satellite imagery was acquired on June 20, 2022 for the baseline analysis.

        Field sampling occurred from August 15-20, 2022.

        The monitoring report was completed on December 31, 2022.
        """

        extractor = DateExtractor()
        results = await extractor.extract(markdown, [], "test_timeline.pdf")

        print(f"\n=== Extracted {len(results)} dates ===")
        for field in results:
            print(f"  {field.field_type}: {field.value} (confidence: {field.confidence})")

        # Should extract multiple dates
        assert len(results) >= 3, "Should extract at least 3 different dates"

        # Check for different date types
        date_types = {f.field_type for f in results}
        assert "project_start_date" in date_types
        assert "baseline_date" in date_types or "sampling_date" in date_types

    @pytest.mark.asyncio
    async def test_date_format_flexibility(self):
        """Test that various date formats are recognized."""
        markdown = """
        Project Timeline:

        Project started: January 15, 2022

        Baseline conducted: 03/20/2022

        Field work: August 10-15, 2022

        Report submitted: 2022-12-31
        """

        extractor = DateExtractor()
        results = await extractor.extract(markdown, [], "test_formats.pdf")

        print(f"\n=== Extracted {len(results)} dates from various formats ===")
        for field in results:
            print(f"  {field.value} from: '{field.raw_text}' (type: {field.field_type})")

        # Should recognize multiple formats
        assert len(results) >= 3, "Should extract dates in multiple formats"

        # Verify different date formats were parsed
        values = [f.value for f in results]
        assert any("2022-01-15" in str(v) for v in values)
        assert any("2022-03-20" in str(v) or "03" in str(v) for v in values)

    @pytest.mark.asyncio
    async def test_date_disambiguation(self):
        """Test that dates are correctly classified by context."""
        markdown = """
        Project Dates:

        The project started on 01/01/2022.

        Imagery was acquired on 06/15/2022 for baseline analysis.

        Soil sampling was conducted on 08/20/2022.
        """

        extractor = DateExtractor()
        results = await extractor.extract(markdown, [], "test_disambiguation.pdf")

        print(f"\n=== Date Classification ===")
        for field in results:
            print(f"  {field.field_type}: {field.value}")
            print(f"    Context: {field.raw_text}")

        # Check that dates are classified differently
        date_types = {f.field_type for f in results}
        assert len(date_types) >= 2, "Should classify dates into different types"

        # Verify specific classifications
        imagery_dates = [f for f in results if "imagery" in f.field_type]
        sampling_dates = [f for f in results if "sampling" in f.field_type]

        # At least one should be correctly classified
        assert len(imagery_dates) > 0 or len(sampling_dates) > 0


class TestCachingWithRealAPI:
    """Test that caching works with real API calls."""

    @pytest.mark.asyncio
    async def test_caching_prevents_duplicate_api_calls(self):
        """Test that second call uses cache instead of API."""
        import time

        markdown = "Project started on 01/01/2022"
        doc_name = f"cache_test_{int(time.time())}.pdf"

        extractor = DateExtractor()

        # First call - hits API
        start1 = time.time()
        results1 = await extractor.extract(markdown, [], doc_name)
        duration1 = time.time() - start1

        print(f"\nFirst call (API): {duration1:.2f}s, extracted {len(results1)} dates")

        # Second call - should use cache
        start2 = time.time()
        results2 = await extractor.extract(markdown, [], doc_name)
        duration2 = time.time() - start2

        print(f"Second call (cache): {duration2:.2f}s, extracted {len(results2)} dates")

        # Verify results are identical
        assert len(results1) == len(results2)
        assert results1[0].value == results2[0].value

        # Second call should be much faster (< 100ms for cache hit)
        assert duration2 < 0.1, f"Cache hit should be fast, got {duration2}s"


class TestEndToEndExtraction:
    """Test end-to-end extraction with evidence data format."""

    @pytest.mark.asyncio
    async def test_extract_from_evidence_structure(self):
        """Test extraction from evidence.json format."""
        # Simulate evidence data structure
        evidence_data = {
            "evidence": [
                {
                    "requirement_id": "REQ-007",
                    "requirement_text": "Project start date",
                    "evidence_snippets": [
                        {
                            "document_name": "Project Plan",
                            "text": "Project Start Date: 01/01/2022. The project will be aligned with the calendar year.",
                            "page": 4,
                        }
                    ],
                }
            ]
        }

        session_id = "test_session"
        results = await extract_fields_with_llm(session_id, evidence_data)

        print(f"\n=== End-to-End Extraction ===")
        print(f"Dates extracted: {len(results.get('dates', []))}")
        for field in results.get('dates', []):
            print(f"  {field.field_type}: {field.value} (confidence: {field.confidence})")

        # Should extract dates
        assert len(results.get('dates', [])) > 0, "Should extract dates from evidence"

        # Should have high confidence
        dates = results.get('dates', [])
        if dates:
            assert dates[0].confidence >= 0.7, "Should have reasonable confidence"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/gaiaaiagent/regen-registry-review-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_llm_extraction_integration.py•7.82 KiB

"""Integration tests for LLM extraction with real API calls."""

import pytest
from pathlib import Path

from registry_review_mcp.config.settings import settings
from registry_review_mcp.extractors.llm_extractors import (
    DateExtractor,
    extract_fields_with_llm,
)


pytestmark = [
    pytest.mark.expensive,
    pytest.mark.skipif(
        not settings.anthropic_api_key or not settings.llm_extraction_enabled,
        reason="LLM extraction not configured (set ANTHROPIC_API_KEY and enable LLM extraction)"
    )
]


class TestRealAPIDateExtraction:
    """Test date extraction with real Anthropic API calls."""

    @pytest.mark.asyncio
    async def test_extract_simple_project_start_date(self):
        """Test extracting a simple project start date."""
        markdown = """
        # Project Information

        ## 1.8. Project Start Date

        01/01/2022. The project will be aligned with the calendar year, with annual
        monitoring rounds taking place in the August  March bracket when the soil is
        dormant.
        """

        extractor = DateExtractor()
        results = await extractor.extract(markdown, [], "test_project_plan.pdf")

        print(f"\n=== Extracted {len(results)} dates ===")
        for field in results:
            print(f"  {field.field_type}: {field.value} (confidence: {field.confidence})")
            print(f"    Reasoning: {field.reasoning}")

        # Verify we got results
        assert len(results) > 0, "Should extract at least one date"

        # Check for project start date
        project_start_dates = [f for f in results if f.field_type == "project_start_date"]
        assert len(project_start_dates) > 0, "Should find project start date"

        # Verify the date value
        assert "2022-01-01" in project_start_dates[0].value

        # Verify confidence is reasonable
        assert project_start_dates[0].confidence >= 0.7

    @pytest.mark.asyncio
    async def test_extract_multiple_date_types(self):
        """Test extracting multiple date types from complex text."""
        markdown = """
        # Project Timeline

        Project Start Date: January 1, 2022

        Baseline assessment was conducted on March 15, 2022.

        Satellite imagery was acquired on June 20, 2022 for the baseline analysis.

        Field sampling occurred from August 15-20, 2022.

        The monitoring report was completed on December 31, 2022.
        """

        extractor = DateExtractor()
        results = await extractor.extract(markdown, [], "test_timeline.pdf")

        print(f"\n=== Extracted {len(results)} dates ===")
        for field in results:
            print(f"  {field.field_type}: {field.value} (confidence: {field.confidence})")

        # Should extract multiple dates
        assert len(results) >= 3, "Should extract at least 3 different dates"

        # Check for different date types
        date_types = {f.field_type for f in results}
        assert "project_start_date" in date_types
        assert "baseline_date" in date_types or "sampling_date" in date_types

    @pytest.mark.asyncio
    async def test_date_format_flexibility(self):
        """Test that various date formats are recognized."""
        markdown = """
        Project Timeline:

        Project started: January 15, 2022

        Baseline conducted: 03/20/2022

        Field work: August 10-15, 2022

        Report submitted: 2022-12-31
        """

        extractor = DateExtractor()
        results = await extractor.extract(markdown, [], "test_formats.pdf")

        print(f"\n=== Extracted {len(results)} dates from various formats ===")
        for field in results:
            print(f"  {field.value} from: '{field.raw_text}' (type: {field.field_type})")

        # Should recognize multiple formats
        assert len(results) >= 3, "Should extract dates in multiple formats"

        # Verify different date formats were parsed
        values = [f.value for f in results]
        assert any("2022-01-15" in str(v) for v in values)
        assert any("2022-03-20" in str(v) or "03" in str(v) for v in values)

    @pytest.mark.asyncio
    async def test_date_disambiguation(self):
        """Test that dates are correctly classified by context."""
        markdown = """
        Project Dates:

        The project started on 01/01/2022.

        Imagery was acquired on 06/15/2022 for baseline analysis.

        Soil sampling was conducted on 08/20/2022.
        """

        extractor = DateExtractor()
        results = await extractor.extract(markdown, [], "test_disambiguation.pdf")

        print(f"\n=== Date Classification ===")
        for field in results:
            print(f"  {field.field_type}: {field.value}")
            print(f"    Context: {field.raw_text}")

        # Check that dates are classified differently
        date_types = {f.field_type for f in results}
        assert len(date_types) >= 2, "Should classify dates into different types"

        # Verify specific classifications
        imagery_dates = [f for f in results if "imagery" in f.field_type]
        sampling_dates = [f for f in results if "sampling" in f.field_type]

        # At least one should be correctly classified
        assert len(imagery_dates) > 0 or len(sampling_dates) > 0


class TestCachingWithRealAPI:
    """Test that caching works with real API calls."""

    @pytest.mark.asyncio
    async def test_caching_prevents_duplicate_api_calls(self):
        """Test that second call uses cache instead of API."""
        import time

        markdown = "Project started on 01/01/2022"
        doc_name = f"cache_test_{int(time.time())}.pdf"

        extractor = DateExtractor()

        # First call - hits API
        start1 = time.time()
        results1 = await extractor.extract(markdown, [], doc_name)
        duration1 = time.time() - start1

        print(f"\nFirst call (API): {duration1:.2f}s, extracted {len(results1)} dates")

        # Second call - should use cache
        start2 = time.time()
        results2 = await extractor.extract(markdown, [], doc_name)
        duration2 = time.time() - start2

        print(f"Second call (cache): {duration2:.2f}s, extracted {len(results2)} dates")

        # Verify results are identical
        assert len(results1) == len(results2)
        assert results1[0].value == results2[0].value

        # Second call should be much faster (< 100ms for cache hit)
        assert duration2 < 0.1, f"Cache hit should be fast, got {duration2}s"


class TestEndToEndExtraction:
    """Test end-to-end extraction with evidence data format."""

    @pytest.mark.asyncio
    async def test_extract_from_evidence_structure(self):
        """Test extraction from evidence.json format."""
        # Simulate evidence data structure
        evidence_data = {
            "evidence": [
                {
                    "requirement_id": "REQ-007",
                    "requirement_text": "Project start date",
                    "evidence_snippets": [
                        {
                            "document_name": "Project Plan",
                            "text": "Project Start Date: 01/01/2022. The project will be aligned with the calendar year.",
                            "page": 4,
                        }
                    ],
                }
            ]
        }

        session_id = "test_session"
        results = await extract_fields_with_llm(session_id, evidence_data)

        print(f"\n=== End-to-End Extraction ===")
        print(f"Dates extracted: {len(results.get('dates', []))}")
        for field in results.get('dates', []):
            print(f"  {field.field_type}: {field.value} (confidence: {field.confidence})")

        # Should extract dates
        assert len(results.get('dates', [])) > 0, "Should extract dates from evidence"

        # Should have high confidence
        dates = results.get('dates', [])
        if dates:
            assert dates[0].confidence >= 0.7, "Should have reasonable confidence"