Solr MCP

Verified
MIT License
Overview InspectNew Schema Related Servers Reviews Score
solr-mcp
scripts
#!/usr/bin/env python3
"""
Diagnostic script to help debug search issues in Solr collections.
"""

import argparse
import asyncio
import httpx
import json
import os
import sys
from typing import Dict, Any, List, Optional

# Add project root to path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))


async def get_collection_schema(collection: str) -> Dict[str, Any]:
    """Get schema details for a collection.
    
    Args:
        collection: Solr collection name
        
    Returns:
        Schema details
    """
    async with httpx.AsyncClient() as client:
        response = await client.get(
            f"http://localhost:8983/solr/{collection}/schema",
            params={"wt": "json"},
            timeout=10.0
        )
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error getting schema: {response.status_code} - {response.text}")
            return {}


async def get_collection_status(collection: str) -> Dict[str, Any]:
    """Get status details for a collection.
    
    Args:
        collection: Solr collection name
        
    Returns:
        Collection status
    """
    async with httpx.AsyncClient() as client:
        response = await client.get(
            "http://localhost:8983/solr/admin/collections",
            params={"action": "STATUS", "name": collection, "wt": "json"},
            timeout=10.0
        )
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error getting collection status: {response.status_code} - {response.text}")
            return {}


async def get_document_count(collection: str) -> int:
    """Get document count for a collection.
    
    Args:
        collection: Solr collection name
        
    Returns:
        Document count
    """
    async with httpx.AsyncClient() as client:
        response = await client.get(
            f"http://localhost:8983/solr/{collection}/select",
            params={"q": "*:*", "rows": 0, "wt": "json"},
            timeout=10.0
        )
        
        if response.status_code == 200:
            return response.json().get("response", {}).get("numFound", 0)
        else:
            print(f"Error getting document count: {response.status_code} - {response.text}")
            return 0


async def get_document_sample(collection: str, num_docs: int = 3) -> List[Dict[str, Any]]:
    """Get a sample of documents from the collection.
    
    Args:
        collection: Solr collection name
        num_docs: Number of documents to return
        
    Returns:
        List of documents
    """
    async with httpx.AsyncClient() as client:
        response = await client.get(
            f"http://localhost:8983/solr/{collection}/select",
            params={"q": "*:*", "rows": num_docs, "wt": "json"},
            timeout=10.0
        )
        
        if response.status_code == 200:
            return response.json().get("response", {}).get("docs", [])
        else:
            print(f"Error getting document sample: {response.status_code} - {response.text}")
            return []


async def test_text_search(collection: str, field: str, search_term: str) -> Dict[str, Any]:
    """Test a text search on a specific field.
    
    Args:
        collection: Solr collection name
        field: Field to search in
        search_term: Term to search for
        
    Returns:
        Search results
    """
    query = f"{field}:{search_term}"
    
    async with httpx.AsyncClient() as client:
        response = await client.get(
            f"http://localhost:8983/solr/{collection}/select",
            params={"q": query, "rows": 5, "wt": "json"},
            timeout=10.0
        )
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error testing text search: {response.status_code} - {response.text}")
            return {}


async def analyze_text(collection: str, field_type: str, text: str) -> Dict[str, Any]:
    """Analyze how a text is processed for a given field type.
    
    Args:
        collection: Solr collection name
        field_type: Field type to analyze with
        text: Text to analyze
        
    Returns:
        Analysis results
    """
    async with httpx.AsyncClient() as client:
        response = await client.get(
            f"http://localhost:8983/solr/{collection}/analysis/field",
            params={"analysis.fieldtype": field_type, "analysis.fieldvalue": text, "wt": "json"},
            timeout=10.0
        )
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error analyzing text: {response.status_code} - {response.text}")
            return {}


async def diagnose_collection(collection: str, search_term: str = "bitcoin") -> None:
    """Run a comprehensive diagnosis on a collection.
    
    Args:
        collection: Solr collection name
        search_term: Term to use in search tests
    """
    print(f"\n=== Diagnosing Collection: {collection} ===\n")
    
    # Check if collection exists
    status = await get_collection_status(collection)
    if not status or "status" not in status:
        print(f"Error: Collection '{collection}' may not exist.")
        return
    
    # Get document count
    doc_count = await get_document_count(collection)
    print(f"Document count: {doc_count}")
    
    if doc_count == 0:
        print("No documents found in the collection. Please index some documents first.")
        return
    
    # Get schema details
    schema = await get_collection_schema(collection)
    if schema:
        field_types = {ft.get("name"): ft for ft in schema.get("schema", {}).get("fieldTypes", [])}
        fields = {f.get("name"): f for f in schema.get("schema", {}).get("fields", [])}
        
        print("\nText fields in schema:")
        text_fields = []
        for name, field in fields.items():
            field_type = field.get("type")
            if field_type and ("text" in field_type.lower() or field_type == "string"):
                indexed = field.get("indexed", True)
                stored = field.get("stored", True)
                text_fields.append(name)
                print(f"  - {name} (type: {field_type}, indexed: {indexed}, stored: {stored})")
        
        # Get document sample
        print("\nSample documents:")
        docs = await get_document_sample(collection)
        for i, doc in enumerate(docs):
            print(f"\nDocument {i+1}:")
            for key, value in doc.items():
                # Truncate long values
                if isinstance(value, str) and len(value) > 100:
                    value = value[:100] + "..."
                elif isinstance(value, list) and len(str(value)) > 100:
                    value = str(value)[:100] + "..."
                print(f"  {key}: {value}")
        
        # Test search on each text field
        print("\nSearch tests:")
        for field in text_fields:
            print(f"\nTesting search on field: {field}")
            results = await test_text_search(collection, field, search_term)
            num_found = results.get("response", {}).get("numFound", 0)
            print(f"  Query: {field}:{search_term}")
            print(f"  Results found: {num_found}")
            
            if num_found > 0:
                print("  First match:")
                doc = results.get("response", {}).get("docs", [{}])[0]
                for key, value in doc.items():
                    if key == field or key in ["id", "title", "score"]:
                        # Truncate long values
                        if isinstance(value, str) and len(value) > 100:
                            value = value[:100] + "..."
                        print(f"    {key}: {value}")
        
        # Test general search
        print("\nTesting general search:")
        results = await test_text_search(collection, "*", search_term)
        num_found = results.get("response", {}).get("numFound", 0)
        print(f"  Query: {search_term}")
        print(f"  Results found: {num_found}")
        
        if num_found > 0:
            print("  First match:")
            doc = results.get("response", {}).get("docs", [{}])[0]
            for key, value in doc.items():
                if key in ["id", "title", "score", "content"]:
                    # Truncate long values
                    if isinstance(value, str) and len(value) > 100:
                        value = value[:100] + "..."
                    print(f"    {key}: {value}")
        
        # Analyze text processing
        print("\nText analysis for search term:")
        # Find a text field type to analyze with
        text_field_type = None
        for name, field in fields.items():
            if "text" in field.get("type", "").lower():
                text_field_type = field.get("type")
                break
        
        if text_field_type and text_field_type in field_types:
            print(f"  Using field type: {text_field_type}")
            analysis = await analyze_text(collection, text_field_type, search_term)
            
            if "analysis" in analysis:
                for key, stages in analysis.get("analysis", {}).items():
                    print(f"\n  {key.capitalize()} analysis:")
                    for stage in stages:
                        if "text" in stage:
                            print(f"    - {stage.get('name', 'unknown')}: {stage.get('text', [])}")
    
    print("\n=== Diagnosis Complete ===")


async def main() -> None:
    """Main entry point."""
    parser = argparse.ArgumentParser(description="Diagnose Solr search issues")
    parser.add_argument("--collection", "-c", default="unified", help="Collection name")
    parser.add_argument("--term", "-t", default="bitcoin", help="Search term to test with")
    
    args = parser.parse_args()
    
    await diagnose_collection(args.collection, args.term)


if __name__ == "__main__":
    asyncio.run(main())