utils.py•10.1 kB
"""
Utility functions for the Smithsonian MCP server.
"""
from typing import Dict, Any, Optional, List
def mask_api_key(params: Dict[str, Any]) -> Dict[str, Any]:
    """
    Masks the API key in a dictionary of parameters.
    Args:
        params: A dictionary of parameters.
    Returns:
        A new dictionary with the API key masked.
    """
    if "api_key" in params:
        masked_params = params.copy()
        masked_params["api_key"] = "****"
        return masked_params
    return params
def resolve_museum_code(museum_name: str) -> Optional[str]:
    """
    Resolve a museum name or code to the correct Smithsonian unit code.
    This function provides flexible matching for museum names, handling common
    variations and partial matches. It supports:
    - Exact matches: "asian art" -> "FSG"
    - Partial matches: "Smithsonian Asian Art Museum" -> "FSG"
    - Direct codes: "SAAM" -> "SAAM"
    - Case-insensitive matching
    Args:
        museum_name: Museum name or code to resolve
    Returns:
        The corresponding museum code (e.g., "FSG", "SAAM"), or None if not found
    Examples:
        resolve_museum_code("Smithsonian Asian Art Museum")  # -> "FSG"
        resolve_museum_code("Asian Art")                      # -> "FSG"
        resolve_museum_code("SAAM")                          # -> "SAAM"
        resolve_museum_code("Natural History Museum")        # -> "NMNH"
    """
    if not museum_name or not museum_name.strip():
        return None
    # Import here to avoid circular imports
    from .constants import MUSEUM_MAP, VALID_MUSEUM_CODES
    # Normalize input
    normalized = museum_name.lower().strip()
    # Try exact match on original first
    if normalized in MUSEUM_MAP:
        return MUSEUM_MAP[normalized]
    # Remove common prefixes that don't help with matching
    cleaned = normalized
    prefixes_to_remove = ["smithsonian", "national museum of", "museum of"]
    for prefix in prefixes_to_remove:
        if cleaned.startswith(prefix + " "):
            cleaned = cleaned[len(prefix) + 1:].strip()
    # Try exact match on cleaned version
    if cleaned in MUSEUM_MAP:
        return MUSEUM_MAP[cleaned]
    # Try direct code match
    if normalized.upper() in VALID_MUSEUM_CODES:
        return normalized.upper()
    # Try partial matches - check if normalized contains any map key
    for map_key in MUSEUM_MAP:
        if map_key in normalized or normalized in map_key:
            return MUSEUM_MAP[map_key]
    # Try word-based matching for multi-word museum names
    normalized_words = set(normalized.split())
    for map_key, code in MUSEUM_MAP.items():
        map_words = set(map_key.split())
        # If there's significant overlap (more than 50% of words match)
        if len(normalized_words & map_words) / len(map_words) > 0.5:
            return code
    # No match found
    return None
def validate_url(url_str: Optional[str]) -> Optional[str]:
    """
    Validate and normalize a URL string.
    This function checks if a URL string is a valid HTTP or HTTPS URL.
    It handles edge cases like malformed URLs and non-HTTP protocols.
    Args:
        url_str: The URL string to validate
    Returns:
        The validated URL string if valid, None otherwise
    Examples:
        validate_url("https://example.com")  # -> "https://example.com"
        validate_url("http://example.com")   # -> "http://example.com"
        validate_url("ftp://example.com")    # -> None
        validate_url("not-a-url")            # -> None
        validate_url(None)                   # -> None
    """
    if not url_str:
        return None
    try:
        from pydantic import HttpUrl
        parsed = HttpUrl(url_str)
        if parsed.scheme in ('http', 'https'):
            return str(parsed)
    except (ValueError, TypeError):
        pass
    return None
def prioritize_objects_by_unit_code(objects: List, unit_code: Optional[str]) -> List:
    """
    Reorder search results to prioritize objects whose IDs start with the unit code.
    This ensures that when searching with a specific unit_code (e.g., "NMAH"),
    objects from that museum appear first, even if the API ordered them differently.
    Args:
        objects: List of SmithsonianObject instances from search results
        unit_code: The unit code used in the search (e.g., "NMAH", "FSG")
    Returns:
        Reordered list with museum-specific objects first
    """
    if not unit_code or not objects:
        return objects
    # Convert unit_code to lowercase for case-insensitive matching
    unit_prefix = f"{unit_code.lower()}_"
    # Separate objects into prioritized and others
    prioritized = []
    others = []
    for obj in objects:
        if obj.id and obj.id.lower().startswith(unit_prefix):
            prioritized.append(obj)
        else:
            others.append(obj)
    # Return prioritized objects first, then others (maintaining their relative order)
    return prioritized + others
def _normalize_museum_code(record_id_prefix: str) -> str:
    """Normalize record_id prefix to museum code key used in MUSEUM_URL_PATTERNS."""
    prefix = record_id_prefix.lower()
    # Handle NMNH sub-museums with long prefixes
    if prefix.startswith("nmnh"):
        if "invertebratezoology" in prefix:
            return "NMNHINV"
        elif "anthropology" in prefix:
            return "NMNHANTHRO"
        elif "education" in prefix:
            return "NMNHEDUCATION"
        elif "mineralsciences" in prefix:
            return "NMNHMINSCI"
        elif "paleobiology" in prefix:
            return "NMNHPALEO"
        # Add more as needed
        else:
            return prefix.upper()  # fallback
    return prefix.upper()
async def construct_url_from_record_id(record_id: Optional[str]) -> Optional[str]:
    """
    Construct a URL from a record_id using museum-specific URL patterns.
    This function uses predefined URL construction patterns for each Smithsonian museum
    to generate accurate object URLs. Different museums have different URL formats and
    identifier requirements.
    Args:
        record_id: The record identifier (e.g., "nmah_1448973", "fsg_F1900.47")
    Returns:
        Constructed URL string, or None if museum not found or record_id malformed
    Examples:
        construct_url_from_record_id("nmah_1448973")
        # Returns: "https://americanhistory.si.edu/collections/object/nmah_1448973"
        construct_url_from_record_id("fsg_F1900.47")
        # Returns: "https://asia.si.edu/object/F1900.47"
        construct_url_from_record_id("nmnhinvertebratezoology_14688577")
        # Returns: "https://naturalhistory.si.edu/object/nmnhinvertebratezoology_14688577"
    """
    if not record_id or "_" not in record_id:
        return None
    # Extract components from record_id
    parts = record_id.split("_", 1)
    if len(parts) != 2:
        return None
    record_id_prefix = parts[0]
    accession = parts[1]
    # Normalize to museum code
    museum_code = _normalize_museum_code(record_id_prefix)
    # Import patterns
    from .constants import MUSEUM_URL_PATTERNS
    pattern = MUSEUM_URL_PATTERNS.get(museum_code)
    if not pattern:
        # Unknown museum, fall back to API lookup
        return await _get_url_from_api_record_id(record_id)
    # Handle different identifier types
    identifier_type = pattern["identifier"]
    base_url = pattern["base_url"]
    path_template = pattern["path_template"]
    if identifier_type == "record_ID":
        # Use the full record_id
        identifier = record_id
    elif identifier_type == "accession":
        # Use just the accession part
        identifier = accession
    elif identifier_type in ["record_link", "guid", "url", "idsId"]:
        # Need to get this from API
        return await _get_url_from_api_record_id(record_id)
    else:
        # Unknown identifier type, fall back to API
        return await _get_url_from_api_record_id(record_id)
    # Handle template variables in base_url
    if "{record_link}" in base_url or "{guid}" in base_url:
        # Need API data for these
        return await _get_url_from_api_record_id(record_id)
    # Construct the URL
    try:
        url = base_url.rstrip("/")
        if path_template:
            # Format the path template with available variables
            formatted_path = path_template.format(
                record_ID=record_id,
                accession=accession,
                url=record_id,  # fallback
                idsId=record_id,  # fallback
                guid=record_id,  # fallback
            )
            url += formatted_path
        return url
    except (KeyError, ValueError):
        # Template formatting failed, fall back to API
        return await _get_url_from_api_record_id(record_id)
async def _get_url_from_api_record_id(record_id: str) -> Optional[str]:
    """
    Fallback function to get URL from API when pattern-based construction fails
    or when API data is required (record_link, guid, etc.).
    """
    from .api_client import SmithsonianAPIClient
    from .models import CollectionSearchFilter
    client = SmithsonianAPIClient()
    try:
        await client.connect()
        # Try to find the object by record_id
        # First, search for it
        filters = CollectionSearchFilter(
            query=record_id,
            unit_code=None,
            object_type=None,
            date_start=None,
            date_end=None,
            maker=None,
            material=None,
            topic=None,
            has_images=None,
            is_cc0=None,
            on_view=None,
            limit=1,
            offset=0,
        )
        results = await client.search_collections(filters=filters)
        if results.objects and results.objects[0].record_link:
            return str(results.objects[0].record_link)
        # If that doesn't work, try direct lookup if the API supports it
        # For now, return None if we can't construct it
        return None
    except Exception:
        return None
    finally:
        await client.disconnect()