from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from typing import Any
import httpx
INSPIRE_BASE_URL = "https://inspirehep.net"
LITERATURE_ENDPOINT = "/api/literature"
DEFAULT_FIELDS = ",".join(
[
"titles.title",
"authors.full_name",
"abstracts.value",
"publication_info.year",
"citation_count",
"imprints.date",
"preprint_date",
"earliest_date",
"arxiv_eprints.value",
"external_system_identifiers.schema",
"external_system_identifiers.value",
]
)
ABSTRACT_MAX_CHARS = 1200
DEFAULT_NON_COLLAB_MAX_AUTHORS = 10
@dataclass(frozen=True)
class SearchResult:
records: list[dict[str, Any]]
def build_title_query(title: str, large_collaboration: bool, year: int | None = None) -> str:
escaped = _escape_quotes(title)
base_query = f'title "{escaped}"'
return _apply_filters(base_query, large_collaboration, year)
def build_fulltext_query(fulltext: str, large_collaboration: bool, year: int | None = None) -> str:
escaped = _escape_quotes(fulltext)
base_query = f'ft "{escaped}"'
return _apply_filters(base_query, large_collaboration, year)
def build_author_query(
authors: list[str], large_collaboration: bool, year: int | None = None
) -> str:
clauses = [f'author "{_escape_quotes(author)}"' for author in authors]
base_query = " and ".join(clauses)
return _apply_filters(base_query, large_collaboration, year)
class InspireHEPClient:
def __init__(
self,
http_client: httpx.AsyncClient | None = None,
timeout_seconds: float = 15.0,
) -> None:
self._owns_client = http_client is None
self._client = http_client or httpx.AsyncClient(
base_url=INSPIRE_BASE_URL,
timeout=timeout_seconds,
headers={"User-Agent": "inspirehep-mcp/0.1.0"},
)
async def __aenter__(self) -> "InspireHEPClient":
return self
async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
await self.close()
async def close(self) -> None:
if self._owns_client:
await self._client.aclose()
async def search_literature(
self,
*,
query: str,
limit: int,
sort: str,
fields: str = DEFAULT_FIELDS,
) -> SearchResult:
response = await self._client.get(
LITERATURE_ENDPOINT,
params={"q": query, "size": limit, "sort": sort, "fields": fields},
)
response.raise_for_status()
payload = response.json()
hits = payload.get("hits", {}).get("hits", [])
records = [normalize_record(hit.get("metadata", {})) for hit in hits]
return SearchResult(records=[record for record in records if record])
def normalize_record(metadata: dict[str, Any]) -> dict[str, Any]:
title = _first_non_empty(
(item.get("title") for item in metadata.get("titles", []) if isinstance(item, dict))
)
abstract = _first_non_empty(
(item.get("value") for item in metadata.get("abstracts", []) if isinstance(item, dict))
)
authors = [
author.get("full_name")
for author in metadata.get("authors", [])
if isinstance(author, dict) and author.get("full_name")
]
result: dict[str, Any] = {
"title": title,
"authors": authors,
"abstract": _compact_abstract(abstract),
"year": _extract_year(metadata),
"citation_count": _extract_citation_count(metadata),
}
arxiv_url = _extract_arxiv_url(metadata)
if arxiv_url:
result["arxiv_url"] = arxiv_url
return {key: value for key, value in result.items() if value not in (None, "", [])}
def _extract_year(metadata: dict[str, Any]) -> int | None:
publication_info = metadata.get("publication_info", [])
for entry in publication_info:
if isinstance(entry, dict) and isinstance(entry.get("year"), int):
return entry["year"]
for key in ("imprints",):
for entry in metadata.get(key, []):
if not isinstance(entry, dict):
continue
parsed = _parse_year_from_date(entry.get("date"))
if parsed is not None:
return parsed
for key in ("preprint_date", "earliest_date"):
parsed = _parse_year_from_date(metadata.get(key))
if parsed is not None:
return parsed
return None
def _extract_arxiv_url(metadata: dict[str, Any]) -> str | None:
for item in metadata.get("arxiv_eprints", []):
if not isinstance(item, dict):
continue
value = item.get("value")
url = _arxiv_abs_url(value)
if url:
return url
for item in metadata.get("external_system_identifiers", []):
if not isinstance(item, dict):
continue
schema = str(item.get("schema", "")).strip().lower()
if schema != "arxiv":
continue
url = _arxiv_abs_url(item.get("value"))
if url:
return url
return None
def _extract_citation_count(metadata: dict[str, Any]) -> int | None:
value = metadata.get("citation_count")
if isinstance(value, int):
return value
return None
def _arxiv_abs_url(raw_id: Any) -> str | None:
if not isinstance(raw_id, str):
return None
identifier = raw_id.strip()
if not identifier:
return None
identifier = identifier.replace("arXiv:", "").replace("arxiv:", "")
return f"https://arxiv.org/abs/{identifier}"
def _escape_quotes(value: str) -> str:
return value.replace("\\", "\\\\").replace('"', '\\"')
def _apply_filters(base_query: str, large_collaboration: bool, year: int | None) -> str:
clauses = [base_query]
if year is not None:
clauses.append(f"year {year}")
if not large_collaboration:
clauses.append(f"ac 1->{DEFAULT_NON_COLLAB_MAX_AUTHORS}")
return " and ".join(clauses)
def _first_non_empty(values: Any) -> str | None:
for value in values:
if isinstance(value, str) and value.strip():
return value.strip()
return None
def _compact_abstract(value: str | None) -> str | None:
if not value:
return None
normalized = " ".join(value.split())
if len(normalized) <= ABSTRACT_MAX_CHARS:
return normalized
return f"{normalized[: ABSTRACT_MAX_CHARS - 3].rstrip()}..."
def _parse_year_from_date(value: Any) -> int | None:
if not isinstance(value, str) or not value.strip():
return None
year_token = value.strip()[:4]
if year_token.isdigit():
return int(year_token)
try:
parsed = datetime.fromisoformat(value)
except ValueError:
return None
return parsed.year