arXiv MCP Server

Overview Inspect Schema Related Servers Score Discussions

server.py•19 kB

import argparse import os import re import difflib import json import asyncio import random import uuid from typing import Optional, Tuple, Dict, Any import httpx from fastmcp import Context, FastMCP from starlette.responses import JSONResponse from starlette.routing import Route import uvicorn import feedparser import fitz mcp = FastMCP("arxiv-server") USER_AGENT = "arxiv-mcp/1.0" ARXIV_API_BASE = "https://export.arxiv.org/api" DEFAULT_TIMEOUT = 60.0 RETRY_ATTEMPTS = 3 RETRY_BASE = 0.5 # seconds HTTP_LIMITS = httpx.Limits(max_keepalive_connections=5, max_connections=10) # Ensure DOWNLOAD_PATH exists; fall back to ./downloads for local runs DOWNLOAD_PATH = os.getenv("DOWNLOAD_PATH") or os.path.join(os.getcwd(), "downloads") os.makedirs(DOWNLOAD_PATH, exist_ok=True) ARXIV_ID_RE = re.compile( r"^(?:arXiv:)?(?P<id>[\d]{4}\.[\d]{4,5}(?:v\d+)?|[a-z\-]+(?:\.[A-Z]{2})?\/\d{7}(?:v\d+)?)$", re.IGNORECASE, ) def _error(code: str, message: str, *, retry_after: Optional[int] = None) -> str: """Return structured JSON error.""" payload = { "status": "error", "code": code, "message": message, "retry_after": retry_after, "request_id": str(uuid.uuid4()), } return json.dumps(payload) async def _retry_sleep(attempt: int) -> None: # Exponential backoff with jitter base = RETRY_BASE * (2 ** attempt) await asyncio.sleep(base + random.random() * 0.25) async def make_api_call(url: str, params: Dict[str, str]) -> Optional[str]: """Make a request to the arXiv API with retries.""" headers = {"User-Agent": USER_AGENT, "Accept": "application/atom+xml"} async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT, limits=HTTP_LIMITS) as client: for attempt in range(RETRY_ATTEMPTS): try: resp = await client.get(url, params=params, headers=headers) resp.raise_for_status() return resp.text except Exception: if attempt < RETRY_ATTEMPTS - 1: await _retry_sleep(attempt) continue return None async def get_pdf(url: str) -> Optional[bytes]: """Get PDF document as bytes from arXiv.org with retries.""" headers = {"User-Agent": USER_AGENT, "Accept": "application/pdf"} async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT, limits=HTTP_LIMITS) as client: for attempt in range(RETRY_ATTEMPTS): try: resp = await client.get(url, headers=headers) resp.raise_for_status() return resp.content except Exception: if attempt < RETRY_ATTEMPTS - 1: await _retry_sleep(attempt) continue return None def find_best_match(target_title: str, entries: list, threshold: float = 0.8): """Find the entry whose title best matches the target title.""" target_title_lower = target_title.lower() best_entry = None best_score = 0.0 for entry in entries: entry_title_lower = entry.title.lower() score = difflib.SequenceMatcher(None, target_title_lower, entry_title_lower).ratio() if score > best_score: best_score = score best_entry = entry if best_score >= threshold: return best_entry return None async def fetch_information(title: str): """Get information about the article.""" formatted_title = format_text(title) url = f"{ARXIV_API_BASE}/query" params = { "search_query": f"ti:{formatted_title}", "start": 0, "max_results": 25, } data = await make_api_call(url, params=params) if data is None: return "Unable to retrieve data from arXiv.org." feed = feedparser.parse(data) error_msg = ( "Unable to extract information for the provided title. " "This issue may stem from an incorrect or incomplete title, " "or because the work has not been published on arXiv." ) if not feed.entries: return error_msg best_match = find_best_match(target_title=formatted_title, entries=feed.entries) if best_match is None: return str(error_msg) return best_match async def resolve_article(title: Optional[str] = None, arxiv_id: Optional[str] = None) -> Tuple[str, str] | str: """ Resolve to a direct PDF URL and arXiv ID using either a title or an arXiv ID. Preference order: arxiv_id > title. """ if arxiv_id: m = ARXIV_ID_RE.match(arxiv_id.strip()) if not m: return _error("INVALID_ID", f"Not a valid arXiv ID: {arxiv_id}") vid = m.group("id") return (f"https://arxiv.org/pdf/{vid}", vid) if not title: return _error("MISSING_PARAM", "Provide either 'arxiv_id' or 'title'.") info = await fetch_information(title) if isinstance(info, str): return _error("NOT_FOUND", str(info)) resolved_id = info.id.split("/abs/")[-1] direct_pdf_url = f"https://arxiv.org/pdf/{resolved_id}" return (direct_pdf_url, resolved_id) def format_text(text: str) -> str: """Clean a given text string by removing escape sequences and leading and trailing whitespaces.""" # Remove common escape sequences text_without_escapes = re.sub(r"\\[ntr]", " ", text) # Replace colon with space text_without_colon = text_without_escapes.replace(":", " ") # Remove both single quotes and double quotes text_without_quotes = re.sub(r"['\"]", "", text_without_colon) # Collapse multiple spaces into one text_single_spaced = re.sub(r"\s+", " ", text_without_quotes) # Trim leading and trailing spaces cleaned_text = text_single_spaced.strip() return cleaned_text @mcp.tool() async def get_article_url(title: Optional[str] = None, arxiv_id: Optional[str] = None) -> str: """ Retrieve the direct PDF URL of an article on arXiv.org by title or arXiv ID. Args: title: Article title. arxiv_id: arXiv ID (e.g., 1706.03762 or arXiv:1706.03762v7). Returns: URL that can be used to retrieve the article, or structured error JSON. """ result = await resolve_article(title=title, arxiv_id=arxiv_id) if isinstance(result, str): return result article_url, _ = result return article_url @mcp.tool() async def download_article( title: Optional[str] = None, arxiv_id: Optional[str] = None, ) -> str: """ Download the article as a PDF file. Resolve by arXiv ID or title. Args: title: Article title. arxiv_id: arXiv ID. Returns: Success message or structured error JSON. """ result = await resolve_article(title=title, arxiv_id=arxiv_id) if isinstance(result, str): return result article_url, resolved_id = result headers = {"User-Agent": USER_AGENT, "Accept": "application/pdf"} file_path = os.path.join(DOWNLOAD_PATH, f"{resolved_id}.pdf") async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT, limits=HTTP_LIMITS) as client: for attempt in range(RETRY_ATTEMPTS): try: async with client.stream("GET", article_url, headers=headers) as resp: resp.raise_for_status() with open(file_path, "wb") as f: async for chunk in resp.aiter_bytes(): if chunk: f.write(chunk) return json.dumps({ "status": "ok", "message": "Download successful.", "path": file_path, }) except Exception as e: if attempt < RETRY_ATTEMPTS - 1: await _retry_sleep(attempt) continue return _error("DOWNLOAD_FAILED", f"Unable to retrieve or save the article: {e}") @mcp.tool() async def load_article_to_context( title: Optional[str] = None, arxiv_id: Optional[str] = None, start_page: Optional[int] = None, end_page: Optional[int] = None, max_pages: Optional[int] = None, max_chars: Optional[int] = None, preview: bool = False, ) -> str: """ Load the article text into context. Supports title or arXiv ID resolution and partial extraction. Args: title: Article title. arxiv_id: arXiv ID. start_page: 1-based start page (inclusive). end_page: 1-based end page (inclusive). max_pages: hard cap on number of pages to extract. max_chars: hard cap on number of characters to extract. preview: if True, only validate availability and return minimal info. Returns: Article text or structured error JSON. """ result = await resolve_article(title=title, arxiv_id=arxiv_id) if isinstance(result, str): return result article_url, resolved_id = result if preview: # Lightweight availability check try: async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT, limits=HTTP_LIMITS) as client: head = await client.head(article_url, headers={"User-Agent": USER_AGENT}) ok = head.status_code < 400 except Exception: ok = False return json.dumps({"status": "ok" if ok else "error", "reachable": ok, "arxiv_id": resolved_id, "url": article_url}) pdf_bytes = await get_pdf(article_url) if pdf_bytes is None: return _error("FETCH_FAILED", "Unable to retrieve the article from arXiv.org.") try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") except Exception as e: return _error("PDF_OPEN_FAILED", f"Unable to open PDF: {e}") total_pages = doc.page_count # Normalize page bounds (1-based inputs) s = max(1, start_page) if start_page else 1 e = min(end_page, total_pages) if end_page else total_pages if s > e or s < 1: return _error("BAD_RANGE", f"Invalid page range [{s}, {e}] for total_pages={total_pages}") # Apply max_pages cap if max_pages is not None: e = min(e, s + max_pages - 1) parts = [] chars = 0 for p in range(s - 1, e): page_text = doc.load_page(p).get_text() if not page_text: continue if max_chars is not None and chars + len(page_text) > max_chars: remain = max_chars - chars if remain > 0: parts.append(page_text[:remain]) chars += remain break parts.append(page_text) chars += len(page_text) return "".join(parts) @mcp.tool() async def get_details(title: Optional[str] = None, arxiv_id: Optional[str] = None) -> str: """ Retrieve metadata of an article by title or arXiv ID. Args: title: Article title. arxiv_id: arXiv ID. Returns: JSON string containing article details or structured error JSON. """ if arxiv_id: # Quick path via ID res = await resolve_article(arxiv_id=arxiv_id) if isinstance(res, str): return res _, vid = res # Fetch the /abs entry for richer fields params = {"search_query": f"id:{vid}", "start": 0, "max_results": 1} data = await make_api_call(f"{ARXIV_API_BASE}/query", params=params) if data is None: return _error("API_ERROR", "Unable to retrieve data from arXiv.org.") feed = feedparser.parse(data) if not feed.entries: return _error("NOT_FOUND", f"No metadata for {vid}") info = feed.entries[0] else: info = await fetch_information(title or "") if isinstance(info, str): return _error("NOT_FOUND", str(info)) entry_id = info.id link = info.link article_title = info.title authors = [author["name"] for author in info.authors] vid = entry_id.split("/abs/")[-1] direct_pdf_url = f"https://arxiv.org/pdf/{vid}" updated = getattr(info, "updated", "Unknown") published = getattr(info, "published", "Unknown") summary = getattr(info, "summary", "Unknown") info_dict = { "arXiv ID": vid, "Title": article_title, "Authors": authors, "Link": link, "Direct PDF URL": direct_pdf_url, "Published": published, "Updated": updated, "Summary": summary, } return json.dumps(info_dict) @mcp.tool() async def search_arxiv( ctx: Context, all_fields: Optional[str] = None, title: Optional[str] = None, author: Optional[str] = None, abstract: Optional[str] = None, start: int = 0, max_results: int = 10, ) -> Any: """ Performs a search query on the arXiv API based on specified parameters and returns matching article metadata. This function allows for flexible querying of the arXiv database. Only parameters that are explicitly provided will be included in the final search query. Results are returned in a JSON-formatted string with article titles as keys and their corresponding arXiv IDs as values. Args: all_fields: General keyword search across all metadata fields including title, abstract, authors, comments, and categories. title: Keyword(s) to search for within the titles of articles. author: Author name(s) to filter results by. abstract: Keyword(s) to search for within article abstracts. start: Index of the first result to return; used for paginating through search results. Defaults to 0. max_results: Maximum number of results to return (1-50). Returns: A JSON-formatted string containing article titles and their associated arXiv IDs; otherwise, a structured error JSON string. """ prefixed_params = [] if author: author = format_text(author) prefixed_params.append(f"au:{author}") if all_fields: all_fields = format_text(all_fields) prefixed_params.append(f"all:{all_fields}") if title: title = format_text(title) prefixed_params.append(f"ti:{title}") if abstract: abstract = format_text(abstract) prefixed_params.append(f"abs:{abstract}") # Construct search query search_query = " AND ".join(prefixed_params) params = { "search_query": search_query, "start": start, "max_results": max(1, min(max_results, 50)), } await ctx.info("Calling the API") response = await make_api_call(f"{ARXIV_API_BASE}/query", params=params) if response is None: return _error("API_ERROR", "Unable to retrieve data from arXiv.org.") feed = feedparser.parse(response) error_msg = ( "Unable to extract information for your query. " "This issue may stem from an incorrect search query." ) if not feed.entries: return _error("NOT_FOUND", error_msg) entries: Dict[str, Dict[str, Any]] = {} await ctx.info("Extracting information") for entry in feed.entries: id = entry.id article_title = entry.title arxiv_id = id.split("/abs/")[-1] authors = [author['name'] for author in entry.authors] entries[article_title] = {"arXiv ID": arxiv_id, "Authors": authors} return entries def _resolve_port(arg_port: Optional[int]) -> int: if arg_port is not None: return arg_port for key in ("MCP_PORT", "PORT"): value = os.getenv(key) if value: try: return int(value) except ValueError: raise ValueError(f"Invalid integer for {key}: {value}") return 8081 def main(): parser = argparse.ArgumentParser(description="Run the arXiv MCP server.") parser.add_argument( "--transport", choices=("stdio", "http"), help="Transport to use. Defaults to HTTP when PORT is set, otherwise STDIO.", ) parser.add_argument("--host", help="Host to bind for HTTP transport (default 0.0.0.0).") parser.add_argument("--port", type=int, help="Port to bind for HTTP transport.") parser.add_argument("--path", help="HTTP endpoint path (default /mcp).") args = parser.parse_args() transport = args.transport or os.getenv("MCP_TRANSPORT") if not transport: transport = "http" if os.getenv("PORT") else "stdio" if transport == "http": host = args.host or os.getenv("MCP_HOST") or os.getenv("HOST") or "0.0.0.0" port = _resolve_port(args.port) path = ( args.path or os.getenv("MCP_HTTP_PATH") or os.getenv("FASTMCP_STREAMABLE_HTTP_PATH") or os.getenv("FASTMCP_HTTP_PATH") or "/mcp" ) if not path.startswith("/"): path = f"/{path}" app = mcp.http_app(path=path, transport="http") primary_route = None for route in app.router.routes: if getattr(route, "path", None) == path: primary_route = route break if primary_route is not None: primary_route.methods = {"GET", "POST", "OPTIONS"} if path != "/.well-known/mcp": app.router.routes.append( Route( "/.well-known/mcp", endpoint=primary_route.endpoint, methods=["GET", "POST", "OPTIONS"], ) ) config_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "$id": "https://arxiv-mcp/.well-known/mcp-config", "title": "ArXiv MCP Session Configuration", "description": "Optional session configuration for the arXiv MCP server.", "x-query-style": "dot+bracket", "type": "object", "properties": { "downloadPath": { "type": "string", "title": "Download path", "description": "Override the download directory for saved PDFs.", "default": "/data", } }, "required": [], "additionalProperties": False, } async def config_endpoint(request): return JSONResponse(config_schema) app.router.routes.append( Route("/.well-known/mcp-config", endpoint=config_endpoint, methods=["GET"]) ) log_level = os.getenv("FASTMCP_LOG_LEVEL") or "INFO" uvicorn_config = { "timeout_graceful_shutdown": 0, "lifespan": "on", "log_level": log_level.lower(), } print(f"Starting arxiv-server via HTTP on {host}:{port}{path}") config = uvicorn.Config(app, host=host, port=port, **uvicorn_config) server = uvicorn.Server(config) asyncio.run(server.serve()) else: print("Starting arxiv-server via STDIO transport") mcp.run(transport="stdio") if __name__ == "__main__": main()

Implementation Reference

Latest Blog Posts

Code Execution with MCP: Architecting Agentic Efficiency
By Om-Shree-0709 on December 14, 2025.
Model Context Protocol Proxies: Enabling Enterprise Control with Virtual MCPs
By Om-Shree-0709 on December 9, 2025.
AI Security
Virtual MCP
Kubernetes Operator
The State of MCP in 2025: Who's Building What and Why It Matters
By punkpeye on December 7, 2025.
mcp
startups

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/lecigarevolant/arxiv-mcp-server-gpt'

If you have feedback or need assistance with the MCP directory API, please join our Discord server