ArXiv MCP Server

Overview Schema Related Servers Score Discussions

search.py•18.3 KiB

"""Search functionality for the arXiv MCP server.""" import arxiv import json import logging import httpx import xml.etree.ElementTree as ET from typing import Dict, Any, List, Optional from datetime import datetime, timezone from dateutil import parser import mcp.types as types from ..config import Settings logger = logging.getLogger("arxiv-mcp-server") settings = Settings() # arXiv API endpoint for raw queries (bypasses arxiv package URL encoding issues) # Use HTTPS to avoid redirect from http -> https ARXIV_API_URL = "https://export.arxiv.org/api/query" # XML namespaces used in arXiv Atom feed ARXIV_NS = { "atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom", } # Valid arXiv category prefixes for validation VALID_CATEGORIES = { "cs", "econ", "eess", "math", "physics", "q-bio", "q-fin", "stat", "astro-ph", "cond-mat", "gr-qc", "hep-ex", "hep-lat", "hep-ph", "hep-th", "math-ph", "nlin", "nucl-ex", "nucl-th", "quant-ph", } async def _raw_arxiv_search( query: str, max_results: int = 10, sort_by: str = "relevance", date_from: Optional[str] = None, date_to: Optional[str] = None, categories: Optional[List[str]] = None, ) -> List[Dict[str, Any]]: """ Perform arXiv search using raw HTTP requests. This bypasses the arxiv Python package to avoid URL encoding issues with date filters. The arxiv package encodes '+' as '%2B' which breaks the submittedDate:[YYYYMMDD+TO+YYYYMMDD] syntax. """ # Build query components query_parts = [] if query.strip(): query_parts.append(f"({query})") # Add category filtering if categories: category_filter = " OR ".join(f"cat:{cat}" for cat in categories) query_parts.append(f"({category_filter})") # Add date filtering using arXiv API syntax if date_from or date_to: try: if date_from: start_date = parser.parse(date_from).strftime("%Y%m%d0000") else: start_date = "199107010000" # arXiv started July 1991 if date_to: end_date = parser.parse(date_to).strftime("%Y%m%d2359") else: end_date = datetime.now().strftime("%Y%m%d2359") # CRITICAL: This must NOT be URL-encoded. The '+' in '+TO+' must remain literal. date_filter = f"submittedDate:[{start_date}+TO+{end_date}]" query_parts.append(date_filter) logger.debug(f"Added date filter: {date_filter}") except (ValueError, TypeError) as e: logger.error(f"Error parsing dates: {e}") raise ValueError(f"Invalid date format. Use YYYY-MM-DD format: {e}") if not query_parts: raise ValueError("No search criteria provided") # Combine query parts with AND (space in arXiv = AND) final_query = " AND ".join(query_parts) logger.debug(f"Raw API query: {final_query}") # Map sort parameter to arXiv API values sort_map = { "relevance": "relevance", "date": "submittedDate", } sort_order = "descending" # Build the URL manually to avoid encoding the '+' in date ranges # We encode most parameters but carefully preserve '+TO+' in date filters base_params = f"max_results={max_results}&sortBy={sort_map.get(sort_by, 'relevance')}&sortOrder={sort_order}" # Manually construct search_query parameter # We need to encode spaces and special chars BUT NOT the '+' in '+TO+' # Strategy: encode the query parts separately, then join with encoded AND encoded_query = ( final_query.replace(" AND ", "+AND+").replace(" OR ", "+OR+").replace(" ", "+") ) # But we need to be careful about existing '+TO+' - it should stay as-is # Since we built the date filter with literal '+TO+', it's already correct url = f"{ARXIV_API_URL}?search_query={encoded_query}&{base_params}" logger.debug(f"Raw API URL: {url}") # Make the request async with httpx.AsyncClient(timeout=30.0) as client: response = await client.get(url) response.raise_for_status() # Parse the Atom XML response return _parse_arxiv_atom_response(response.text) def _parse_arxiv_atom_response(xml_text: str) -> List[Dict[str, Any]]: """Parse arXiv Atom XML response into paper dictionaries.""" results = [] try: root = ET.fromstring(xml_text) for entry in root.findall("atom:entry", ARXIV_NS): # Extract paper ID from the id URL id_elem = entry.find("atom:id", ARXIV_NS) if id_elem is None or id_elem.text is None: continue # ID format: http://arxiv.org/abs/XXXX.XXXXX or http://arxiv.org/abs/category/XXXXXXX paper_id = id_elem.text.split("/abs/")[-1] # Remove version suffix for short ID short_id = paper_id.split("v")[0] if "v" in paper_id else paper_id # Title title_elem = entry.find("atom:title", ARXIV_NS) title = ( title_elem.text.strip().replace("\n", " ") if title_elem is not None and title_elem.text else "" ) # Authors authors = [] for author in entry.findall("atom:author", ARXIV_NS): name_elem = author.find("atom:name", ARXIV_NS) if name_elem is not None and name_elem.text: authors.append(name_elem.text) # Abstract/Summary summary_elem = entry.find("atom:summary", ARXIV_NS) abstract = ( summary_elem.text.strip().replace("\n", " ") if summary_elem is not None and summary_elem.text else "" ) # Categories categories = [] for cat in entry.findall("arxiv:primary_category", ARXIV_NS): term = cat.get("term") if term: categories.append(term) for cat in entry.findall("atom:category", ARXIV_NS): term = cat.get("term") if term and term not in categories: categories.append(term) # Published date published_elem = entry.find("atom:published", ARXIV_NS) published = ( published_elem.text if published_elem is not None and published_elem.text else "" ) # PDF URL pdf_url = None for link in entry.findall("atom:link", ARXIV_NS): if link.get("title") == "pdf": pdf_url = link.get("href") break if not pdf_url: pdf_url = f"http://arxiv.org/pdf/{paper_id}" results.append( { "id": short_id, "title": title, "authors": authors, "abstract": abstract, "categories": categories, "published": published, "url": pdf_url, "resource_uri": f"arxiv://{short_id}", } ) except ET.ParseError as e: logger.error(f"Failed to parse arXiv XML response: {e}") raise ValueError(f"Failed to parse arXiv API response: {e}") return results search_tool = types.Tool( name="search_papers", description="""Search for papers on arXiv with advanced filtering and query optimization. QUERY CONSTRUCTION GUIDELINES: - Use QUOTED PHRASES for exact matches: "multi-agent systems", "neural networks", "machine learning" - Combine related concepts with OR: "AI agents" OR "software agents" OR "intelligent agents" - Use field-specific searches for precision: - ti:"exact title phrase" - search in titles only - au:"author name" - search by author - abs:"keyword" - search in abstracts only - Use ANDNOT to exclude unwanted results: "machine learning" ANDNOT "survey" - For best results, use 2-4 core concepts rather than long keyword lists ADVANCED SEARCH PATTERNS: - Field + phrase: ti:"transformer architecture" for papers with exact title phrase - Multiple fields: au:"Smith" AND ti:"quantum" for author Smith's quantum papers - Exclusions: "deep learning" ANDNOT ("survey" OR "review") to exclude survey papers - Broad + narrow: "artificial intelligence" AND (robotics OR "computer vision") CATEGORY FILTERING (highly recommended for relevance): - cs.AI: Artificial Intelligence - cs.MA: Multi-Agent Systems - cs.LG: Machine Learning - cs.CL: Computation and Language (NLP) - cs.CV: Computer Vision - cs.RO: Robotics - cs.HC: Human-Computer Interaction - cs.CR: Cryptography and Security - cs.DB: Databases EXAMPLES OF EFFECTIVE QUERIES: - ti:"reinforcement learning" with categories: ["cs.LG", "cs.AI"] - for RL papers by title - au:"Hinton" AND "deep learning" with categories: ["cs.LG"] - for Hinton's deep learning work - "multi-agent" ANDNOT "survey" with categories: ["cs.MA"] - exclude survey papers - abs:"transformer" AND ti:"attention" with categories: ["cs.CL"] - attention papers with transformer abstracts DATE FILTERING: Use YYYY-MM-DD format for historical research: - date_to: "2015-12-31" - for foundational/classic work (pre-2016) - date_from: "2020-01-01" - for recent developments (post-2020) - Both together for specific time periods RESULT QUALITY: Results sorted by RELEVANCE (most relevant papers first), not just newest papers. This ensures you get the most pertinent results regardless of publication date. TIPS FOR FOUNDATIONAL RESEARCH: - Use date_to: "2010-12-31" to find classic papers on BDI, SOAR, ACT-R - Combine with field searches: ti:"BDI" AND abs:"belief desire intention" - Try author searches: au:"Rao" AND "BDI" for Anand Rao's foundational BDI work""", inputSchema={ "type": "object", "properties": { "query": { "type": "string", "description": 'Search query using quoted phrases for exact matches (e.g., \'"machine learning" OR "deep learning"\') or specific technical terms. Avoid overly broad or generic terms.', }, "max_results": { "type": "integer", "description": "Maximum number of results to return (default: 10, max: 50). Use 15-20 for comprehensive searches.", }, "date_from": { "type": "string", "description": "Start date for papers (YYYY-MM-DD format). Use to find recent work, e.g., '2023-01-01' for last 2 years.", }, "date_to": { "type": "string", "description": "End date for papers (YYYY-MM-DD format). Use with date_from to find historical work, e.g., '2020-12-31' for older research.", }, "categories": { "type": "array", "items": {"type": "string"}, "description": "Strongly recommended: arXiv categories to focus search (e.g., ['cs.AI', 'cs.MA'] for agent research, ['cs.LG'] for ML, ['cs.CL'] for NLP, ['cs.CV'] for vision). Greatly improves relevance.", }, "sort_by": { "type": "string", "enum": ["relevance", "date"], "description": "Sort results by 'relevance' (most relevant first, default) or 'date' (newest first). Use 'relevance' for focused searches, 'date' for recent developments.", }, }, "required": ["query"], }, ) def _validate_categories(categories: List[str]) -> bool: """Validate that all provided categories are valid arXiv categories.""" for category in categories: if "." in category: prefix = category.split(".")[0] else: prefix = category if prefix not in VALID_CATEGORIES: logger.warning(f"Unknown category prefix: {prefix}") return False return True def _optimize_query(query: str) -> str: """Minimal query optimization - preserve user intent while fixing obvious issues.""" # Don't modify queries with existing field specifiers (ti:, au:, abs:, cat:) if any( field in query for field in ["ti:", "au:", "abs:", "cat:", "AND", "OR", "ANDNOT"] ): logger.debug("Field-specific or boolean query detected - no optimization") return query # Don't modify queries that are already quoted if query.startswith('"') and query.endswith('"'): logger.debug("Pre-quoted query detected - no optimization") return query # For very long queries (>10 terms), suggest user be more specific rather than auto-converting terms = query.split() if len(terms) > 10: logger.warning( f"Very long query ({len(terms)} terms) - consider using quotes for phrases or field-specific searches" ) # Only optimization: preserve the original query exactly as intended return query def _process_paper(paper: arxiv.Result) -> Dict[str, Any]: """Process paper information with resource URI.""" return { "id": paper.get_short_id(), "title": paper.title, "authors": [author.name for author in paper.authors], "abstract": paper.summary, "categories": paper.categories, "published": paper.published.isoformat(), "url": paper.pdf_url, "resource_uri": f"arxiv://{paper.get_short_id()}", } async def handle_search(arguments: Dict[str, Any]) -> List[types.TextContent]: """Handle paper search requests with improved arXiv API integration. Uses raw HTTP requests when date filtering is requested to avoid URL encoding issues with the arxiv Python package. Falls back to the arxiv package for non-date queries for better compatibility. """ try: max_results = min(int(arguments.get("max_results", 10)), settings.MAX_RESULTS) base_query = arguments["query"] date_from_arg = arguments.get("date_from") date_to_arg = arguments.get("date_to") categories = arguments.get("categories") sort_by_arg = arguments.get("sort_by", "relevance") logger.debug( f"Starting search with query: '{base_query}', max_results: {max_results}" ) # Validate categories if provided if categories and not _validate_categories(categories): return [ types.TextContent( type="text", text="Error: Invalid category provided. Please check arXiv category names.", ) ] # Use raw HTTP API when date filtering is requested # This bypasses the arxiv package's URL encoding which breaks date syntax if date_from_arg or date_to_arg: logger.debug( f"Date filtering requested - using raw API: {date_from_arg} to {date_to_arg}" ) try: optimized_query = ( _optimize_query(base_query) if base_query.strip() else "" ) results = await _raw_arxiv_search( query=optimized_query, max_results=max_results, sort_by=sort_by_arg, date_from=date_from_arg, date_to=date_to_arg, categories=categories, ) logger.info( f"Raw API search completed: {len(results)} results returned" ) response_data = {"total_results": len(results), "papers": results} return [ types.TextContent( type="text", text=json.dumps(response_data, indent=2) ) ] except httpx.HTTPStatusError as e: logger.error(f"arXiv API HTTP error: {e}") return [ types.TextContent( type="text", text=f"Error: arXiv API HTTP error - {str(e)}" ) ] except ValueError as e: return [types.TextContent(type="text", text=f"Error: {str(e)}")] # For non-date queries, use the arxiv package (more robust parsing) client = arxiv.Client() # Build query components query_parts = [] # Add base query with optimization if base_query.strip(): optimized_query = _optimize_query(base_query) query_parts.append(f"({optimized_query})") if optimized_query != base_query: logger.debug(f"Optimized query: '{base_query}' -> '{optimized_query}'") # Add category filtering if categories: category_filter = " OR ".join(f"cat:{cat}" for cat in categories) query_parts.append(f"({category_filter})") logger.debug(f"Added category filter: {category_filter}") # Combine query parts if not query_parts: return [ types.TextContent( type="text", text="Error: No search criteria provided" ) ] # Combine query parts - arXiv uses space for AND by default final_query = " ".join(query_parts) logger.debug(f"Final arXiv query: {final_query}") # Determine sort method if sort_by_arg == "date": sort_criterion = arxiv.SortCriterion.SubmittedDate logger.debug("Using date sorting (newest first)") else: sort_criterion = arxiv.SortCriterion.Relevance logger.debug("Using relevance sorting (most relevant first)") search = arxiv.Search( query=final_query, max_results=max_results, sort_by=sort_criterion, ) # Process results results = [] for paper in client.results(search): if len(results) >= max_results: break results.append(_process_paper(paper)) logger.info(f"Search completed: {len(results)} results returned") response_data = {"total_results": len(results), "papers": results} return [ types.TextContent(type="text", text=json.dumps(response_data, indent=2)) ] except arxiv.ArxivError as e: logger.error(f"ArXiv API error: {e}") return [ types.TextContent(type="text", text=f"Error: ArXiv API error - {str(e)}") ] except Exception as e: logger.error(f"Unexpected search error: {e}") return [types.TextContent(type="text", text=f"Error: {str(e)}")]

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/blazickjp/arxiv-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

search.py•18.3 KiB