mcp-simple-arxiv

MIT License

Overview InspectNew Schema Related Servers Reviews Score

mcp-simple-arxiv
mcp_simple_arxiv

"""
arXiv API client with rate limiting.
"""

import asyncio
import logging
from datetime import datetime, timedelta
import feedparser
import httpx
from typing import Optional, Dict, List, Any

logger = logging.getLogger(__name__)

class ArxivClient:
    """
    arXiv API client with built-in rate limiting.
    Ensures no more than 1 request every 3 seconds.
    """
    
    def __init__(self):
        self.base_url = "http://export.arxiv.org/api/query"
        self._last_request: Optional[datetime] = None
        self._lock = asyncio.Lock()
        
    async def _wait_for_rate_limit(self) -> None:
        """Ensures we respect arXiv's rate limit of 1 request every 3 seconds."""
        async with self._lock:
            if self._last_request is not None:
                elapsed = datetime.now() - self._last_request
                if elapsed < timedelta(seconds=3):
                    await asyncio.sleep(3 - elapsed.total_seconds())
            self._last_request = datetime.now()

    def _clean_text(self, text: str) -> str:
        """Clean up text by removing extra whitespace and newlines."""
        return " ".join(text.split())

    def _get_html_url(self, arxiv_id: str) -> str:
        """
        Construct HTML version URL for a paper.
        
        The HTML version URL is not provided by the API but can be constructed
        by modifying the PDF URL pattern.
        """
        # Remove version suffix if present (e.g., v1, v2)
        base_id = arxiv_id.split('v')[0]
        return f"https://arxiv.org/html/{arxiv_id}"

    def _parse_entry(self, entry: Dict[str, Any]) -> Dict[str, Any]:
        """Parse a feed entry into a paper dictionary."""
        # Extract PDF and HTML links
        pdf_url = None
        abstract_url = None  # This is the URL to the abstract page
        for link in entry.get('links', []):
            if isinstance(link, dict):
                if link.get('type') == 'application/pdf':
                    pdf_url = link.get('href')
                elif link.get('type') == 'text/html':
                    abstract_url = link.get('href')

        # Get paper ID
        paper_id = entry.get('id', '').split("/abs/")[-1].rstrip()
        
        # Create HTML version URL
        html_url = self._get_html_url(paper_id) if paper_id else None

        # Get authors
        authors = []
        for author in entry.get('authors', []):
            if isinstance(author, dict) and 'name' in author:
                authors.append(author['name'])
            elif hasattr(author, 'name'):
                authors.append(author.name)

        # Get categories
        categories = []
        primary_category = None
        
        # Get primary category
        if 'arxiv_primary_category' in entry:
            if isinstance(entry['arxiv_primary_category'], dict):
                primary_category = entry['arxiv_primary_category'].get('term')
            elif hasattr(entry['arxiv_primary_category'], 'term'):
                primary_category = entry['arxiv_primary_category'].term
        
        # Get all categories
        for category in entry.get('tags', []):
            if isinstance(category, dict) and 'term' in category:
                categories.append(category['term'])
            elif hasattr(category, 'term'):
                categories.append(category.term)

        # Remove primary category from regular categories if it's there
        if primary_category and primary_category in categories:
            categories.remove(primary_category)

        return {
            "id": paper_id,
            "title": self._clean_text(entry.get('title', '')),
            "authors": authors,
            "primary_category": primary_category,
            "categories": categories,
            "published": entry.get('published', ''),
            "updated": entry.get('updated', ''),
            "summary": self._clean_text(entry.get('summary', '')),
            "comment": self._clean_text(entry.get('arxiv_comment', '')),
            "journal_ref": entry.get('arxiv_journal_ref', ''),
            "doi": entry.get('arxiv_doi', ''),
            "pdf_url": pdf_url,
            "abstract_url": abstract_url,  # URL to abstract page
            "html_url": html_url  # URL to HTML version if available
        }

    async def search(self, query: str, max_results: int = 10) -> List[Dict[str, Any]]:
        """
        Search arXiv papers.
        
        The query string supports arXiv's advanced search syntax:
        - Search in title: ti:"search terms"
        - Search in abstract: abs:"search terms"
        - Search by author: au:"author name"
        - Combine terms with: AND, OR, ANDNOT
        - Filter by category: cat:cs.AI
        
        Examples:
        - "machine learning"  (searches all fields)
        - ti:"neural networks" AND cat:cs.AI  (title with category)
        - au:bengio AND ti:"deep learning"  (author and title)
        """
        await self._wait_for_rate_limit()
        
        # Ensure max_results is within API limits
        max_results = min(max_results, 2000)  # API limit: 2000 per request
        
        params = {
            "search_query": query,
            "max_results": max_results,
            "sortBy": "submittedDate",  # Default to newest papers first
            "sortOrder": "descending",
        }
        
        async with httpx.AsyncClient() as client:
            try:
                response = await client.get(self.base_url, params=params)
                response.raise_for_status()
                
                # Parse the Atom feed response
                feed = feedparser.parse(response.text)
                
                if not isinstance(feed, dict) or 'entries' not in feed:
                    logger.error("Invalid response from arXiv API")
                    logger.debug(f"Response text: {response.text[:1000]}...")
                    raise ValueError("Invalid response from arXiv API")
                    
                if not feed.get('entries'):
                    # Empty results are ok - return empty list
                    return []
                
                return [self._parse_entry(entry) for entry in feed.entries]
                
            except httpx.HTTPError as e:
                logger.error(f"HTTP error while searching: {e}")
                raise ValueError(f"arXiv API HTTP error: {str(e)}")
            
    async def get_paper(self, paper_id: str) -> Dict[str, Any]:
        """
        Get detailed information about a specific paper.
        
        Args:
            paper_id: arXiv paper ID (e.g., "2103.08220")
            
        Returns:
            Dictionary containing paper metadata, including:
            - Basic metadata (title, authors, dates)
            - Categories (primary and others)
            - Abstract and comments
            - URLs (abstract page, PDF version, HTML version if available)
            - DOI if available
        """
        await self._wait_for_rate_limit()
        
        params = {
            "id_list": paper_id,
            "max_results": 1
        }
        
        async with httpx.AsyncClient() as client:
            try:
                response = await client.get(self.base_url, params=params)
                response.raise_for_status()
                
                feed = feedparser.parse(response.text)
                if not isinstance(feed, dict) or 'entries' not in feed:
                    logger.error("Invalid response from arXiv API")
                    logger.debug(f"Response text: {response.text[:1000]}...")
                    raise ValueError("Invalid response from arXiv API")
                
                if not feed.get('entries'):
                    raise ValueError(f"Paper not found: {paper_id}")
                    
                return self._parse_entry(feed.entries[0])
                
            except httpx.HTTPError as e:
                logger.error(f"HTTP error while fetching paper: {e}")
                raise ValueError(f"arXiv API HTTP error: {str(e)}")

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/andybrandt/mcp-simple-arxiv'

If you have feedback or need assistance with the MCP directory API, please join our Discord server