Skip to main content
Glama
andybrandt

mcp-simple-arxiv

by andybrandt

get_paper_data

Retrieve detailed academic paper information including abstracts and available formats from arXiv using a paper ID.

Instructions

Get detailed information about a specific paper including abstract and available formats.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
paper_idYes

Implementation Reference

  • The MCP tool handler for 'get_paper_data'. Registered via @app.tool decorator with annotations. Takes paper_id: str, fetches paper using ArxivClient, formats and returns detailed paper information as string.
    @app.tool(
        annotations={
            "title": "Get arXiv Paper Data",
            "readOnlyHint": True,
            "openWorldHint": True
        }
    )
    async def get_paper_data(paper_id: str) -> str:
        """Get detailed information about a specific paper including abstract and available formats."""
        paper = await arxiv_client.get_paper(paper_id)
        
        # Format paper details in a readable way with clear sections
        result = f"Title: {paper['title']}\n\n"
        
        # Metadata section
        result += "Metadata:\n"
        result += f"- Authors: {', '.join(paper['authors'])}\n"
        result += f"- Published: {paper['published']}\n"
        result += f"- Last Updated: {paper['updated']}\n"
        result += "- Categories: "
        if paper['primary_category']:
            result += f"Primary: {paper['primary_category']}"
        if paper['categories']:
            result += f", Additional: {', '.join(paper['categories'])}"
        result += "\n"
        
        if paper['doi']:
            result += f"- DOI: {paper['doi']}\n"
        if paper["journal_ref"]:
            result += f"- Journal Reference: {paper['journal_ref']}\n"
        
        # Abstract section
        result += "\nAbstract:\n"
        result += paper["summary"]
        result += "\n"
        
        # Access options section
        result += "\nAccess Options:\n"
        result += "- Abstract page: " + paper["abstract_url"] + "\n"
        if paper["html_url"]:  # Add HTML version if available
            result += "- Full text HTML version: " + paper["html_url"] + "\n"
        result += "- PDF version: " + paper["pdf_url"] + "\n"
        
        # Additional information section
        if paper["comment"] or "code" in paper["comment"].lower():
            result += "\nAdditional Information:\n"
            if paper["comment"]:
                result += "- Comment: " + paper["comment"] + "\n"
                
        return result
  • Helper method in ArxivClient class that queries the arXiv API by paper_id, parses the Atom feed response, and returns structured paper metadata used by the tool handler.
    async def get_paper(self, paper_id: str) -> Dict[str, Any]:
        """
        Get detailed information about a specific paper.
        
        Args:
            paper_id: arXiv paper ID (e.g., "2103.08220")
            
        Returns:
            Dictionary containing paper metadata, including:
            - Basic metadata (title, authors, dates)
            - Categories (primary and others)
            - Abstract and comments
            - URLs (abstract page, PDF version, HTML version if available)
            - DOI if available
        """
        await self._wait_for_rate_limit()
        
        params = {
            "id_list": paper_id,
            "max_results": 1
        }
        
        async with httpx.AsyncClient(timeout=20.0) as client:
            try:
                response = await client.get(self.base_url, params=params)
                response.raise_for_status()
                
                feed = feedparser.parse(response.text)
                if not isinstance(feed, dict) or 'entries' not in feed:
                    logger.error("Invalid response from arXiv API")
                    logger.debug(f"Response text: {response.text[:1000]}...")
                    raise ValueError("Invalid response from arXiv API")
                
                if not feed.get('entries'):
                    raise ValueError(f"Paper not found: {paper_id}")
                    
                return self._parse_entry(feed.entries[0])
                
            except httpx.HTTPError as e:
                logger.error(f"HTTP error while fetching paper: {e}")
                raise ValueError(f"arXiv API HTTP error: {str(e)}")
  • Supporting helper that parses individual arXiv Atom feed entry into the standardized paper dictionary format used by get_paper.
    def _parse_entry(self, entry: Dict[str, Any]) -> Dict[str, Any]:
        """Parse a feed entry into a paper dictionary."""
        # Extract PDF and HTML links
        pdf_url = None
        abstract_url = None  # This is the URL to the abstract page
        for link in entry.get('links', []):
            if isinstance(link, dict):
                if link.get('type') == 'application/pdf':
                    pdf_url = link.get('href')
                elif link.get('type') == 'text/html':
                    abstract_url = link.get('href')
    
        # Get paper ID
        paper_id = entry.get('id', '').split("/abs/")[-1].rstrip()
        
        # Create HTML version URL
        html_url = self._get_html_url(paper_id) if paper_id else None
    
        # Get authors
        authors = []
        for author in entry.get('authors', []):
            if isinstance(author, dict) and 'name' in author:
                authors.append(author['name'])
            elif hasattr(author, 'name'):
                authors.append(author.name)
    
        # Get categories
        categories = []
        primary_category = None
        
        # Get primary category
        if 'arxiv_primary_category' in entry:
            if isinstance(entry['arxiv_primary_category'], dict):
                primary_category = entry['arxiv_primary_category'].get('term')
            elif hasattr(entry['arxiv_primary_category'], 'term'):
                primary_category = entry['arxiv_primary_category'].term
        
        # Get all categories
        for category in entry.get('tags', []):
            if isinstance(category, dict) and 'term' in category:
                categories.append(category['term'])
            elif hasattr(category, 'term'):
                categories.append(category.term)
    
        # Remove primary category from regular categories if it's there
        if primary_category and primary_category in categories:
            categories.remove(primary_category)
    
        return {
            "id": paper_id,
            "title": self._clean_text(entry.get('title', '')),
            "authors": authors,
            "primary_category": primary_category,
            "categories": categories,
            "published": entry.get('published', ''),
            "updated": entry.get('updated', ''),
            "summary": self._clean_text(entry.get('summary', '')),
            "comment": self._clean_text(entry.get('arxiv_comment', '')),
            "journal_ref": entry.get('arxiv_journal_ref', ''),
            "doi": entry.get('arxiv_doi', ''),
            "pdf_url": pdf_url,
            "abstract_url": abstract_url,  # URL to abstract page
            "html_url": html_url  # URL to HTML version if available
        }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/andybrandt/mcp-simple-arxiv'

If you have feedback or need assistance with the MCP directory API, please join our Discord server