download_paper
Download arXiv research papers by ID to access full text content for reading and analysis.
Instructions
Download a paper and create a resource for it
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| paper_id | Yes | The arXiv ID of the paper to download | |
| check_status | No | If true, only check conversion status without downloading |
Implementation Reference
- The core handler function that orchestrates downloading an arXiv paper as PDF using arxiv library, converts it to Markdown using pymupdf4llm, tracks status, and returns JSON status updates. Handles existing papers, status checks, and errors.async def handle_download(arguments: Dict[str, Any]) -> List[types.TextContent]: """Handle paper download and conversion requests.""" try: paper_id = arguments["paper_id"] check_status = arguments.get("check_status", False) # If only checking status if check_status: status = conversion_statuses.get(paper_id) if not status: if get_paper_path(paper_id, ".md").exists(): return [ types.TextContent( type="text", text=json.dumps( { "status": "success", "message": "Paper is ready", "resource_uri": f"file://{get_paper_path(paper_id, '.md')}", } ), ) ] return [ types.TextContent( type="text", text=json.dumps( { "status": "unknown", "message": "No download or conversion in progress", } ), ) ] return [ types.TextContent( type="text", text=json.dumps( { "status": status.status, "started_at": status.started_at.isoformat(), "completed_at": ( status.completed_at.isoformat() if status.completed_at else None ), "error": status.error, "message": f"Paper conversion {status.status}", } ), ) ] # Check if paper is already converted if get_paper_path(paper_id, ".md").exists(): return [ types.TextContent( type="text", text=json.dumps( { "status": "success", "message": "Paper already available", "resource_uri": f"file://{get_paper_path(paper_id, '.md')}", } ), ) ] # Check if already in progress if paper_id in conversion_statuses: status = conversion_statuses[paper_id] return [ types.TextContent( type="text", text=json.dumps( { "status": status.status, "message": f"Paper conversion {status.status}", "started_at": status.started_at.isoformat(), } ), ) ] # Start new download and conversion pdf_path = get_paper_path(paper_id, ".pdf") client = arxiv.Client() # Initialize status conversion_statuses[paper_id] = ConversionStatus( paper_id=paper_id, status="downloading", started_at=datetime.now() ) # Download PDF paper = next(client.results(arxiv.Search(id_list=[paper_id]))) paper.download_pdf(dirpath=pdf_path.parent, filename=pdf_path.name) # Update status and start conversion status = conversion_statuses[paper_id] status.status = "converting" # Start conversion in thread asyncio.create_task( asyncio.to_thread(convert_pdf_to_markdown, paper_id, pdf_path) ) return [ types.TextContent( type="text", text=json.dumps( { "status": "converting", "message": "Paper downloaded, conversion started", "started_at": status.started_at.isoformat(), } ), ) ] except StopIteration: return [ types.TextContent( type="text", text=json.dumps( { "status": "error", "message": f"Paper {paper_id} not found on arXiv", } ), ) ] except Exception as e: return [ types.TextContent( type="text", text=json.dumps({"status": "error", "message": f"Error: {str(e)}"}), ) ]
- The Tool object defining the input schema for the download_paper tool, including paper_id (required) and optional check_status.download_tool = types.Tool( name="download_paper", description="Download a paper and create a resource for it", inputSchema={ "type": "object", "properties": { "paper_id": { "type": "string", "description": "The arXiv ID of the paper to download", }, "check_status": { "type": "boolean", "description": "If true, only check conversion status without downloading", "default": False, }, }, "required": ["paper_id"], }, )
- src/arxiv_mcp_server/server.py:41-45 (registration)Registration of the download_paper tool via the list_tools MCP handler, which returns the download_tool object among others.@server.list_tools() async def list_tools() -> List[types.Tool]: """List available arXiv research tools.""" return [search_tool, download_tool, list_tool, read_tool]
- src/arxiv_mcp_server/server.py:54-55 (registration)Dispatch/registration in the call_tool handler that routes 'download_paper' calls to the handle_download function.elif name == "download_paper": return await handle_download(arguments)
- Helper function to convert downloaded PDF to Markdown format asynchronously, updates conversion status.def convert_pdf_to_markdown(paper_id: str, pdf_path: Path) -> None: """Convert PDF to Markdown in a separate thread.""" try: logger.info(f"Starting conversion for {paper_id}") markdown = pymupdf4llm.to_markdown(pdf_path, show_progress=False) md_path = get_paper_path(paper_id, ".md") with open(md_path, "w", encoding="utf-8") as f: f.write(markdown) status = conversion_statuses.get(paper_id) if status: status.status = "success" status.completed_at = datetime.now() # Clean up PDF after successful conversion logger.info(f"Conversion completed for {paper_id}") except Exception as e: logger.error(f"Conversion failed for {paper_id}: {str(e)}") status = conversion_statuses.get(paper_id) if status: status.status = "error" status.completed_at = datetime.now() status.error = str(e)