from mcp.server.fastmcp import FastMCP, Context
from contextlib import asynccontextmanager
from collections.abc import AsyncIterator
from dataclasses import dataclass
from dotenv import load_dotenv
import feedparser
import aiohttp
import asyncio
import json
import os
import io
import PyPDF2
load_dotenv()
# Default number of results to return
DEFAULT_RESULT_LIMIT = 5
@dataclass
class ArxivContext:
"""Context for the arXiv MCP server."""
session: aiohttp.ClientSession
@asynccontextmanager
async def arxiv_lifespan(server: FastMCP) -> AsyncIterator[ArxivContext]:
"""
Manages the arXiv client lifecycle.
Args:
server: The FastMCP server instance
Yields:
ArxivContext: The context containing the aiohttp session
"""
# Create aiohttp session
session = aiohttp.ClientSession()
try:
yield ArxivContext(session=session)
finally:
# Close the session when done
await session.close()
# Initialize FastMCP server with the arXiv context
mcp = FastMCP(
"arxiv-mcp",
description="MCP server for retrieving papers from arXiv based on keywords",
lifespan=arxiv_lifespan,
host=os.getenv("HOST", "0.0.0.0"),
port=os.getenv("PORT", "8060")
)
async def fetch_arxiv_papers(session: aiohttp.ClientSession, query: str, limit: int = DEFAULT_RESULT_LIMIT) -> list:
"""
Fetch papers from arXiv based on the query.
Args:
session: The aiohttp session
query: The search query
limit: Maximum number of results to return
Returns:
List of paper metadata
"""
# Format the query for arXiv API
formatted_query = query.replace(' ', '+')
url = f"http://export.arxiv.org/api/query?search_query=all:{formatted_query}&start=0&max_results={limit}"
async with session.get(url) as response:
if response.status != 200:
raise Exception(f"Failed to fetch papers: HTTP {response.status}")
content = await response.text()
feed = feedparser.parse(content)
papers = []
for entry in feed.entries:
# Extract paper ID from the URL
paper_id = entry.id.split('/abs/')[-1]
# Format authors
authors = [author.name for author in entry.authors]
# Extract categories
categories = [tag.term for tag in entry.tags] if hasattr(entry, 'tags') else []
# Create paper metadata
paper = {
"id": paper_id,
"title": entry.title,
"authors": authors,
"summary": entry.summary,
"published": entry.published,
"updated": entry.updated,
"link": entry.link,
"pdf_link": f"http://arxiv.org/pdf/{paper_id}",
"categories": categories
}
papers.append(paper)
return papers
async def fetch_paper_content(session: aiohttp.ClientSession, paper_id: str) -> str:
"""
Attempt to fetch and extract the content of a paper.
Args:
session: The aiohttp session
paper_id: The arXiv paper ID
Returns:
Extracted text from the paper or error message
"""
try:
pdf_url = f"http://arxiv.org/pdf/{paper_id}"
async with session.get(pdf_url) as response:
if response.status != 200:
return f"Failed to fetch PDF: HTTP {response.status}"
# Read PDF content
pdf_content = await response.read()
# Use PyPDF2 to extract text
pdf_file = io.BytesIO(pdf_content)
pdf_reader = PyPDF2.PdfReader(pdf_file)
# Extract text from first few pages (full extraction could be too large)
max_pages = min(5, len(pdf_reader.pages))
text = ""
for i in range(max_pages):
page = pdf_reader.pages[i]
text += page.extract_text()
# Truncate if too long
if len(text) > 5000:
text = text[:5000] + "... [truncated]"
return text
except Exception as e:
return f"Error extracting paper content: {str(e)}"
def format_paper_to_markdown(paper: dict, content: str = None) -> str:
"""
Format paper metadata and content to Markdown.
Args:
paper: Paper metadata
content: Paper content (if available)
Returns:
Markdown formatted string
"""
md = f"# {paper['title']}\n\n"
# Authors
md += "## Authors\n"
md += ", ".join(paper['authors']) + "\n\n"
# Publication info
md += f"**Published:** {paper['published']}\n"
md += f"**Last Updated:** {paper['updated']}\n"
md += f"**arXiv ID:** {paper['id']}\n"
md += f"**Categories:** {', '.join(paper['categories'])}\n\n"
# Links
md += f"**Paper Link:** [{paper['link']}]({paper['link']})\n"
md += f"**PDF Link:** [{paper['pdf_link']}]({paper['pdf_link']})\n\n"
# Summary
md += "## Abstract\n"
md += paper['summary'] + "\n\n"
# Content (if available)
if content and content.startswith("Error") is False:
md += "## Content Preview\n"
md += "```\n" + content + "\n```\n\n"
return md
@mcp.tool()
async def search_arxiv(ctx: Context, query: str, limit: int = DEFAULT_RESULT_LIMIT) -> str:
"""Search for papers on arXiv based on keywords.
This tool searches arXiv for papers matching the provided keywords and returns
the top results with their metadata in a structured format.
Args:
ctx: The MCP server provided context
query: Search keywords or phrases
limit: Maximum number of results to return (default: 5)
"""
try:
session = ctx.request_context.lifespan_context.session
papers = await fetch_arxiv_papers(session, query, limit)
if not papers:
return "No papers found matching your query."
# Format results as JSON
return json.dumps(papers, indent=2)
except Exception as e:
return f"Error searching arXiv: {str(e)}"
@mcp.tool()
async def get_paper_details(ctx: Context, paper_id: str, include_content: bool = False) -> str:
"""Get detailed information about a specific arXiv paper.
This tool retrieves detailed metadata for a specific paper and optionally
attempts to extract its content.
Args:
ctx: The MCP server provided context
paper_id: The arXiv paper ID (e.g., "2104.08653")
include_content: Whether to attempt to extract paper content (default: False)
"""
try:
session = ctx.request_context.lifespan_context.session
# Fetch paper metadata
query = f"id:{paper_id}"
papers = await fetch_arxiv_papers(session, query, 1)
if not papers:
return f"Paper with ID {paper_id} not found."
paper = papers[0]
# Fetch content if requested
content = None
if include_content:
content = await fetch_paper_content(session, paper_id)
# Format as Markdown
markdown_output = format_paper_to_markdown(paper, content)
return markdown_output
except Exception as e:
return f"Error retrieving paper details: {str(e)}"
@mcp.tool()
async def search_and_summarize(ctx: Context, query: str, limit: int = DEFAULT_RESULT_LIMIT) -> str:
"""Search arXiv and provide a comprehensive summary of the top papers.
This tool searches arXiv for papers matching the provided keywords, fetches
their metadata and content, and returns a comprehensive summary in Markdown format.
Args:
ctx: The MCP server provided context
query: Search keywords or phrases
limit: Maximum number of results to return (default: 5)
"""
try:
session = ctx.request_context.lifespan_context.session
papers = await fetch_arxiv_papers(session, query, limit)
if not papers:
return "No papers found matching your query."
# Compile results
results = f"# arXiv Search Results for: {query}\n\n"
results += f"Found {len(papers)} papers matching your query.\n\n"
# Process each paper
for i, paper in enumerate(papers):
results += f"## {i+1}. {paper['title']}\n\n"
# Authors
results += "### Authors\n"
results += ", ".join(paper['authors']) + "\n\n"
# Publication info
results += f"**Published:** {paper['published']}\n"
results += f"**arXiv ID:** {paper['id']}\n"
results += f"**Categories:** {', '.join(paper['categories'])}\n\n"
# Links
results += f"**Paper Link:** [{paper['link']}]({paper['link']})\n"
results += f"**PDF Link:** [{paper['pdf_link']}]({paper['pdf_link']})\n\n"
# Abstract
results += "### Abstract\n"
results += paper['summary'] + "\n\n"
# Add separator between papers
if i < len(papers) - 1:
results += "---\n\n"
return results
except Exception as e:
return f"Error searching and summarizing arXiv papers: {str(e)}"
async def main():
transport = os.getenv("TRANSPORT", "sse")
if transport == 'sse':
# Run the MCP server with sse transport
await mcp.run_sse_async()
else:
# Run the MCP server with stdio transport
await mcp.run_stdio_async()
if __name__ == "__main__":
asyncio.run(main())