Skip to main content
Glama

ArxivSearcher MCP Server

by emi-dm
remote_arxiv_mcp.py27.2 kB
import arxiv import re import json import os import pandas as pd from collections import Counter from datetime import datetime from typing import Dict, List, Any from sklearn.feature_extraction.text import TfidfVectorizer from fastmcp import FastMCP import logging import fitz mcp = FastMCP("Arxiv Searcher 🚀", port=8001, host="0.0.0.0") logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # MCP Prompts and Descriptions @mcp.prompt def search_by_author(author_name: str) -> str: """Generates a prompt to search for papers by an author.""" return f"Search for the latest papers by '{author_name}' on Arxiv." @mcp.prompt def search_by_recent_topic(topic: str) -> str: """Generates a prompt to search for recent papers on a topic.""" return f"Show me the most relevant papers on '{topic}' from the last year on Arxiv." @mcp.prompt def search_by_keyword_in_title(keyword: str) -> str: """Generates a prompt to search for a specific keyword in paper titles.""" return f"Find papers with the keyword '{keyword}' directly in the title." @mcp.prompt def search_author_and_topic(author_name: str, topic: str) -> str: """Generates a combined prompt to search for papers by an author on a specific topic.""" return f"Look for papers by '{author_name}' on the topic of '{topic}'." @mcp.prompt def search_in_category(topic: str, category: str) -> str: """Generates a prompt to search for a topic within a specific arXiv category.""" return f"Search for '{topic}' within the arXiv category '{category}'." @mcp.prompt def get_latest_papers_in_category(category: str = "cs.LG") -> str: """Generates a prompt to get the most recent submissions in a category.""" return f"What are the 5 newest papers in the '{category}' category?" @mcp.prompt def get_paper_by_id(arxiv_id: str) -> str: """Generates a prompt to fetch a specific paper by its ID.""" return f"Get me the details for the paper with ArXiv ID: '{arxiv_id}'." # Resources @mcp.resource("data://arxiv_categories") def get_arxiv_categories() -> str: """Provides the list of Arxiv categories in JSON format.""" with open("arxiv_categories.json", "r") as f: categories = json.load(f) return categories @mcp.resource("data://search_tips") def get_search_tips() -> str: """Returns a document with tips for searching on Arxiv.""" return """ **Advanced ArXiv Search Guide for LLMs** This guide details how to build effective search queries for the ArXiv API. **1. Logical Operators:** - `AND`: Finds documents containing ALL terms. - Example: `electron AND positron` - `OR`: Finds documents containing ANY of the terms. - Example: `GAN OR "Generative Adversarial Network"` - `ANDNOT`: Excludes documents containing the term. (Less common, use with care). - Example: `transformer ANDNOT "electrical"` **2. Specific Field Searching:** You can limit your search to specific fields of a paper using prefixes. - `ti:` (Title): Searches only in the paper's title. - Example: `ti:"Quantum Computing"` - `au:` (Author): Searches by author name. For exact names, use quotes. - Example: `au:"Yann LeCun"` - `abs:` (Abstract): Searches only in the abstract. - Example: `abs:"self-attention mechanism"` - `cat:` (Category): Filters by a specific ArXiv category. - Example: `cat:cs.AI` **3. Combining Searches (Advanced Examples):** - **Author and Title:** Find papers by a specific author that contain a keyword in the title. - `au:"Yoshua Bengio" AND ti:consciousness` - **Topic in Multiple Categories:** Search for a topic across several categories of interest. - `(ti:"language model" OR abs:"language model") AND (cat:cs.CL OR cat:cs.LG)` - **Exclude a Secondary Topic:** Search for 'transformers' in computer vision, but exclude those that mention NLP. - `(abs:"computer vision" AND ti:transformer) ANDNOT abs:NLP` **4. Date Syntax:** - The `search_papers` tool handles dates with the `start_date` and `end_date` parameters. - It is preferable to use these parameters instead of including dates directly in the `query` for greater precision. """ @mcp.resource("data://mcp_readme") def get_mcp_readme() -> str: """Provides the project's README.md file.""" with open("README.md", "r") as file: return file.read() @mcp.resource("data://downloaded_papers") def list_downloaded_papers() -> str: """ Reads all downloaded PDF files, extracts their content, and returns it in a structured XML-like format: <Paper {title}> {content} </Paper>. Requires the 'PyMuPDF' library to be installed (`pip install PyMuPDF`). """ directory = "downloaded_papers" if not os.path.exists(directory): return "" # Return empty string if no directory papers_content = [] pdf_files = [f for f in os.listdir(directory) if f.endswith(".pdf")] for filename in pdf_files: try: # 1. Extract arXiv ID (with version) from filename arxiv_id = filename.replace(".pdf", "") # 2. Fetch paper metadata (title) from arXiv API search = arxiv.Search(id_list=[arxiv_id]) paper_meta = next(search.results(), None) title = paper_meta.title if paper_meta else f"Unknown Title for {arxiv_id}" # 3. Read PDF content filepath = os.path.join(directory, filename) content = "" with fitz.open(filepath) as doc: content = "".join(page.get_text() for page in doc) # 4. Format the output for this paper papers_content.append(f"<Paper {title} > {content} </Paper>") except Exception as e: logging.error(f"Failed to process file {filename}: {e}") continue return "\n\n".join(papers_content) # MCP Tools @mcp.tool async def search_papers( query: str, max_results: int = 10, start_date: str | None = None, end_date: str | None = None, sort_by_relevance: bool = True, category: str = "cs.SE", ) -> dict: """ Search for papers on arXiv. It can parse natural language queries, extracting keywords and years for filtering. :param query: The base search query. Can be natural language. :param max_results: The maximum number of results to return. :param start_date: The start date for the search period (YYYY-MM-DD or YYYY). Overrides years in query. :param end_date: The end date for the search period (YYYY-MM-DD or YYYY). Overrides years in query. :param sort_by_relevance: If True, sorts by relevance. If False, sorts by submission date. :param category: The arXiv category to search in (e.g., 'cs.AI', 'cs.CL', 'cs.SE'). """ STOP_WORDS = { "a", "an", "and", "the", "of", "in", "for", "to", "with", "on", "is", "are", "was", "were", "it", } # Extract years from query to use as date filters if not provided explicitly years_in_query = re.findall(r"\b(20\d{2})\b", query) query_text = re.sub(r"\b(20\d{2})\b", "", query).strip() # Use provided dates or fall back to dates from query effective_start_date = start_date if not effective_start_date and years_in_query: effective_start_date = min(years_in_query) effective_end_date = end_date if not effective_end_date and years_in_query: effective_end_date = max(years_in_query) # Process keywords from the query text keywords = [ word for word in query_text.split() if word.lower() not in STOP_WORDS and len(word) > 2 ] if keywords: # Build a structured query from keywords, joining with OR for broader results keyword_query = " OR ".join([f'(ti:"{kw}" OR abs:"{kw}")' for kw in keywords]) query_parts = [f"({keyword_query})"] else: # Fallback to using the original query text if no keywords are left query_parts = [f'(ti:"{query_text}" OR abs:"{query_text}")'] if category: query_parts.append(f"cat:{category}") # Add date range to the query if effective_start_date or effective_end_date: start = "19910814" if effective_start_date: try: dt = datetime.strptime(effective_start_date, "%Y-%m-%d") except ValueError: dt = datetime.strptime(effective_start_date, "%Y") start = dt.strftime("%Y%m%d") end = datetime.now().strftime("%Y%m%d") if effective_end_date: try: dt = datetime.strptime(effective_end_date, "%Y-%m-%d") except ValueError: dt = datetime.strptime(effective_end_date, "%Y") dt = dt.replace(month=12, day=31) end = dt.strftime("%Y%m%d") query_parts.append(f"submittedDate:[{start} TO {end}]") final_query = " AND ".join(query_parts) print(f"[arxiv-search] Query sent: {final_query}") sort_criterion = ( arxiv.SortCriterion.Relevance if sort_by_relevance else arxiv.SortCriterion.SubmittedDate ) search = arxiv.Search( query=final_query, max_results=max_results, sort_by=sort_criterion, sort_order=arxiv.SortOrder.Descending, ) results = [] for r in search.results(): results.append( { "title": r.title, "authors": [a.name for a in r.authors], "summary": r.summary, "pdf_url": r.pdf_url, "published_date": r.published.strftime("%Y-%m-%d"), } ) return {"query_used": final_query, "results": results} @mcp.tool async def get_paper_details(arxiv_id: str) -> dict: """ Get detailed information about a specific paper by ArXiv ID. :param arxiv_id: The ArXiv ID (e.g., '2301.12345') """ try: search = arxiv.Search(id_list=[arxiv_id]) paper = next(search.results()) return { "title": paper.title, "authors": [a.name for a in paper.authors], "summary": paper.summary, "pdf_url": paper.pdf_url, "published_date": paper.published.strftime("%Y-%m-%d"), "updated_date": paper.updated.strftime("%Y-%m-%d"), "categories": paper.categories, "primary_category": paper.primary_category, "arxiv_id": paper.entry_id.split("/")[-1], "doi": paper.doi, "journal_ref": paper.journal_ref, "comment": paper.comment, } except Exception as e: return {"error": f"Failed to fetch paper details: {str(e)}"} @mcp.tool async def search_by_author( author_name: str, max_results: int = 20, category: str | None = None, start_date: str | None = None, end_date: str | None = None, ) -> dict: """ Search papers by a specific author. :param author_name: Name of the author to search for :param max_results: Maximum number of results :param category: Optional category filter (e.g., 'cs.SE', 'cs.AI') :param start_date: Optional start date filter (YYYY-MM-DD or YYYY) :param end_date: Optional end date filter (YYYY-MM-DD or YYYY) """ query_parts = [f'au:"{author_name}"'] if category: query_parts.append(f"cat:{category}") # Add date range if specified if start_date or end_date: start = "19910814" if start_date: try: dt = datetime.strptime(start_date, "%Y-%m-%d") except ValueError: dt = datetime.strptime(start_date, "%Y") start = dt.strftime("%Y%m%d") end = datetime.now().strftime("%Y%m%d") if end_date: try: dt = datetime.strptime(end_date, "%Y-%m-%d") except ValueError: dt = datetime.strptime(end_date, "%Y") dt = dt.replace(month=12, day=31) end = dt.strftime("%Y%m%d") query_parts.append(f"submittedDate:[{start} TO {end}]") final_query = " AND ".join(query_parts) print(f"[arxiv-search] Author query: {final_query}") search = arxiv.Search( query=final_query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate, sort_order=arxiv.SortOrder.Descending, ) results = [] for r in search.results(): results.append( { "title": r.title, "authors": [a.name for a in r.authors], "summary": r.summary, "pdf_url": r.pdf_url, "published_date": r.published.strftime("%Y-%m-%d"), "arxiv_id": r.entry_id.split("/")[-1], "categories": r.categories, } ) return { "author": author_name, "query_used": final_query, "total_results": len(results), "results": results, } @mcp.tool async def analyze_paper_trends( papers: List[Dict[str, Any]], analysis_type: str = "authors" ) -> dict: """ Analyze trends in a collection of papers. :param papers: List of papers from search_papers results :param analysis_type: Type of analysis ('authors', 'keywords', 'timeline', 'categories') """ if not papers or "results" not in papers: if isinstance(papers, list): results = papers else: return { "error": "Invalid papers format. Expected list or dict with 'results' key." } else: results = papers["results"] if not results: return {"error": "No papers to analyze"} analysis = {} if analysis_type == "authors": author_counts = Counter() for paper in results: for author in paper.get("authors", []): author_counts[author] += 1 analysis = { "type": "authors", "total_unique_authors": len(author_counts), "most_prolific_authors": author_counts.most_common(10), "collaboration_stats": { "avg_authors_per_paper": sum(len(p.get("authors", [])) for p in results) / len(results), "single_author_papers": sum( 1 for p in results if len(p.get("authors", [])) == 1 ), "multi_author_papers": sum( 1 for p in results if len(p.get("authors", [])) > 1 ), }, } elif analysis_type == "timeline": date_counts = Counter() for paper in results: date = paper.get("published_date", "") if date: year = date.split("-")[0] date_counts[year] += 1 analysis = { "type": "timeline", "papers_by_year": dict(sorted(date_counts.items())), "most_active_year": date_counts.most_common(1)[0] if date_counts else None, "total_years_span": len(date_counts), } elif analysis_type == "categories": category_counts = Counter() for paper in results: categories = paper.get("categories", []) for cat in categories: category_counts[cat] += 1 analysis = { "type": "categories", "total_categories": len(category_counts), "most_common_categories": category_counts.most_common(10), "category_distribution": dict(category_counts), } elif analysis_type == "keywords": # Extract keywords from titles and abstracts text_content = [] for paper in results: title = paper.get("title", "") summary = paper.get("summary", "") text_content.append(f"{title} {summary}") if text_content: try: # Use TF-IDF to find important terms vectorizer = TfidfVectorizer( max_features=50, stop_words="english", ngram_range=(1, 2), min_df=2 ) tfidf_matrix = vectorizer.fit_transform(text_content) feature_names = vectorizer.get_feature_names_out() scores = tfidf_matrix.sum(axis=0).A1 keyword_scores = list(zip(feature_names, scores)) keyword_scores.sort(key=lambda x: x[1], reverse=True) analysis = { "type": "keywords", "top_keywords": keyword_scores[:20], "total_unique_terms": len(feature_names), } except Exception as e: analysis = { "type": "keywords", "error": f"Could not perform keyword analysis: {str(e)}", "fallback_word_count": Counter(), } analysis["total_papers_analyzed"] = len(results) return analysis @mcp.tool async def find_related_papers( paper_title: str, max_results: int = 10, similarity_threshold: float = 0.7, category: str | None = None, ) -> dict: """ Find papers related to a given paper title using keyword similarity. :param paper_title: Title of the reference paper :param max_results: Maximum number of related papers to return :param similarity_threshold: Minimum similarity score (0.0 to 1.0) :param category: Optional category filter """ try: # Extract keywords from the title stop_words = { "a", "an", "and", "the", "of", "in", "for", "to", "with", "on", "is", "are", "was", "were", "it", } keywords = [ word.lower() for word in re.findall(r"\b\w+\b", paper_title) if word.lower() not in stop_words and len(word) > 2 ] if not keywords: return {"error": "No meaningful keywords found in title"} # Create search query from keywords keyword_query = " OR ".join([f'(ti:"{kw}" OR abs:"{kw}")' for kw in keywords]) query_parts = [f"({keyword_query})"] if category: query_parts.append(f"cat:{category}") final_query = " AND ".join(query_parts) # Search for related papers search = arxiv.Search( query=final_query, max_results=max_results * 2, # Get more results to filter by similarity sort_by=arxiv.SortCriterion.Relevance, sort_order=arxiv.SortOrder.Descending, ) results = [] for r in search.results(): # Calculate simple similarity based on keyword overlap paper_text = f"{r.title} {r.summary}".lower() # Count keyword matches matches = sum(1 for kw in keywords if kw in paper_text) similarity = matches / len(keywords) if keywords else 0 if similarity >= similarity_threshold: results.append( { "title": r.title, "authors": [a.name for a in r.authors], "summary": r.summary[:500] + "..." if len(r.summary) > 500 else r.summary, "pdf_url": r.pdf_url, "published_date": r.published.strftime("%Y-%m-%d"), "similarity_score": round(similarity, 3), "arxiv_id": r.entry_id.split("/")[-1], } ) # Sort by similarity score and limit results results.sort(key=lambda x: x["similarity_score"], reverse=True) results = results[:max_results] return { "reference_title": paper_title, "keywords_used": keywords, "similarity_threshold": similarity_threshold, "total_related_found": len(results), "related_papers": results, } except Exception as e: return {"error": f"Failed to find related papers: {str(e)}"} @mcp.tool async def export_search_results( results: Dict[str, Any], format: str = "bibtex", filename: str | None = None, save_path: str | None = None, ) -> dict: """ Export search results to various formats. :param results: Results from search_papers or other search functions :param format: Export format ('bibtex', 'csv', 'json', 'markdown') :param filename: Output filename (without extension) :param save_path: Directory to save the file (default: current directory) """ try: if save_path is None: save_path = os.getcwd() os.makedirs(save_path, exist_ok=True) # Extract papers from results if isinstance(results, dict) and "results" in results: papers = results["results"] elif isinstance(results, list): papers = results else: return { "error": "Invalid results format. Expected a list of papers or a dict with a 'results' key." } if not papers: return {"error": "No papers to export."} # Generate default filename if not provided if filename is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"arxiv_search_{timestamp}" full_path = os.path.join(save_path, f"{filename}.{format}") if format == "bibtex": bibtex_entries = [] query_info = results.get("query_used", "N/A") export_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") header = f"""% Query: {query_info} % Exported: {export_time} """ bibtex_entries.append(header) bibtex_keys = set() for i, paper in enumerate(papers): authors = paper.get("authors", ["unknown"]) year = paper.get("published_date", "unknown").split("-")[0] first_author_lastname = "unknown" if authors and isinstance(authors, list) and authors[0] != "unknown": name_parts = authors[0].split(" ") if name_parts: first_author_lastname = name_parts[-1] first_author_lastname = re.sub( r"[^a-zA-Z0-9]", "", first_author_lastname ).lower() key = f"{first_author_lastname}{year}" # Handle duplicates original_key = key suffix = 1 while key in bibtex_keys: key = f"{original_key}_{suffix}" suffix += 1 bibtex_keys.add(key) title = paper.get("title", "No Title Provided") author_str = " and ".join(paper.get("authors", [])) pdf_url = paper.get("pdf_url", "") arxiv_id_match = ( re.search(r"/pdf/([^v]+)", pdf_url) if pdf_url else None ) if arxiv_id_match: arxiv_id = arxiv_id_match.group(1) journal = f"arXiv preprint arXiv:{arxiv_id}" else: journal = f"arXiv preprint arXiv:{key}" entry = f"""@article{{{key}, title = {{{title}}}, author = {{{author_str}}}, year = {{{year}}}, journal = {{{journal}}}, url = {{{pdf_url}}} }}""" bibtex_entries.append(entry) content = "\n\n".join(bibtex_entries) with open(full_path, "w", encoding="utf-8") as f: f.write(content) elif format == "csv": df = pd.DataFrame(papers) df.to_csv(full_path, index=False, encoding="utf-8-sig") content = df.to_string() elif format == "json": with open(full_path, "w", encoding="utf-8") as f: json.dump(papers, f, indent=4) content = json.dumps(papers, indent=4) elif format == "markdown": md_entries = [] for paper in papers: title = paper.get("title", "N/A") authors = ", ".join(paper.get("authors", ["N/A"])) date = paper.get("published_date", "N/A") url = paper.get("pdf_url", "#") summary = paper.get("summary", "N/A").replace("\n", " ") md_entries.append( f"""### {title}\n**Authors:** {authors}\n**Published:** {date}\n**[PDF Link]({url})**\n> {summary}\n""" ) content = "\n---\n".join(md_entries) with open(full_path, "w", encoding="utf-8") as f: f.write(content) else: return {"error": f"Unsupported format: {format}"} return { "success": True, "format": format, "saved_path": full_path, "papers_exported": len(papers), "content_preview": content[:500] + ("..." if len(content) > 500 else ""), } except Exception as e: return {"success": False, "error": f"Failed to export results: {str(e)}"} @mcp.tool async def download_paper(arxiv_id: str, directory: str = "downloaded_papers") -> dict: """ Downloads the PDF of a paper to a local directory on the server. NOTE: In a stateless/free hosting environment, this file is temporary and will be deleted when the server restarts or sleeps. :param arxiv_id: The ArXiv ID of the paper to download (e.g., '2301.12345'). :param directory: The local directory where the paper will be saved. """ try: # Ensure the download directory exists os.makedirs(directory, exist_ok=True) search = arxiv.Search(id_list=[arxiv_id]) paper = next(search.results()) # Define a clean filename to avoid issues with special characters clean_id = re.sub(r'[^0-9v.]', '_', arxiv_id) filename = f"{clean_id}.pdf" # Download the paper to the specified directory paper.download_pdf(dirpath=directory, filename=filename) filepath = os.path.join(directory, filename) logging.info(f"Paper {arxiv_id} downloaded to {filepath}") return { "success": True, "arxiv_id": arxiv_id, "local_path": filepath, "message": f"Paper is temporarily available at the server path: {filepath}" } except StopIteration: logging.error(f"Paper with ID {arxiv_id} not found.") return {"success": False, "error": f"Paper with ID {arxiv_id} not found."} except Exception as e: logging.error(f"Failed to download paper {arxiv_id}: {e}") return {"success": False, "error": f"An unexpected error occurred: {str(e)}"} if __name__ == "__main__": mcp.run(transport="streamable-http")

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/emi-dm/Arxiv-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server