Academic MCP

academic-mcp
academic_mcp

__main__.py•15.2 KiB

import asyncio import os import traceback from typing import Any, Dict, List, Literal, Optional, cast import httpx from loguru import logger from fastmcp import FastMCP from mcp.types import TextContent from pydantic import BaseModel, Field, field_validator, model_validator import typer from xlin import xmap_async from .types import Paper, PaperSource, paper2text from .sources.arxiv import ArxivSearcher from .sources.pubmed import PubMedSearcher from .sources.biorxiv import BioRxivSearcher from .sources.medrxiv import MedRxivSearcher from .sources.google_scholar import GoogleScholarSearcher from .sources.iacr import IACRSearcher from .sources.semantic import SemanticSearcher from .sources.crossref import CrossRefSearcher # from .academic_platforms.hub import SciHubSearcher # Initialize MCP server mcp = FastMCP("academic_mcp") SAVE_PATH = os.getenv("ACADEMIC_MCP_DOWNLOAD_PATH", "./downloads") os.makedirs(SAVE_PATH, exist_ok=True) # Instances of searchers arxiv_searcher = ArxivSearcher() pubmed_searcher = PubMedSearcher() biorxiv_searcher = BioRxivSearcher() medrxiv_searcher = MedRxivSearcher() google_scholar_searcher = GoogleScholarSearcher() iacr_searcher = IACRSearcher() semantic_searcher = SemanticSearcher() crossref_searcher = CrossRefSearcher() # scihub_searcher = SciHubSearcher() engine2searcher: Dict[str, PaperSource] = { "arxiv": arxiv_searcher, "pubmed": pubmed_searcher, "biorxiv": biorxiv_searcher, "medrxiv": medrxiv_searcher, "google_scholar": google_scholar_searcher, "iacr": iacr_searcher, "semantic": semantic_searcher, "crossref": crossref_searcher, # "scihub": scihub_searcher, } # region paper_search class PaperQuery(BaseModel): searcher: Optional[Literal["arxiv", "pubmed", "biorxiv", "medrxiv", "google_scholar", "iacr", "semantic", "crossref"]] = Field( default=None, description="The academic platform to search from. None means searching from all available platforms.", ) query: str = Field( ..., min_length=1, max_length=500, description="Search query string. Must be between 1 and 500 characters.", ) max_results: int = Field( default=10, ge=1, le=100, description="Maximum number of results to return. Must be between 1 and 100.", ) fetch_details: Optional[bool] = Field( default=True, description="""[Only applicable to searcher == 'iacr'] Whether to fetch detailed information for each paper.""", ) year: Optional[str] = Field( default=None, pattern=r"^\d{4}(-\d{4})?|\d{4}-|-\d{4}$", description="""[Only applicable to searcher == 'semantic'] Year filter for Semantic Scholar search. Valid formats: - Single year: '2019' - Year range: '2016-2020' - From year onwards: '2010-' - Up to year: '-2015'""", ) kwargs: Optional[Dict[str, Any]] = Field( default=None, description="""[Only applicable to searcher == 'crossref'] Additional search parameters: - filter: CrossRef filter string (e.g., 'has-full-text:true,from-pub-date:2020') - sort: Sort field ('relevance', 'published', 'updated', 'deposited', etc.) - order: Sort order ('asc' or 'desc')""", ) @field_validator('query') @classmethod def validate_query(cls, v: str) -> str: """Validate and clean the query string.""" v = v.strip() if not v: raise ValueError("Query cannot be empty or whitespace only") return v @model_validator(mode='after') def validate_searcher_specific_params(self) -> 'PaperQuery': """Validate that searcher-specific parameters are only used with appropriate searchers.""" if self.year is not None and self.searcher not in [None, 'semantic']: raise ValueError("'year' parameter is only applicable when searcher is 'semantic' or None") if self.kwargs is not None and self.searcher not in [None, 'crossref']: raise ValueError("'kwargs' parameter is only applicable when searcher is 'crossref' or None") if self.fetch_details is not None and self.fetch_details != True and self.searcher not in [None, 'iacr']: raise ValueError("'fetch_details' parameter is only applicable when searcher is 'iacr' or None") return self # Asynchronous helper to adapt synchronous searchers async def async_search(searcher: PaperSource, query: str, max_results: int, **kwargs) -> List[Paper]: async with httpx.AsyncClient() as client: # Assuming searchers use requests internally; we'll call synchronously for now if 'year' in kwargs: papers = searcher.search(query, year=kwargs['year'], max_results=max_results) else: papers = searcher.search(query, max_results=max_results) return papers def expand_query(query_list: list[PaperQuery]) -> list[PaperQuery]: expanded_queries = [] for query in query_list: if query.searcher: expanded_queries.append(query) else: # Expand to all available platforms for engine in engine2searcher.keys(): expanded_query = query.model_copy(update={"searcher": engine}) expanded_queries.append(expanded_query) return expanded_queries async def async_search_per_query(query: PaperQuery) -> List[Paper]: searcher = engine2searcher.get(query.searcher) if not searcher: return [] papers = [] if query.searcher == "iacr": papers = iacr_searcher.search(query.query, query.max_results, query.fetch_details) elif query.searcher == "semantic": papers = semantic_searcher.search(query.query, query.year, query.max_results) elif query.searcher == "crossref": kwargs = query.kwargs if query.kwargs else {} papers = crossref_searcher.search(query.query, query.max_results, **kwargs) else: papers = await async_search(searcher, query.query, query.max_results) return papers async def async_search_per_query_list(query_list: List[PaperQuery]) -> List[Paper]: all_papers = await asyncio.gather(*[async_search_per_query(query) for query in query_list]) papers = sum(all_papers, []) return papers @mcp.tool( name="paper_search", description="""Search academic papers from multiple sources. ## Available sources: arxiv, PubMed, bioRxiv, medRxiv, Google Scholar, IACR ePrint Archive, Semantic Scholar, CrossRef. ## Input Constraints: - query: 1-500 characters, required, cannot be empty - max_results: 1-100, default is 10 - year: Valid formats: '2019', '2016-2020', '2010-', '-2015' (only for semantic) - fetch_details: boolean (only for iacr) - kwargs: dict (only for crossref) ## Example: paper_search([ {"searcher": "arxiv", "query": "machine learning", "max_results": 5}, {"searcher": "pubmed", "query": "cancer immunotherapy", "max_results": 3}, {"searcher": "iacr", "query": "cryptography", "max_results": 3, "fetch_details": true}, {"searcher": "semantic", "query": "climate change", "max_results": 4, "year": "2015-2020"}, {"searcher": "crossref", "query": "deep learning", "max_results": 2, "kwargs": {"filter": "from-pub-date:2020,has-full-text:true"}}, {"query": "deep learning", "max_results": 2} ]) """, ) async def paper_search(query_list: List[PaperQuery]) -> Dict[str, List[TextContent]]: async with httpx.AsyncClient() as client: expanded_queries = expand_query(query_list) papers = await xmap_async(expanded_queries, async_search_per_query_list, is_async_work_func=True, desc="Searching papers", is_batch_work_func=True, batch_size=1) texts = [] for paper in papers: if isinstance(paper, dict) and "error" in paper: pass else: texts.append(paper2text(cast(Paper, paper))) content = "\n\n".join(texts) if texts else "No papers found." return content content = "No papers found." return content # endregion paper_search # region paper_download class PaperDownloadQuery(BaseModel): searcher: Literal["arxiv", "pubmed", "biorxiv", "medrxiv", "google_scholar", "iacr", "semantic", "crossref"] = Field( description="The academic platform to download from." ) paper_id: str = Field( ..., min_length=1, max_length=200, description="""The unique identifier of the paper to download. Format depends on the searcher: - arxiv: arXiv ID (e.g., '2106.12345') - pubmed: PubMed ID/PMID (e.g., '32790614') - biorxiv: bioRxiv DOI (e.g., '10.1101/2020.01.01.123456') - medrxiv: medRxiv DOI (e.g., '10.1101/2020.01.01.123456') - iacr: IACR paper ID (e.g., '2009/101') - semantic: Semantic Scholar ID or prefixed ID (e.g., 'DOI:10.18653/v1/N18-3011', 'ARXIV:2106.15928') - crossref: DOI (e.g., '10.1038/s41586-020-2649-2')""" ) @field_validator('paper_id') @classmethod def validate_paper_id(cls, v: str) -> str: """Validate and clean the paper ID.""" v = v.strip() if not v: raise ValueError("Paper ID cannot be empty or whitespace only") return v async def async_download_per_query(query: PaperDownloadQuery) -> str: searcher = engine2searcher.get(query.searcher) if not searcher: return f"Searcher '{query.searcher}' not found." try: pdf_path = searcher.download_pdf(query.paper_id, SAVE_PATH) return pdf_path except Exception as e: logger.error(f"Error downloading paper {query.paper_id} from {query.searcher}: {e}\n{traceback.format_exc()}") return f"Error downloading paper {query.paper_id} from {query.searcher}: {e}" @mcp.tool( name="paper_download", description="""Download academic paper PDFs from multiple sources. ## Input Constraints: - searcher: Required, must be one of the supported platforms - paper_id: Required, 1-200 characters, cannot be empty ## Paper ID formats: - arXiv: Use the arXiv ID (e.g., "2106.12345"). - PubMed: Use the PubMed ID (PMID) (e.g., "32790614"). - bioRxiv: Use the bioRxiv DOI (e.g., "10.1101/2020.01.01.123456"). - medRxiv: Use the medRxiv DOI (e.g., "10.1101/2020.01.01.123456"). - Google Scholar: Direct PDF download is not supported; please use the paper URL to access the publisher's website. - IACR: Use the IACR paper ID (e.g., "2009/101"). - Semantic Scholar: Use the Semantic Scholar paper ID, Paper identifier in one of the following formats: - Semantic Scholar ID (e.g., "649def34f8be52c8b66281af98ae884c09aef38b") - DOI:<doi> (e.g., "DOI:10.18653/v1/N18-3011") - ARXIV:<id> (e.g., "ARXIV:2106.15928") - MAG:<id> (e.g., "MAG:112218234") - ACL:<id> (e.g., "ACL:W12-3903") - PMID:<id> (e.g., "PMID:19872477") - PMCID:<id> (e.g., "PMCID:2323736") - URL:<url> (e.g., "URL:https://arxiv.org/abs/2106.15928v1") ## Returns: List of paths to the downloaded PDF files. ## Example: paper_download([ {"searcher": "arxiv", "paper_id": "2106.12345"}, {"searcher": "pubmed", "paper_id": "32790614"}, {"searcher": "biorxiv", "paper_id": "10.1101/2020.01.01.123456"}, {"searcher": "semantic", "paper_id": "DOI:10.18653/v1/N18-3011"} ]) """, ) async def paper_download(query_list: List[PaperDownloadQuery]) -> List[str]: async with httpx.AsyncClient() as client: pdf_paths = await xmap_async(query_list, async_download_per_query, is_async_work_func=True, desc="Downloading papers") return pdf_paths return [] # endregion paper_download # region paper_read @mcp.tool( name="paper_read", description="""Read and extract text content from academic paper PDFs from multiple sources. ## Input Constraints: - searcher: Required, must be one of: arxiv, pubmed, biorxiv, medrxiv, iacr, semantic, crossref - paper_id: Required, 1-200 characters, cannot be empty ## Example: ### arXiv paper_read({"searcher": "arxiv", "paper_id": "2106.12345", "save_path": "./downloads"}) # paper_id is arXiv ID. ### PubMed paper_read({"searcher": "pubmed", "paper_id": "32790614", "save_path": "./downloads"}) # paper_id is PubMed ID (PMID). ### bioRxiv paper_read({"searcher": "biorxiv", "paper_id": "10.1101/2020.01.01.123456", "save_path": "./downloads"}) # paper_id is bioRxiv DOI. ### medRxiv paper_read({"searcher": "medrxiv", "paper_id": "10.1101/2020.01.01.123456", "save_path": "./downloads"}) # paper_id is medRxiv DOI. ### IACR paper_read({"searcher": "iacr", "paper_id": "2009/101", "save_path": "./downloads"}) # paper_id is IACR paper ID. ### Semantic Scholar paper_read({"searcher": "semantic", "paper_id": "DOI:10.18653/v1/N18-3011", "save_path": "./downloads"}) where paper_id: Semantic Scholar paper ID, Paper identifier in one of the following formats: - Semantic Scholar ID (e.g., "649def34f8be52c8b66281af98ae884c09aef38b") - DOI:<doi> (e.g., "DOI:10.18653/v1/N18-3011") - ARXIV:<id> (e.g., "ARXIV:2106.15928") - MAG:<id> (e.g., "MAG:112218234") - ACL:<id> (e.g., "ACL:W12-3903") - PMID:<id> (e.g., "PMID:19872477") - PMCID:<id> (e.g., "PMCID:2323736") - URL:<url> (e.g., "URL:https://arxiv.org/abs/2106.15928v1") ### CrossRef paper_read({"searcher": "crossref", "paper_id": "10.1038/s41586-020-2649-2", "save_path": "./downloads"}) # paper_id is DOI. """) async def paper_read( searcher: Literal["arxiv", "pubmed", "biorxiv", "medrxiv", "iacr", "semantic", "crossref"], paper_id: str = Field( ..., min_length=1, max_length=200, description="The unique identifier of the paper to read (format depends on searcher)" ), ) -> str: try: # Validate paper_id paper_id = paper_id.strip() if not paper_id: return "Error: paper_id cannot be empty or whitespace only" searcher_instance = engine2searcher.get(searcher) if not searcher_instance: return f"Searcher '{searcher}' not found or not supported." text = searcher_instance.read_paper(paper_id, SAVE_PATH) return text except Exception as e: logger.error(f"Error converting paper to text: {e}\n{traceback.format_exc()}") return f"Error converting paper to text: {e}" # endregion paper_read app = typer.Typer(add_completion=False) @app.callback(invoke_without_command=True) def run( host: str = typer.Option("127.0.0.1", help="Bind host (SSE/HTTP only)."), port: int = typer.Option(8000, min=1, max=65535, help="Bind port (SSE/HTTP only)."), debug: bool = typer.Option(False, help="Enable debug logging."), transport: Optional[Literal["stdio", "sse", "streamable-http", "http"]] = typer.Option( None, "--transport", "-t", help="Transport method. One of: stdio, sse, streamable-http, http. Default is stdio; if host/port are set, defaults to sse.", ), ) -> None: """运行 Academic MCP 服务器。默认使用 stdio（适配 MCP 客户端）。如需网络服务（SSE/HTTP），设置环境变量： - `ACADEMIC_MCP_TRANSPORT=sse` 或 `ACADEMIC_MCP_TRANSPORT=streamable-http` """ log_level = "debug" if debug else "info" if not transport or transport == "stdio": logger.info("Starting Academic MCP server with stdio transport") mcp.run(transport="stdio", log_level=log_level) return logger.info(f"Starting Academic MCP server on {host}:{port} with transport '{transport}'") mcp.run(transport=transport, host=host, port=port, log_level=log_level) def main() -> None: """Console script entrypoint.""" app() if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/LinXueyuanStdio/academic-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

__main__.py•15.2 KiB