"""
Semantic Scholar MCP Server
===========================
Production MCP server providing direct access to Semantic Scholar's
database of 200M+ academic papers within Claude Desktop.
Tools Provided:
- semantic_scholar_search_papers: Advanced paper search with filters
- semantic_scholar_get_paper: Full paper details with citations/references
- semantic_scholar_search_authors: Find researchers by name
- semantic_scholar_get_author: Author profiles and publications
- semantic_scholar_recommendations: AI-powered related paper discovery
- semantic_scholar_bulk_papers: Batch retrieval (up to 500 papers)
Configuration:
API Key (choose one):
- Environment variable: Set SEMANTIC_SCHOLAR_API_KEY
- Per-request: Pass api_key parameter to any tool (takes priority over env var)
Get a free key at: https://www.semanticscholar.org/product/api
Author: Santiago Maniches
- ORCID: https://orcid.org/0009-0005-6480-1987
- LinkedIn: https://www.linkedin.com/in/santiago-maniches/
Organization: TOPOLOGICA LLC
- Website: https://topologica.ai
- Email: santiago@topologica.ai
License: MIT
Repository: https://github.com/smaniches/semantic-scholar-mcp
Copyright (c) 2025 TOPOLOGICA LLC. All rights reserved.
"""
from __future__ import annotations
import json
import logging
import os
from datetime import datetime, timezone
from enum import Enum
from typing import Any, Dict, List, Optional
import httpx
from mcp.server.fastmcp import FastMCP
from pydantic import BaseModel, ConfigDict, Field
# ═══════════════════════════════════════════════════════════════════════════════
# CONFIGURATION
# ═══════════════════════════════════════════════════════════════════════════════
# API Key: Set via environment variable (user provides their own key)
# Get free key at: https://www.semanticscholar.org/product/api
SEMANTIC_SCHOLAR_API_KEY: str = os.environ.get("SEMANTIC_SCHOLAR_API_KEY", "")
SEMANTIC_SCHOLAR_API_BASE: str = "https://api.semanticscholar.org/graph/v1"
DEFAULT_TIMEOUT: float = 30.0
# Field sets for comprehensive paper metadata
PAPER_FIELDS: List[str] = [
"paperId", "corpusId", "url", "title", "abstract", "venue", "publicationVenue",
"year", "referenceCount", "citationCount", "influentialCitationCount",
"isOpenAccess", "openAccessPdf", "fieldsOfStudy", "s2FieldsOfStudy",
"publicationTypes", "publicationDate", "journal", "citationStyles",
"authors", "externalIds", "tldr"
]
AUTHOR_FIELDS: List[str] = [
"authorId", "externalIds", "url", "name", "aliases", "affiliations",
"homepage", "paperCount", "citationCount", "hIndex"
]
# Logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
logger = logging.getLogger("semantic_scholar_mcp")
# ═══════════════════════════════════════════════════════════════════════════════
# MCP SERVER
# ═══════════════════════════════════════════════════════════════════════════════
mcp = FastMCP(
"semantic_scholar_mcp",
instructions="""
Semantic Scholar MCP Server - Access 200M+ academic papers.
Created by Santiago Maniches (ORCID: 0009-0005-6480-1987)
TOPOLOGICA LLC - https://topologica.ai
Supports DOI, ArXiv, PubMed, ACL, and Semantic Scholar IDs.
"""
)
# ═══════════════════════════════════════════════════════════════════════════════
# PYDANTIC INPUT MODELS
# ═══════════════════════════════════════════════════════════════════════════════
class ResponseFormat(str, Enum):
MARKDOWN = "markdown"
JSON = "json"
class PaperSearchInput(BaseModel):
model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
query: str = Field(..., description="Search query", min_length=1, max_length=500)
year: Optional[str] = Field(default=None, description="Year filter: '2024', '2020-2024', '2020-'")
fields_of_study: Optional[List[str]] = Field(default=None, description="Filter by fields: ['Computer Science', 'Biology']")
publication_types: Optional[List[str]] = Field(default=None, description="Filter: 'Review', 'JournalArticle'")
open_access_only: bool = Field(default=False, description="Only return open access papers")
min_citation_count: Optional[int] = Field(default=None, description="Minimum citations", ge=0)
limit: int = Field(default=10, description="Max results (1-100)", ge=1, le=100)
offset: int = Field(default=0, description="Pagination offset", ge=0)
response_format: ResponseFormat = Field(default=ResponseFormat.MARKDOWN, description="Output format")
api_key: Optional[str] = Field(default=None, description="API key (overrides SEMANTIC_SCHOLAR_API_KEY env var)")
class PaperDetailsInput(BaseModel):
model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
paper_id: str = Field(..., description="Paper ID: S2 ID, DOI:xxx, ARXIV:xxx, PMID:xxx, CorpusId:xxx", min_length=1)
include_citations: bool = Field(default=False, description="Include citing papers")
include_references: bool = Field(default=False, description="Include referenced papers")
citations_limit: int = Field(default=10, description="Max citations to return", ge=1, le=100)
references_limit: int = Field(default=10, description="Max references to return", ge=1, le=100)
response_format: ResponseFormat = Field(default=ResponseFormat.MARKDOWN, description="Output format")
api_key: Optional[str] = Field(default=None, description="API key (overrides SEMANTIC_SCHOLAR_API_KEY env var)")
class AuthorSearchInput(BaseModel):
model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
query: str = Field(..., description="Author name to search", min_length=1, max_length=200)
limit: int = Field(default=10, description="Max results", ge=1, le=100)
offset: int = Field(default=0, description="Pagination offset", ge=0)
response_format: ResponseFormat = Field(default=ResponseFormat.MARKDOWN, description="Output format")
api_key: Optional[str] = Field(default=None, description="API key (overrides SEMANTIC_SCHOLAR_API_KEY env var)")
class AuthorDetailsInput(BaseModel):
model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
author_id: str = Field(..., description="Semantic Scholar author ID", min_length=1)
include_papers: bool = Field(default=True, description="Include publications")
papers_limit: int = Field(default=20, description="Max papers to return", ge=1, le=100)
response_format: ResponseFormat = Field(default=ResponseFormat.MARKDOWN, description="Output format")
api_key: Optional[str] = Field(default=None, description="API key (overrides SEMANTIC_SCHOLAR_API_KEY env var)")
class PaperRecommendationsInput(BaseModel):
model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
paper_id: str = Field(..., description="Seed paper ID for recommendations", min_length=1)
limit: int = Field(default=10, description="Max recommendations", ge=1, le=100)
response_format: ResponseFormat = Field(default=ResponseFormat.MARKDOWN, description="Output format")
api_key: Optional[str] = Field(default=None, description="API key (overrides SEMANTIC_SCHOLAR_API_KEY env var)")
class BulkPaperInput(BaseModel):
model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
paper_ids: List[str] = Field(..., description="List of paper IDs (max 500)", min_length=1, max_length=500)
response_format: ResponseFormat = Field(default=ResponseFormat.JSON, description="Output format")
api_key: Optional[str] = Field(default=None, description="API key (overrides SEMANTIC_SCHOLAR_API_KEY env var)")
# ═══════════════════════════════════════════════════════════════════════════════
# HTTP CLIENT
# ═══════════════════════════════════════════════════════════════════════════════
def _get_headers(api_key: Optional[str] = None) -> Dict[str, str]:
"""Build request headers. User-provided api_key takes priority over env var."""
headers = {"Accept": "application/json", "Content-Type": "application/json"}
effective_key = api_key or SEMANTIC_SCHOLAR_API_KEY
if effective_key:
headers["x-api-key"] = effective_key
return headers
async def _make_request(
method: str,
endpoint: str,
params: Optional[Dict] = None,
json_body: Optional[Dict] = None,
api_key: Optional[str] = None,
) -> Dict[str, Any]:
"""Make HTTP request to Semantic Scholar API."""
url = f"{SEMANTIC_SCHOLAR_API_BASE}/{endpoint}"
headers = _get_headers(api_key)
async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
try:
if method == "GET":
resp = await client.get(url, params=params, headers=headers)
else:
resp = await client.post(url, params=params, json=json_body, headers=headers)
resp.raise_for_status()
return resp.json()
except httpx.HTTPStatusError as e:
_handle_error(e.response.status_code, api_key)
except httpx.TimeoutException:
raise Exception("Request timed out")
return {}
def _handle_error(status: int, api_key: Optional[str] = None) -> None:
"""Handle API errors with contextual messages."""
if status == 401:
if api_key:
msg = "Auth failed. Check your provided API key."
else:
msg = "Auth failed. Set SEMANTIC_SCHOLAR_API_KEY env var or provide api_key parameter."
raise Exception(f"API Error ({status}): {msg}")
if status == 403:
if api_key:
msg = "Forbidden. Your provided API key may be invalid or expired."
else:
msg = "Forbidden. Check SEMANTIC_SCHOLAR_API_KEY env var or provide api_key parameter."
raise Exception(f"API Error ({status}): {msg}")
errors = {
400: "Bad request. Check syntax.",
404: "Not found. Check ID format.",
429: "Rate limited. Wait and retry.",
500: "Server error. Try later.",
503: "Service unavailable."
}
raise Exception(f"API Error ({status}): {errors.get(status, 'Unknown')}")
# ═══════════════════════════════════════════════════════════════════════════════
# FORMATTING UTILITIES
# ═══════════════════════════════════════════════════════════════════════════════
def _format_paper_markdown(paper: Dict[str, Any]) -> str:
lines = []
title = paper.get("title", "Unknown Title")
year = paper.get("year", "N/A")
lines.append(f"### {title} ({year})")
authors = paper.get("authors", [])
if authors:
names = [a.get("name", "?") for a in authors[:5]]
if len(authors) > 5:
names.append(f"... +{len(authors)-5} more")
lines.append(f"**Authors:** {', '.join(names)}")
venue = paper.get("venue") or (paper.get("publicationVenue") or {}).get("name")
if venue:
lines.append(f"**Venue:** {venue}")
citations = paper.get("citationCount", 0)
influential = paper.get("influentialCitationCount", 0)
lines.append(f"**Citations:** {citations} ({influential} influential)")
pdf_info = paper.get("openAccessPdf") or {}
if pdf_info.get("url"):
lines.append(f"**Open Access:** [PDF]({pdf_info['url']})")
fields = paper.get("fieldsOfStudy") or []
if fields:
lines.append(f"**Fields:** {', '.join(fields[:5])}")
tldr = paper.get("tldr") or {}
if tldr.get("text"):
lines.append(f"**TL;DR:** {tldr['text']}")
abstract = paper.get("abstract")
if abstract:
lines.append(f"**Abstract:** {abstract[:500]}..." if len(abstract) > 500 else f"**Abstract:** {abstract}")
ext_ids = paper.get("externalIds") or {}
ids = []
if ext_ids.get("DOI"): ids.append(f"DOI: {ext_ids['DOI']}")
if ext_ids.get("ArXiv"): ids.append(f"ArXiv: {ext_ids['ArXiv']}")
if ext_ids.get("PubMed"): ids.append(f"PMID: {ext_ids['PubMed']}")
if ids:
lines.append(f"**IDs:** {', '.join(ids)}")
if paper.get("url"):
lines.append(f"**Link:** [{paper.get('paperId')}]({paper['url']})")
lines.append("")
return "\n".join(lines)
def _format_author_markdown(author: Dict[str, Any]) -> str:
lines = [f"### {author.get('name', 'Unknown')}"]
affiliations = author.get("affiliations") or []
if affiliations:
lines.append(f"**Affiliations:** {', '.join(affiliations[:3])}")
lines.append(f"**h-index:** {author.get('hIndex')} | **Papers:** {author.get('paperCount', 0)} | **Citations:** {author.get('citationCount', 0)}")
if author.get("homepage"):
lines.append(f"**Homepage:** {author['homepage']}")
if author.get("url"):
lines.append(f"**Profile:** [{author.get('authorId')}]({author['url']})")
lines.append("")
return "\n".join(lines)
# ═══════════════════════════════════════════════════════════════════════════════
# MCP TOOLS
# ═══════════════════════════════════════════════════════════════════════════════
@mcp.tool(name="semantic_scholar_search_papers")
async def search_papers(params: PaperSearchInput) -> str:
"""Search for academic papers. Supports boolean operators (AND, OR, NOT), phrase search with quotes."""
logger.info(f"Searching: {params.query}")
api_params = {"query": params.query, "offset": params.offset, "limit": params.limit, "fields": ",".join(PAPER_FIELDS)}
if params.year: api_params["year"] = params.year
if params.fields_of_study: api_params["fieldsOfStudy"] = ",".join(params.fields_of_study)
if params.publication_types: api_params["publicationTypes"] = ",".join(params.publication_types)
if params.open_access_only: api_params["openAccessPdf"] = ""
if params.min_citation_count: api_params["minCitationCount"] = params.min_citation_count
response = await _make_request("GET", "paper/search", params=api_params, api_key=params.api_key)
total, papers = response.get("total", 0), response.get("data", [])
if params.response_format == ResponseFormat.JSON:
return json.dumps({"query": params.query, "total": total, "papers": papers}, indent=2)
lines = [f"## Search Results: \"{params.query}\"", f"**Found:** {total} papers (showing {params.offset + 1}-{params.offset + len(papers)})", ""]
for paper in papers:
lines.append(_format_paper_markdown(paper))
if total > params.offset + len(papers):
lines.append(f"*Use offset={params.offset + params.limit} to see more results*")
return "\n".join(lines)
@mcp.tool(name="semantic_scholar_get_paper")
async def get_paper_details(params: PaperDetailsInput) -> str:
"""Get paper details. Accepts: S2 ID, DOI:xxx, ARXIV:xxx, PMID:xxx, CorpusId:xxx"""
logger.info(f"Getting paper: {params.paper_id}")
paper = await _make_request("GET", f"paper/{params.paper_id}", params={"fields": ",".join(PAPER_FIELDS)}, api_key=params.api_key)
result = {"paper": paper}
if params.include_citations:
cit = await _make_request("GET", f"paper/{params.paper_id}/citations", params={"fields": ",".join(PAPER_FIELDS), "limit": params.citations_limit}, api_key=params.api_key)
result["citations"] = cit.get("data", [])
if params.include_references:
ref = await _make_request("GET", f"paper/{params.paper_id}/references", params={"fields": ",".join(PAPER_FIELDS), "limit": params.references_limit}, api_key=params.api_key)
result["references"] = ref.get("data", [])
if params.response_format == ResponseFormat.JSON:
return json.dumps(result, indent=2)
lines = ["## Paper Details", "", _format_paper_markdown(paper)]
if result.get("citations"):
lines.extend(["---", f"### Citing Papers ({len(result['citations'])} shown)", ""])
for c in result["citations"]:
p = c.get("citingPaper", {})
if p: lines.append(f"- **{p.get('title', '?')}** ({p.get('year', '')}) - {p.get('citationCount', 0)} citations")
if result.get("references"):
lines.extend(["---", f"### References ({len(result['references'])} shown)", ""])
for r in result["references"]:
p = r.get("citedPaper", {})
if p: lines.append(f"- **{p.get('title', '?')}** ({p.get('year', '')}) - {p.get('citationCount', 0)} citations")
return "\n".join(lines)
@mcp.tool(name="semantic_scholar_search_authors")
async def search_authors(params: AuthorSearchInput) -> str:
"""Search for academic authors by name."""
logger.info(f"Searching authors: {params.query}")
response = await _make_request("GET", "author/search", params={"query": params.query, "offset": params.offset, "limit": params.limit, "fields": ",".join(AUTHOR_FIELDS)}, api_key=params.api_key)
total, authors = response.get("total", 0), response.get("data", [])
if params.response_format == ResponseFormat.JSON:
return json.dumps({"query": params.query, "total": total, "authors": authors}, indent=2)
lines = [f"## Author Search: \"{params.query}\"", f"**Found:** {total} authors", ""]
for author in authors:
lines.append(_format_author_markdown(author))
return "\n".join(lines)
@mcp.tool(name="semantic_scholar_get_author")
async def get_author_details(params: AuthorDetailsInput) -> str:
"""Get author profile with optional publications list."""
logger.info(f"Getting author: {params.author_id}")
author = await _make_request("GET", f"author/{params.author_id}", params={"fields": ",".join(AUTHOR_FIELDS)}, api_key=params.api_key)
result = {"author": author}
if params.include_papers:
papers = await _make_request("GET", f"author/{params.author_id}/papers", params={"fields": ",".join(PAPER_FIELDS), "limit": params.papers_limit}, api_key=params.api_key)
result["papers"] = papers.get("data", [])
if params.response_format == ResponseFormat.JSON:
return json.dumps(result, indent=2)
lines = ["## Author Profile", "", _format_author_markdown(author)]
if result.get("papers"):
lines.extend(["---", f"### Publications ({len(result['papers'])} shown)", ""])
for p in result["papers"]:
lines.append(f"- **{p.get('title', '?')}** ({p.get('year', '')}) - {p.get('citationCount', 0)} citations")
return "\n".join(lines)
@mcp.tool(name="semantic_scholar_recommendations")
async def get_recommendations(params: PaperRecommendationsInput) -> str:
"""Get paper recommendations based on a seed paper."""
logger.info(f"Recommendations for: {params.paper_id}")
async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
resp = await client.post(
f"https://api.semanticscholar.org/recommendations/v1/papers/forpaper/{params.paper_id}",
params={"fields": ",".join(PAPER_FIELDS), "limit": params.limit},
json={"positivePaperIds": [params.paper_id]},
headers=_get_headers(params.api_key)
)
resp.raise_for_status()
data = resp.json()
papers = data.get("recommendedPapers", [])
if params.response_format == ResponseFormat.JSON:
return json.dumps({"seed": params.paper_id, "recommendations": papers}, indent=2)
lines = [f"## Recommendations", f"**Seed:** {params.paper_id}", f"**Found:** {len(papers)}", ""]
for paper in papers:
lines.append(_format_paper_markdown(paper))
return "\n".join(lines)
@mcp.tool(name="semantic_scholar_bulk_papers")
async def get_bulk_papers(params: BulkPaperInput) -> str:
"""Retrieve multiple papers in a single request (max 500)."""
logger.info(f"Bulk retrieval: {len(params.paper_ids)} papers")
response = await _make_request("POST", "paper/batch", params={"fields": ",".join(PAPER_FIELDS)}, json_body={"ids": params.paper_ids}, api_key=params.api_key)
papers = response if isinstance(response, list) else response.get("data", [])
if params.response_format == ResponseFormat.JSON:
return json.dumps({"requested": len(params.paper_ids), "retrieved": len(papers), "papers": papers}, indent=2)
lines = [f"## Bulk Retrieval", f"**Requested:** {len(params.paper_ids)} | **Retrieved:** {len(papers)}", ""]
for paper in papers:
if paper: lines.append(_format_paper_markdown(paper))
return "\n".join(lines)
# ═══════════════════════════════════════════════════════════════════════════════
# ENTRY POINT
# ═══════════════════════════════════════════════════════════════════════════════
def main():
"""Run the MCP server."""
if not SEMANTIC_SCHOLAR_API_KEY:
logger.warning("SEMANTIC_SCHOLAR_API_KEY not set. You can provide api_key per-request or use rate-limited public access (1 req/sec).")
mcp.run()
if __name__ == "__main__":
main()