Paper Search MCP

MIT License

366

Overview InspectNew Endpoints Schema Related Servers Reviews Score

biorxiv.py•6.33 kB

from typing import List import requests import os from datetime import datetime, timedelta from ..paper import Paper from PyPDF2 import PdfReader class PaperSource: """Abstract base class for paper sources""" def search(self, query: str, **kwargs) -> List[Paper]: raise NotImplementedError def download_pdf(self, paper_id: str, save_path: str) -> str: raise NotImplementedError def read_paper(self, paper_id: str, save_path: str) -> str: raise NotImplementedError class BioRxivSearcher(PaperSource): """Searcher for bioRxiv papers""" BASE_URL = "https://api.biorxiv.org/details/biorxiv" def __init__(self): self.session = requests.Session() self.session.proxies = {'http': None, 'https': None} self.timeout = 30 self.max_retries = 3 def search(self, query: str, max_results: int = 10, days: int = 30) -> List[Paper]: """ Search for papers on bioRxiv by category within the last N days. Args: query: Category name to search for (e.g., "cell biology"). max_results: Maximum number of papers to return. days: Number of days to look back for papers. Returns: List of Paper objects matching the category within the specified date range. """ # Calculate date range: last N days end_date = datetime.now().strftime('%Y-%m-%d') start_date = (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d') # Format category: lowercase and replace spaces with underscores category = query.lower().replace(' ', '_') papers = [] cursor = 0 while len(papers) < max_results: url = f"{self.BASE_URL}/{start_date}/{end_date}/{cursor}" if category: url += f"?category={category}" tries = 0 while tries < self.max_retries: try: response = self.session.get(url, timeout=self.timeout) response.raise_for_status() data = response.json() collection = data.get('collection', []) for item in collection: try: date = datetime.strptime(item['date'], '%Y-%m-%d') papers.append(Paper( paper_id=item['doi'], title=item['title'], authors=item['authors'].split('; '), abstract=item['abstract'], url=f"https://www.biorxiv.org/content/{item['doi']}v{item.get('version', '1')}", pdf_url=f"https://www.biorxiv.org/content/{item['doi']}v{item.get('version', '1')}.full.pdf", published_date=date, updated_date=date, source="biorxiv", categories=[item['category']], keywords=[], doi=item['doi'] )) except Exception as e: print(f"Error parsing bioRxiv entry: {e}") if len(collection) < 100: break # No more results cursor += 100 break # Exit retry loop on success except requests.exceptions.RequestException as e: tries += 1 if tries == self.max_retries: print(f"Failed to connect to bioRxiv API after {self.max_retries} attempts: {e}") break print(f"Attempt {tries} failed, retrying...") else: continue break return papers[:max_results] def download_pdf(self, paper_id: str, save_path: str) -> str: """ Download a PDF for a given paper ID from bioRxiv. Args: paper_id: The DOI of the paper. save_path: Directory to save the PDF. Returns: Path to the downloaded PDF file. """ if not paper_id: raise ValueError("Invalid paper_id: paper_id is empty") pdf_url = f"https://www.biorxiv.org/content/{paper_id}v1.full.pdf" tries = 0 while tries < self.max_retries: try: # Add User-Agent to avoid potential 403 errors headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = self.session.get(pdf_url, timeout=self.timeout, headers=headers) response.raise_for_status() os.makedirs(save_path, exist_ok=True) output_file = f"{save_path}/{paper_id.replace('/', '_')}.pdf" with open(output_file, 'wb') as f: f.write(response.content) return output_file except requests.exceptions.RequestException as e: tries += 1 if tries == self.max_retries: raise Exception(f"Failed to download PDF after {self.max_retries} attempts: {e}") print(f"Attempt {tries} failed, retrying...") def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str: """ Read a paper and convert it to text format. Args: paper_id: bioRxiv DOI save_path: Directory where the PDF is/will be saved Returns: str: The extracted text content of the paper """ pdf_path = f"{save_path}/{paper_id.replace('/', '_')}.pdf" if not os.path.exists(pdf_path): pdf_path = self.download_pdf(paper_id, save_path) try: reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text.strip() except Exception as e: print(f"Error reading PDF for paper {paper_id}: {e}") return ""

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/openags/paper-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server