Skip to main content
Glama
biorxiv.py6.33 kB
from typing import List import requests import os from datetime import datetime, timedelta from ..paper import Paper from PyPDF2 import PdfReader class PaperSource: """Abstract base class for paper sources""" def search(self, query: str, **kwargs) -> List[Paper]: raise NotImplementedError def download_pdf(self, paper_id: str, save_path: str) -> str: raise NotImplementedError def read_paper(self, paper_id: str, save_path: str) -> str: raise NotImplementedError class BioRxivSearcher(PaperSource): """Searcher for bioRxiv papers""" BASE_URL = "https://api.biorxiv.org/details/biorxiv" def __init__(self): self.session = requests.Session() self.session.proxies = {'http': None, 'https': None} self.timeout = 30 self.max_retries = 3 def search(self, query: str, max_results: int = 10, days: int = 30) -> List[Paper]: """ Search for papers on bioRxiv by category within the last N days. Args: query: Category name to search for (e.g., "cell biology"). max_results: Maximum number of papers to return. days: Number of days to look back for papers. Returns: List of Paper objects matching the category within the specified date range. """ # Calculate date range: last N days end_date = datetime.now().strftime('%Y-%m-%d') start_date = (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d') # Format category: lowercase and replace spaces with underscores category = query.lower().replace(' ', '_') papers = [] cursor = 0 while len(papers) < max_results: url = f"{self.BASE_URL}/{start_date}/{end_date}/{cursor}" if category: url += f"?category={category}" tries = 0 while tries < self.max_retries: try: response = self.session.get(url, timeout=self.timeout) response.raise_for_status() data = response.json() collection = data.get('collection', []) for item in collection: try: date = datetime.strptime(item['date'], '%Y-%m-%d') papers.append(Paper( paper_id=item['doi'], title=item['title'], authors=item['authors'].split('; '), abstract=item['abstract'], url=f"https://www.biorxiv.org/content/{item['doi']}v{item.get('version', '1')}", pdf_url=f"https://www.biorxiv.org/content/{item['doi']}v{item.get('version', '1')}.full.pdf", published_date=date, updated_date=date, source="biorxiv", categories=[item['category']], keywords=[], doi=item['doi'] )) except Exception as e: print(f"Error parsing bioRxiv entry: {e}") if len(collection) < 100: break # No more results cursor += 100 break # Exit retry loop on success except requests.exceptions.RequestException as e: tries += 1 if tries == self.max_retries: print(f"Failed to connect to bioRxiv API after {self.max_retries} attempts: {e}") break print(f"Attempt {tries} failed, retrying...") else: continue break return papers[:max_results] def download_pdf(self, paper_id: str, save_path: str) -> str: """ Download a PDF for a given paper ID from bioRxiv. Args: paper_id: The DOI of the paper. save_path: Directory to save the PDF. Returns: Path to the downloaded PDF file. """ if not paper_id: raise ValueError("Invalid paper_id: paper_id is empty") pdf_url = f"https://www.biorxiv.org/content/{paper_id}v1.full.pdf" tries = 0 while tries < self.max_retries: try: # Add User-Agent to avoid potential 403 errors headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = self.session.get(pdf_url, timeout=self.timeout, headers=headers) response.raise_for_status() os.makedirs(save_path, exist_ok=True) output_file = f"{save_path}/{paper_id.replace('/', '_')}.pdf" with open(output_file, 'wb') as f: f.write(response.content) return output_file except requests.exceptions.RequestException as e: tries += 1 if tries == self.max_retries: raise Exception(f"Failed to download PDF after {self.max_retries} attempts: {e}") print(f"Attempt {tries} failed, retrying...") def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str: """ Read a paper and convert it to text format. Args: paper_id: bioRxiv DOI save_path: Directory where the PDF is/will be saved Returns: str: The extracted text content of the paper """ pdf_path = f"{save_path}/{paper_id.replace('/', '_')}.pdf" if not os.path.exists(pdf_path): pdf_path = self.download_pdf(paper_id, save_path) try: reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text.strip() except Exception as e: print(f"Error reading PDF for paper {paper_id}: {e}") return ""

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/TitanSneaker/paper-search-mcp-openai'

If you have feedback or need assistance with the MCP directory API, please join our Discord server