Skip to main content
Glama

Paper Search MCP

by openags
google_scholar.py6.32 kB
from typing import List, Optional from datetime import datetime import requests from bs4 import BeautifulSoup import time import random from ..paper import Paper import logging logger = logging.getLogger(__name__) class PaperSource: """Abstract base class for paper sources""" def search(self, query: str, **kwargs) -> List[Paper]: raise NotImplementedError def download_pdf(self, paper_id: str, save_path: str) -> str: raise NotImplementedError def read_paper(self, paper_id: str, save_path: str) -> str: raise NotImplementedError class GoogleScholarSearcher(PaperSource): """Custom implementation of Google Scholar paper search""" SCHOLAR_URL = "https://scholar.google.com/scholar" BROWSERS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36" ] def __init__(self): self._setup_session() def _setup_session(self): """Initialize session with random user agent""" self.session = requests.Session() self.session.headers.update({ 'User-Agent': random.choice(self.BROWSERS), 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'en-US,en;q=0.9' }) def _extract_year(self, text: str) -> Optional[int]: """Extract year from publication info""" for word in text.split(): if word.isdigit() and 1900 <= int(word) <= datetime.now().year: return int(word) return None def _parse_paper(self, item) -> Optional[Paper]: """Parse single paper entry from HTML""" try: # Extract main paper elements title_elem = item.find('h3', class_='gs_rt') info_elem = item.find('div', class_='gs_a') abstract_elem = item.find('div', class_='gs_rs') if not title_elem or not info_elem: return None # Process title and URL title = title_elem.get_text(strip=True).replace('[PDF]', '').replace('[HTML]', '') link = title_elem.find('a', href=True) url = link['href'] if link else '' # Process author info info_text = info_elem.get_text() authors = [a.strip() for a in info_text.split('-')[0].split(',')] year = self._extract_year(info_text) # Create paper object return Paper( paper_id=f"gs_{hash(url)}", title=title, authors=authors, abstract=abstract_elem.get_text() if abstract_elem else "", url=url, pdf_url="", published_date=datetime(year, 1, 1) if year else None, updated_date=None, source="google_scholar", categories=[], keywords=[], doi="", citations=0 ) except Exception as e: logger.warning(f"Failed to parse paper: {e}") return None def search(self, query: str, max_results: int = 10) -> List[Paper]: """ Search Google Scholar with custom parameters """ papers = [] start = 0 results_per_page = min(10, max_results) while len(papers) < max_results: try: # Construct search parameters params = { 'q': query, 'start': start, 'hl': 'en', 'as_sdt': '0,5' # Include articles and citations } # Make request with random delay time.sleep(random.uniform(1.0, 3.0)) response = self.session.get(self.SCHOLAR_URL, params=params) if response.status_code != 200: logger.error(f"Search failed with status {response.status_code}") break # Parse results soup = BeautifulSoup(response.text, 'html.parser') results = soup.find_all('div', class_='gs_ri') if not results: break # Process each result for item in results: if len(papers) >= max_results: break paper = self._parse_paper(item) if paper: papers.append(paper) start += results_per_page except Exception as e: logger.error(f"Search error: {e}") break return papers[:max_results] def download_pdf(self, paper_id: str, save_path: str) -> str: """ Google Scholar doesn't support direct PDF downloads Raises: NotImplementedError: Always raises this error """ raise NotImplementedError( "Google Scholar doesn't provide direct PDF downloads. " "Please use the paper URL to access the publisher's website." ) def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str: """ Google Scholar doesn't support direct paper reading Returns: str: Message indicating the feature is not supported """ return ( "Google Scholar doesn't support direct paper reading. " "Please use the paper URL to access the full text on the publisher's website." ) if __name__ == "__main__": # Test Google Scholar searcher searcher = GoogleScholarSearcher() print("Testing search functionality...") query = "machine learning" max_results = 5 try: papers = searcher.search(query, max_results=max_results) print(f"\nFound {len(papers)} papers for query '{query}':") for i, paper in enumerate(papers, 1): print(f"\n{i}. {paper.title}") print(f" Authors: {', '.join(paper.authors)}") print(f" Citations: {paper.citations}") print(f" URL: {paper.url}") except Exception as e: print(f"Error during search: {e}")

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/openags/paper-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server