Skip to main content
Glama
google_scholar.py8.98 kB
# all_in_mcp/academic_platforms/google_scholar.py import logging import random import time from datetime import datetime from typing import Optional import requests from bs4 import BeautifulSoup from ..models.paper import Paper from .base import PaperSource logger = logging.getLogger(__name__) class GoogleScholarSearcher(PaperSource): """Google Scholar paper search implementation""" SCHOLAR_URL = "https://scholar.google.com/scholar" BROWSERS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0", ] def __init__(self): """Initialize Google Scholar searcher""" self._setup_session() def _setup_session(self): """Initialize session with random user agent""" self.session = requests.Session() self.session.headers.update( { "User-Agent": random.choice(self.BROWSERS), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", } ) def _extract_year(self, text: str) -> Optional[int]: """Extract publication year from text""" words = text.replace(",", " ").replace("-", " ").split() for word in words: if word.isdigit() and 1900 <= int(word) <= datetime.now().year: return int(word) return None def _extract_citations(self, item) -> int: """Extract citation count from paper item""" try: citation_elem = item.find("div", class_="gs_fl") if citation_elem: citation_link = citation_elem.find( "a", string=lambda text: text and "Cited by" in text ) if citation_link: citation_text = citation_link.get_text() # Extract number from "Cited by X" text citation_num = "".join(filter(str.isdigit, citation_text)) return int(citation_num) if citation_num else 0 return 0 except Exception: return 0 def _parse_paper(self, item) -> Optional[Paper]: """Parse a single paper entry from HTML""" try: # Extract main paper elements title_elem = item.find("h3", class_="gs_rt") info_elem = item.find("div", class_="gs_a") abstract_elem = item.find("div", class_="gs_rs") if not title_elem or not info_elem: return None # Process title and URL title_text = title_elem.get_text(strip=True) # Remove common prefixes title = ( title_text.replace("[PDF]", "") .replace("[HTML]", "") .replace("[BOOK]", "") .strip() ) link = title_elem.find("a", href=True) url = link["href"] if link else "" # Process author and publication info info_text = info_elem.get_text() info_parts = info_text.split(" - ") # Extract authors (usually the first part before the first dash) authors_text = info_parts[0] if info_parts else "" authors = [a.strip() for a in authors_text.split(",") if a.strip()] # Extract year from the info text year = self._extract_year(info_text) # Extract abstract abstract = abstract_elem.get_text(strip=True) if abstract_elem else "" # Extract citations citations = self._extract_citations(item) # Generate a paper ID based on the URL or title paper_id = f"gs_{abs(hash(url if url else title))}" # Create paper object return Paper( paper_id=paper_id, title=title, authors=authors, abstract=abstract, url=url, pdf_url="", # Google Scholar doesn't provide direct PDF links published_date=datetime(year, 1, 1) if year else datetime.now(), updated_date=None, source="google_scholar", categories=[], keywords=[], doi="", citations=citations, references=[], extra={"info_text": info_text}, ) except Exception as e: logger.warning(f"Failed to parse paper: {e}") return None def search(self, query: str, max_results: int = 10, **kwargs) -> list[Paper]: """ Search Google Scholar for papers Args: query: Search query string max_results: Maximum number of results to return **kwargs: Additional search parameters (e.g., year_low, year_high) Returns: List of Paper objects """ papers = [] start = 0 results_per_page = min(10, max_results) # Extract additional parameters year_low = kwargs.get("year_low") year_high = kwargs.get("year_high") while len(papers) < max_results: try: # Construct search parameters params = { "q": query, "start": start, "hl": "en", "as_sdt": "0,5", # Include articles and citations "num": results_per_page, } # Add year filters if provided if year_low: params["as_ylo"] = year_low if year_high: params["as_yhi"] = year_high # Make request with random delay to avoid rate limiting time.sleep(random.uniform(1.0, 3.0)) response = self.session.get(self.SCHOLAR_URL, params=params, timeout=30) if response.status_code != 200: logger.error(f"Search failed with status {response.status_code}") break # Parse results soup = BeautifulSoup(response.text, "html.parser") results = soup.find_all("div", class_="gs_ri") if not results: logger.info("No more results found") break # Process each result for item in results: if len(papers) >= max_results: break paper = self._parse_paper(item) if paper: papers.append(paper) start += results_per_page except requests.exceptions.RequestException as e: logger.error(f"Network error during search: {e}") break except Exception as e: logger.error(f"Search error: {e}") break return papers[:max_results] def download_pdf(self, paper_id: str, save_path: str) -> str: """ Google Scholar doesn't support direct PDF downloads Args: paper_id: Paper identifier save_path: Directory to save the PDF Returns: Error message explaining limitation Raises: NotImplementedError: Always raises this error """ raise NotImplementedError( "Google Scholar doesn't provide direct PDF downloads. " "Please use the paper URL to access the publisher's website." ) def read_paper(self, paper_id: str, save_path: str = "./downloads", start_page: int | None = None, end_page: int | None = None) -> str: """ Google Scholar doesn't support direct paper reading Args: paper_id: Paper identifier save_path: Directory where papers are stored start_page: Starting page number (1-indexed, inclusive). Defaults to 1. end_page: Ending page number (1-indexed, inclusive). Defaults to last page. Returns: Message indicating the feature is not supported """ return ( "Google Scholar doesn't support direct paper reading. " "Please use the paper URL to access the full text on the publisher's website." )

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jiahaoxiang2000/all-in-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server