Academic MCP

google_scholar.py•5.81 KiB

from typing import List, Dict, Any, Optional from datetime import datetime, timedelta from bs4 import BeautifulSoup import requests import os import time import random import feedparser from PyPDF2 import PdfReader from loguru import logger from ..types import Paper, PaperSource class GoogleScholarSearcher(PaperSource): """Custom implementation of Google Scholar paper search""" SCHOLAR_URL = "https://scholar.google.com/scholar" BROWSERS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36" ] def __init__(self): self._setup_session() def _setup_session(self): """Initialize session with random user agent""" self.session = requests.Session() self.session.headers.update({ 'User-Agent': random.choice(self.BROWSERS), 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'en-US,en;q=0.9' }) def _extract_year(self, text: str) -> Optional[int]: """Extract year from publication info""" for word in text.split(): if word.isdigit() and 1900 <= int(word) <= datetime.now().year: return int(word) return None def _parse_paper(self, item) -> Optional[Paper]: """Parse single paper entry from HTML""" try: # Extract main paper elements title_elem = item.find('h3', class_='gs_rt') info_elem = item.find('div', class_='gs_a') abstract_elem = item.find('div', class_='gs_rs') if not title_elem or not info_elem: return None # Process title and URL title = title_elem.get_text(strip=True).replace('[PDF]', '').replace('[HTML]', '') link = title_elem.find('a', href=True) url = link['href'] if link else '' # Process author info info_text = info_elem.get_text() authors = [a.strip() for a in info_text.split('-')[0].split(',')] year = self._extract_year(info_text) # Create paper object return Paper( paper_id=f"gs_{hash(url)}", title=title, authors=authors, abstract=abstract_elem.get_text() if abstract_elem else "", url=url, pdf_url="", published_date=datetime(year, 1, 1) if year else None, updated_date=None, source="google_scholar", categories=[], keywords=[], doi="", citations=0 ) except Exception as e: logger.warning(f"Failed to parse paper: {e}") return None def search(self, query: str, max_results: int = 10) -> List[Paper]: """ Search Google Scholar with custom parameters """ papers = [] start = 0 results_per_page = min(10, max_results) while len(papers) < max_results: try: # Construct search parameters params = { 'q': query, 'start': start, 'hl': 'en', 'as_sdt': '0,5' # Include articles and citations } # Make request with random delay time.sleep(random.uniform(1.0, 3.0)) response = self.session.get(self.SCHOLAR_URL, params=params) if response.status_code != 200: logger.error(f"Search failed with status {response.status_code}") break # Parse results soup = BeautifulSoup(response.text, 'html.parser') results = soup.find_all('div', class_='gs_ri') if not results: break # Process each result for item in results: if len(papers) >= max_results: break paper = self._parse_paper(item) if paper: papers.append(paper) start += results_per_page except Exception as e: logger.error(f"Search error: {e}") break return papers[:max_results] def download_pdf(self, paper_id: str, save_path: str) -> str: """ Google Scholar doesn't support direct PDF downloads Raises: NotImplementedError: Always raises this error """ raise NotImplementedError( "Google Scholar doesn't provide direct PDF downloads. " "Please use the paper URL to access the publisher's website." ) def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str: """ Google Scholar doesn't support direct paper reading Returns: str: Message indicating the feature is not supported """ return ( "Google Scholar doesn't support direct paper reading. " "Please use the paper URL to access the full text on the publisher's website." ) if __name__ == "__main__": # Test Google Scholar searcher searcher = GoogleScholarSearcher() print("Testing search functionality...") query = "machine learning" max_results = 5 try: papers = searcher.search(query, max_results=max_results) print(f"\nFound {len(papers)} papers for query '{query}':") for i, paper in enumerate(papers, 1): print(f"\n{i}. {paper.title}") print(f" Authors: {', '.join(paper.authors)}") print(f" Citations: {paper.citations}") print(f" URL: {paper.url}") except Exception as e: print(f"Error during search: {e}")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/LinXueyuanStdio/academic-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

google_scholar.py•5.81 KiB