Skip to main content
Glama

apaper_search_google_scholar_papers

Search academic papers from Google Scholar using specific queries, result limits, and publication year filters to support research needs.

Instructions

Search academic papers from Google Scholar

Args: query: Search query string (e.g., 'machine learning', 'neural networks') max_results: Maximum number of papers to return (default: 10) year_low: Minimum publication year (optional) year_high: Maximum publication year (optional)

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
queryYes
max_resultsNo
year_lowNo
year_highNo

Implementation Reference

  • The MCP tool handler function 'search_google_scholar_papers' that handles the tool invocation, parameter validation, calls the underlying searcher, formats results, and returns the response string. Note: Likely registered as 'apaper_search_google_scholar_papers' due to FastMCP('apaper') server name.
    @mcp.tool()
    def search_google_scholar_papers(
        query: str,
        max_results: int = 10,
        year_low: int | str | None = None,
        year_high: int | str | None = None,
    ) -> str:
        """
        Search academic papers from Google Scholar
        
        Args:
            query: Search query string (e.g., 'machine learning', 'neural networks')
            max_results: Maximum number of papers to return (default: 10)
            year_low: Minimum publication year (optional)
            year_high: Maximum publication year (optional)
        """
        try:
            # Convert string parameters to integers if needed
            year_low_int = None
            year_high_int = None
            
            if year_low is not None:
                year_low_int = int(year_low)
            
            if year_high is not None:
                year_high_int = int(year_high)
            
            papers = google_scholar_searcher.search(
                query,
                max_results=max_results,
                year_low=year_low_int,
                year_high=year_high_int,
            )
    
            if not papers:
                year_filter_msg = ""
                if year_low or year_high:
                    year_range = f" ({year_low or 'earliest'}-{year_high or 'latest'})"
                    year_filter_msg = f" in year range{year_range}"
                return f"No papers found for query: {query}{year_filter_msg}"
    
            year_filter_msg = ""
            if year_low or year_high:
                year_range = f" ({year_low or 'earliest'}-{year_high or 'latest'})"
                year_filter_msg = f" in year range{year_range}"
    
            result_text = f"Found {len(papers)} Google Scholar papers for query '{query}'{year_filter_msg}:\n\n"
            for i, paper in enumerate(papers, 1):
                result_text += f"{i}. **{paper.title}**\n"
                result_text += f"   - Authors: {', '.join(paper.authors)}\n"
                if paper.citations > 0:
                    result_text += f"   - Citations: {paper.citations}\n"
                if paper.published_date and paper.published_date.year > 1900:
                    result_text += f"   - Year: {paper.published_date.year}\n"
                if paper.url:
                    result_text += f"   - URL: {paper.url}\n"
                if paper.abstract:
                    # Truncate abstract for readability
                    abstract_preview = (
                        paper.abstract[:300] + "..."
                        if len(paper.abstract) > 300
                        else paper.abstract
                    )
                    result_text += f"   - Abstract: {abstract_preview}\n"
                result_text += "\n"
    
            return result_text
        except ValueError as e:
            return f"Error: Invalid year format. Please provide valid integers for year_low and year_high."
        except Exception as e:
            return f"Error searching Google Scholar: {str(e)}"
  • The core helper function implementing the Google Scholar search logic via web scraping with requests and BeautifulSoup, parsing results into Paper objects.
    def search(self, query: str, max_results: int = 10, **kwargs) -> list[Paper]:
        """
        Search Google Scholar for papers
    
        Args:
            query: Search query string
            max_results: Maximum number of results to return
            **kwargs: Additional search parameters (e.g., year_low, year_high)
    
        Returns:
            List of Paper objects
        """
        papers = []
        start = 0
        results_per_page = min(10, max_results)
    
        # Extract additional parameters
        year_low = kwargs.get("year_low")
        year_high = kwargs.get("year_high")
    
        while len(papers) < max_results:
            try:
                # Construct search parameters
                params = {
                    "q": query,
                    "start": start,
                    "hl": "en",
                    "as_sdt": "0,5",  # Include articles and citations
                    "num": results_per_page,
                }
    
                # Add year filters if provided
                if year_low:
                    params["as_ylo"] = year_low
                if year_high:
                    params["as_yhi"] = year_high
    
                # Make request with random delay to avoid rate limiting
                time.sleep(random.uniform(1.0, 3.0))
                response = self.session.get(self.SCHOLAR_URL, params=params, timeout=30)
    
                if response.status_code != 200:
                    logger.error(f"Search failed with status {response.status_code}")
                    break
    
                # Parse results
                soup = BeautifulSoup(response.text, "html.parser")
                results = soup.find_all("div", class_="gs_ri")
    
                if not results:
                    logger.info("No more results found")
                    break
    
                # Process each result
                for item in results:
                    if len(papers) >= max_results:
                        break
    
                    paper = self._parse_paper(item)
                    if paper:
                        papers.append(paper)
    
                start += results_per_page
    
            except requests.exceptions.RequestException as e:
                logger.error(f"Network error during search: {e}")
                break
            except Exception as e:
                logger.error(f"Search error: {e}")
                break
    
        return papers[:max_results]
  • Instantiation of the GoogleScholarSearcher helper class used by the handler.
    google_scholar_searcher = GoogleScholarSearcher()
  • FastMCP server initialization with name 'apaper', which likely prefixes tool names like 'apaper_search_google_scholar_papers'.
    mcp = FastMCP("apaper")
  • The GoogleScholarSearcher class providing the scraping session setup, parsing helpers, and search functionality.
    class GoogleScholarSearcher(PaperSource):
        """Google Scholar paper search implementation"""
    
        SCHOLAR_URL = "https://scholar.google.com/scholar"
        BROWSERS = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0",
        ]
    
        def __init__(self):
            """Initialize Google Scholar searcher"""
            self._setup_session()
    
        def _setup_session(self):
            """Initialize session with random user agent"""
            self.session = requests.Session()
            self.session.headers.update(
                {
                    "User-Agent": random.choice(self.BROWSERS),
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                    "Accept-Language": "en-US,en;q=0.9",
                    "Accept-Encoding": "gzip, deflate",
                    "DNT": "1",
                    "Connection": "keep-alive",
                    "Upgrade-Insecure-Requests": "1",
                }
            )
    
        def _extract_year(self, text: str) -> Optional[int]:
            """Extract publication year from text"""
            words = text.replace(",", " ").replace("-", " ").split()
            for word in words:
                if word.isdigit() and 1900 <= int(word) <= datetime.now().year:
                    return int(word)
            return None
    
        def _extract_citations(self, item) -> int:
            """Extract citation count from paper item"""
            try:
                citation_elem = item.find("div", class_="gs_fl")
                if citation_elem:
                    citation_link = citation_elem.find(
                        "a", string=lambda text: text and "Cited by" in text
                    )
                    if citation_link:
                        citation_text = citation_link.get_text()
                        # Extract number from "Cited by X" text
                        citation_num = "".join(filter(str.isdigit, citation_text))
                        return int(citation_num) if citation_num else 0
                return 0
            except Exception:
                return 0
    
        def _parse_paper(self, item) -> Optional[Paper]:
            """Parse a single paper entry from HTML"""
            try:
                # Extract main paper elements
                title_elem = item.find("h3", class_="gs_rt")
                info_elem = item.find("div", class_="gs_a")
                abstract_elem = item.find("div", class_="gs_rs")
    
                if not title_elem or not info_elem:
                    return None
    
                # Process title and URL
                title_text = title_elem.get_text(strip=True)
                # Remove common prefixes
                title = (
                    title_text.replace("[PDF]", "")
                    .replace("[HTML]", "")
                    .replace("[BOOK]", "")
                    .strip()
                )
    
                link = title_elem.find("a", href=True)
                url = link["href"] if link else ""
    
                # Process author and publication info
                info_text = info_elem.get_text()
                info_parts = info_text.split(" - ")
    
                # Extract authors (usually the first part before the first dash)
                authors_text = info_parts[0] if info_parts else ""
                authors = [a.strip() for a in authors_text.split(",") if a.strip()]
    
                # Extract year from the info text
                year = self._extract_year(info_text)
    
                # Extract abstract
                abstract = abstract_elem.get_text(strip=True) if abstract_elem else ""
    
                # Extract citations
                citations = self._extract_citations(item)
    
                # Generate a paper ID based on the URL or title
                paper_id = f"gs_{abs(hash(url if url else title))}"
    
                # Create paper object
                return Paper(
                    paper_id=paper_id,
                    title=title,
                    authors=authors,
                    abstract=abstract,
                    url=url,
                    pdf_url="",  # Google Scholar doesn't provide direct PDF links
                    published_date=datetime(year, 1, 1) if year else datetime.now(),
                    updated_date=None,
                    source="google_scholar",
                    categories=[],
                    keywords=[],
                    doi="",
                    citations=citations,
                    references=[],
                    extra={"info_text": info_text},
                )
            except Exception as e:
                logger.warning(f"Failed to parse paper: {e}")
                return None
    
        def search(self, query: str, max_results: int = 10, **kwargs) -> list[Paper]:
            """
            Search Google Scholar for papers
    
            Args:
                query: Search query string
                max_results: Maximum number of results to return
                **kwargs: Additional search parameters (e.g., year_low, year_high)
    
            Returns:
                List of Paper objects
            """
            papers = []
            start = 0
            results_per_page = min(10, max_results)
    
            # Extract additional parameters
            year_low = kwargs.get("year_low")
            year_high = kwargs.get("year_high")
    
            while len(papers) < max_results:
                try:
                    # Construct search parameters
                    params = {
                        "q": query,
                        "start": start,
                        "hl": "en",
                        "as_sdt": "0,5",  # Include articles and citations
                        "num": results_per_page,
                    }
    
                    # Add year filters if provided
                    if year_low:
                        params["as_ylo"] = year_low
                    if year_high:
                        params["as_yhi"] = year_high
    
                    # Make request with random delay to avoid rate limiting
                    time.sleep(random.uniform(1.0, 3.0))
                    response = self.session.get(self.SCHOLAR_URL, params=params, timeout=30)
    
                    if response.status_code != 200:
                        logger.error(f"Search failed with status {response.status_code}")
                        break
    
                    # Parse results
                    soup = BeautifulSoup(response.text, "html.parser")
                    results = soup.find_all("div", class_="gs_ri")
    
                    if not results:
                        logger.info("No more results found")
                        break
    
                    # Process each result
                    for item in results:
                        if len(papers) >= max_results:
                            break
    
                        paper = self._parse_paper(item)
                        if paper:
                            papers.append(paper)
    
                    start += results_per_page
    
                except requests.exceptions.RequestException as e:
                    logger.error(f"Network error during search: {e}")
                    break
                except Exception as e:
                    logger.error(f"Search error: {e}")
                    break
    
            return papers[:max_results]
    
        def download_pdf(self, paper_id: str, save_path: str) -> str:
            """
            Google Scholar doesn't support direct PDF downloads
    
            Args:
                paper_id: Paper identifier
                save_path: Directory to save the PDF
    
            Returns:
                Error message explaining limitation
    
            Raises:
                NotImplementedError: Always raises this error
            """
            raise NotImplementedError(
                "Google Scholar doesn't provide direct PDF downloads. "
                "Please use the paper URL to access the publisher's website."
            )
    
        def read_paper(self, paper_id: str, save_path: str = "./downloads", start_page: int | None = None, end_page: int | None = None) -> str:
            """
            Google Scholar doesn't support direct paper reading
    
            Args:
                paper_id: Paper identifier
                save_path: Directory where papers are stored
                start_page: Starting page number (1-indexed, inclusive). Defaults to 1.
                end_page: Ending page number (1-indexed, inclusive). Defaults to last page.
    
            Returns:
                Message indicating the feature is not supported
            """
            return (
                "Google Scholar doesn't support direct paper reading. "
                "Please use the paper URL to access the full text on the publisher's website."
            )

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/isomoes/all-in-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server