from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
import os
import time
import random
import feedparser
from PyPDF2 import PdfReader
from loguru import logger
from ..types import Paper, PaperSource
class GoogleScholarSearcher(PaperSource):
"""Custom implementation of Google Scholar paper search"""
SCHOLAR_URL = "https://scholar.google.com/scholar"
BROWSERS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
]
def __init__(self):
self._setup_session()
def _setup_session(self):
"""Initialize session with random user agent"""
self.session = requests.Session()
self.session.headers.update({
'User-Agent': random.choice(self.BROWSERS),
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'en-US,en;q=0.9'
})
def _extract_year(self, text: str) -> Optional[int]:
"""Extract year from publication info"""
for word in text.split():
if word.isdigit() and 1900 <= int(word) <= datetime.now().year:
return int(word)
return None
def _parse_paper(self, item) -> Optional[Paper]:
"""Parse single paper entry from HTML"""
try:
# Extract main paper elements
title_elem = item.find('h3', class_='gs_rt')
info_elem = item.find('div', class_='gs_a')
abstract_elem = item.find('div', class_='gs_rs')
if not title_elem or not info_elem:
return None
# Process title and URL
title = title_elem.get_text(strip=True).replace('[PDF]', '').replace('[HTML]', '')
link = title_elem.find('a', href=True)
url = link['href'] if link else ''
# Process author info
info_text = info_elem.get_text()
authors = [a.strip() for a in info_text.split('-')[0].split(',')]
year = self._extract_year(info_text)
# Create paper object
return Paper(
paper_id=f"gs_{hash(url)}",
title=title,
authors=authors,
abstract=abstract_elem.get_text() if abstract_elem else "",
url=url,
pdf_url="",
published_date=datetime(year, 1, 1) if year else None,
updated_date=None,
source="google_scholar",
categories=[],
keywords=[],
doi="",
citations=0
)
except Exception as e:
logger.warning(f"Failed to parse paper: {e}")
return None
def search(self, query: str, max_results: int = 10) -> List[Paper]:
"""
Search Google Scholar with custom parameters
"""
papers = []
start = 0
results_per_page = min(10, max_results)
while len(papers) < max_results:
try:
# Construct search parameters
params = {
'q': query,
'start': start,
'hl': 'en',
'as_sdt': '0,5' # Include articles and citations
}
# Make request with random delay
time.sleep(random.uniform(1.0, 3.0))
response = self.session.get(self.SCHOLAR_URL, params=params)
if response.status_code != 200:
logger.error(f"Search failed with status {response.status_code}")
break
# Parse results
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all('div', class_='gs_ri')
if not results:
break
# Process each result
for item in results:
if len(papers) >= max_results:
break
paper = self._parse_paper(item)
if paper:
papers.append(paper)
start += results_per_page
except Exception as e:
logger.error(f"Search error: {e}")
break
return papers[:max_results]
def download_pdf(self, paper_id: str, save_path: str) -> str:
"""
Google Scholar doesn't support direct PDF downloads
Raises:
NotImplementedError: Always raises this error
"""
raise NotImplementedError(
"Google Scholar doesn't provide direct PDF downloads. "
"Please use the paper URL to access the publisher's website."
)
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
"""
Google Scholar doesn't support direct paper reading
Returns:
str: Message indicating the feature is not supported
"""
return (
"Google Scholar doesn't support direct paper reading. "
"Please use the paper URL to access the full text on the publisher's website."
)
if __name__ == "__main__":
# Test Google Scholar searcher
searcher = GoogleScholarSearcher()
print("Testing search functionality...")
query = "machine learning"
max_results = 5
try:
papers = searcher.search(query, max_results=max_results)
print(f"\nFound {len(papers)} papers for query '{query}':")
for i, paper in enumerate(papers, 1):
print(f"\n{i}. {paper.title}")
print(f" Authors: {', '.join(paper.authors)}")
print(f" Citations: {paper.citations}")
print(f" URL: {paper.url}")
except Exception as e:
print(f"Error during search: {e}")