Skip to main content
Glama
h-lu
by h-lu
sci_hub.py14.4 kB
# paper_search_mcp/academic_platforms/sci_hub.py """ SciHubFetcher - Sci-Hub PDF 下载器 通过 Sci-Hub 下载学术论文 PDF(仅限 2023 年之前发表的论文)。 注意:Sci-Hub 的使用可能在某些地区受到法律限制。 请确保您在使用前了解当地法律法规。 """ from pathlib import Path import re import hashlib import logging import os import subprocess import shutil from typing import Optional from datetime import datetime import requests from bs4 import BeautifulSoup import pymupdf4llm logger = logging.getLogger(__name__) # Sci-Hub 可用镜像列表(按可用性排序,2024/2025 更新) SCIHUB_MIRRORS = [ "https://sci-hub.ru", "https://sci-hub.wf", "https://sci-hub.ren", "https://sci-hub.se", "https://sci-hub.st" ] class SciHubFetcher: """Sci-Hub PDF 下载器 通过 DOI 从 Sci-Hub 下载论文 PDF。 限制: - 仅支持 2023 年之前发表的论文 - 需要有效的 DOI 环境变量: - SCIHUB_MIRROR: 自定义 Sci-Hub 镜像地址 """ def __init__( self, base_url: Optional[str] = None, timeout: int = 30 ): """初始化 Sci-Hub 下载器 Args: base_url: Sci-Hub 镜像地址(默认从环境变量或使用默认镜像) timeout: 请求超时时间 """ self.base_url = ( base_url or os.environ.get('SCIHUB_MIRROR') or SCIHUB_MIRRORS[0] ).rstrip("/") self.timeout = timeout self.session = requests.Session() self.session.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } logger.info(f"SciHub initialized with mirror: {self.base_url}") def _download_with_curl(self, url: str, file_path: str) -> bool: """使用 curl 下载 PDF(更可靠) Args: url: PDF URL file_path: 保存路径 Returns: 是否成功 """ if not shutil.which('curl'): return False try: result = subprocess.run( [ 'curl', '-L', # 跟随重定向 '-o', file_path, '--connect-timeout', '30', '--max-time', '300', # 最大 5 分钟 '-k', # 允许不安全的 SSL(Sci-Hub 证书问题) '-f', # 失败时返回错误码 '-s', # 静默模式 '--retry', '3', '--retry-delay', '2', '-A', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', url ], capture_output=True, text=True, timeout=360 ) if result.returncode == 0 and os.path.exists(file_path): file_size = os.path.getsize(file_path) # 检查是否是有效的 PDF(至少 10KB,且以 %PDF 开头) if file_size > 10000: with open(file_path, 'rb') as f: header = f.read(4) if header == b'%PDF': logger.info(f"PDF downloaded with curl: {file_path} ({file_size} bytes)") return True else: logger.warning(f"Downloaded file is not a PDF") os.remove(file_path) return False else: logger.warning(f"Downloaded file too small: {file_size} bytes") os.remove(file_path) return False else: logger.warning(f"curl failed: {result.stderr}") return False except subprocess.TimeoutExpired: logger.warning("curl download timed out") return False except Exception as e: logger.warning(f"curl error: {e}") return False def download_pdf(self, doi: str, save_path: Optional[str] = None) -> str: """通过 DOI 下载论文 PDF 优先使用 curl(更可靠),失败时回退到 requests。 Args: doi: 论文 DOI(如 "10.1038/nature12373") save_path: 保存目录(默认 ~/paper_downloads) Returns: 下载的文件路径或错误信息 """ if not doi or not doi.strip(): return "Error: DOI is empty" doi = doi.strip() # 如果未指定路径,使用用户主目录下的 paper_downloads output_dir = Path(save_path) if save_path else Path.home() / "paper_downloads" output_dir.mkdir(parents=True, exist_ok=True) try: # 获取 PDF URL(必须用 requests 解析 HTML) pdf_url = self._get_pdf_url(doi) if not pdf_url: return f"Error: Could not find PDF for DOI {doi} on Sci-Hub" # 生成文件路径 clean_doi = re.sub(r'[^\w\-_.]', '_', doi) file_path = output_dir / f"scihub_{clean_doi}.pdf" # 方法1: 优先使用 curl(更可靠) if self._download_with_curl(pdf_url, str(file_path)): return str(file_path) logger.info("curl failed, falling back to requests...") # 方法2: 回退到 requests(带重试) max_retries = 3 for attempt in range(max_retries): try: response = self.session.get( pdf_url, verify=False, timeout=(30, 180), # 连接 30s,读取 180s stream=True ) if response.status_code != 200: logger.warning(f"Download failed with status {response.status_code}") continue # 流式写入 with open(file_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) # 验证是 PDF with open(file_path, 'rb') as f: header = f.read(4) if header != b'%PDF': logger.warning("Downloaded file is not a PDF") os.remove(file_path) continue logger.info(f"PDF downloaded with requests: {file_path}") return str(file_path) except requests.exceptions.Timeout: logger.warning(f"Timeout (attempt {attempt + 1}/{max_retries})") except Exception as e: logger.warning(f"Download error (attempt {attempt + 1}/{max_retries}): {e}") return f"Error: Could not download PDF for DOI {doi}" except Exception as e: logger.error(f"Download failed for {doi}: {e}") return f"Error downloading PDF: {e}" def read_paper(self, doi: str, save_path: Optional[str] = None) -> str: """下载并提取论文文本 Args: doi: 论文 DOI save_path: 保存目录 Returns: 提取的 Markdown 文本或错误信息 """ # 先下载 PDF result = self.download_pdf(doi, save_path) if result.startswith("Error"): return result pdf_path = result try: text = pymupdf4llm.to_markdown(pdf_path, show_progress=False) logger.info(f"Extracted {len(text)} characters from {pdf_path}") if not text.strip(): return f"PDF downloaded to {pdf_path}, but no text could be extracted." # 添加元数据 metadata = f"# Paper: {doi}\n\n" metadata += f"**DOI**: https://doi.org/{doi}\n" metadata += f"**PDF**: {pdf_path}\n" metadata += f"**Source**: Sci-Hub\n\n" metadata += "---\n\n" return metadata + text except Exception as e: logger.error(f"Failed to extract text: {e}") return f"Error extracting text: {e}" def _get_pdf_url(self, doi: str) -> Optional[str]: """从 Sci-Hub 获取 PDF 直链""" try: # 如果已经是直接的 PDF URL,直接返回 if doi.endswith('.pdf'): return doi search_url = f"{self.base_url}/{doi}" response = self.session.get(search_url, verify=False, timeout=self.timeout) if response.status_code != 200: logger.warning(f"Sci-Hub returned status {response.status_code}") return None # 检查是否找到文章 if "article not found" in response.text.lower(): logger.warning(f"Article not found on Sci-Hub: {doi}") return None soup = BeautifulSoup(response.content, 'html.parser') # 方法 1: embed 标签(现代 Sci-Hub 最常用) embed = soup.find('embed', {'type': 'application/pdf'}) logger.debug(f"Found embed tag: {embed}") if embed: src = embed.get('src') if hasattr(embed, 'get') else None logger.debug(f"Embed src: {src}") if src and isinstance(src, str): pdf_url = self._normalize_url(src) logger.debug(f"Returning PDF URL from embed: {pdf_url}") return pdf_url # 方法 2: iframe(回退方案) iframe = soup.find('iframe') if iframe: src = iframe.get('src') if hasattr(iframe, 'get') else None if src and isinstance(src, str): pdf_url = self._normalize_url(src) logger.debug(f"Returning PDF URL from iframe: {pdf_url}") return pdf_url # 方法 3: 下载按钮的 onclick for button in soup.find_all('button'): onclick = button.get('onclick', '') if hasattr(button, 'get') else '' if isinstance(onclick, str) and 'pdf' in onclick.lower(): match = re.search(r"location\.href='([^']+)'", onclick) if match: pdf_url = self._normalize_url(match.group(1)) logger.debug(f"Returning PDF URL from button: {pdf_url}") return pdf_url # 方法 4: 直接下载链接 for link in soup.find_all('a'): href = link.get('href', '') if hasattr(link, 'get') else '' if isinstance(href, str) and href and ('pdf' in href.lower() or href.endswith('.pdf')): if href.startswith('http'): logger.debug(f"Returning PDF URL from link: {href}") return href else: pdf_url = self._normalize_url(href) logger.debug(f"Returning PDF URL from link: {pdf_url}") return pdf_url logger.warning(f"No PDF URL found in Sci-Hub page for: {doi}") return None except Exception as e: logger.error(f"Error getting PDF URL: {e}") return None def _normalize_url(self, url: str) -> str: """规范化 URL""" if url.startswith('//'): return 'https:' + url elif url.startswith('/'): return self.base_url + url return url def _generate_filename(self, doi: str, response: requests.Response) -> str: """生成文件名""" # 清理 DOI 作为文件名 clean_doi = re.sub(r'[^\w\-_.]', '_', doi) # 添加短哈希以避免冲突 content_hash = hashlib.md5(response.content).hexdigest()[:6] return f"scihub_{clean_doi}_{content_hash}.pdf" def check_paper_year(published_date: Optional[datetime], cutoff_year: int = 2023) -> bool: """检查论文是否在截止年份之前发表 Args: published_date: 发表日期 cutoff_year: 截止年份(默认 2023) Returns: True 如果论文在截止年份之前发表 """ if not published_date: return False return published_date.year < cutoff_year # ============================================================ # 测试代码 # ============================================================ if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # 禁用 SSL 警告(仅测试用) import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) fetcher = SciHubFetcher() print("=" * 60) print("Testing SciHubFetcher...") print("=" * 60) # 测试一个已知的老论文 DOI test_doi = "10.1080/13504851.2021.1890325" # 2021 年的论文 print(f"\nDownloading: {test_doi}") result = fetcher.download_pdf(test_doi) print(f"Result: {result}") if not result.startswith("Error"): print("\n✅ Download successful!") else: print("\n❌ Download failed (this may be due to Sci-Hub availability)")

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/h-lu/paper-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server