Skip to main content
Glama
h-lu
by h-lu
arxiv.py11.8 kB
# paper_search_mcp/academic_platforms/arxiv.py """ ArxivSearcher - arXiv 论文搜索、下载与阅读 2025 年最佳实践版本: - 使用 PyMuPDF4LLM 替代 PyPDF2,提供更好的表格和公式提取 - 输出 Markdown 格式,对 LLM 更友好 - 支持多种表格检测策略 """ from typing import List, Literal, Optional from datetime import datetime import requests import feedparser import pymupdf4llm import pymupdf import os import logging from ..paper import Paper logger = logging.getLogger(__name__) class PaperSource: """Abstract base class for paper sources""" def search(self, query: str, **kwargs) -> List[Paper]: raise NotImplementedError def download_pdf(self, paper_id: str, save_path: str) -> str: raise NotImplementedError def read_paper(self, paper_id: str, save_path: str) -> str: raise NotImplementedError class ArxivSearcher(PaperSource): """arXiv 论文搜索器 功能: - 搜索 arXiv 论文 - 下载 PDF 文件 - 提取论文内容(支持 Markdown 和纯文本格式) 2025 最佳实践: - 使用 PyMuPDF4LLM 进行 PDF 文本提取 - 表格自动转换为 Markdown 表格 - 支持多种表格检测策略 """ BASE_URL = "http://export.arxiv.org/api/query" # 表格检测策略说明 TABLE_STRATEGIES = { "lines_strict": "严格模式:只检测有完整边框线的表格", "lines": "线条模式:检测有部分边框线的表格", "text": "文本模式:基于文本对齐检测表格(适合无边框表格)", "explicit": "显式模式:只检测 PDF 中明确标记的表格", } def search(self, query: str, max_results: int = 10) -> List[Paper]: """搜索 arXiv 论文 Args: query: 搜索关键词,支持 arXiv 查询语法 例如: "ti:attention" (标题), "au:hinton" (作者) max_results: 最大返回数量 Returns: List[Paper]: 论文列表 """ params = { 'search_query': query, 'max_results': max_results, 'sortBy': 'submittedDate', 'sortOrder': 'descending' } try: response = requests.get(self.BASE_URL, params=params, timeout=30) response.raise_for_status() except requests.RequestException as e: logger.error(f"arXiv API request failed: {e}") return [] feed = feedparser.parse(response.content) papers = [] for entry in feed.entries: try: authors = [author.name for author in entry.authors] published = datetime.strptime(entry.published, '%Y-%m-%dT%H:%M:%SZ') updated = datetime.strptime(entry.updated, '%Y-%m-%dT%H:%M:%SZ') pdf_url = next( (link.href for link in entry.links if link.type == 'application/pdf'), '' ) papers.append(Paper( paper_id=entry.id.split('/')[-1], title=entry.title.replace('\n', ' ').strip(), authors=authors, abstract=entry.summary.replace('\n', ' ').strip(), url=entry.id, pdf_url=pdf_url, published_date=published, updated_date=updated, source='arxiv', categories=[tag.term for tag in entry.tags], keywords=[], doi=entry.get('doi', '') )) except Exception as e: logger.warning(f"Error parsing arXiv entry: {e}") return papers def download_pdf(self, paper_id: str, save_path: str) -> str: """下载 arXiv 论文 PDF Args: paper_id: arXiv 论文 ID (例如 '2106.12345') save_path: 保存目录 Returns: str: PDF 文件路径 Raises: RuntimeError: 下载失败时抛出 """ # 确保目录存在 os.makedirs(save_path, exist_ok=True) # 构建文件路径 # 处理带版本号的 ID (例如 2106.12345v2) safe_id = paper_id.replace('/', '_').replace(':', '_') output_file = os.path.join(save_path, f"{safe_id}.pdf") # 检查文件是否已存在 if os.path.exists(output_file): logger.info(f"PDF already exists: {output_file}") return output_file # 下载 PDF pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf" try: response = requests.get(pdf_url, timeout=60) response.raise_for_status() with open(output_file, 'wb') as f: f.write(response.content) logger.info(f"PDF downloaded: {output_file}") return output_file except requests.RequestException as e: raise RuntimeError(f"Failed to download PDF: {e}") def read_paper( self, paper_id: str, save_path: str, output_format: Literal["markdown", "text"] = "markdown", table_strategy: Literal["lines_strict", "lines", "text", "explicit"] = "lines_strict", pages: Optional[List[int]] = None ) -> str: """读取论文并提取内容 使用 PyMuPDF4LLM 进行高质量文本提取,支持: - Markdown 格式输出(推荐,对 LLM 友好) - 表格自动转换为 Markdown 表格 - 多种表格检测策略 Args: paper_id: arXiv 论文 ID save_path: PDF 存储目录 output_format: 输出格式 - "markdown": Markdown 格式(推荐,包含表格) - "text": 纯文本格式 table_strategy: 表格检测策略 - "lines_strict": 严格模式,只检测有完整边框的表格 - "lines": 线条模式,检测有部分边框的表格 - "text": 文本模式,基于对齐检测(适合无边框表格) - "explicit": 显式模式,只检测明确标记的表格 pages: 要提取的页面列表(0-indexed),None 表示全部页面 Returns: str: 提取的论文内容 """ # 确保 PDF 已下载 pdf_path = self._ensure_pdf_downloaded(paper_id, save_path) if output_format == "markdown": return self._extract_markdown(pdf_path, table_strategy, pages) else: return self._extract_text(pdf_path, pages) def _ensure_pdf_downloaded(self, paper_id: str, save_path: str) -> str: """确保 PDF 已下载,返回文件路径""" safe_id = paper_id.replace('/', '_').replace(':', '_') pdf_path = os.path.join(save_path, f"{safe_id}.pdf") if not os.path.exists(pdf_path): pdf_path = self.download_pdf(paper_id, save_path) return pdf_path def _extract_markdown( self, pdf_path: str, table_strategy: str, pages: Optional[List[int]] = None ) -> str: """使用 PyMuPDF4LLM 提取 Markdown 格式内容 PyMuPDF4LLM 特点: - 专为 LLM 优化的输出格式 - 自动检测并格式化表格 - 保留文档结构(标题、列表等) """ try: md_text = pymupdf4llm.to_markdown( pdf_path, pages=pages, table_strategy=table_strategy, show_progress=False ) return md_text except Exception as e: logger.error(f"Markdown extraction failed: {e}") # 回退到纯文本提取 logger.info("Falling back to plain text extraction") return self._extract_text(pdf_path, pages) def _extract_text( self, pdf_path: str, pages: Optional[List[int]] = None ) -> str: """使用 PyMuPDF 提取纯文本内容""" try: doc = pymupdf.open(pdf_path) text_parts = [] page_range = pages if pages else range(len(doc)) for page_num in page_range: if 0 <= page_num < len(doc): page = doc[page_num] text_parts.append(page.get_text()) doc.close() return "\n".join(text_parts) except Exception as e: logger.error(f"Text extraction failed: {e}") return f"Error extracting text: {e}" if __name__ == "__main__": # 测试 ArxivSearcher 的功能 logging.basicConfig(level=logging.INFO) searcher = ArxivSearcher() # 测试搜索功能 print("=" * 60) print("1. Testing search functionality...") print("=" * 60) query = "machine learning" max_results = 3 papers = [] try: papers = searcher.search(query, max_results=max_results) print(f"Found {len(papers)} papers for query '{query}':") for i, paper in enumerate(papers, 1): print(f"{i}. {paper.title[:60]}... (ID: {paper.paper_id})") except Exception as e: print(f"Error during search: {e}") # 测试 PDF 下载功能 if papers: print("\n" + "=" * 60) print("2. Testing PDF download functionality...") print("=" * 60) paper_id = papers[0].paper_id save_path = "./downloads" try: pdf_path = searcher.download_pdf(paper_id, save_path) print(f"PDF downloaded successfully: {pdf_path}") except Exception as e: print(f"Error during PDF download: {e}") # 测试 Markdown 提取功能 if papers: print("\n" + "=" * 60) print("3. Testing Markdown extraction (PyMuPDF4LLM)...") print("=" * 60) paper_id = papers[0].paper_id try: md_content = searcher.read_paper( paper_id, output_format="markdown", table_strategy="lines_strict" ) # 保存 Markdown 文件到 downloads 目录 safe_id = paper_id.replace('/', '_').replace(':', '_') md_file_path = os.path.join(save_path, f"{safe_id}.md") with open(md_file_path, 'w', encoding='utf-8') as f: f.write(md_content) print(f"Markdown saved to: {md_file_path}") print(f"\nFirst 1000 characters of Markdown content:") print("-" * 40) print(md_content[:1000]) print("-" * 40) print(f"\nTotal length: {len(md_content)} characters") except Exception as e: print(f"Error during paper reading: {e}") # 测试纯文本提取功能 if papers: print("\n" + "=" * 60) print("4. Testing plain text extraction...") print("=" * 60) paper_id = papers[0].paper_id try: text_content = searcher.read_paper( paper_id, output_format="text" ) print(f"\nFirst 500 characters of text content:") print("-" * 40) print(text_content[:500]) print("-" * 40) print(f"\nTotal length: {len(text_content)} characters") except Exception as e: print(f"Error during paper reading: {e}")

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/h-lu/paper-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server