arxiv.py•4.94 kB
# paper_search_mcp/sources/arxiv.py
from typing import List
from datetime import datetime
import requests
import feedparser
from ..paper import Paper
from PyPDF2 import PdfReader
import os
class PaperSource:
"""Abstract base class for paper sources"""
def search(self, query: str, **kwargs) -> List[Paper]:
raise NotImplementedError
def download_pdf(self, paper_id: str, save_path: str) -> str:
raise NotImplementedError
def read_paper(self, paper_id: str, save_path: str) -> str:
raise NotImplementedError
class ArxivSearcher(PaperSource):
"""Searcher for arXiv papers"""
BASE_URL = "http://export.arxiv.org/api/query"
def search(self, query: str, max_results: int = 10) -> List[Paper]:
params = {
'search_query': query,
'max_results': max_results,
'sortBy': 'submittedDate',
'sortOrder': 'descending'
}
response = requests.get(self.BASE_URL, params=params)
feed = feedparser.parse(response.content)
papers = []
for entry in feed.entries:
try:
authors = [author.name for author in entry.authors]
published = datetime.strptime(entry.published, '%Y-%m-%dT%H:%M:%SZ')
updated = datetime.strptime(entry.updated, '%Y-%m-%dT%H:%M:%SZ')
pdf_url = next((link.href for link in entry.links if link.type == 'application/pdf'), '')
papers.append(Paper(
paper_id=entry.id.split('/')[-1],
title=entry.title,
authors=authors,
abstract=entry.summary,
url=entry.id,
pdf_url=pdf_url,
published_date=published,
updated_date=updated,
source='arxiv',
categories=[tag.term for tag in entry.tags],
keywords=[],
doi=entry.get('doi', '')
))
except Exception as e:
print(f"Error parsing arXiv entry: {e}")
return papers
def download_pdf(self, paper_id: str, save_path: str) -> str:
pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
response = requests.get(pdf_url)
output_file = f"{save_path}/{paper_id}.pdf"
with open(output_file, 'wb') as f:
f.write(response.content)
return output_file
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
"""Read a paper and convert it to text format.
Args:
paper_id: arXiv paper ID
save_path: Directory where the PDF is/will be saved
Returns:
str: The extracted text content of the paper
"""
# First ensure we have the PDF
pdf_path = f"{save_path}/{paper_id}.pdf"
if not os.path.exists(pdf_path):
pdf_path = self.download_pdf(paper_id, save_path)
# Read the PDF
try:
reader = PdfReader(pdf_path)
text = ""
# Extract text from each page
for page in reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
print(f"Error reading PDF for paper {paper_id}: {e}")
return ""
if __name__ == "__main__":
# 测试 ArxivSearcher 的功能
searcher = ArxivSearcher()
# 测试搜索功能
print("Testing search functionality...")
query = "machine learning"
max_results = 5
try:
papers = searcher.search(query, max_results=max_results)
print(f"Found {len(papers)} papers for query '{query}':")
for i, paper in enumerate(papers, 1):
print(f"{i}. {paper.title} (ID: {paper.paper_id})")
except Exception as e:
print(f"Error during search: {e}")
# 测试 PDF 下载功能
if papers:
print("\nTesting PDF download functionality...")
paper_id = papers[0].paper_id
save_path = "./downloads" # 确保此目录存在
try:
os.makedirs(save_path, exist_ok=True)
pdf_path = searcher.download_pdf(paper_id, save_path)
print(f"PDF downloaded successfully: {pdf_path}")
except Exception as e:
print(f"Error during PDF download: {e}")
# 测试论文阅读功能
if papers:
print("\nTesting paper reading functionality...")
paper_id = papers[0].paper_id
try:
text_content = searcher.read_paper(paper_id)
print(f"\nFirst 500 characters of the paper content:")
print(text_content[:500] + "...")
print(f"\nTotal length of extracted text: {len(text_content)} characters")
except Exception as e:
print(f"Error during paper reading: {e}")