# paper_search_mcp/academic_platforms/crossref.py
"""
CrossRefSearcher - CrossRef 引用数据库搜索
2025 最佳实践版本:
- Polite Pool 支持(mailto 参数)
- 环境变量配置
- 指数退避重试
- Session 复用
"""
from typing import List, Optional, Dict, Any
from datetime import datetime
import requests
import time
import os
import logging
from ..paper import Paper
logger = logging.getLogger(__name__)
class PaperSource:
"""Abstract base class for paper sources"""
def search(self, query: str, **kwargs) -> List[Paper]:
raise NotImplementedError
def download_pdf(self, paper_id: str, save_path: str) -> str:
raise NotImplementedError
def read_paper(self, paper_id: str, save_path: str) -> str:
raise NotImplementedError
class CrossRefSearcher(PaperSource):
"""CrossRef 引用数据库搜索器
使用 CrossRef REST API 搜索学术论文元数据。
2025 最佳实践:
- 使用 mailto 参数进入 "Polite Pool"(更高速率限制)
- 设置 User-Agent 便于 CrossRef 联系
- 指数退避处理速率限制
环境变量:
- CROSSREF_MAILTO: 联系邮箱(推荐设置)
"""
BASE_URL = "https://api.crossref.org"
def __init__(
self,
mailto: Optional[str] = None,
timeout: int = 30,
max_retries: int = 3
):
"""初始化 CrossRef 搜索器
Args:
mailto: 联系邮箱(默认从环境变量获取)
timeout: 请求超时时间
max_retries: 最大重试次数
"""
self.mailto = mailto or os.environ.get('CROSSREF_MAILTO', '')
self.timeout = timeout
self.max_retries = max_retries
# Session 复用
self.session = requests.Session()
self.session.headers.update({
'User-Agent': f'paper_search_mcp/1.0 (mailto:{self.mailto})',
'Accept': 'application/json'
})
if self.mailto:
logger.info(f"Using Polite Pool with mailto: {self.mailto}")
else:
logger.warning("No CROSSREF_MAILTO set. Consider setting it for better rate limits.")
def _make_request(
self,
url: str,
params: dict,
retry_count: int = 0
) -> Optional[requests.Response]:
"""发送请求,带重试机制"""
try:
response = self.session.get(url, params=params, timeout=self.timeout)
if response.status_code == 429:
if retry_count < self.max_retries:
wait_time = (2 ** retry_count) + 1
logger.warning(f"Rate limited (429), retrying in {wait_time}s...")
time.sleep(wait_time)
return self._make_request(url, params, retry_count + 1)
logger.error(f"Rate limited after {self.max_retries} retries")
return None
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
if retry_count < self.max_retries:
wait_time = 2 ** retry_count
logger.warning(f"Request failed, retrying in {wait_time}s: {e}")
time.sleep(wait_time)
return self._make_request(url, params, retry_count + 1)
logger.error(f"Request failed: {e}")
return None
def search(self, query: str, max_results: int = 10, **kwargs) -> List[Paper]:
"""
Search CrossRef database for papers.
Args:
query: Search query string
max_results: Maximum number of results to return (default: 10)
**kwargs: Additional parameters like filters, sort, etc.
Returns:
List of Paper objects
"""
try:
params = {
'query': query,
'rows': min(max_results, 1000), # CrossRef API max is 1000
'sort': 'relevance',
'order': 'desc'
}
# Add any additional filters from kwargs
if 'filter' in kwargs:
params['filter'] = kwargs['filter']
if 'sort' in kwargs:
params['sort'] = kwargs['sort']
if 'order' in kwargs:
params['order'] = kwargs['order']
# Polite Pool 参数
if self.mailto:
params['mailto'] = self.mailto
url = f"{self.BASE_URL}/works"
response = self._make_request(url, params)
if not response:
return []
data = response.json()
papers = []
items = data.get('message', {}).get('items', [])
for item in items:
try:
paper = self._parse_crossref_item(item)
if paper:
papers.append(paper)
except Exception as e:
logger.warning(f"Error parsing CrossRef item: {e}")
continue
return papers
except requests.RequestException as e:
logger.error(f"Error searching CrossRef: {e}")
return []
except Exception as e:
logger.error(f"Unexpected error in CrossRef search: {e}")
return []
def _parse_crossref_item(self, item: Dict[str, Any]) -> Optional[Paper]:
"""Parse a CrossRef API item into a Paper object."""
try:
# Extract basic information
doi = item.get('DOI', '')
title = self._extract_title(item)
authors = self._extract_authors(item)
abstract = item.get('abstract', '')
# Extract publication date
published_date = self._extract_date(item, 'published')
if not published_date:
published_date = self._extract_date(item, 'issued')
if not published_date:
published_date = self._extract_date(item, 'created')
# Default to epoch if no date found
if not published_date:
published_date = datetime(1970, 1, 1)
# Extract URLs
url = item.get('URL', f"https://doi.org/{doi}" if doi else '')
pdf_url = self._extract_pdf_url(item)
# Extract additional metadata
container_title = self._extract_container_title(item)
publisher = item.get('publisher', '')
categories = [item.get('type', '')]
# Extract subjects/keywords if available
subjects = item.get('subject', [])
if isinstance(subjects, list):
keywords = subjects
else:
keywords = []
return Paper(
paper_id=doi,
title=title,
authors=authors,
abstract=abstract,
doi=doi,
published_date=published_date,
pdf_url=pdf_url,
url=url,
source='crossref',
categories=categories,
keywords=keywords,
citations=item.get('is-referenced-by-count', 0),
extra={
'publisher': publisher,
'container_title': container_title,
'volume': item.get('volume', ''),
'issue': item.get('issue', ''),
'page': item.get('page', ''),
'issn': item.get('ISSN', []),
'isbn': item.get('ISBN', []),
'crossref_type': item.get('type', ''),
'member': item.get('member', ''),
'prefix': item.get('prefix', '')
}
)
except Exception as e:
logger.error(f"Error parsing CrossRef item: {e}")
return None
def _extract_title(self, item: Dict[str, Any]) -> str:
"""Extract title from CrossRef item."""
titles = item.get('title', [])
if isinstance(titles, list) and titles:
return titles[0]
return str(titles) if titles else ''
def _extract_authors(self, item: Dict[str, Any]) -> List[str]:
"""Extract author names from CrossRef item."""
authors = []
author_list = item.get('author', [])
for author in author_list:
if isinstance(author, dict):
given = author.get('given', '')
family = author.get('family', '')
if given and family:
authors.append(f"{given} {family}")
elif family:
authors.append(family)
elif given:
authors.append(given)
return authors
def _extract_date(self, item: Dict[str, Any], date_field: str) -> Optional[datetime]:
"""Extract date from CrossRef item."""
date_info = item.get(date_field, {})
if not date_info:
return None
date_parts = date_info.get('date-parts', [])
if not date_parts or not date_parts[0]:
return None
parts = date_parts[0]
try:
year = parts[0] if len(parts) > 0 else 1970
month = parts[1] if len(parts) > 1 else 1
day = parts[2] if len(parts) > 2 else 1
return datetime(year, month, day)
except (ValueError, IndexError):
return None
def _extract_container_title(self, item: Dict[str, Any]) -> str:
"""Extract container title (journal/book title) from CrossRef item."""
container_titles = item.get('container-title', [])
if isinstance(container_titles, list) and container_titles:
return container_titles[0]
return str(container_titles) if container_titles else ''
def _extract_pdf_url(self, item: Dict[str, Any]) -> str:
"""Extract PDF URL from CrossRef item."""
# Check for link in the resource field
resource = item.get('resource', {})
if resource:
primary = resource.get('primary', {})
if primary and primary.get('URL', '').endswith('.pdf'):
return primary['URL']
# Check in links array
links = item.get('link', [])
for link in links:
if isinstance(link, dict):
content_type = link.get('content-type', '')
if 'pdf' in content_type.lower():
return link.get('URL', '')
return ''
def download_pdf(self, paper_id: str, save_path: str) -> str:
"""
CrossRef doesn't provide direct PDF downloads.
Args:
paper_id: DOI of the paper
save_path: Directory to save the PDF
Raises:
NotImplementedError: Always raises this error as CrossRef doesn't provide direct PDF access
"""
message = ("CrossRef does not provide direct PDF downloads. "
"CrossRef is a citation database that provides metadata about academic papers. "
"To access the full text, please use the paper's DOI or URL to visit the publisher's website.")
raise NotImplementedError(message)
def read_paper(self, paper_id: str, save_path: str) -> str:
"""
CrossRef doesn't provide direct paper content access.
Args:
paper_id: DOI of the paper
save_path: Directory for potential PDF storage (unused)
Returns:
str: Error message indicating PDF reading is not supported
"""
message = ("CrossRef papers cannot be read directly through this tool. "
"CrossRef is a citation database that provides metadata about academic papers. "
"Only metadata and abstracts are available through CrossRef's API. "
"To access the full text, please use the paper's DOI or URL to visit the publisher's website.")
return message
def get_paper_by_doi(self, doi: str) -> Optional[Paper]:
"""
Get a specific paper by DOI.
Args:
doi: Digital Object Identifier
Returns:
Paper object if found, None otherwise
"""
try:
url = f"{self.BASE_URL}/works/{doi}"
params = {'mailto': 'paper-search@example.org'}
response = self.session.get(url, params=params, timeout=30)
if response.status_code == 404:
logger.warning(f"DOI not found in CrossRef: {doi}")
return None
response.raise_for_status()
data = response.json()
item = data.get('message', {})
return self._parse_crossref_item(item)
except requests.RequestException as e:
logger.error(f"Error fetching DOI {doi} from CrossRef: {e}")
return None
except Exception as e:
logger.error(f"Unexpected error fetching DOI {doi}: {e}")
return None
if __name__ == "__main__":
# Test CrossRefSearcher functionality
# 测试CrossRefSearcher功能
searcher = CrossRefSearcher()
# Test search functionality
# 测试搜索功能
print("Testing search functionality...")
query = "machine learning"
max_results = 5
papers = []
try:
papers = searcher.search(query, max_results=max_results)
print(f"Found {len(papers)} papers for query '{query}':")
for i, paper in enumerate(papers, 1):
print(f"{i}. {paper.title} (DOI: {paper.doi})")
print(f" Authors: {', '.join(paper.authors[:3])}{'...' if len(paper.authors) > 3 else ''}")
print(f" Published: {paper.published_date.year}")
print(f" Citations: {paper.citations}")
publisher = paper.extra.get('publisher', 'N/A') if paper.extra else 'N/A'
print(f" Publisher: {publisher}")
print()
except Exception as e:
print(f"Error during search: {e}")
# Test DOI lookup functionality
# 测试DOI查找功能
if papers:
print("Testing DOI lookup functionality...")
test_doi = papers[0].doi
try:
paper = searcher.get_paper_by_doi(test_doi)
if paper:
print(f"Successfully retrieved paper by DOI: {paper.title}")
else:
print("Failed to retrieve paper by DOI")
except Exception as e:
print(f"Error during DOI lookup: {e}")
# Test PDF download functionality (will return unsupported message)
# 测试PDF下载功能(会返回不支持的提示)
if papers:
print("\nTesting PDF download functionality...")
paper_id = papers[0].doi
try:
pdf_path = searcher.download_pdf(paper_id, "./downloads")
except NotImplementedError as e:
print(f"Expected error: {e}")
# Test paper reading functionality (will return unsupported message)
# 测试论文阅读功能(会返回不支持的提示)
if papers:
print("\nTesting paper reading functionality...")
paper_id = papers[0].doi
message = searcher.read_paper(paper_id)
print(f"Message: {message}")