from typing import List, Optional
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import time
import random
import os
import logging
from ..paper import Paper
import pymupdf4llm
logger = logging.getLogger(__name__)
class PaperSource:
"""Abstract base class for paper sources"""
def search(self, query: str, **kwargs) -> List[Paper]:
raise NotImplementedError
def download_pdf(self, paper_id: str, save_path: str) -> str:
raise NotImplementedError
def read_paper(self, paper_id: str, save_path: str) -> str:
raise NotImplementedError
class IACRSearcher(PaperSource):
"""IACR ePrint Archive paper search implementation"""
IACR_SEARCH_URL = "https://eprint.iacr.org/search"
IACR_BASE_URL = "https://eprint.iacr.org"
BROWSERS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
]
def __init__(self):
self._setup_session()
def _setup_session(self):
"""Initialize session with random user agent"""
self.session = requests.Session()
self.session.headers.update(
{
"User-Agent": random.choice(self.BROWSERS),
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
}
)
def _parse_date(self, date_str: str) -> Optional[datetime]:
"""Parse date from IACR format (e.g., '2025-06-02')"""
try:
return datetime.strptime(date_str.strip(), "%Y-%m-%d")
except ValueError:
logger.warning(f"Could not parse date: {date_str}")
return None
def _parse_paper(self, item, fetch_details: bool = True) -> Optional[Paper]:
"""Parse single paper entry from IACR HTML and optionally fetch detailed info"""
try:
# Extract paper ID from the search result
header_div = item.find("div", class_="d-flex")
if not header_div:
return None
# Get paper ID from the link
paper_link = header_div.find("a", class_="paperlink")
if not paper_link:
return None
paper_id = paper_link.get_text(strip=True) # e.g., "2025/1014"
if fetch_details:
# Fetch detailed information for this paper
logger.info(f"Fetching detailed info for paper {paper_id}")
detailed_paper = self.get_paper_details(paper_id)
if detailed_paper:
return detailed_paper
else:
logger.warning(
f"Could not fetch details for {paper_id}, falling back to search result parsing"
)
# Fallback: parse from search results if detailed fetch fails or is disabled
paper_url = self.IACR_BASE_URL + paper_link["href"]
# Get PDF URL
pdf_link = header_div.find("a", href=True, string="(PDF)")
pdf_url = self.IACR_BASE_URL + pdf_link["href"] if pdf_link else ""
# Get last updated date
last_updated_elem = header_div.find("small", class_="ms-auto")
updated_date = None
if last_updated_elem:
date_text = last_updated_elem.get_text(strip=True)
if "Last updated:" in date_text:
date_str = date_text.replace("Last updated:", "").strip()
updated_date = self._parse_date(date_str)
# Get content from the second div
content_div = item.find("div", class_="ms-md-4")
if not content_div:
return None
# Extract title
title_elem = content_div.find("strong")
title = title_elem.get_text(strip=True) if title_elem else ""
# Extract authors
authors_elem = content_div.find("span", class_="fst-italic")
authors = []
if authors_elem:
authors_text = authors_elem.get_text(strip=True)
authors = [author.strip() for author in authors_text.split(",")]
# Extract category
category_elem = content_div.find("small", class_="badge")
categories = []
if category_elem:
category_text = category_elem.get_text(strip=True)
categories = [category_text]
# Extract abstract
abstract_elem = content_div.find("p", class_="search-abstract")
abstract = abstract_elem.get_text(strip=True) if abstract_elem else ""
# Create paper object with search result data
published_date = updated_date if updated_date else datetime(1900, 1, 1)
return Paper(
paper_id=paper_id,
title=title,
authors=authors,
abstract=abstract,
url=paper_url,
pdf_url=pdf_url,
published_date=published_date,
updated_date=updated_date,
source="iacr",
categories=categories,
keywords=[],
doi="",
citations=0,
)
except Exception as e:
logger.warning(f"Failed to parse IACR paper: {e}")
return None
def search(
self, query: str, max_results: int = 10, fetch_details: bool = True
) -> List[Paper]:
"""
Search IACR ePrint Archive
Args:
query: Search query string
max_results: Maximum number of results to return
fetch_details: Whether to fetch detailed information for each paper (slower but more complete)
Returns:
List[Paper]: List of paper objects
"""
papers = []
try:
# Construct search parameters
params = {"q": query}
# Make request
response = self.session.get(self.IACR_SEARCH_URL, params=params)
if response.status_code != 200:
logger.error(f"IACR search failed with status {response.status_code}")
return papers
# Parse results
soup = BeautifulSoup(response.text, "html.parser")
# Find all paper entries - they are divs with class "mb-4"
results = soup.find_all("div", class_="mb-4")
if not results:
logger.info("No results found for the query")
return papers
# Process each result
for i, item in enumerate(results):
if len(papers) >= max_results:
break
logger.info(f"Processing paper {i+1}/{min(len(results), max_results)}")
paper = self._parse_paper(item, fetch_details=fetch_details)
if paper:
papers.append(paper)
except Exception as e:
logger.error(f"IACR search error: {e}")
return papers[:max_results]
def download_pdf(self, paper_id: str, save_path: str) -> str:
"""
Download PDF from IACR ePrint Archive
Args:
paper_id: IACR paper ID (e.g., "2025/1014")
save_path: Path to save the PDF
Returns:
str: Path to downloaded file or error message
"""
try:
pdf_url = f"{self.IACR_BASE_URL}/{paper_id}.pdf"
response = self.session.get(pdf_url)
if response.status_code == 200:
filename = f"{save_path}/iacr_{paper_id.replace('/', '_')}.pdf"
with open(filename, "wb") as f:
f.write(response.content)
return filename
else:
return f"Failed to download PDF: HTTP {response.status_code}"
except Exception as e:
logger.error(f"PDF download error: {e}")
return f"Error downloading PDF: {e}"
def read_paper(self, paper_id: str, save_path: str) -> str:
"""下载并提取 IACR 论文文本
使用 PyMuPDF4LLM 提取 Markdown 格式。
Args:
paper_id: IACR paper ID (e.g., "2009/101")
save_path: 保存目录
Returns:
str: 提取的 Markdown 文本或错误信息
"""
try:
# 获取论文详情
paper = self.get_paper_details(paper_id)
if not paper or not paper.pdf_url:
return f"Error: Could not find PDF URL for paper {paper_id}"
# 下载 PDF
pdf_response = requests.get(paper.pdf_url, timeout=30)
pdf_response.raise_for_status()
# 保存 PDF
os.makedirs(save_path, exist_ok=True)
filename = f"iacr_{paper_id.replace('/', '_')}.pdf"
pdf_path = os.path.join(save_path, filename)
with open(pdf_path, "wb") as f:
f.write(pdf_response.content)
# 使用 PyMuPDF4LLM 提取文本
text = pymupdf4llm.to_markdown(pdf_path, show_progress=False)
logger.info(f"Extracted {len(text)} characters from {pdf_path}")
if not text.strip():
return f"PDF downloaded to {pdf_path}, but no text could be extracted."
# 添加元数据
metadata = f"# {paper.title}\n\n"
metadata += f"**Authors**: {', '.join(paper.authors)}\n"
metadata += f"**Published**: {paper.published_date}\n"
metadata += f"**URL**: {paper.url}\n"
metadata += f"**PDF**: {pdf_path}\n\n"
metadata += "---\n\n"
return metadata + text
except requests.RequestException as e:
logger.error(f"Error downloading PDF: {e}")
return f"Error downloading PDF: {e}"
except Exception as e:
logger.error(f"Read paper error: {e}")
return f"Error reading paper: {e}"
def get_paper_details(self, paper_id: str) -> Optional[Paper]:
"""
Fetch detailed information for a specific IACR paper
Args:
paper_id: IACR paper ID (e.g., "2009/101") or full URL
Returns:
Paper: Detailed paper object with full metadata
"""
try:
# Handle both paper ID and full URL
if paper_id.startswith("http"):
paper_url = paper_id
# Extract paper ID from URL
parts = paper_url.split("/")
if len(parts) >= 2:
paper_id = f"{parts[-2]}/{parts[-1]}"
else:
paper_url = f"{self.IACR_BASE_URL}/{paper_id}"
# Make request
response = self.session.get(paper_url)
if response.status_code != 200:
logger.error(
f"Failed to fetch paper details: HTTP {response.status_code}"
)
return None
# Parse the page
soup = BeautifulSoup(response.text, "html.parser")
# Extract title from h3 element
title = ""
title_elem = soup.find("h3", class_="mb-3")
if title_elem:
title = title_elem.get_text(strip=True)
# Extract authors from the italic paragraph
authors = []
author_elem = soup.find("p", class_="fst-italic")
if author_elem:
author_text = author_elem.get_text(strip=True)
# Split by " and " to get individual authors
authors = [
author.strip()
for author in author_text.replace(" and ", ",").split(",")
]
# Extract abstract from the paragraph with white-space: pre-wrap style
abstract = ""
abstract_p = soup.find("p", style="white-space: pre-wrap;")
if abstract_p:
abstract = abstract_p.get_text(strip=True)
# Extract metadata using a simpler, safer approach
publication_info = ""
keywords = []
history_entries = []
last_updated = None
# Extract publication info
page_text = soup.get_text()
lines = page_text.split("\n")
# Find publication info
for i, line in enumerate(lines):
if "Publication info" in line and i + 1 < len(lines):
publication_info = lines[i + 1].strip()
break
# Find keywords using CSS selector for keyword badges
try:
keyword_elements = soup.select("a.badge.bg-secondary.keyword")
keywords = [elem.get_text(strip=True) for elem in keyword_elements]
except:
keywords = []
# Find history entries
history_found = False
for i, line in enumerate(lines):
if "History" in line and ":" not in line:
history_found = True
continue
elif (
history_found
and ":" in line
and not line.strip().startswith("Short URL")
):
history_entries.append(line.strip())
# Try to extract the last updated date from the first history entry
if not last_updated:
date_str = line.split(":")[0].strip()
try:
last_updated = datetime.strptime(date_str, "%Y-%m-%d")
except ValueError:
pass
elif history_found and (
line.strip().startswith("Short URL")
or line.strip().startswith("License")
):
break
# Combine history entries
history = "; ".join(history_entries) if history_entries else ""
# Construct PDF URL
pdf_url = f"{self.IACR_BASE_URL}/{paper_id}.pdf"
# Use last updated date or current date as published date
published_date = last_updated if last_updated else datetime.now()
return Paper(
paper_id=paper_id,
title=title,
authors=authors,
abstract=abstract,
url=paper_url,
pdf_url=pdf_url,
published_date=published_date,
updated_date=last_updated,
source="iacr",
categories=[],
keywords=keywords,
doi="",
citations=0,
extra={"publication_info": publication_info, "history": history},
)
except Exception as e:
logger.error(f"Error fetching paper details for {paper_id}: {e}")
return None
if __name__ == "__main__":
# Test IACR searcher
searcher = IACRSearcher()
print("Testing IACR search functionality...")
query = "secret sharing"
max_results = 2
print("\n" + "=" * 60)
print("1. Testing search with detailed information (slower but complete)")
print("=" * 60)
try:
papers = searcher.search(query, max_results=max_results, fetch_details=True)
print(f"\nFound {len(papers)} papers for query '{query}' (with details):")
for i, paper in enumerate(papers, 1):
print(f"\n{i}. {paper.title}")
print(f" Paper ID: {paper.paper_id}")
print(f" Authors: {', '.join(paper.authors)}")
print(f" Categories: {', '.join(paper.categories)}")
print(f" Keywords: {', '.join(paper.keywords)}")
print(f" Last Updated: {paper.updated_date}")
print(f" URL: {paper.url}")
print(f" PDF: {paper.pdf_url}")
if paper.abstract:
print(f" Abstract: {paper.abstract[:200]}...")
if paper.extra:
pub_info = paper.extra.get("publication_info", "")
if pub_info:
print(f" Publication Info: {pub_info}")
except Exception as e:
print(f"Error during detailed search: {e}")
print("\n" + "=" * 60)
print("2. Testing search with compact information only (faster)")
print("=" * 60)
try:
papers_compact = searcher.search(
query, max_results=max_results, fetch_details=False
)
print(f"\nFound {len(papers_compact)} papers for query '{query}' (compact):")
for i, paper in enumerate(papers_compact, 1):
print(f"\n{i}. {paper.title}")
print(f" Paper ID: {paper.paper_id}")
print(f" Authors: {', '.join(paper.authors)}")
print(f" Categories: {', '.join(paper.categories)}")
print(f" Keywords: {', '.join(paper.keywords)} (from search)")
if paper.abstract:
print(f" Abstract: {paper.abstract[:150]}...")
except Exception as e:
print(f"Error during compact search: {e}")
print("\n" + "=" * 60)
print("3. Testing manual paper details fetching")
print("=" * 60)
test_paper_id = "2009/101"
try:
paper_details = searcher.get_paper_details(test_paper_id)
if paper_details:
print(f"\nManual fetch for paper {test_paper_id}:")
print(f"Title: {paper_details.title}")
print(f"Authors: {', '.join(paper_details.authors)}")
print(f"Keywords: {', '.join(paper_details.keywords)}")
print(
f"Publication Info: {paper_details.extra.get('publication_info', 'N/A') if paper_details.extra else 'N/A'}"
)
print(
f"History: {paper_details.extra.get('history', 'N/A') if paper_details.extra else 'N/A'}"
)
print(f"Abstract: {paper_details.abstract[:200]}...")
else:
print(f"Could not fetch details for paper {test_paper_id}")
except Exception as e:
print(f"Error fetching paper details: {e}")