Skip to main content
Glama
h-lu
by h-lu
iacr.py18.3 kB
from typing import List, Optional from datetime import datetime import requests from bs4 import BeautifulSoup import time import random import os import logging from ..paper import Paper import pymupdf4llm logger = logging.getLogger(__name__) class PaperSource: """Abstract base class for paper sources""" def search(self, query: str, **kwargs) -> List[Paper]: raise NotImplementedError def download_pdf(self, paper_id: str, save_path: str) -> str: raise NotImplementedError def read_paper(self, paper_id: str, save_path: str) -> str: raise NotImplementedError class IACRSearcher(PaperSource): """IACR ePrint Archive paper search implementation""" IACR_SEARCH_URL = "https://eprint.iacr.org/search" IACR_BASE_URL = "https://eprint.iacr.org" BROWSERS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", ] def __init__(self): self._setup_session() def _setup_session(self): """Initialize session with random user agent""" self.session = requests.Session() self.session.headers.update( { "User-Agent": random.choice(self.BROWSERS), "Accept": "text/html,application/xhtml+xml", "Accept-Language": "en-US,en;q=0.9", } ) def _parse_date(self, date_str: str) -> Optional[datetime]: """Parse date from IACR format (e.g., '2025-06-02')""" try: return datetime.strptime(date_str.strip(), "%Y-%m-%d") except ValueError: logger.warning(f"Could not parse date: {date_str}") return None def _parse_paper(self, item, fetch_details: bool = True) -> Optional[Paper]: """Parse single paper entry from IACR HTML and optionally fetch detailed info""" try: # Extract paper ID from the search result header_div = item.find("div", class_="d-flex") if not header_div: return None # Get paper ID from the link paper_link = header_div.find("a", class_="paperlink") if not paper_link: return None paper_id = paper_link.get_text(strip=True) # e.g., "2025/1014" if fetch_details: # Fetch detailed information for this paper logger.info(f"Fetching detailed info for paper {paper_id}") detailed_paper = self.get_paper_details(paper_id) if detailed_paper: return detailed_paper else: logger.warning( f"Could not fetch details for {paper_id}, falling back to search result parsing" ) # Fallback: parse from search results if detailed fetch fails or is disabled paper_url = self.IACR_BASE_URL + paper_link["href"] # Get PDF URL pdf_link = header_div.find("a", href=True, string="(PDF)") pdf_url = self.IACR_BASE_URL + pdf_link["href"] if pdf_link else "" # Get last updated date last_updated_elem = header_div.find("small", class_="ms-auto") updated_date = None if last_updated_elem: date_text = last_updated_elem.get_text(strip=True) if "Last updated:" in date_text: date_str = date_text.replace("Last updated:", "").strip() updated_date = self._parse_date(date_str) # Get content from the second div content_div = item.find("div", class_="ms-md-4") if not content_div: return None # Extract title title_elem = content_div.find("strong") title = title_elem.get_text(strip=True) if title_elem else "" # Extract authors authors_elem = content_div.find("span", class_="fst-italic") authors = [] if authors_elem: authors_text = authors_elem.get_text(strip=True) authors = [author.strip() for author in authors_text.split(",")] # Extract category category_elem = content_div.find("small", class_="badge") categories = [] if category_elem: category_text = category_elem.get_text(strip=True) categories = [category_text] # Extract abstract abstract_elem = content_div.find("p", class_="search-abstract") abstract = abstract_elem.get_text(strip=True) if abstract_elem else "" # Create paper object with search result data published_date = updated_date if updated_date else datetime(1900, 1, 1) return Paper( paper_id=paper_id, title=title, authors=authors, abstract=abstract, url=paper_url, pdf_url=pdf_url, published_date=published_date, updated_date=updated_date, source="iacr", categories=categories, keywords=[], doi="", citations=0, ) except Exception as e: logger.warning(f"Failed to parse IACR paper: {e}") return None def search( self, query: str, max_results: int = 10, fetch_details: bool = True ) -> List[Paper]: """ Search IACR ePrint Archive Args: query: Search query string max_results: Maximum number of results to return fetch_details: Whether to fetch detailed information for each paper (slower but more complete) Returns: List[Paper]: List of paper objects """ papers = [] try: # Construct search parameters params = {"q": query} # Make request response = self.session.get(self.IACR_SEARCH_URL, params=params) if response.status_code != 200: logger.error(f"IACR search failed with status {response.status_code}") return papers # Parse results soup = BeautifulSoup(response.text, "html.parser") # Find all paper entries - they are divs with class "mb-4" results = soup.find_all("div", class_="mb-4") if not results: logger.info("No results found for the query") return papers # Process each result for i, item in enumerate(results): if len(papers) >= max_results: break logger.info(f"Processing paper {i+1}/{min(len(results), max_results)}") paper = self._parse_paper(item, fetch_details=fetch_details) if paper: papers.append(paper) except Exception as e: logger.error(f"IACR search error: {e}") return papers[:max_results] def download_pdf(self, paper_id: str, save_path: str) -> str: """ Download PDF from IACR ePrint Archive Args: paper_id: IACR paper ID (e.g., "2025/1014") save_path: Path to save the PDF Returns: str: Path to downloaded file or error message """ try: pdf_url = f"{self.IACR_BASE_URL}/{paper_id}.pdf" response = self.session.get(pdf_url) if response.status_code == 200: filename = f"{save_path}/iacr_{paper_id.replace('/', '_')}.pdf" with open(filename, "wb") as f: f.write(response.content) return filename else: return f"Failed to download PDF: HTTP {response.status_code}" except Exception as e: logger.error(f"PDF download error: {e}") return f"Error downloading PDF: {e}" def read_paper(self, paper_id: str, save_path: str) -> str: """下载并提取 IACR 论文文本 使用 PyMuPDF4LLM 提取 Markdown 格式。 Args: paper_id: IACR paper ID (e.g., "2009/101") save_path: 保存目录 Returns: str: 提取的 Markdown 文本或错误信息 """ try: # 获取论文详情 paper = self.get_paper_details(paper_id) if not paper or not paper.pdf_url: return f"Error: Could not find PDF URL for paper {paper_id}" # 下载 PDF pdf_response = requests.get(paper.pdf_url, timeout=30) pdf_response.raise_for_status() # 保存 PDF os.makedirs(save_path, exist_ok=True) filename = f"iacr_{paper_id.replace('/', '_')}.pdf" pdf_path = os.path.join(save_path, filename) with open(pdf_path, "wb") as f: f.write(pdf_response.content) # 使用 PyMuPDF4LLM 提取文本 text = pymupdf4llm.to_markdown(pdf_path, show_progress=False) logger.info(f"Extracted {len(text)} characters from {pdf_path}") if not text.strip(): return f"PDF downloaded to {pdf_path}, but no text could be extracted." # 添加元数据 metadata = f"# {paper.title}\n\n" metadata += f"**Authors**: {', '.join(paper.authors)}\n" metadata += f"**Published**: {paper.published_date}\n" metadata += f"**URL**: {paper.url}\n" metadata += f"**PDF**: {pdf_path}\n\n" metadata += "---\n\n" return metadata + text except requests.RequestException as e: logger.error(f"Error downloading PDF: {e}") return f"Error downloading PDF: {e}" except Exception as e: logger.error(f"Read paper error: {e}") return f"Error reading paper: {e}" def get_paper_details(self, paper_id: str) -> Optional[Paper]: """ Fetch detailed information for a specific IACR paper Args: paper_id: IACR paper ID (e.g., "2009/101") or full URL Returns: Paper: Detailed paper object with full metadata """ try: # Handle both paper ID and full URL if paper_id.startswith("http"): paper_url = paper_id # Extract paper ID from URL parts = paper_url.split("/") if len(parts) >= 2: paper_id = f"{parts[-2]}/{parts[-1]}" else: paper_url = f"{self.IACR_BASE_URL}/{paper_id}" # Make request response = self.session.get(paper_url) if response.status_code != 200: logger.error( f"Failed to fetch paper details: HTTP {response.status_code}" ) return None # Parse the page soup = BeautifulSoup(response.text, "html.parser") # Extract title from h3 element title = "" title_elem = soup.find("h3", class_="mb-3") if title_elem: title = title_elem.get_text(strip=True) # Extract authors from the italic paragraph authors = [] author_elem = soup.find("p", class_="fst-italic") if author_elem: author_text = author_elem.get_text(strip=True) # Split by " and " to get individual authors authors = [ author.strip() for author in author_text.replace(" and ", ",").split(",") ] # Extract abstract from the paragraph with white-space: pre-wrap style abstract = "" abstract_p = soup.find("p", style="white-space: pre-wrap;") if abstract_p: abstract = abstract_p.get_text(strip=True) # Extract metadata using a simpler, safer approach publication_info = "" keywords = [] history_entries = [] last_updated = None # Extract publication info page_text = soup.get_text() lines = page_text.split("\n") # Find publication info for i, line in enumerate(lines): if "Publication info" in line and i + 1 < len(lines): publication_info = lines[i + 1].strip() break # Find keywords using CSS selector for keyword badges try: keyword_elements = soup.select("a.badge.bg-secondary.keyword") keywords = [elem.get_text(strip=True) for elem in keyword_elements] except: keywords = [] # Find history entries history_found = False for i, line in enumerate(lines): if "History" in line and ":" not in line: history_found = True continue elif ( history_found and ":" in line and not line.strip().startswith("Short URL") ): history_entries.append(line.strip()) # Try to extract the last updated date from the first history entry if not last_updated: date_str = line.split(":")[0].strip() try: last_updated = datetime.strptime(date_str, "%Y-%m-%d") except ValueError: pass elif history_found and ( line.strip().startswith("Short URL") or line.strip().startswith("License") ): break # Combine history entries history = "; ".join(history_entries) if history_entries else "" # Construct PDF URL pdf_url = f"{self.IACR_BASE_URL}/{paper_id}.pdf" # Use last updated date or current date as published date published_date = last_updated if last_updated else datetime.now() return Paper( paper_id=paper_id, title=title, authors=authors, abstract=abstract, url=paper_url, pdf_url=pdf_url, published_date=published_date, updated_date=last_updated, source="iacr", categories=[], keywords=keywords, doi="", citations=0, extra={"publication_info": publication_info, "history": history}, ) except Exception as e: logger.error(f"Error fetching paper details for {paper_id}: {e}") return None if __name__ == "__main__": # Test IACR searcher searcher = IACRSearcher() print("Testing IACR search functionality...") query = "secret sharing" max_results = 2 print("\n" + "=" * 60) print("1. Testing search with detailed information (slower but complete)") print("=" * 60) try: papers = searcher.search(query, max_results=max_results, fetch_details=True) print(f"\nFound {len(papers)} papers for query '{query}' (with details):") for i, paper in enumerate(papers, 1): print(f"\n{i}. {paper.title}") print(f" Paper ID: {paper.paper_id}") print(f" Authors: {', '.join(paper.authors)}") print(f" Categories: {', '.join(paper.categories)}") print(f" Keywords: {', '.join(paper.keywords)}") print(f" Last Updated: {paper.updated_date}") print(f" URL: {paper.url}") print(f" PDF: {paper.pdf_url}") if paper.abstract: print(f" Abstract: {paper.abstract[:200]}...") if paper.extra: pub_info = paper.extra.get("publication_info", "") if pub_info: print(f" Publication Info: {pub_info}") except Exception as e: print(f"Error during detailed search: {e}") print("\n" + "=" * 60) print("2. Testing search with compact information only (faster)") print("=" * 60) try: papers_compact = searcher.search( query, max_results=max_results, fetch_details=False ) print(f"\nFound {len(papers_compact)} papers for query '{query}' (compact):") for i, paper in enumerate(papers_compact, 1): print(f"\n{i}. {paper.title}") print(f" Paper ID: {paper.paper_id}") print(f" Authors: {', '.join(paper.authors)}") print(f" Categories: {', '.join(paper.categories)}") print(f" Keywords: {', '.join(paper.keywords)} (from search)") if paper.abstract: print(f" Abstract: {paper.abstract[:150]}...") except Exception as e: print(f"Error during compact search: {e}") print("\n" + "=" * 60) print("3. Testing manual paper details fetching") print("=" * 60) test_paper_id = "2009/101" try: paper_details = searcher.get_paper_details(test_paper_id) if paper_details: print(f"\nManual fetch for paper {test_paper_id}:") print(f"Title: {paper_details.title}") print(f"Authors: {', '.join(paper_details.authors)}") print(f"Keywords: {', '.join(paper_details.keywords)}") print( f"Publication Info: {paper_details.extra.get('publication_info', 'N/A') if paper_details.extra else 'N/A'}" ) print( f"History: {paper_details.extra.get('history', 'N/A') if paper_details.extra else 'N/A'}" ) print(f"Abstract: {paper_details.abstract[:200]}...") else: print(f"Could not fetch details for paper {test_paper_id}") except Exception as e: print(f"Error fetching paper details: {e}")

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/h-lu/paper-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server