arxiv_client.py•6.92 kB
"""arXiv client for searching AI/ML papers."""
import arxiv
from datetime import datetime, timedelta
from typing import List, Dict, Optional
class ArxivClient:
"""Client for searching arXiv papers."""
# arXiv categories for AI/ML research
AI_CATEGORIES = [
"cs.AI", # Artificial Intelligence
"cs.CL", # Computation and Language
"cs.LG", # Machine Learning
"cs.CV", # Computer Vision
"cs.NE", # Neural and Evolutionary Computing
"cs.RO", # Robotics
"cs.HC", # Human-Computer Interaction
"cs.IR", # Information Retrieval
"stat.ML", # Statistics - Machine Learning
"q-bio.QM", # Quantitative Methods
"q-bio.GN", # Genomics
"q-bio.BM", # Biomolecules
"physics.comp-ph", # Computational Physics
"eess.AS", # Audio and Speech Processing
"eess.IV", # Image and Video Processing
]
# Keywords by AI research area
KEYWORDS_BY_AREA = {
"llm": ["large language model", "LLM", "GPT", "transformer", "BERT",
"instruction tuning", "prompt", "fine-tuning", "RLHF", "alignment"],
"vision": ["vision language model", "CLIP", "multimodal", "text-to-image",
"diffusion model", "stable diffusion", "image generation", "video generation"],
"generative": ["generative model", "GAN", "VAE", "flow-based model", "autoregressive model"],
"robotics": ["robot learning", "embodied AI", "manipulation", "navigation",
"imitation learning", "sim-to-real"],
"bioinfo": ["protein folding", "drug discovery", "molecule generation",
"genomics", "AlphaFold", "antibody design"],
"science": ["physics-informed neural network", "scientific machine learning",
"AI4Science", "molecular dynamics"],
"rl": ["reinforcement learning", "multi-agent", "policy gradient",
"Q-learning", "offline RL", "reward modeling"],
"graph": ["graph neural network", "knowledge graph", "molecular graph",
"graph representation learning"],
"efficient": ["model compression", "quantization", "pruning", "knowledge distillation",
"efficient transformer", "LoRA", "parameter-efficient fine-tuning"],
"safety": ["AI safety", "adversarial robustness", "interpretability",
"explainability", "fairness", "bias"],
"emerging": ["federated learning", "continual learning", "meta-learning",
"few-shot learning", "zero-shot learning", "neuromorphic computing"],
}
def __init__(self):
"""Initialize arXiv client."""
self.client = arxiv.Client()
def search_papers(
self,
keywords: Optional[List[str]] = None,
categories: Optional[List[str]] = None,
days: int = 7,
max_results: int = 50,
sort_by: arxiv.SortCriterion = arxiv.SortCriterion.SubmittedDate,
) -> List[Dict]:
"""Search for papers on arXiv.
Args:
keywords: List of keywords to search for (OR condition)
categories: List of arXiv categories to filter by (default: all AI categories)
days: Number of days to look back
max_results: Maximum number of results to return
sort_by: Sort criterion (SubmittedDate, Relevance, or LastUpdatedDate)
Returns:
List of paper dictionaries
"""
# Build query
query_parts = []
# Add keyword search
if keywords:
keyword_query = " OR ".join(f'"{kw}"' for kw in keywords)
query_parts.append(f"({keyword_query})")
# Add category filter
if categories is None:
categories = self.AI_CATEGORIES
if categories:
cat_query = " OR ".join(f"cat:{cat}" for cat in categories)
query_parts.append(f"({cat_query})")
# Combine query parts
query = " AND ".join(query_parts) if query_parts else "all"
# Calculate date range (make timezone-aware)
from datetime import timezone
date_from = datetime.now(timezone.utc) - timedelta(days=days)
# Search arXiv
search = arxiv.Search(
query=query,
max_results=max_results,
sort_by=sort_by,
sort_order=arxiv.SortOrder.Descending,
)
results = []
for paper in self.client.results(search):
# Filter by date
if paper.published < date_from:
continue
# Ensure timezone-aware datetimes
published_dt = paper.published
if published_dt.tzinfo is None:
published_dt = published_dt.replace(tzinfo=timezone.utc)
updated_dt = paper.updated
if updated_dt.tzinfo is None:
updated_dt = updated_dt.replace(tzinfo=timezone.utc)
results.append({
"title": paper.title,
"authors": [author.name for author in paper.authors],
"summary": paper.summary,
"published": published_dt.isoformat(),
"updated": updated_dt.isoformat(),
"url": paper.entry_id,
"pdf_url": paper.pdf_url,
"categories": paper.categories,
"primary_category": paper.primary_category,
"source": "arxiv",
})
return results
def get_latest_by_area(self, area: str, days: int = 7, max_results: int = 20) -> List[Dict]:
"""Get latest papers for a specific research area.
Args:
area: Research area (e.g., 'llm', 'vision', 'robotics')
days: Number of days to look back
max_results: Maximum number of results
Returns:
List of paper dictionaries
"""
keywords = self.KEYWORDS_BY_AREA.get(area.lower(), [])
if not keywords:
raise ValueError(f"Unknown area: {area}. Valid areas: {list(self.KEYWORDS_BY_AREA.keys())}")
return self.search_papers(
keywords=keywords,
days=days,
max_results=max_results,
)
def get_latest_papers(self, days: int = 7, max_results: int = 100) -> List[Dict]:
"""Get latest papers across all AI categories.
Args:
days: Number of days to look back
max_results: Maximum number of results
Returns:
List of paper dictionaries
"""
return self.search_papers(
keywords=None,
categories=self.AI_CATEGORIES,
days=days,
max_results=max_results,
)