test_services.py•12.2 kB
"""Integration tests for arXiv services.
These tests make real API calls to arXiv.org.
They are slower but ensure the service works with the actual API.
"""
import shutil
import tempfile
from pathlib import Path
import pytest
from arxiv.models import ArxivEntry, ArxivSearchResult, Author
from arxiv.services import ArxivService
@pytest.fixture
def temp_dir():
"""Create a temporary directory for test downloads."""
tmp = tempfile.mkdtemp()
yield tmp
shutil.rmtree(tmp)
@pytest.fixture
def service(temp_dir):
"""Create an ArxivService instance with a temporary download directory."""
return ArxivService(download_dir=temp_dir, rate_limit_delay=0.5)
class TestArxivServiceSearch:
"""Tests for the search functionality."""
def test_search_basic(self, service):
"""Test basic search returns results."""
result = service.search(query="ti:attention is all you need", max_results=5)
assert isinstance(result, ArxivSearchResult)
assert result.total_results > 0
assert len(result.entries) > 0
assert result.items_per_page == 5
def test_search_by_title(self, service):
"""Test searching by title."""
result = service.search(query="ti:transformer", max_results=3)
assert len(result.entries) <= 3
for entry in result.entries:
assert isinstance(entry, ArxivEntry)
assert entry.title
assert entry.summary
assert len(entry.authors) > 0
def test_search_by_author(self, service):
"""Test searching by author."""
result = service.search(query="au:Hinton", max_results=5)
assert len(result.entries) > 0
# At least one entry should have Hinton as an author
found_hinton = False
for entry in result.entries:
for author in entry.authors:
if "Hinton" in author.name:
found_hinton = True
break
assert found_hinton, "Expected to find Hinton in author list"
def test_search_by_category(self, service):
"""Test searching by category."""
result = service.search(query="cat:cs.AI", max_results=5)
assert len(result.entries) > 0
for entry in result.entries:
assert "cs.AI" in entry.categories or entry.primary_category == "cs.AI"
def test_search_combined_query(self, service):
"""Test combined search query."""
result = service.search(
query="ti:neural AND cat:cs.LG", max_results=3
)
assert len(result.entries) <= 3
for entry in result.entries:
assert "cs.LG" in entry.categories or entry.primary_category == "cs.LG"
def test_search_pagination(self, service):
"""Test search pagination."""
result1 = service.search(query="ti:neural", max_results=2, start=0)
result2 = service.search(query="ti:neural", max_results=2, start=2)
assert result1.start_index == 0
assert result2.start_index == 2
assert len(result1.entries) == 2
assert len(result2.entries) == 2
# Entries should be different
ids1 = {e.arxiv_id for e in result1.entries}
ids2 = {e.arxiv_id for e in result2.entries}
assert ids1 != ids2, "Pagination should return different results"
def test_search_sort_by_date(self, service):
"""Test sorting by submission date."""
result = service.search(
query="cat:cs.AI",
max_results=5,
sort_by="submittedDate",
sort_order="descending",
)
assert len(result.entries) > 0
# Verify dates are in descending order
dates = [entry.published for entry in result.entries]
assert dates == sorted(dates, reverse=True)
def test_search_no_results(self, service):
"""Test search with no results."""
# Search for something very unlikely to exist
result = service.search(
query='ti:"xyzabcdefghijklmnopqrstuvwxyz123456789"',
max_results=5,
)
assert result.total_results == 0
assert len(result.entries) == 0
class TestArxivServiceGet:
"""Tests for the get functionality."""
def test_get_by_id(self, service):
"""Test getting a specific paper by ID."""
# Use a well-known paper: "Attention Is All You Need"
arxiv_id = "1706.03762"
entry = service.get(arxiv_id, download_pdf=False)
assert isinstance(entry, ArxivEntry)
assert arxiv_id in entry.arxiv_id
assert "Attention" in entry.title or "attention" in entry.title.lower()
assert len(entry.authors) > 0
assert entry.pdf_url is not None
def test_get_with_arxiv_prefix(self, service):
"""Test getting a paper with arXiv: prefix."""
entry = service.get("arXiv:1706.03762", download_pdf=False)
assert isinstance(entry, ArxivEntry)
assert "1706.03762" in entry.arxiv_id
def test_get_with_version(self, service):
"""Test getting a paper with version number."""
entry = service.get("1706.03762v1", download_pdf=False)
assert isinstance(entry, ArxivEntry)
assert "1706.03762" in entry.arxiv_id
def test_get_nonexistent_id(self, service):
"""Test getting a non-existent paper ID."""
with pytest.raises(ValueError, match="No paper found"):
service.get("9999.99999", download_pdf=False)
def test_get_with_download(self, service, temp_dir):
"""Test getting a paper with PDF download."""
arxiv_id = "1706.03762"
entry = service.get(arxiv_id, download_pdf=True)
# Check entry is returned
assert isinstance(entry, ArxivEntry)
# Check PDF was downloaded
pdf_path = Path(temp_dir) / f"{arxiv_id}.pdf"
assert pdf_path.exists()
assert pdf_path.stat().st_size > 0
def test_get_download_exists_no_force(self, service, temp_dir):
"""Test that existing PDFs are not re-downloaded without force."""
arxiv_id = "1706.03762"
# First download
entry1 = service.get(arxiv_id, download_pdf=True, force_download=False)
pdf_path = Path(temp_dir) / f"{arxiv_id}.pdf"
first_size = pdf_path.stat().st_size
first_mtime = pdf_path.stat().st_mtime
# Second download without force - should not re-download
entry2 = service.get(arxiv_id, download_pdf=True, force_download=False)
# File should be the same
assert pdf_path.stat().st_size == first_size
assert pdf_path.stat().st_mtime == first_mtime
def test_get_download_exists_with_force(self, service, temp_dir):
"""Test that force flag causes re-download."""
arxiv_id = "1706.03762"
# First download
entry1 = service.get(arxiv_id, download_pdf=True, force_download=False)
pdf_path = Path(temp_dir) / f"{arxiv_id}.pdf"
first_mtime = pdf_path.stat().st_mtime
# Wait a moment to ensure mtime would differ
import time
time.sleep(0.1)
# Second download with force - should re-download
entry2 = service.get(arxiv_id, download_pdf=True, force_download=True)
# File should have been updated
assert pdf_path.stat().st_mtime > first_mtime
class TestArxivServiceDownloadPDF:
"""Tests for the download_pdf functionality."""
def test_download_pdf_by_id(self, service, temp_dir):
"""Test downloading a PDF by arXiv ID."""
arxiv_id = "1706.03762"
pdf_path = service.download_pdf(arxiv_id)
assert pdf_path is not None
assert pdf_path.exists()
assert pdf_path.name == f"{arxiv_id}.pdf"
assert pdf_path.stat().st_size > 0
def test_download_pdf_force(self, service, temp_dir):
"""Test force downloading a PDF."""
arxiv_id = "1706.03762"
# First download
pdf_path1 = service.download_pdf(arxiv_id, force_download=False)
first_mtime = pdf_path1.stat().st_mtime
# Wait a moment
import time
time.sleep(0.1)
# Force download
pdf_path2 = service.download_pdf(arxiv_id, force_download=True)
assert pdf_path2.stat().st_mtime > first_mtime
def test_download_pdf_invalid_id(self, service):
"""Test downloading PDF with invalid ID."""
result = service.download_pdf("9999.99999")
assert result is None
class TestArxivEntry:
"""Tests for ArxivEntry model."""
def test_entry_properties(self, service):
"""Test that entry properties are correctly populated."""
entry = service.get("1706.03762", download_pdf=False)
# Basic fields
assert entry.id
assert entry.title
assert entry.summary
assert len(entry.summary) > 100 # Should have substantial content
# Authors
assert len(entry.authors) > 0
for author in entry.authors:
assert isinstance(author, Author)
assert author.name
# Dates
assert entry.published
assert entry.updated
# Categories
assert entry.primary_category
assert len(entry.categories) > 0
# Links
assert len(entry.links) > 0
assert entry.pdf_url
def test_entry_arxiv_id_property(self, service):
"""Test arxiv_id property extraction."""
entry = service.get("1706.03762", download_pdf=False)
# Should extract clean ID from URL
assert "1706.03762" in entry.arxiv_id
assert not entry.arxiv_id.startswith("http")
def test_entry_str_representation(self, service):
"""Test string representation of entry."""
entry = service.get("1706.03762", download_pdf=False)
entry_str = str(entry)
# Should contain key information
assert "ID:" in entry_str
assert "Title:" in entry_str
assert "Authors:" in entry_str
assert "Published:" in entry_str
assert "Categories:" in entry_str
assert "Abstract:" in entry_str
class TestArxivSearchResult:
"""Tests for ArxivSearchResult model."""
def test_search_result_properties(self, service):
"""Test search result properties."""
result = service.search(query="ti:neural", max_results=3)
assert result.total_results > 0
assert result.start_index >= 0
assert result.items_per_page == 3
assert len(result.entries) <= 3
def test_search_result_str_representation(self, service):
"""Test string representation of search result."""
result = service.search(query="ti:transformer", max_results=2)
result_str = str(result)
# Should contain summary information
assert "Total results:" in result_str
assert "Showing:" in result_str
assert "Items per page:" in result_str
assert "Result 1:" in result_str
class TestRateLimiting:
"""Tests for rate limiting functionality."""
def test_rate_limiting_enforced(self, temp_dir):
"""Test that rate limiting delays are enforced."""
import time
service = ArxivService(download_dir=temp_dir, rate_limit_delay=0.5)
start_time = time.time()
# Make two consecutive searches
service.search(query="ti:neural", max_results=1)
service.search(query="ti:machine", max_results=1)
elapsed = time.time() - start_time
# Should take at least rate_limit_delay seconds
assert elapsed >= 0.5
class TestDownloadDirectory:
"""Tests for download directory management."""
def test_download_dir_created(self):
"""Test that download directory is created if it doesn't exist."""
with tempfile.TemporaryDirectory() as tmpdir:
download_dir = Path(tmpdir) / "downloads"
assert not download_dir.exists()
service = ArxivService(download_dir=str(download_dir))
assert download_dir.exists()
assert download_dir.is_dir()
def test_download_dir_default(self):
"""Test default download directory."""
service = ArxivService()
assert service.download_dir == Path("./.arxiv")