"""Tests for Naver news collector."""
import pytest
from unittest.mock import Mock, AsyncMock, patch, MagicMock
from datetime import datetime, timezone
import aiohttp
import json
from src.collectors.naver_collector import NaverNewsCollector, NaverAPIError
from src.collectors.base_collector import CollectorError
class TestNaverNewsCollector:
"""Test cases for NaverNewsCollector."""
@pytest.fixture
def collector(self):
"""Create NaverNewsCollector instance for testing."""
return NaverNewsCollector(
client_id="test_client_id",
client_secret="test_client_secret"
)
@pytest.fixture
def sample_naver_response(self):
"""Sample Naver API response."""
return {
"items": [
{
"title": "테스트 <b>뉴스</b> 제목",
"description": "테스트 뉴스 내용입니다.",
"link": "https://news.naver.com/main/read.nhn?oid=001&aid=123456",
"pubDate": "Tue, 23 Jul 2024 14:30:00 +0900",
"originallink": "https://example.com/news/123"
},
{
"title": "두번째 <b>테스트</b> 뉴스",
"description": "두번째 뉴스 내용입니다.",
"link": "https://news.naver.com/main/read.nhn?oid=002&aid=789012",
"pubDate": "Tue, 23 Jul 2024 13:15:00 +0900",
"originallink": "https://example.com/news/789"
}
],
"lastBuildDate": "Tue, 23 Jul 2024 14:30:00 +0900",
"total": 1234,
"start": 1,
"display": 2
}
def test_naver_collector_initialization(self, collector):
"""Test NaverNewsCollector initialization."""
assert collector.source_name == "naver"
assert collector.client_id == "test_client_id"
assert collector.client_secret == "test_client_secret"
assert collector.base_url == "https://openapi.naver.com/v1/search/news.json"
assert collector.max_results_per_request == 100
def test_naver_collector_missing_credentials(self):
"""Test that missing credentials raise error."""
with pytest.raises(ValueError):
NaverNewsCollector(client_id="", client_secret="test")
with pytest.raises(ValueError):
NaverNewsCollector(client_id="test", client_secret="")
def test_build_api_url(self, collector):
"""Test API URL building."""
url = collector._build_api_url(
keyword="테스트",
start=1,
display=10,
sort="date"
)
assert "query=" in url and "테스트" in url or "%ED%85%8C%EC%8A%A4%ED%8A%B8" in url
assert "start=1" in url
assert "display=10" in url
assert "sort=date" in url
assert url.startswith("https://openapi.naver.com/v1/search/news.json")
def test_build_headers(self, collector):
"""Test API headers building."""
headers = collector._build_headers()
assert headers["X-Naver-Client-Id"] == "test_client_id"
assert headers["X-Naver-Client-Secret"] == "test_client_secret"
assert "User-Agent" in headers
@pytest.mark.asyncio
async def test_make_api_request_success(self, collector, sample_naver_response):
"""Test successful API request."""
with patch('aiohttp.ClientSession.get') as mock_get:
# Mock response
mock_response = AsyncMock()
mock_response.status = 200
mock_response.json.return_value = sample_naver_response
# Configure the async context manager
mock_get.return_value.__aenter__.return_value = mock_response
mock_get.return_value.__aexit__.return_value = False
# Initialize collector session
await collector.initialize()
# Test API request
result = await collector._make_api_request("테스트", start=1, display=10)
assert result == sample_naver_response
mock_get.assert_called_once()
@pytest.mark.asyncio
async def test_make_api_request_http_error(self, collector):
"""Test API request with HTTP error."""
with patch('aiohttp.ClientSession.get') as mock_get:
# Mock response with HTTP error
mock_response = AsyncMock()
mock_response.status = 400
mock_response.text.return_value = "Bad Request"
# Configure the async context manager
mock_get.return_value.__aenter__.return_value = mock_response
mock_get.return_value.__aexit__.return_value = False
# Initialize collector session
await collector.initialize()
with pytest.raises(NaverAPIError):
await collector._make_api_request("테스트")
@pytest.mark.asyncio
async def test_make_api_request_network_error(self, collector):
"""Test API request with network error."""
with patch('aiohttp.ClientSession.get') as mock_get:
# Mock network error
mock_get.side_effect = aiohttp.ClientError("Network error")
# Initialize collector session
await collector.initialize()
with pytest.raises(CollectorError):
await collector._make_api_request("테스트")
@pytest.mark.asyncio
async def test_collect_basic(self, collector, sample_naver_response):
"""Test basic news collection."""
collector._make_api_request = AsyncMock(return_value=sample_naver_response)
result = await collector.collect(keyword="테스트", limit=10)
assert len(result) == 2
assert result[0]["title"] == "테스트 <b>뉴스</b> 제목"
assert result[1]["title"] == "두번째 <b>테스트</b> 뉴스"
@pytest.mark.asyncio
async def test_collect_with_pagination(self, collector, sample_naver_response):
"""Test collection with pagination."""
# Create response with 100 items (max per request) to trigger pagination
full_page_response = {
"items": [sample_naver_response["items"][0]] * 100, # 100 identical items
"total": 1234,
"start": 1,
"display": 100
}
second_page_response = {
"items": [sample_naver_response["items"][1]] * 50, # 50 items on second page
"total": 1234,
"start": 101,
"display": 50
}
# Mock multiple API calls for pagination
collector._make_api_request = AsyncMock(side_effect=[
full_page_response, # First page (100 items)
second_page_response # Second page (50 items)
])
result = await collector.collect(keyword="테스트", limit=150)
# Should make multiple API calls due to limit > max_results_per_request
assert collector._make_api_request.call_count == 2
assert len(result) == 150
@pytest.mark.asyncio
async def test_collect_with_date_filter(self, collector, sample_naver_response):
"""Test collection with date filtering."""
collector._make_api_request = AsyncMock(return_value=sample_naver_response)
from datetime import date
result = await collector.collect(
keyword="테스트",
start_date=date(2024, 7, 20),
end_date=date(2024, 7, 23)
)
assert len(result) >= 0 # Should filter by date
@pytest.mark.asyncio
async def test_parse_news_item(self, collector):
"""Test parsing individual news item."""
raw_item = {
"title": "테스트 <b>뉴스</b> 제목",
"description": "테스트 뉴스 내용입니다.",
"link": "https://news.naver.com/main/read.nhn?oid=001&aid=123456",
"pubDate": "Tue, 23 Jul 2024 14:30:00 +0900",
"originallink": "https://example.com/news/123"
}
parsed = await collector.parse(raw_item)
assert parsed["title"] == "테스트 뉴스 제목" # HTML tags removed
assert parsed["content"] == "테스트 뉴스 내용입니다."
assert parsed["url"] == "https://news.naver.com/main/read.nhn?oid=001&aid=123456"
assert parsed["original_url"] == "https://example.com/news/123"
assert parsed["source"] == "naver"
assert isinstance(parsed["published_at"], datetime)
def test_parse_date(self, collector):
"""Test date parsing from Naver format."""
# Test various date formats
test_dates = [
"Tue, 23 Jul 2024 14:30:00 +0900",
"Mon, 01 Jan 2024 00:00:00 +0900",
"Wed, 31 Dec 2024 23:59:59 +0900"
]
for date_str in test_dates:
parsed_date = collector._parse_date(date_str)
assert isinstance(parsed_date, datetime)
assert parsed_date.tzinfo is not None
def test_parse_invalid_date(self, collector):
"""Test parsing invalid date."""
invalid_dates = [
"Invalid date format",
"",
None,
"2024-07-23" # Wrong format
]
for invalid_date in invalid_dates:
parsed_date = collector._parse_date(invalid_date)
# Should return current time for invalid dates
assert isinstance(parsed_date, datetime)
def test_clean_html_tags(self, collector):
"""Test HTML tag cleaning."""
test_cases = [
("테스트 <b>뉴스</b> 제목", "테스트 뉴스 제목"),
("<em>강조된</em> <strong>내용</strong>", "강조된 내용"),
("일반 텍스트", "일반 텍스트"),
("", "")
]
for html_text, expected in test_cases:
cleaned = collector._clean_html_tags(html_text)
assert cleaned == expected
@pytest.mark.asyncio
async def test_collect_empty_results(self, collector):
"""Test collection when API returns no results."""
empty_response = {
"items": [],
"total": 0,
"start": 1,
"display": 0
}
collector._make_api_request = AsyncMock(return_value=empty_response)
result = await collector.collect(keyword="존재하지않는키워드")
assert result == []
@pytest.mark.asyncio
async def test_collect_with_sort_options(self, collector, sample_naver_response):
"""Test collection with different sort options."""
collector._make_api_request = AsyncMock(return_value=sample_naver_response)
# Test date sort
await collector.collect(keyword="테스트", sort="date")
call_args = collector._make_api_request.call_args
assert call_args is not None
# Test relevance sort
await collector.collect(keyword="테스트", sort="sim")
call_args = collector._make_api_request.call_args
assert call_args is not None
def test_validate_sort_option(self, collector):
"""Test sort option validation."""
# Valid sort options
assert collector._validate_sort_option("date") == "date"
assert collector._validate_sort_option("sim") == "sim"
# Invalid sort option should default to date
assert collector._validate_sort_option("invalid") == "date"
assert collector._validate_sort_option("") == "date"
@pytest.mark.asyncio
async def test_rate_limiting_applied(self, collector, sample_naver_response):
"""Test that rate limiting is properly applied."""
collector._make_api_request = AsyncMock(return_value=sample_naver_response)
# Mock rate limiter
collector.rate_limiter = AsyncMock()
await collector.collect(keyword="테스트")
# Should use rate limiter in collect_and_process
# (This will be tested through the full pipeline)
def test_extract_article_id(self, collector):
"""Test article ID extraction from Naver URLs."""
test_urls = [
("https://news.naver.com/main/read.nhn?oid=001&aid=123456", "001_123456"),
("https://news.naver.com/main/read.nhn?oid=022&aid=789012", "022_789012"),
("https://other-site.com/news", None)
]
for url, expected_id in test_urls:
article_id = collector._extract_article_id(url)
assert article_id == expected_id
@pytest.mark.asyncio
async def test_collect_with_full_content_fetch(self, collector, sample_naver_response):
"""Test collection with full content fetching enabled."""
collector._make_api_request = AsyncMock(return_value=sample_naver_response)
collector._fetch_full_content = AsyncMock(return_value="전체 기사 내용입니다.")
result = await collector.collect(keyword="테스트", fetch_full_content=True)
# Should attempt to fetch full content for each article
assert collector._fetch_full_content.call_count == len(sample_naver_response["items"])
@pytest.mark.asyncio
async def test_fetch_full_content(self, collector):
"""Test fetching full content from article URL."""
html_content = """
<html>
<body>
<div class="go_trans _article_content">
<p>전체 기사 내용입니다. 이것은 충분히 긴 내용으로 테스트를 위해 작성된 가짜 뉴스 기사입니다.</p>
<p>두번째 문단입니다. 추가적인 내용을 포함하여 길이 제한을 통과할 수 있도록 합니다.</p>
</div>
</body>
</html>
"""
with patch('aiohttp.ClientSession.get') as mock_get:
# Mock response
mock_response = AsyncMock()
mock_response.text.return_value = html_content
mock_response.status = 200
# Configure the async context manager
mock_get.return_value.__aenter__.return_value = mock_response
mock_get.return_value.__aexit__.return_value = False
# Initialize collector session
await collector.initialize()
content = await collector._fetch_full_content("https://news.naver.com/article/123")
assert "전체 기사 내용입니다" in content
assert "두번째 문단입니다" in content
@pytest.mark.asyncio
async def test_fetch_full_content_error(self, collector):
"""Test full content fetch with error handling."""
with patch('aiohttp.ClientSession.get') as mock_get:
# Mock network error
mock_get.side_effect = aiohttp.ClientError("Network error")
# Initialize collector session
await collector.initialize()
# Should return empty string on error, not raise exception
content = await collector._fetch_full_content("https://invalid-url.com")
assert content == ""
def test_build_search_filters(self, collector):
"""Test search filter building."""
from datetime import date
filters = collector._build_search_filters(
start_date=date(2024, 7, 20),
end_date=date(2024, 7, 23)
)
# Should build appropriate filters for date range
assert isinstance(filters, dict)
@pytest.mark.asyncio
async def test_collect_error_recovery(self, collector):
"""Test error recovery during collection."""
# First request fails, second succeeds
collector._make_api_request = AsyncMock(side_effect=[
NaverAPIError("Rate limit exceeded"),
{"items": [], "total": 0}
])
# Should handle the error gracefully
with pytest.raises(NaverAPIError):
await collector.collect(keyword="테스트")
def test_get_collector_stats(self, collector):
"""Test collector statistics tracking."""
stats = collector.get_stats()
assert "requests_made" in stats
assert "articles_collected" in stats
assert "api_errors" in stats
assert "rate_limited_requests" in stats
@pytest.mark.asyncio
async def test_concurrent_requests_handling(self, collector, sample_naver_response):
"""Test handling of concurrent requests with proper rate limiting."""
collector._make_api_request = AsyncMock(return_value=sample_naver_response)
# Simulate multiple concurrent collection requests
import asyncio
tasks = [
collector.collect(keyword=f"테스트{i}", limit=10)
for i in range(3)
]
results = await asyncio.gather(*tasks)
# All should succeed
assert len(results) == 3
for result in results:
assert isinstance(result, list)