"""Tests for Daum news collector."""
import pytest
from unittest.mock import Mock, AsyncMock, patch, MagicMock
from datetime import datetime, timezone
import aiohttp
import json
from src.collectors.daum_collector import DaumNewsCollector, DaumAPIError
from src.collectors.base_collector import CollectorError
class TestDaumNewsCollector:
"""Test cases for DaumNewsCollector."""
@pytest.fixture
def collector(self):
"""Create DaumNewsCollector instance for testing."""
return DaumNewsCollector(
api_key="test_api_key"
)
@pytest.fixture
def sample_daum_response(self):
"""Sample Daum API response."""
return {
"documents": [
{
"title": "테스트 <b>뉴스</b> 제목",
"contents": "테스트 뉴스 내용입니다.",
"url": "https://v.daum.net/v/20240723143000123",
"datetime": "2024-07-23T14:30:00.000+09:00",
"blogname": "뉴시스",
"thumbnail": "https://img1.daumcdn.net/thumb/test.jpg"
},
{
"title": "두번째 <b>테스트</b> 뉴스",
"contents": "두번째 뉴스 내용입니다.",
"url": "https://v.daum.net/v/20240723131500789",
"datetime": "2024-07-23T13:15:00.000+09:00",
"blogname": "연합뉴스",
"thumbnail": "https://img1.daumcdn.net/thumb/test2.jpg"
}
],
"meta": {
"total_count": 1234,
"pageable_count": 800,
"is_end": False
}
}
def test_daum_collector_initialization(self, collector):
"""Test DaumNewsCollector initialization."""
assert collector.source_name == "daum"
assert collector.api_key == "test_api_key"
assert collector.base_url == "https://dapi.kakao.com/v2/search/web"
assert collector.max_results_per_request == 50
def test_daum_collector_missing_credentials(self):
"""Test that missing credentials raise error."""
with pytest.raises(ValueError):
DaumNewsCollector(api_key="")
with pytest.raises(ValueError):
DaumNewsCollector(api_key=None)
def test_build_api_url(self, collector):
"""Test API URL building."""
url = collector._build_api_url(
keyword="테스트",
page=1,
size=10,
sort="recency"
)
assert "query=" in url and "테스트" in url or "%ED%85%8C%EC%8A%A4%ED%8A%B8" in url
assert "page=1" in url
assert "size=10" in url
assert "sort=recency" in url
assert url.startswith("https://dapi.kakao.com/v2/search/web")
def test_build_headers(self, collector):
"""Test API headers building."""
headers = collector._build_headers()
assert headers["Authorization"] == "KakaoAK test_api_key"
assert "User-Agent" in headers
@pytest.mark.asyncio
async def test_make_api_request_success(self, collector, sample_daum_response):
"""Test successful API request."""
with patch('aiohttp.ClientSession.get') as mock_get:
# Mock response
mock_response = AsyncMock()
mock_response.status = 200
mock_response.json.return_value = sample_daum_response
# Configure the async context manager
mock_get.return_value.__aenter__.return_value = mock_response
mock_get.return_value.__aexit__.return_value = False
# Initialize collector session
await collector.initialize()
# Test API request
result = await collector._make_api_request("테스트", page=1, size=10)
assert result == sample_daum_response
mock_get.assert_called_once()
@pytest.mark.asyncio
async def test_make_api_request_http_error(self, collector):
"""Test API request with HTTP error."""
with patch('aiohttp.ClientSession.get') as mock_get:
# Mock response with HTTP error
mock_response = AsyncMock()
mock_response.status = 401
mock_response.text.return_value = "Unauthorized"
# Configure the async context manager
mock_get.return_value.__aenter__.return_value = mock_response
mock_get.return_value.__aexit__.return_value = False
# Initialize collector session
await collector.initialize()
with pytest.raises(DaumAPIError):
await collector._make_api_request("테스트")
@pytest.mark.asyncio
async def test_make_api_request_network_error(self, collector):
"""Test API request with network error."""
with patch('aiohttp.ClientSession.get') as mock_get:
# Mock network error
mock_get.side_effect = aiohttp.ClientError("Network error")
# Initialize collector session
await collector.initialize()
with pytest.raises(CollectorError):
await collector._make_api_request("테스트")
@pytest.mark.asyncio
async def test_collect_basic(self, collector, sample_daum_response):
"""Test basic news collection."""
collector._make_api_request = AsyncMock(return_value=sample_daum_response)
result = await collector.collect(keyword="테스트", limit=10)
assert len(result) == 2
assert result[0]["title"] == "테스트 <b>뉴스</b> 제목"
assert result[1]["title"] == "두번째 <b>테스트</b> 뉴스"
@pytest.mark.asyncio
async def test_collect_with_pagination(self, collector, sample_daum_response):
"""Test collection with pagination."""
# Create response with max items to trigger pagination
full_page_response = {
"documents": [sample_daum_response["documents"][0]] * 50, # 50 items (max)
"meta": {
"total_count": 1234,
"pageable_count": 800,
"is_end": False
}
}
second_page_response = {
"documents": [sample_daum_response["documents"][1]] * 25, # 25 items on second page
"meta": {
"total_count": 1234,
"pageable_count": 800,
"is_end": True
}
}
# Mock multiple API calls for pagination
collector._make_api_request = AsyncMock(side_effect=[
full_page_response, # First page (50 items)
second_page_response # Second page (25 items)
])
result = await collector.collect(keyword="테스트", limit=75)
# Should make multiple API calls due to pagination
assert collector._make_api_request.call_count == 2
assert len(result) == 75
@pytest.mark.asyncio
async def test_collect_with_date_filter(self, collector, sample_daum_response):
"""Test collection with date filtering."""
collector._make_api_request = AsyncMock(return_value=sample_daum_response)
from datetime import date
result = await collector.collect(
keyword="테스트",
start_date=date(2024, 7, 20),
end_date=date(2024, 7, 23)
)
assert len(result) >= 0 # Should filter by date
@pytest.mark.asyncio
async def test_parse_news_item(self, collector):
"""Test parsing individual news item."""
raw_item = {
"title": "테스트 <b>뉴스</b> 제목",
"contents": "테스트 뉴스 내용입니다.",
"url": "https://v.daum.net/v/20240723143000123",
"datetime": "2024-07-23T14:30:00.000+09:00",
"blogname": "뉴시스",
"thumbnail": "https://img1.daumcdn.net/thumb/test.jpg"
}
parsed = await collector.parse(raw_item)
assert parsed["title"] == "테스트 뉴스 제목" # HTML tags removed
assert parsed["content"] == "테스트 뉴스 내용입니다."
assert parsed["url"] == "https://v.daum.net/v/20240723143000123"
assert parsed["source"] == "daum"
assert parsed["blogname"] == "뉴시스"
assert parsed["thumbnail"] == "https://img1.daumcdn.net/thumb/test.jpg"
assert isinstance(parsed["published_at"], datetime)
def test_parse_date(self, collector):
"""Test date parsing from Daum format."""
# Test various date formats
test_dates = [
"2024-07-23T14:30:00.000+09:00",
"2024-01-01T00:00:00.000+09:00",
"2024-12-31T23:59:59.000+09:00"
]
for date_str in test_dates:
parsed_date = collector._parse_date(date_str)
assert isinstance(parsed_date, datetime)
assert parsed_date.tzinfo is not None
def test_parse_invalid_date(self, collector):
"""Test parsing invalid date."""
invalid_dates = [
"Invalid date format",
"",
None,
"2024-07-23" # Wrong format
]
for invalid_date in invalid_dates:
parsed_date = collector._parse_date(invalid_date)
# Should return current time for invalid dates
assert isinstance(parsed_date, datetime)
def test_clean_html_tags(self, collector):
"""Test HTML tag cleaning."""
test_cases = [
("테스트 <b>뉴스</b> 제목", "테스트 뉴스 제목"),
("<em>강조된</em> <strong>내용</strong>", "강조된 내용"),
("일반 텍스트", "일반 텍스트"),
("", "")
]
for html_text, expected in test_cases:
cleaned = collector._clean_html_tags(html_text)
assert cleaned == expected
@pytest.mark.asyncio
async def test_collect_empty_results(self, collector):
"""Test collection when API returns no results."""
empty_response = {
"documents": [],
"meta": {
"total_count": 0,
"pageable_count": 0,
"is_end": True
}
}
collector._make_api_request = AsyncMock(return_value=empty_response)
result = await collector.collect(keyword="존재하지않는키워드")
assert result == []
@pytest.mark.asyncio
async def test_collect_with_sort_options(self, collector, sample_daum_response):
"""Test collection with different sort options."""
collector._make_api_request = AsyncMock(return_value=sample_daum_response)
# Test recency sort
await collector.collect(keyword="테스트", sort="recency")
call_args = collector._make_api_request.call_args
assert call_args is not None
# Test accuracy sort
await collector.collect(keyword="테스트", sort="accuracy")
call_args = collector._make_api_request.call_args
assert call_args is not None
def test_validate_sort_option(self, collector):
"""Test sort option validation."""
# Valid sort options
assert collector._validate_sort_option("recency") == "recency"
assert collector._validate_sort_option("accuracy") == "accuracy"
# Invalid sort option should default to recency
assert collector._validate_sort_option("invalid") == "recency"
assert collector._validate_sort_option("") == "recency"
@pytest.mark.asyncio
async def test_rate_limiting_applied(self, collector, sample_daum_response):
"""Test that rate limiting is properly applied."""
collector._make_api_request = AsyncMock(return_value=sample_daum_response)
# Mock rate limiter
collector.rate_limiter = AsyncMock()
await collector.collect(keyword="테스트")
# Should use rate limiter in collect_and_process
# (This will be tested through the full pipeline)
def test_extract_article_id(self, collector):
"""Test article ID extraction from Daum URLs."""
test_urls = [
("https://v.daum.net/v/20240723143000123", "20240723143000123"),
("https://v.daum.net/v/20240723131500789", "20240723131500789"),
("https://other-site.com/news", None)
]
for url, expected_id in test_urls:
article_id = collector._extract_article_id(url)
assert article_id == expected_id
@pytest.mark.asyncio
async def test_collect_with_full_content_fetch(self, collector, sample_daum_response):
"""Test collection with full content fetching enabled."""
collector._make_api_request = AsyncMock(return_value=sample_daum_response)
collector._fetch_full_content = AsyncMock(return_value="전체 기사 내용입니다.")
result = await collector.collect(keyword="테스트", fetch_full_content=True)
# Should attempt to fetch full content for each article
assert collector._fetch_full_content.call_count == len(sample_daum_response["documents"])
@pytest.mark.asyncio
async def test_fetch_full_content(self, collector):
"""Test fetching full content from article URL."""
html_content = """
<html>
<body>
<div class="news_view">
<p>전체 기사 내용입니다. 이것은 충분히 긴 내용으로 테스트를 위해 작성된 가짜 뉴스 기사입니다.</p>
<p>두번째 문단입니다. 추가적인 내용을 포함하여 길이 제한을 통과할 수 있도록 합니다.</p>
</div>
</body>
</html>
"""
with patch('aiohttp.ClientSession.get') as mock_get:
# Mock response
mock_response = AsyncMock()
mock_response.text.return_value = html_content
mock_response.status = 200
# Configure the async context manager
mock_get.return_value.__aenter__.return_value = mock_response
mock_get.return_value.__aexit__.return_value = False
# Initialize collector session
await collector.initialize()
content = await collector._fetch_full_content("https://v.daum.net/v/20240723143000123")
assert "전체 기사 내용입니다" in content
assert "두번째 문단입니다" in content
@pytest.mark.asyncio
async def test_fetch_full_content_error(self, collector):
"""Test full content fetch with error handling."""
with patch('aiohttp.ClientSession.get') as mock_get:
# Mock network error
mock_get.side_effect = aiohttp.ClientError("Network error")
# Initialize collector session
await collector.initialize()
# Should return empty string on error, not raise exception
content = await collector._fetch_full_content("https://invalid-url.com")
assert content == ""
def test_build_search_filters(self, collector):
"""Test search filter building."""
from datetime import date
filters = collector._build_search_filters(
start_date=date(2024, 7, 20),
end_date=date(2024, 7, 23)
)
# Should build appropriate filters for date range
assert isinstance(filters, dict)
@pytest.mark.asyncio
async def test_collect_error_recovery(self, collector):
"""Test error recovery during collection."""
# First request fails, second succeeds
collector._make_api_request = AsyncMock(side_effect=[
DaumAPIError("Rate limit exceeded"),
{"documents": [], "meta": {"total_count": 0}}
])
# Should handle the error gracefully
with pytest.raises(DaumAPIError):
await collector.collect(keyword="테스트")
def test_get_collector_stats(self, collector):
"""Test collector statistics tracking."""
stats = collector.get_stats()
assert "requests_made" in stats
assert "articles_collected" in stats
assert "api_errors" in stats
assert "rate_limited_requests" in stats
@pytest.mark.asyncio
async def test_concurrent_requests_handling(self, collector, sample_daum_response):
"""Test handling of concurrent requests with proper rate limiting."""
collector._make_api_request = AsyncMock(return_value=sample_daum_response)
# Simulate multiple concurrent collection requests
import asyncio
tasks = [
collector.collect(keyword=f"테스트{i}", limit=10)
for i in range(3)
]
results = await asyncio.gather(*tasks)
# All should succeed
assert len(results) == 3
for result in results:
assert isinstance(result, list)
def test_daum_specific_parsing(self, collector):
"""Test Daum-specific parsing features."""
raw_item = {
"title": "주식 <b>시장</b> 분석",
"contents": "오늘 주식 시장이 급등했습니다.",
"url": "https://v.daum.net/v/20240723143000123",
"datetime": "2024-07-23T14:30:00.000+09:00",
"blogname": "매일경제",
"thumbnail": "https://img1.daumcdn.net/thumb/C430x430/?fname=https://example.com/image.jpg"
}
# Test thumbnail URL processing
processed_thumbnail = collector._process_thumbnail_url(raw_item["thumbnail"])
assert "img1.daumcdn.net" in processed_thumbnail
# Test blogname extraction
source_info = collector._extract_source_info(raw_item)
assert source_info["blogname"] == "매일경제"
assert source_info["source_type"] == "media"
def test_api_rate_limit_handling(self, collector):
"""Test API rate limit configuration."""
# Daum API has different rate limits than Naver
assert collector.requests_per_minute <= 100
assert collector.concurrent_requests <= 10
@pytest.mark.asyncio
async def test_collect_with_category_filter(self, collector, sample_daum_response):
"""Test collection with category filtering."""
# Mock API request to avoid network calls
collector._make_api_request = AsyncMock(return_value=sample_daum_response)
# Daum supports category-based filtering
result = await collector.collect(
keyword="경제",
category="news",
limit=10
)
# Should return mocked results
assert isinstance(result, list)
assert len(result) == 2
def test_url_validation(self, collector):
"""Test URL validation for Daum domains."""
valid_urls = [
"https://v.daum.net/v/20240723143000123",
"https://news.v.daum.net/v/20240723143000123",
"http://v.daum.net/v/20240723143000123"
]
invalid_urls = [
"https://naver.com/news/123",
"https://google.com",
"not-a-url"
]
for url in valid_urls:
assert collector._is_valid_daum_url(url) is True
for url in invalid_urls:
assert collector._is_valid_daum_url(url) is False