#!/usr/bin/env python3
"""
Web Search Engine - 고급 검색 엔진 핸들러
다중 검색 엔진 지원과 URL 생성, 결과 파싱을 담당합니다.
"""
import asyncio
import logging
from typing import Dict, List
from urllib.parse import urlencode, urlparse, parse_qs
from bs4 import BeautifulSoup
from datetime import datetime
from web_search_base import (
SearchEngine, SearchRequest, SearchResult, SearchResultType, ContentType
)
from web_search_antibot import EnhancedAntiBotManager
from web_search_playwright import PlaywrightManager
logger = logging.getLogger(__name__)
class AdvancedSearchEngineHandler:
"""고급 검색 엔진 핸들러 - 다중 엔진 지원"""
def __init__(self, antibot_manager: EnhancedAntiBotManager, playwright_manager: PlaywrightManager):
self.antibot = antibot_manager
self.playwright = playwright_manager
def build_search_url(self, request: SearchRequest, engine: SearchEngine) -> str:
"""검색 엔진별 URL 생성"""
if engine == SearchEngine.GOOGLE:
return self._build_google_url(request)
elif engine == SearchEngine.BING:
return self._build_bing_url(request)
elif engine == SearchEngine.DUCKDUCKGO:
return self._build_duckduckgo_url(request)
elif engine == SearchEngine.GOOGLE_SCHOLAR:
return self._build_google_scholar_url(request)
elif engine == SearchEngine.PUBMED:
return self._build_pubmed_url(request)
elif engine == SearchEngine.ARXIV:
return self._build_arxiv_url(request)
elif engine == SearchEngine.GITHUB:
return self._build_github_url(request)
elif engine == SearchEngine.STACKOVERFLOW:
return self._build_stackoverflow_url(request)
elif engine == SearchEngine.REDDIT:
return self._build_reddit_url(request)
elif engine == SearchEngine.YOUTUBE:
return self._build_youtube_url(request)
elif engine == SearchEngine.GOOGLE_NEWS:
return self._build_google_news_url(request)
elif engine == SearchEngine.GOOGLE_IMAGES:
return self._build_google_images_url(request)
else:
raise ValueError(f"지원하지 않는 검색 엔진: {engine}")
def _build_google_url(self, request: SearchRequest) -> str:
"""Google 검색 URL"""
base_url = "https://www.google.com/search"
params = {
'q': self._enhance_query(request.query, request),
'num': request.num_results,
'hl': request.language,
'gl': request.region,
'safe': 'active' if request.safe_search else 'off'
}
if request.time_range:
time_map = {'h': 'qdr:h', 'd': 'qdr:d', 'w': 'qdr:w', 'm': 'qdr:m', 'y': 'qdr:y'}
if request.time_range in time_map:
params['tbs'] = time_map[request.time_range]
if request.file_type:
params['q'] += f' filetype:{request.file_type}'
return f"{base_url}?{urlencode(params)}"
def _build_google_scholar_url(self, request: SearchRequest) -> str:
"""Google Scholar 검색 URL"""
base_url = "https://scholar.google.com/scholar"
params = {
'q': request.query,
'num': min(request.num_results, 20), # Scholar 제한
'hl': request.language,
'as_vis': '1' # 인용 포함
}
return f"{base_url}?{urlencode(params)}"
def _build_pubmed_url(self, request: SearchRequest) -> str:
"""PubMed 검색 URL"""
base_url = "https://pubmed.ncbi.nlm.nih.gov/"
params = {
'term': request.query,
'size': min(request.num_results, 200),
'format': 'summary'
}
return f"{base_url}?{urlencode(params)}"
def _build_arxiv_url(self, request: SearchRequest) -> str:
"""arXiv 검색 URL"""
base_url = "https://arxiv.org/search/"
params = {
'query': request.query,
'searchtype': 'all',
'size': min(request.num_results, 50),
'order': '-announced_date_first'
}
return f"{base_url}?{urlencode(params)}"
def _build_github_url(self, request: SearchRequest) -> str:
"""GitHub 검색 URL"""
base_url = "https://github.com/search"
params = {
'q': request.query,
'type': 'repositories',
'per_page': min(request.num_results, 100)
}
return f"{base_url}?{urlencode(params)}"
def _build_stackoverflow_url(self, request: SearchRequest) -> str:
"""Stack Overflow 검색 URL"""
base_url = "https://stackoverflow.com/search"
params = {
'q': request.query,
'pagesize': min(request.num_results, 50)
}
return f"{base_url}?{urlencode(params)}"
def _build_reddit_url(self, request: SearchRequest) -> str:
"""Reddit 검색 URL"""
base_url = "https://www.reddit.com/search"
params = {
'q': request.query,
'type': 'link',
'sort': 'relevance',
'limit': min(request.num_results, 100)
}
return f"{base_url}?{urlencode(params)}"
def _build_youtube_url(self, request: SearchRequest) -> str:
"""YouTube 검색 URL"""
base_url = "https://www.youtube.com/results"
params = {
'search_query': request.query,
'sp': 'CAI%253D' # 관련성 순 정렬
}
return f"{base_url}?{urlencode(params)}"
def _build_google_news_url(self, request: SearchRequest) -> str:
"""Google News 검색 URL"""
base_url = "https://news.google.com/search"
params = {
'q': request.query,
'hl': request.language,
'gl': request.region,
'ceid': f"{request.region}:{request.language}"
}
return f"{base_url}?{urlencode(params)}"
def _build_google_images_url(self, request: SearchRequest) -> str:
"""Google Images 검색 URL"""
base_url = "https://www.google.com/search"
params = {
'q': request.query,
'tbm': 'isch', # 이미지 검색
'num': request.num_results,
'hl': request.language,
'safe': 'active' if request.safe_search else 'off'
}
return f"{base_url}?{urlencode(params)}"
def _build_bing_url(self, request: SearchRequest) -> str:
"""Bing 검색 URL"""
base_url = "https://www.bing.com/search"
params = {
'q': self._enhance_query(request.query, request),
'count': request.num_results,
'mkt': f"{request.language}-{request.region}",
'safesearch': 'Strict' if request.safe_search else 'Off'
}
return f"{base_url}?{urlencode(params)}"
def _build_duckduckgo_url(self, request: SearchRequest) -> str:
"""DuckDuckGo 검색 URL"""
base_url = "https://duckduckgo.com/html"
params = {
'q': request.query,
'kl': f"{request.language}-{request.region.lower()}",
'safe': 'strict' if request.safe_search else 'off'
}
return f"{base_url}?{urlencode(params)}"
def _enhance_query(self, query: str, request: SearchRequest) -> str:
"""검색 쿼리 향상"""
enhanced_query = query
# 도메인 필터링
if request.domain_filter:
domain_terms = " OR ".join([f"site:{domain}" for domain in request.domain_filter])
enhanced_query += f" ({domain_terms})"
# 제외 도메인
if request.exclude_domains:
exclude_terms = " ".join([f"-site:{domain}" for domain in request.exclude_domains])
enhanced_query += f" {exclude_terms}"
# 파일 타입
if request.file_type:
enhanced_query += f" filetype:{request.file_type}"
return enhanced_query
async def search_engine_async(self, request: SearchRequest, engine: SearchEngine) -> List[SearchResult]:
"""비동기 검색 실행"""
try:
url = self.build_search_url(request, engine)
# Playwright 사용 여부 결정
if request.use_playwright or self._requires_playwright(engine):
return await self._search_with_playwright(url, engine, request)
else:
return await self._search_with_requests(url, engine, request)
except Exception as e:
logger.error(f"{engine.value} 검색 실패: {e}")
return []
def _requires_playwright(self, engine: SearchEngine) -> bool:
"""Playwright 필요 여부 판단"""
# JavaScript 많이 사용하는 사이트들
playwright_engines = {
SearchEngine.GOOGLE_SCHOLAR,
SearchEngine.GITHUB,
SearchEngine.YOUTUBE,
SearchEngine.REDDIT
}
return engine in playwright_engines
async def _search_with_playwright(self, url: str, engine: SearchEngine, request: SearchRequest) -> List[SearchResult]:
"""Playwright로 검색"""
if not self.playwright.is_initialized:
await self.playwright.initialize(headless=request.headless, stealth=request.stealth_mode)
try:
selectors = self._get_result_selectors(engine)
html_content, metadata = await self.playwright.search_with_playwright(url, selectors)
# 결과 파싱
results = self._parse_search_results(html_content, engine, request)
# 성공률 업데이트
domain = self.antibot._get_engine_domain(engine)
self.antibot.update_success_rate(engine, domain, len(results) > 0)
return results
except Exception as e:
logger.error(f"Playwright 검색 실패 {engine.value}: {e}")
domain = self.antibot._get_engine_domain(engine)
self.antibot.update_success_rate(engine, domain, False)
return []
async def _search_with_requests(self, url: str, engine: SearchEngine, request: SearchRequest) -> List[SearchResult]:
"""Requests로 검색"""
try:
# 적응형 지연
domain = self.antibot._get_engine_domain(engine)
delay = self.antibot.calculate_adaptive_delay(engine, domain)
await asyncio.sleep(delay)
# 세션 및 요청
session = self.antibot.get_session(engine)
response = session.get(url, timeout=30)
response.raise_for_status()
# 결과 파싱
results = self._parse_search_results(response.text, engine, request)
# 성공률 업데이트
self.antibot.update_success_rate(engine, domain, len(results) > 0)
return results
except Exception as e:
logger.error(f"Requests 검색 실패 {engine.value}: {e}")
domain = self.antibot._get_engine_domain(engine)
self.antibot.update_success_rate(engine, domain, False)
return []
def _get_result_selectors(self, engine: SearchEngine) -> Dict[str, str]:
"""검색 엔진별 CSS 셀렉터"""
selectors = {
SearchEngine.GOOGLE: {
'results': 'div.g',
'title': 'h3',
'url': 'a',
'snippet': 'span.aCOpRe, div.VwiC3b'
},
SearchEngine.BING: {
'results': 'li.b_algo',
'title': 'h2',
'url': 'a',
'snippet': 'p'
},
SearchEngine.DUCKDUCKGO: {
'results': 'div.result',
'title': 'a.result__a',
'url': 'a.result__a',
'snippet': 'a.result__snippet'
},
SearchEngine.GITHUB: {
'results': 'div[data-testid="results-list"] > div',
'title': 'h3 a',
'url': 'h3 a',
'snippet': 'p'
}
}
return selectors.get(engine, selectors[SearchEngine.GOOGLE])
def _parse_search_results(self, html: str, engine: SearchEngine, request: SearchRequest) -> List[SearchResult]:
"""검색 결과 파싱"""
soup = BeautifulSoup(html, 'html.parser')
results = []
selectors = self._get_result_selectors(engine)
# 결과 요소 찾기
result_elements = soup.select(selectors['results'])
for i, element in enumerate(result_elements[:request.num_results]):
try:
# 제목 추출
title_elem = element.select_one(selectors['title'])
title = title_elem.get_text(strip=True) if title_elem else "No Title"
# URL 추출
url_elem = element.select_one(selectors['url'])
url = url_elem.get('href') if url_elem else ""
# 스니펫 추출
snippet_elem = element.select_one(selectors['snippet'])
snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
# 결과 타입 결정
result_type = self._determine_result_type(engine, element)
if title and url:
result = SearchResult(
title=title,
url=self._clean_url(url, engine),
snippet=snippet,
result_type=result_type,
ranking=i + 1,
relevance_score=self._calculate_relevance_score(title, snippet, request.query, i),
timestamp=datetime.now()
)
results.append(result)
except Exception as e:
logger.warning(f"결과 파싱 실패 {engine.value}: {e}")
continue
return results
def _determine_result_type(self, engine: SearchEngine, element) -> SearchResultType:
"""결과 타입 결정"""
if engine == SearchEngine.GOOGLE_SCHOLAR:
return SearchResultType.ACADEMIC
elif engine == SearchEngine.GITHUB:
return SearchResultType.CODE
elif engine == SearchEngine.YOUTUBE:
return SearchResultType.VIDEO
elif engine == SearchEngine.GOOGLE_NEWS:
return SearchResultType.NEWS
elif engine == SearchEngine.GOOGLE_IMAGES:
return SearchResultType.IMAGE
elif engine in [SearchEngine.REDDIT, SearchEngine.TWITTER]:
return SearchResultType.SOCIAL
else:
return SearchResultType.ORGANIC
def _clean_url(self, url: str, engine: SearchEngine) -> str:
"""URL 정리"""
if not url:
return ""
# Google 리다이렉트 URL 처리
if engine == SearchEngine.GOOGLE and '/url?q=' in url:
try:
parsed = urlparse(url)
query_params = parse_qs(parsed.query)
if 'q' in query_params:
return query_params['q'][0]
except:
pass
# 상대 URL을 절대 URL로 변환
if url.startswith('/'):
base_urls = {
SearchEngine.GOOGLE: 'https://www.google.com',
SearchEngine.BING: 'https://www.bing.com',
SearchEngine.GITHUB: 'https://github.com',
SearchEngine.REDDIT: 'https://www.reddit.com'
}
base_url = base_urls.get(engine, 'https://www.google.com')
url = base_url + url
return url
def _calculate_relevance_score(self, title: str, snippet: str, query: str, position: int) -> float:
"""관련성 점수 계산"""
base_score = 1.0 - (position * 0.05) # 순위 기반 기본 점수
# 제목에서 쿼리 매치
title_lower = title.lower()
query_lower = query.lower()
query_words = query_lower.split()
title_matches = sum(1 for word in query_words if word in title_lower)
title_score = title_matches / len(query_words) if query_words else 0
# 스니펫에서 쿼리 매치
snippet_lower = snippet.lower()
snippet_matches = sum(1 for word in query_words if word in snippet_lower)
snippet_score = snippet_matches / len(query_words) if query_words else 0
# 최종 점수 계산
relevance_score = base_score * 0.5 + title_score * 0.3 + snippet_score * 0.2
return min(max(relevance_score, 0.0), 1.0)