#!/usr/bin/env python3
"""
Web Search Anti-Bot - 고도화된 안티봇 우회 관리자
지능형 봇 탐지 우회와 적응형 요청 관리를 제공합니다.
"""
import time
import random
import logging
from typing import Dict, List
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from web_search_base import SearchEngine
logger = logging.getLogger(__name__)
class EnhancedAntiBotManager:
"""고도화된 안티봇 우회 관리자"""
def __init__(self):
self.user_agents = self._initialize_user_agents()
self.proxies = []
self.request_history = []
self.success_rates = {}
self.last_request_times = {}
# 지능형 지연 시스템
self.base_delays = [0.5, 1.0, 1.5, 2.0, 3.0, 5.0]
self.adaptive_delays = {}
# 세션 관리
self.sessions = {}
self.session_cookies = {}
def _initialize_user_agents(self) -> Dict[str, List[str]]:
"""플랫폼별 User-Agent 초기화"""
return {
'desktop_chrome': [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
],
'desktop_firefox': [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/119.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0'
],
'mobile': [
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Linux; Android 13; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Mobile Safari/537.36'
]
}
def get_optimized_headers(self, engine: SearchEngine, platform: str = 'desktop_chrome') -> Dict[str, str]:
"""검색 엔진별 최적화된 헤더 생성"""
base_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'cross-site',
'Cache-Control': 'max-age=0',
'DNT': '1'
}
# User-Agent 선택
if platform in self.user_agents:
user_agent = random.choice(self.user_agents[platform])
else:
user_agent = random.choice(self.user_agents['desktop_chrome'])
base_headers['User-Agent'] = user_agent
# 검색 엔진별 특화 헤더
if engine == SearchEngine.GOOGLE:
base_headers.update({
'Sec-Fetch-Site': 'none',
'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"'
})
elif engine == SearchEngine.BING:
base_headers.update({
'Referer': 'https://www.bing.com/',
'X-Edge-Shopping-Flag': '1'
})
elif engine == SearchEngine.DUCKDUCKGO:
base_headers.update({
'Referer': 'https://duckduckgo.com/',
'X-Requested-With': 'XMLHttpRequest'
})
return base_headers
def calculate_adaptive_delay(self, engine: SearchEngine, domain: str) -> float:
"""적응형 지연 계산"""
key = f"{engine.value}:{domain}"
# 기본 지연
base_delay = random.choice(self.base_delays)
# 성공률 기반 조정
if key in self.success_rates:
success_rate = self.success_rates[key]
if success_rate < 0.7: # 성공률이 낮으면 지연 증가
base_delay *= 2.0
elif success_rate > 0.9: # 성공률이 높으면 지연 감소
base_delay *= 0.7
# 최근 요청 간격 고려
if key in self.last_request_times:
elapsed = time.time() - self.last_request_times[key]
if elapsed < 5: # 최근 요청이 너무 가까우면 추가 지연
base_delay += random.uniform(2, 5)
# 랜덤 지터 추가
jitter = random.uniform(-0.3, 0.5) * base_delay
final_delay = max(0.1, base_delay + jitter)
return final_delay
def update_success_rate(self, engine: SearchEngine, domain: str, success: bool):
"""성공률 업데이트"""
key = f"{engine.value}:{domain}"
if key not in self.success_rates:
self.success_rates[key] = 1.0 if success else 0.0
else:
# 지수 이동 평균으로 업데이트
alpha = 0.1
current_rate = self.success_rates[key]
new_value = 1.0 if success else 0.0
self.success_rates[key] = alpha * new_value + (1 - alpha) * current_rate
self.last_request_times[key] = time.time()
def get_session(self, engine: SearchEngine) -> requests.Session:
"""검색 엔진별 최적화된 세션 생성"""
session_key = engine.value
if session_key not in self.sessions:
session = requests.Session()
# 재시도 전략 (검색 엔진별 최적화)
if engine == SearchEngine.GOOGLE:
retry_strategy = Retry(
total=3,
backoff_factor=2,
status_forcelist=[429, 500, 502, 503, 504, 403],
respect_retry_after_header=True
)
else:
retry_strategy = Retry(
total=2,
backoff_factor=1.5,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy, pool_maxsize=10)
session.mount("http://", adapter)
session.mount("https://", adapter)
# 세션별 쿠키 복원
if session_key in self.session_cookies:
session.cookies.update(self.session_cookies[session_key])
self.sessions[session_key] = session
# 헤더 업데이트 (매 요청마다)
domain = self._get_engine_domain(engine)
headers = self.get_optimized_headers(engine)
self.sessions[session_key].headers.update(headers)
return self.sessions[session_key]
def _get_engine_domain(self, engine: SearchEngine) -> str:
"""검색 엔진 도메인 반환"""
domain_map = {
SearchEngine.GOOGLE: 'google.com',
SearchEngine.BING: 'bing.com',
SearchEngine.DUCKDUCKGO: 'duckduckgo.com',
SearchEngine.YANDEX: 'yandex.com',
SearchEngine.BAIDU: 'baidu.com'
}
return domain_map.get(engine, 'unknown.com')