Skip to main content
Glama
Skynotdie

MCP Localization Project

by Skynotdie
web_search_engine.py17.1 kB
#!/usr/bin/env python3 """ Web Search Engine - 고급 검색 엔진 핸들러 다중 검색 엔진 지원과 URL 생성, 결과 파싱을 담당합니다. """ import asyncio import logging from typing import Dict, List from urllib.parse import urlencode, urlparse, parse_qs from bs4 import BeautifulSoup from datetime import datetime from web_search_base import ( SearchEngine, SearchRequest, SearchResult, SearchResultType, ContentType ) from web_search_antibot import EnhancedAntiBotManager from web_search_playwright import PlaywrightManager logger = logging.getLogger(__name__) class AdvancedSearchEngineHandler: """고급 검색 엔진 핸들러 - 다중 엔진 지원""" def __init__(self, antibot_manager: EnhancedAntiBotManager, playwright_manager: PlaywrightManager): self.antibot = antibot_manager self.playwright = playwright_manager def build_search_url(self, request: SearchRequest, engine: SearchEngine) -> str: """검색 엔진별 URL 생성""" if engine == SearchEngine.GOOGLE: return self._build_google_url(request) elif engine == SearchEngine.BING: return self._build_bing_url(request) elif engine == SearchEngine.DUCKDUCKGO: return self._build_duckduckgo_url(request) elif engine == SearchEngine.GOOGLE_SCHOLAR: return self._build_google_scholar_url(request) elif engine == SearchEngine.PUBMED: return self._build_pubmed_url(request) elif engine == SearchEngine.ARXIV: return self._build_arxiv_url(request) elif engine == SearchEngine.GITHUB: return self._build_github_url(request) elif engine == SearchEngine.STACKOVERFLOW: return self._build_stackoverflow_url(request) elif engine == SearchEngine.REDDIT: return self._build_reddit_url(request) elif engine == SearchEngine.YOUTUBE: return self._build_youtube_url(request) elif engine == SearchEngine.GOOGLE_NEWS: return self._build_google_news_url(request) elif engine == SearchEngine.GOOGLE_IMAGES: return self._build_google_images_url(request) else: raise ValueError(f"지원하지 않는 검색 엔진: {engine}") def _build_google_url(self, request: SearchRequest) -> str: """Google 검색 URL""" base_url = "https://www.google.com/search" params = { 'q': self._enhance_query(request.query, request), 'num': request.num_results, 'hl': request.language, 'gl': request.region, 'safe': 'active' if request.safe_search else 'off' } if request.time_range: time_map = {'h': 'qdr:h', 'd': 'qdr:d', 'w': 'qdr:w', 'm': 'qdr:m', 'y': 'qdr:y'} if request.time_range in time_map: params['tbs'] = time_map[request.time_range] if request.file_type: params['q'] += f' filetype:{request.file_type}' return f"{base_url}?{urlencode(params)}" def _build_google_scholar_url(self, request: SearchRequest) -> str: """Google Scholar 검색 URL""" base_url = "https://scholar.google.com/scholar" params = { 'q': request.query, 'num': min(request.num_results, 20), # Scholar 제한 'hl': request.language, 'as_vis': '1' # 인용 포함 } return f"{base_url}?{urlencode(params)}" def _build_pubmed_url(self, request: SearchRequest) -> str: """PubMed 검색 URL""" base_url = "https://pubmed.ncbi.nlm.nih.gov/" params = { 'term': request.query, 'size': min(request.num_results, 200), 'format': 'summary' } return f"{base_url}?{urlencode(params)}" def _build_arxiv_url(self, request: SearchRequest) -> str: """arXiv 검색 URL""" base_url = "https://arxiv.org/search/" params = { 'query': request.query, 'searchtype': 'all', 'size': min(request.num_results, 50), 'order': '-announced_date_first' } return f"{base_url}?{urlencode(params)}" def _build_github_url(self, request: SearchRequest) -> str: """GitHub 검색 URL""" base_url = "https://github.com/search" params = { 'q': request.query, 'type': 'repositories', 'per_page': min(request.num_results, 100) } return f"{base_url}?{urlencode(params)}" def _build_stackoverflow_url(self, request: SearchRequest) -> str: """Stack Overflow 검색 URL""" base_url = "https://stackoverflow.com/search" params = { 'q': request.query, 'pagesize': min(request.num_results, 50) } return f"{base_url}?{urlencode(params)}" def _build_reddit_url(self, request: SearchRequest) -> str: """Reddit 검색 URL""" base_url = "https://www.reddit.com/search" params = { 'q': request.query, 'type': 'link', 'sort': 'relevance', 'limit': min(request.num_results, 100) } return f"{base_url}?{urlencode(params)}" def _build_youtube_url(self, request: SearchRequest) -> str: """YouTube 검색 URL""" base_url = "https://www.youtube.com/results" params = { 'search_query': request.query, 'sp': 'CAI%253D' # 관련성 순 정렬 } return f"{base_url}?{urlencode(params)}" def _build_google_news_url(self, request: SearchRequest) -> str: """Google News 검색 URL""" base_url = "https://news.google.com/search" params = { 'q': request.query, 'hl': request.language, 'gl': request.region, 'ceid': f"{request.region}:{request.language}" } return f"{base_url}?{urlencode(params)}" def _build_google_images_url(self, request: SearchRequest) -> str: """Google Images 검색 URL""" base_url = "https://www.google.com/search" params = { 'q': request.query, 'tbm': 'isch', # 이미지 검색 'num': request.num_results, 'hl': request.language, 'safe': 'active' if request.safe_search else 'off' } return f"{base_url}?{urlencode(params)}" def _build_bing_url(self, request: SearchRequest) -> str: """Bing 검색 URL""" base_url = "https://www.bing.com/search" params = { 'q': self._enhance_query(request.query, request), 'count': request.num_results, 'mkt': f"{request.language}-{request.region}", 'safesearch': 'Strict' if request.safe_search else 'Off' } return f"{base_url}?{urlencode(params)}" def _build_duckduckgo_url(self, request: SearchRequest) -> str: """DuckDuckGo 검색 URL""" base_url = "https://duckduckgo.com/html" params = { 'q': request.query, 'kl': f"{request.language}-{request.region.lower()}", 'safe': 'strict' if request.safe_search else 'off' } return f"{base_url}?{urlencode(params)}" def _enhance_query(self, query: str, request: SearchRequest) -> str: """검색 쿼리 향상""" enhanced_query = query # 도메인 필터링 if request.domain_filter: domain_terms = " OR ".join([f"site:{domain}" for domain in request.domain_filter]) enhanced_query += f" ({domain_terms})" # 제외 도메인 if request.exclude_domains: exclude_terms = " ".join([f"-site:{domain}" for domain in request.exclude_domains]) enhanced_query += f" {exclude_terms}" # 파일 타입 if request.file_type: enhanced_query += f" filetype:{request.file_type}" return enhanced_query async def search_engine_async(self, request: SearchRequest, engine: SearchEngine) -> List[SearchResult]: """비동기 검색 실행""" try: url = self.build_search_url(request, engine) # Playwright 사용 여부 결정 if request.use_playwright or self._requires_playwright(engine): return await self._search_with_playwright(url, engine, request) else: return await self._search_with_requests(url, engine, request) except Exception as e: logger.error(f"{engine.value} 검색 실패: {e}") return [] def _requires_playwright(self, engine: SearchEngine) -> bool: """Playwright 필요 여부 판단""" # JavaScript 많이 사용하는 사이트들 playwright_engines = { SearchEngine.GOOGLE_SCHOLAR, SearchEngine.GITHUB, SearchEngine.YOUTUBE, SearchEngine.REDDIT } return engine in playwright_engines async def _search_with_playwright(self, url: str, engine: SearchEngine, request: SearchRequest) -> List[SearchResult]: """Playwright로 검색""" if not self.playwright.is_initialized: await self.playwright.initialize(headless=request.headless, stealth=request.stealth_mode) try: selectors = self._get_result_selectors(engine) html_content, metadata = await self.playwright.search_with_playwright(url, selectors) # 결과 파싱 results = self._parse_search_results(html_content, engine, request) # 성공률 업데이트 domain = self.antibot._get_engine_domain(engine) self.antibot.update_success_rate(engine, domain, len(results) > 0) return results except Exception as e: logger.error(f"Playwright 검색 실패 {engine.value}: {e}") domain = self.antibot._get_engine_domain(engine) self.antibot.update_success_rate(engine, domain, False) return [] async def _search_with_requests(self, url: str, engine: SearchEngine, request: SearchRequest) -> List[SearchResult]: """Requests로 검색""" try: # 적응형 지연 domain = self.antibot._get_engine_domain(engine) delay = self.antibot.calculate_adaptive_delay(engine, domain) await asyncio.sleep(delay) # 세션 및 요청 session = self.antibot.get_session(engine) response = session.get(url, timeout=30) response.raise_for_status() # 결과 파싱 results = self._parse_search_results(response.text, engine, request) # 성공률 업데이트 self.antibot.update_success_rate(engine, domain, len(results) > 0) return results except Exception as e: logger.error(f"Requests 검색 실패 {engine.value}: {e}") domain = self.antibot._get_engine_domain(engine) self.antibot.update_success_rate(engine, domain, False) return [] def _get_result_selectors(self, engine: SearchEngine) -> Dict[str, str]: """검색 엔진별 CSS 셀렉터""" selectors = { SearchEngine.GOOGLE: { 'results': 'div.g', 'title': 'h3', 'url': 'a', 'snippet': 'span.aCOpRe, div.VwiC3b' }, SearchEngine.BING: { 'results': 'li.b_algo', 'title': 'h2', 'url': 'a', 'snippet': 'p' }, SearchEngine.DUCKDUCKGO: { 'results': 'div.result', 'title': 'a.result__a', 'url': 'a.result__a', 'snippet': 'a.result__snippet' }, SearchEngine.GITHUB: { 'results': 'div[data-testid="results-list"] > div', 'title': 'h3 a', 'url': 'h3 a', 'snippet': 'p' } } return selectors.get(engine, selectors[SearchEngine.GOOGLE]) def _parse_search_results(self, html: str, engine: SearchEngine, request: SearchRequest) -> List[SearchResult]: """검색 결과 파싱""" soup = BeautifulSoup(html, 'html.parser') results = [] selectors = self._get_result_selectors(engine) # 결과 요소 찾기 result_elements = soup.select(selectors['results']) for i, element in enumerate(result_elements[:request.num_results]): try: # 제목 추출 title_elem = element.select_one(selectors['title']) title = title_elem.get_text(strip=True) if title_elem else "No Title" # URL 추출 url_elem = element.select_one(selectors['url']) url = url_elem.get('href') if url_elem else "" # 스니펫 추출 snippet_elem = element.select_one(selectors['snippet']) snippet = snippet_elem.get_text(strip=True) if snippet_elem else "" # 결과 타입 결정 result_type = self._determine_result_type(engine, element) if title and url: result = SearchResult( title=title, url=self._clean_url(url, engine), snippet=snippet, result_type=result_type, ranking=i + 1, relevance_score=self._calculate_relevance_score(title, snippet, request.query, i), timestamp=datetime.now() ) results.append(result) except Exception as e: logger.warning(f"결과 파싱 실패 {engine.value}: {e}") continue return results def _determine_result_type(self, engine: SearchEngine, element) -> SearchResultType: """결과 타입 결정""" if engine == SearchEngine.GOOGLE_SCHOLAR: return SearchResultType.ACADEMIC elif engine == SearchEngine.GITHUB: return SearchResultType.CODE elif engine == SearchEngine.YOUTUBE: return SearchResultType.VIDEO elif engine == SearchEngine.GOOGLE_NEWS: return SearchResultType.NEWS elif engine == SearchEngine.GOOGLE_IMAGES: return SearchResultType.IMAGE elif engine in [SearchEngine.REDDIT, SearchEngine.TWITTER]: return SearchResultType.SOCIAL else: return SearchResultType.ORGANIC def _clean_url(self, url: str, engine: SearchEngine) -> str: """URL 정리""" if not url: return "" # Google 리다이렉트 URL 처리 if engine == SearchEngine.GOOGLE and '/url?q=' in url: try: parsed = urlparse(url) query_params = parse_qs(parsed.query) if 'q' in query_params: return query_params['q'][0] except: pass # 상대 URL을 절대 URL로 변환 if url.startswith('/'): base_urls = { SearchEngine.GOOGLE: 'https://www.google.com', SearchEngine.BING: 'https://www.bing.com', SearchEngine.GITHUB: 'https://github.com', SearchEngine.REDDIT: 'https://www.reddit.com' } base_url = base_urls.get(engine, 'https://www.google.com') url = base_url + url return url def _calculate_relevance_score(self, title: str, snippet: str, query: str, position: int) -> float: """관련성 점수 계산""" base_score = 1.0 - (position * 0.05) # 순위 기반 기본 점수 # 제목에서 쿼리 매치 title_lower = title.lower() query_lower = query.lower() query_words = query_lower.split() title_matches = sum(1 for word in query_words if word in title_lower) title_score = title_matches / len(query_words) if query_words else 0 # 스니펫에서 쿼리 매치 snippet_lower = snippet.lower() snippet_matches = sum(1 for word in query_words if word in snippet_lower) snippet_score = snippet_matches / len(query_words) if query_words else 0 # 최종 점수 계산 relevance_score = base_score * 0.5 + title_score * 0.3 + snippet_score * 0.2 return min(max(relevance_score, 0.0), 1.0)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Skynotdie/mky'

If you have feedback or need assistance with the MCP directory API, please join our Discord server