#!/usr/bin/env python3
"""
Web Search Base - 기본 데이터 구조 및 열거형
웹 검색 MCP의 기본 타입 정의와 데이터 구조를 포함합니다.
"""
import asyncio
import aiohttp
import json
import time
import random
import hashlib
import sqlite3
import re
import logging
import subprocess
import tempfile
import threading
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union, Any, Callable
from dataclasses import dataclass, asdict, field
from enum import Enum
from urllib.parse import urlencode, quote_plus, urlparse, parse_qs
from bs4 import BeautifulSoup
import concurrent.futures
# from fake_useragent import UserAgent # 선택적 의존성
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# 로깅 설정
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class SearchEngine(Enum):
"""확장된 검색 엔진 타입"""
# 일반 검색 엔진
GOOGLE = "google"
BING = "bing"
DUCKDUCKGO = "duckduckgo"
YANDEX = "yandex"
BAIDU = "baidu"
# 학술 검색 엔진
GOOGLE_SCHOLAR = "google_scholar"
PUBMED = "pubmed"
ARXIV = "arxiv"
IEEE_XPLORE = "ieee_xplore"
# 전문 검색 엔진
GITHUB = "github"
STACKOVERFLOW = "stackoverflow"
REDDIT = "reddit"
YOUTUBE = "youtube"
TWITTER = "twitter"
# 뉴스 검색
GOOGLE_NEWS = "google_news"
BING_NEWS = "bing_news"
# 이미지/비디오 검색
GOOGLE_IMAGES = "google_images"
BING_IMAGES = "bing_images"
UNSPLASH = "unsplash"
class SearchResultType(Enum):
"""검색 결과 타입"""
ORGANIC = "organic"
FEATURED = "featured"
IMAGE = "image"
VIDEO = "video"
NEWS = "news"
ACADEMIC = "academic"
CODE = "code"
SOCIAL = "social"
KNOWLEDGE = "knowledge"
DEFINITION = "definition"
FAQ = "faq"
REVIEW = "review"
class ContentType(Enum):
"""콘텐츠 타입"""
TEXT = "text"
HTML = "html"
JSON = "json"
XML = "xml"
PDF = "pdf"
IMAGE = "image"
VIDEO = "video"
AUDIO = "audio"
CODE = "code"
MARKDOWN = "markdown"
class QualityScore(Enum):
"""품질 점수"""
EXCELLENT = 5
GOOD = 4
AVERAGE = 3
POOR = 2
SPAM = 1
@dataclass
class SearchResult:
"""확장된 검색 결과"""
title: str
url: str
snippet: str
result_type: SearchResultType
ranking: int
# AI 품질 평가
relevance_score: float = 0.0
quality_score: QualityScore = QualityScore.AVERAGE
authority_score: float = 0.0
freshness_score: float = 0.0
# 콘텐츠 분석
extracted_content: str = None
content_summary: str = None
content_type: ContentType = ContentType.TEXT
language: str = "unknown"
# 메타데이터
domain: str = None
author: str = None
publish_date: datetime = None
last_modified: datetime = None
tags: List[str] = field(default_factory=list)
# 기술적 정보
response_time: float = 0.0
content_length: int = 0
status_code: int = 200
timestamp: datetime = field(default_factory=datetime.now)
def __post_init__(self):
"""초기화 후 처리"""
if self.domain is None and self.url:
try:
self.domain = urlparse(self.url).netloc
except:
self.domain = "unknown"
def to_dict(self) -> Dict[str, Any]:
"""딕셔너리 변환"""
return {
'title': self.title,
'url': self.url,
'snippet': self.snippet,
'result_type': self.result_type.value,
'ranking': self.ranking,
'relevance_score': self.relevance_score,
'quality_score': self.quality_score.value,
'authority_score': self.authority_score,
'freshness_score': self.freshness_score,
'extracted_content': self.extracted_content,
'content_summary': self.content_summary,
'content_type': self.content_type.value if self.content_type else None,
'language': self.language,
'domain': self.domain,
'author': self.author,
'publish_date': self.publish_date.isoformat() if self.publish_date else None,
'last_modified': self.last_modified.isoformat() if self.last_modified else None,
'tags': self.tags,
'response_time': self.response_time,
'content_length': self.content_length,
'status_code': self.status_code,
'timestamp': self.timestamp.isoformat()
}
@dataclass
class SearchRequest:
"""확장된 검색 요청"""
query: str
engines: List[SearchEngine] = field(default_factory=lambda: [SearchEngine.GOOGLE])
# 검색 설정
num_results: int = 10
safe_search: bool = True
language: str = "ko"
region: str = "KR"
time_range: str = None # "h", "d", "w", "m", "y"
# 고급 필터링
domain_filter: List[str] = field(default_factory=list) # 특정 도메인만
exclude_domains: List[str] = field(default_factory=list) # 제외할 도메인
file_type: str = None # pdf, doc, ppt 등
content_type: ContentType = ContentType.TEXT
# AI 최적화 설정
extract_content: bool = True
summarize_content: bool = True
quality_filter: bool = True
deduplicate: bool = True
max_content_length: int = 2000
# Playwright 설정
use_playwright: bool = False
headless: bool = True
stealth_mode: bool = True
def get_cache_key(self) -> str:
"""캐시 키 생성"""
engines_str = ",".join([e.value for e in self.engines])
domains_str = ",".join(sorted(self.domain_filter))
exclude_str = ",".join(sorted(self.exclude_domains))
key_data = (
f"{self.query}:{engines_str}:{self.num_results}:"
f"{self.language}:{self.region}:{self.time_range}:"
f"{domains_str}:{exclude_str}:{self.file_type}:"
f"{self.content_type.value}:{self.extract_content}:"
f"{self.summarize_content}:{self.quality_filter}"
)
return hashlib.sha256(key_data.encode()).hexdigest()
@dataclass
class SearchResponse:
"""확장된 검색 응답"""
query: str
results: List[SearchResult]
# 통계 정보
total_results: int
search_time: float
engines_used: List[SearchEngine]
cached: bool = False
# 품질 분석
avg_relevance_score: float = 0.0
avg_quality_score: float = 0.0
duplicate_count: int = 0
# AI 최적화 결과
token_count: int = 0
compressed_results: List[Dict] = field(default_factory=list)
summary: str = None
def __post_init__(self):
"""초기화 후 처리"""
if self.results:
self.avg_relevance_score = sum(r.relevance_score for r in self.results) / len(self.results)
self.avg_quality_score = sum(r.quality_score.value for r in self.results) / len(self.results)
def to_dict(self) -> Dict[str, Any]:
"""딕셔너리 변환"""
return {
'query': self.query,
'results': [result.to_dict() for result in self.results],
'total_results': self.total_results,
'search_time': self.search_time,
'engines_used': [e.value for e in self.engines_used],
'cached': self.cached,
'avg_relevance_score': self.avg_relevance_score,
'avg_quality_score': self.avg_quality_score,
'duplicate_count': self.duplicate_count,
'token_count': self.token_count,
'compressed_results': self.compressed_results,
'summary': self.summary
}