Skip to main content
Glama
Skynotdie

MCP Localization Project

by Skynotdie
web_search_base.py7.95 kB
#!/usr/bin/env python3 """ Web Search Base - 기본 데이터 구조 및 열거형 웹 검색 MCP의 기본 타입 정의와 데이터 구조를 포함합니다. """ import asyncio import aiohttp import json import time import random import hashlib import sqlite3 import re import logging import subprocess import tempfile import threading from datetime import datetime, timedelta from pathlib import Path from typing import Dict, List, Optional, Tuple, Union, Any, Callable from dataclasses import dataclass, asdict, field from enum import Enum from urllib.parse import urlencode, quote_plus, urlparse, parse_qs from bs4 import BeautifulSoup import concurrent.futures # from fake_useragent import UserAgent # 선택적 의존성 import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry # 로깅 설정 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class SearchEngine(Enum): """확장된 검색 엔진 타입""" # 일반 검색 엔진 GOOGLE = "google" BING = "bing" DUCKDUCKGO = "duckduckgo" YANDEX = "yandex" BAIDU = "baidu" # 학술 검색 엔진 GOOGLE_SCHOLAR = "google_scholar" PUBMED = "pubmed" ARXIV = "arxiv" IEEE_XPLORE = "ieee_xplore" # 전문 검색 엔진 GITHUB = "github" STACKOVERFLOW = "stackoverflow" REDDIT = "reddit" YOUTUBE = "youtube" TWITTER = "twitter" # 뉴스 검색 GOOGLE_NEWS = "google_news" BING_NEWS = "bing_news" # 이미지/비디오 검색 GOOGLE_IMAGES = "google_images" BING_IMAGES = "bing_images" UNSPLASH = "unsplash" class SearchResultType(Enum): """검색 결과 타입""" ORGANIC = "organic" FEATURED = "featured" IMAGE = "image" VIDEO = "video" NEWS = "news" ACADEMIC = "academic" CODE = "code" SOCIAL = "social" KNOWLEDGE = "knowledge" DEFINITION = "definition" FAQ = "faq" REVIEW = "review" class ContentType(Enum): """콘텐츠 타입""" TEXT = "text" HTML = "html" JSON = "json" XML = "xml" PDF = "pdf" IMAGE = "image" VIDEO = "video" AUDIO = "audio" CODE = "code" MARKDOWN = "markdown" class QualityScore(Enum): """품질 점수""" EXCELLENT = 5 GOOD = 4 AVERAGE = 3 POOR = 2 SPAM = 1 @dataclass class SearchResult: """확장된 검색 결과""" title: str url: str snippet: str result_type: SearchResultType ranking: int # AI 품질 평가 relevance_score: float = 0.0 quality_score: QualityScore = QualityScore.AVERAGE authority_score: float = 0.0 freshness_score: float = 0.0 # 콘텐츠 분석 extracted_content: str = None content_summary: str = None content_type: ContentType = ContentType.TEXT language: str = "unknown" # 메타데이터 domain: str = None author: str = None publish_date: datetime = None last_modified: datetime = None tags: List[str] = field(default_factory=list) # 기술적 정보 response_time: float = 0.0 content_length: int = 0 status_code: int = 200 timestamp: datetime = field(default_factory=datetime.now) def __post_init__(self): """초기화 후 처리""" if self.domain is None and self.url: try: self.domain = urlparse(self.url).netloc except: self.domain = "unknown" def to_dict(self) -> Dict[str, Any]: """딕셔너리 변환""" return { 'title': self.title, 'url': self.url, 'snippet': self.snippet, 'result_type': self.result_type.value, 'ranking': self.ranking, 'relevance_score': self.relevance_score, 'quality_score': self.quality_score.value, 'authority_score': self.authority_score, 'freshness_score': self.freshness_score, 'extracted_content': self.extracted_content, 'content_summary': self.content_summary, 'content_type': self.content_type.value if self.content_type else None, 'language': self.language, 'domain': self.domain, 'author': self.author, 'publish_date': self.publish_date.isoformat() if self.publish_date else None, 'last_modified': self.last_modified.isoformat() if self.last_modified else None, 'tags': self.tags, 'response_time': self.response_time, 'content_length': self.content_length, 'status_code': self.status_code, 'timestamp': self.timestamp.isoformat() } @dataclass class SearchRequest: """확장된 검색 요청""" query: str engines: List[SearchEngine] = field(default_factory=lambda: [SearchEngine.GOOGLE]) # 검색 설정 num_results: int = 10 safe_search: bool = True language: str = "ko" region: str = "KR" time_range: str = None # "h", "d", "w", "m", "y" # 고급 필터링 domain_filter: List[str] = field(default_factory=list) # 특정 도메인만 exclude_domains: List[str] = field(default_factory=list) # 제외할 도메인 file_type: str = None # pdf, doc, ppt 등 content_type: ContentType = ContentType.TEXT # AI 최적화 설정 extract_content: bool = True summarize_content: bool = True quality_filter: bool = True deduplicate: bool = True max_content_length: int = 2000 # Playwright 설정 use_playwright: bool = False headless: bool = True stealth_mode: bool = True def get_cache_key(self) -> str: """캐시 키 생성""" engines_str = ",".join([e.value for e in self.engines]) domains_str = ",".join(sorted(self.domain_filter)) exclude_str = ",".join(sorted(self.exclude_domains)) key_data = ( f"{self.query}:{engines_str}:{self.num_results}:" f"{self.language}:{self.region}:{self.time_range}:" f"{domains_str}:{exclude_str}:{self.file_type}:" f"{self.content_type.value}:{self.extract_content}:" f"{self.summarize_content}:{self.quality_filter}" ) return hashlib.sha256(key_data.encode()).hexdigest() @dataclass class SearchResponse: """확장된 검색 응답""" query: str results: List[SearchResult] # 통계 정보 total_results: int search_time: float engines_used: List[SearchEngine] cached: bool = False # 품질 분석 avg_relevance_score: float = 0.0 avg_quality_score: float = 0.0 duplicate_count: int = 0 # AI 최적화 결과 token_count: int = 0 compressed_results: List[Dict] = field(default_factory=list) summary: str = None def __post_init__(self): """초기화 후 처리""" if self.results: self.avg_relevance_score = sum(r.relevance_score for r in self.results) / len(self.results) self.avg_quality_score = sum(r.quality_score.value for r in self.results) / len(self.results) def to_dict(self) -> Dict[str, Any]: """딕셔너리 변환""" return { 'query': self.query, 'results': [result.to_dict() for result in self.results], 'total_results': self.total_results, 'search_time': self.search_time, 'engines_used': [e.value for e in self.engines_used], 'cached': self.cached, 'avg_relevance_score': self.avg_relevance_score, 'avg_quality_score': self.avg_quality_score, 'duplicate_count': self.duplicate_count, 'token_count': self.token_count, 'compressed_results': self.compressed_results, 'summary': self.summary }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Skynotdie/mky'

If you have feedback or need assistance with the MCP directory API, please join our Discord server