Skip to main content
Glama
Skynotdie

MCP Localization Project

by Skynotdie
web_search_analyzer.py10.3 kB
#!/usr/bin/env python3 """ Web Search Content Analyzer - AI 기반 콘텐츠 분석기 검색 결과의 품질 평가, 권위성 점수, 스팸 탐지 등을 수행합니다. """ import re import logging from typing import Dict, List, Tuple from urllib.parse import urlparse from bs4 import BeautifulSoup from web_search_base import QualityScore logger = logging.getLogger(__name__) class ContentAnalyzer: """AI 기반 콘텐츠 분석기""" def __init__(self): self.quality_indicators = self._load_quality_indicators() self.authority_domains = self._load_authority_domains() self.spam_patterns = self._load_spam_patterns() def _load_quality_indicators(self) -> Dict[str, float]: """품질 지표 가중치""" return { 'content_length': 0.15, 'paragraph_count': 0.10, 'heading_structure': 0.15, 'external_links': 0.10, 'image_count': 0.05, 'code_blocks': 0.10, 'citation_count': 0.15, 'readability': 0.20 } def _load_authority_domains(self) -> Dict[str, float]: """권위 있는 도메인 목록""" return { # 학술/연구 'scholar.google.com': 1.0, 'pubmed.ncbi.nlm.nih.gov': 1.0, 'arxiv.org': 0.95, 'ieee.org': 0.95, 'acm.org': 0.9, # 기술/개발 'github.com': 0.9, 'stackoverflow.com': 0.85, 'developer.mozilla.org': 0.9, 'docs.python.org': 0.9, # 뉴스/미디어 'reuters.com': 0.9, 'bbc.com': 0.85, 'cnn.com': 0.8, # 일반 정보 'wikipedia.org': 0.85, 'britannica.com': 0.9, # 정부/공공기관 '.gov': 0.95, '.edu': 0.9, '.ac.kr': 0.9 } def _load_spam_patterns(self) -> List[str]: """스팸 패턴 목록""" return [ r'click here for', r'limited time offer', r'free download now', r'增值税发票', # 중국어 스팸 r'виагра', # 러시아어 스팸 r'!{2,}', # 과도한 느낌표 r'\${3,}', # 과도한 달러 기호 ] def analyze_content_quality(self, content: str, url: str, metadata: Dict = None) -> Tuple[QualityScore, float, Dict]: """콘텐츠 품질 분석""" if not content: return QualityScore.POOR, 0.1, {'error': 'No content'} analysis = {} total_score = 0.0 # 1. 콘텐츠 길이 분석 content_length = len(content) if content_length < 100: length_score = 0.2 elif content_length < 500: length_score = 0.5 elif content_length < 2000: length_score = 0.8 else: length_score = 1.0 analysis['content_length'] = content_length total_score += length_score * self.quality_indicators['content_length'] # 2. 구조 분석 (HTML 기반) try: soup = BeautifulSoup(content, 'html.parser') # 제목 구조 headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) heading_score = min(len(headings) / 5, 1.0) if headings else 0.1 analysis['heading_count'] = len(headings) total_score += heading_score * self.quality_indicators['heading_structure'] # 단락 수 paragraphs = soup.find_all('p') paragraph_score = min(len(paragraphs) / 10, 1.0) if paragraphs else 0.1 analysis['paragraph_count'] = len(paragraphs) total_score += paragraph_score * self.quality_indicators['paragraph_count'] # 외부 링크 links = soup.find_all('a', href=True) external_links = [link for link in links if 'http' in link.get('href', '')] link_score = min(len(external_links) / 20, 1.0) analysis['external_links'] = len(external_links) total_score += link_score * self.quality_indicators['external_links'] # 이미지 images = soup.find_all('img') image_score = min(len(images) / 10, 1.0) analysis['image_count'] = len(images) total_score += image_score * self.quality_indicators['image_count'] # 코드 블록 (기술 문서용) code_blocks = soup.find_all(['pre', 'code']) code_score = min(len(code_blocks) / 5, 1.0) if code_blocks else 0.0 analysis['code_blocks'] = len(code_blocks) total_score += code_score * self.quality_indicators['code_blocks'] except Exception as e: logger.warning(f"구조 분석 실패: {e}") # HTML 파싱 실패 시 텍스트 기반 분석 text_lines = content.split('\n') analysis['line_count'] = len(text_lines) total_score += min(len(text_lines) / 50, 0.8) * 0.3 # 3. 도메인 권위 점수 domain = urlparse(url).netloc.lower() authority_score = self._calculate_authority_score(domain) analysis['authority_score'] = authority_score total_score += authority_score * 0.2 # 4. 스팸 탐지 spam_score = self._detect_spam(content) analysis['spam_score'] = spam_score total_score *= (1.0 - spam_score) # 스팸 점수만큼 감점 # 5. 언어 품질 (간단한 휴리스틱) readability_score = self._calculate_readability(content) analysis['readability_score'] = readability_score total_score += readability_score * self.quality_indicators['readability'] # 총점 정규화 final_score = min(max(total_score, 0.0), 1.0) # 품질 등급 결정 if final_score >= 0.8: quality = QualityScore.EXCELLENT elif final_score >= 0.6: quality = QualityScore.GOOD elif final_score >= 0.4: quality = QualityScore.AVERAGE elif final_score >= 0.2: quality = QualityScore.POOR else: quality = QualityScore.SPAM return quality, final_score, analysis def _calculate_authority_score(self, domain: str) -> float: """도메인 권위 점수 계산""" domain = domain.lower() # 정확한 매치 if domain in self.authority_domains: return self.authority_domains[domain] # 부분 매치 (TLD 기반) for auth_domain, score in self.authority_domains.items(): if auth_domain.startswith('.') and domain.endswith(auth_domain): return score # 서브도메인 매치 for auth_domain, score in self.authority_domains.items(): if not auth_domain.startswith('.') and auth_domain in domain: return score * 0.8 # 서브도메인은 약간 감점 # 기본 점수 (알려지지 않은 도메인) return 0.5 def _detect_spam(self, content: str) -> float: """스팸 탐지""" spam_indicators = 0 content_lower = content.lower() for pattern in self.spam_patterns: if re.search(pattern, content_lower): spam_indicators += 1 # 기타 스팸 지표 if content.count('!') > len(content) * 0.02: # 과도한 느낌표 spam_indicators += 1 if content.count('$') > len(content) * 0.01: # 과도한 달러 기호 spam_indicators += 1 if len(re.findall(r'[A-Z]{3,}', content)) > 10: # 과도한 대문자 spam_indicators += 1 # 스팸 점수 계산 (0-1) spam_score = min(spam_indicators / 10, 1.0) return spam_score def _calculate_readability(self, content: str) -> float: """가독성 점수 계산 (간단한 휴리스틱)""" if not content: return 0.0 # 문장 수 계산 sentences = re.split(r'[.!?]+', content) sentence_count = len([s for s in sentences if s.strip()]) if sentence_count == 0: return 0.0 # 단어 수 계산 words = content.split() word_count = len(words) if word_count == 0: return 0.0 # 평균 문장 길이 avg_sentence_length = word_count / sentence_count # 복잡한 단어 비율 (5글자 이상) complex_words = [word for word in words if len(word) > 5] complex_ratio = len(complex_words) / word_count # 가독성 점수 계산 (Flesch Reading Ease 변형) readability = 1.0 - (avg_sentence_length / 30) * 0.4 - complex_ratio * 0.6 return max(0.0, min(readability, 1.0)) def generate_summary(self, content: str, max_length: int = 200) -> str: """콘텐츠 요약 생성""" if not content or len(content) < max_length: return content # HTML 태그 제거 try: soup = BeautifulSoup(content, 'html.parser') text = soup.get_text() except: text = content # 문장 단위로 분리 sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if s.strip()] if not sentences: return text[:max_length] + "..." # 첫 번째 문장 우선 summary = sentences[0] # 길이 제한 내에서 추가 문장 포함 for sentence in sentences[1:]: if len(summary) + len(sentence) + 2 <= max_length: summary += ". " + sentence else: break if len(summary) >= max_length: summary = summary[:max_length-3] + "..." return summary

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Skynotdie/mky'

If you have feedback or need assistance with the MCP directory API, please join our Discord server