#!/usr/bin/env python3
"""
Web Search Content Analyzer - AI 기반 콘텐츠 분석기
검색 결과의 품질 평가, 권위성 점수, 스팸 탐지 등을 수행합니다.
"""
import re
import logging
from typing import Dict, List, Tuple
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from web_search_base import QualityScore
logger = logging.getLogger(__name__)
class ContentAnalyzer:
"""AI 기반 콘텐츠 분석기"""
def __init__(self):
self.quality_indicators = self._load_quality_indicators()
self.authority_domains = self._load_authority_domains()
self.spam_patterns = self._load_spam_patterns()
def _load_quality_indicators(self) -> Dict[str, float]:
"""품질 지표 가중치"""
return {
'content_length': 0.15,
'paragraph_count': 0.10,
'heading_structure': 0.15,
'external_links': 0.10,
'image_count': 0.05,
'code_blocks': 0.10,
'citation_count': 0.15,
'readability': 0.20
}
def _load_authority_domains(self) -> Dict[str, float]:
"""권위 있는 도메인 목록"""
return {
# 학술/연구
'scholar.google.com': 1.0,
'pubmed.ncbi.nlm.nih.gov': 1.0,
'arxiv.org': 0.95,
'ieee.org': 0.95,
'acm.org': 0.9,
# 기술/개발
'github.com': 0.9,
'stackoverflow.com': 0.85,
'developer.mozilla.org': 0.9,
'docs.python.org': 0.9,
# 뉴스/미디어
'reuters.com': 0.9,
'bbc.com': 0.85,
'cnn.com': 0.8,
# 일반 정보
'wikipedia.org': 0.85,
'britannica.com': 0.9,
# 정부/공공기관
'.gov': 0.95,
'.edu': 0.9,
'.ac.kr': 0.9
}
def _load_spam_patterns(self) -> List[str]:
"""스팸 패턴 목록"""
return [
r'click here for',
r'limited time offer',
r'free download now',
r'增值税发票', # 중국어 스팸
r'виагра', # 러시아어 스팸
r'!{2,}', # 과도한 느낌표
r'\${3,}', # 과도한 달러 기호
]
def analyze_content_quality(self, content: str, url: str, metadata: Dict = None) -> Tuple[QualityScore, float, Dict]:
"""콘텐츠 품질 분석"""
if not content:
return QualityScore.POOR, 0.1, {'error': 'No content'}
analysis = {}
total_score = 0.0
# 1. 콘텐츠 길이 분석
content_length = len(content)
if content_length < 100:
length_score = 0.2
elif content_length < 500:
length_score = 0.5
elif content_length < 2000:
length_score = 0.8
else:
length_score = 1.0
analysis['content_length'] = content_length
total_score += length_score * self.quality_indicators['content_length']
# 2. 구조 분석 (HTML 기반)
try:
soup = BeautifulSoup(content, 'html.parser')
# 제목 구조
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
heading_score = min(len(headings) / 5, 1.0) if headings else 0.1
analysis['heading_count'] = len(headings)
total_score += heading_score * self.quality_indicators['heading_structure']
# 단락 수
paragraphs = soup.find_all('p')
paragraph_score = min(len(paragraphs) / 10, 1.0) if paragraphs else 0.1
analysis['paragraph_count'] = len(paragraphs)
total_score += paragraph_score * self.quality_indicators['paragraph_count']
# 외부 링크
links = soup.find_all('a', href=True)
external_links = [link for link in links if 'http' in link.get('href', '')]
link_score = min(len(external_links) / 20, 1.0)
analysis['external_links'] = len(external_links)
total_score += link_score * self.quality_indicators['external_links']
# 이미지
images = soup.find_all('img')
image_score = min(len(images) / 10, 1.0)
analysis['image_count'] = len(images)
total_score += image_score * self.quality_indicators['image_count']
# 코드 블록 (기술 문서용)
code_blocks = soup.find_all(['pre', 'code'])
code_score = min(len(code_blocks) / 5, 1.0) if code_blocks else 0.0
analysis['code_blocks'] = len(code_blocks)
total_score += code_score * self.quality_indicators['code_blocks']
except Exception as e:
logger.warning(f"구조 분석 실패: {e}")
# HTML 파싱 실패 시 텍스트 기반 분석
text_lines = content.split('\n')
analysis['line_count'] = len(text_lines)
total_score += min(len(text_lines) / 50, 0.8) * 0.3
# 3. 도메인 권위 점수
domain = urlparse(url).netloc.lower()
authority_score = self._calculate_authority_score(domain)
analysis['authority_score'] = authority_score
total_score += authority_score * 0.2
# 4. 스팸 탐지
spam_score = self._detect_spam(content)
analysis['spam_score'] = spam_score
total_score *= (1.0 - spam_score) # 스팸 점수만큼 감점
# 5. 언어 품질 (간단한 휴리스틱)
readability_score = self._calculate_readability(content)
analysis['readability_score'] = readability_score
total_score += readability_score * self.quality_indicators['readability']
# 총점 정규화
final_score = min(max(total_score, 0.0), 1.0)
# 품질 등급 결정
if final_score >= 0.8:
quality = QualityScore.EXCELLENT
elif final_score >= 0.6:
quality = QualityScore.GOOD
elif final_score >= 0.4:
quality = QualityScore.AVERAGE
elif final_score >= 0.2:
quality = QualityScore.POOR
else:
quality = QualityScore.SPAM
return quality, final_score, analysis
def _calculate_authority_score(self, domain: str) -> float:
"""도메인 권위 점수 계산"""
domain = domain.lower()
# 정확한 매치
if domain in self.authority_domains:
return self.authority_domains[domain]
# 부분 매치 (TLD 기반)
for auth_domain, score in self.authority_domains.items():
if auth_domain.startswith('.') and domain.endswith(auth_domain):
return score
# 서브도메인 매치
for auth_domain, score in self.authority_domains.items():
if not auth_domain.startswith('.') and auth_domain in domain:
return score * 0.8 # 서브도메인은 약간 감점
# 기본 점수 (알려지지 않은 도메인)
return 0.5
def _detect_spam(self, content: str) -> float:
"""스팸 탐지"""
spam_indicators = 0
content_lower = content.lower()
for pattern in self.spam_patterns:
if re.search(pattern, content_lower):
spam_indicators += 1
# 기타 스팸 지표
if content.count('!') > len(content) * 0.02: # 과도한 느낌표
spam_indicators += 1
if content.count('$') > len(content) * 0.01: # 과도한 달러 기호
spam_indicators += 1
if len(re.findall(r'[A-Z]{3,}', content)) > 10: # 과도한 대문자
spam_indicators += 1
# 스팸 점수 계산 (0-1)
spam_score = min(spam_indicators / 10, 1.0)
return spam_score
def _calculate_readability(self, content: str) -> float:
"""가독성 점수 계산 (간단한 휴리스틱)"""
if not content:
return 0.0
# 문장 수 계산
sentences = re.split(r'[.!?]+', content)
sentence_count = len([s for s in sentences if s.strip()])
if sentence_count == 0:
return 0.0
# 단어 수 계산
words = content.split()
word_count = len(words)
if word_count == 0:
return 0.0
# 평균 문장 길이
avg_sentence_length = word_count / sentence_count
# 복잡한 단어 비율 (5글자 이상)
complex_words = [word for word in words if len(word) > 5]
complex_ratio = len(complex_words) / word_count
# 가독성 점수 계산 (Flesch Reading Ease 변형)
readability = 1.0 - (avg_sentence_length / 30) * 0.4 - complex_ratio * 0.6
return max(0.0, min(readability, 1.0))
def generate_summary(self, content: str, max_length: int = 200) -> str:
"""콘텐츠 요약 생성"""
if not content or len(content) < max_length:
return content
# HTML 태그 제거
try:
soup = BeautifulSoup(content, 'html.parser')
text = soup.get_text()
except:
text = content
# 문장 단위로 분리
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return text[:max_length] + "..."
# 첫 번째 문장 우선
summary = sentences[0]
# 길이 제한 내에서 추가 문장 포함
for sentence in sentences[1:]:
if len(summary) + len(sentence) + 2 <= max_length:
summary += ". " + sentence
else:
break
if len(summary) >= max_length:
summary = summary[:max_length-3] + "..."
return summary