"""News preprocessing utilities."""
import re
import logging
from typing import Dict, Any, Optional
from datetime import datetime
from bs4 import BeautifulSoup
class NewsPreprocessor:
"""News content preprocessing and normalization."""
def __init__(self):
"""Initialize preprocessor."""
self.logger = logging.getLogger("preprocessor")
# Compile regex patterns for efficiency
self._ad_patterns = [
re.compile(r'\[광고\].*?(?=\n|$)', re.IGNORECASE),
re.compile(r'※.*?광고.*?(?=\n|$)', re.IGNORECASE),
re.compile(r'▲.*?광고.*?(?=\n|$)', re.IGNORECASE),
re.compile(r'<광고>.*?</광고>', re.IGNORECASE | re.DOTALL),
]
self._quote_patterns = [
(re.compile(r'[""„«»]'), '"'), # Various quote marks to standard double quotes
(re.compile(r'[''`]'), "'"), # Various single quotes
]
self._whitespace_pattern = re.compile(r'\s+')
self._reporter_pattern = re.compile(r'([가-힣]+)\s*기자')
self._date_pattern = re.compile(r'(\d{4})년\s*(\d{1,2})월\s*(\d{1,2})일')
self._company_pattern = re.compile(r'([가-힣]+(?:전자|그룹|회사|기업|코퍼레이션))')
def process(self, text: str) -> str:
"""Full preprocessing pipeline.
Args:
text: Raw text content
Returns:
Processed and cleaned text
"""
if not text:
return ""
# Apply all preprocessing steps
processed = text
processed = self.clean_html(processed)
processed = self.remove_ads_and_promotions(processed)
processed = self.standardize_quotes(processed)
processed = self.normalize_text(processed)
return processed.strip()
def clean_html(self, html_content: str) -> str:
"""Clean HTML tags and extract text content.
Args:
html_content: Raw HTML content
Returns:
Cleaned text content
"""
if not html_content:
return ""
try:
# Parse with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for element in soup(["script", "style", "meta", "link", "noscript"]):
element.decompose()
# Remove common ad containers
ad_classes = ['ad', 'advertisement', 'banner', 'sponsor', 'promotion']
for ad_class in ad_classes:
for element in soup.find_all(class_=re.compile(ad_class, re.I)):
element.decompose()
# Get text content
text = soup.get_text(separator=' ', strip=True)
return text
except Exception as e:
self.logger.warning(f"HTML cleaning failed: {e}")
# Fallback: simple tag removal
text = re.sub(r'<[^>]+>', ' ', html_content)
return text
def normalize_text(self, text: str) -> str:
"""Normalize text whitespace and formatting.
Args:
text: Input text
Returns:
Normalized text
"""
if not text:
return ""
# Replace multiple whitespace with single space
normalized = self._whitespace_pattern.sub(' ', text)
# Strip leading and trailing whitespace
normalized = normalized.strip()
# Remove excessive line breaks
normalized = re.sub(r'\n\s*\n\s*\n+', '\n\n', normalized)
return normalized
def extract_metadata(self, text: str) -> Dict[str, Any]:
"""Extract metadata from news text.
Args:
text: News text content
Returns:
Dictionary with extracted metadata
"""
metadata = {
"date": None,
"reporter": None,
"company": None
}
if not text:
return metadata
# Extract reporter name
reporter_match = self._reporter_pattern.search(text)
if reporter_match:
metadata["reporter"] = reporter_match.group(1)
# Extract date
date_match = self._date_pattern.search(text)
if date_match:
year, month, day = date_match.groups()
try:
metadata["date"] = datetime(int(year), int(month), int(day)).isoformat()
except ValueError:
pass # Invalid date
# Extract company names
company_matches = self._company_pattern.findall(text)
if company_matches:
# Remove duplicates and get most common
companies = list(set(company_matches))
metadata["company"] = companies[:5] # Top 5 companies mentioned
return metadata
def remove_ads_and_promotions(self, text: str) -> str:
"""Remove advertisements and promotional content.
Args:
text: Input text
Returns:
Text with ads removed
"""
if not text:
return ""
cleaned = text
# Apply all ad removal patterns
for pattern in self._ad_patterns:
cleaned = pattern.sub('', cleaned)
# Remove common promotional phrases
promo_patterns = [
r'지금\s+구매하세요.*?(?=\n|$)',
r'할인.*?이벤트.*?(?=\n|$)',
r'▶.*?바로가기.*?(?=\n|$)',
r'클릭.*?더보기.*?(?=\n|$)',
]
for pattern in promo_patterns:
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
return cleaned
def standardize_quotes(self, text: str) -> str:
"""Standardize various quote marks to standard ASCII quotes.
Args:
text: Input text
Returns:
Text with standardized quotes
"""
if not text:
return ""
standardized = text
# Apply quote standardization patterns
for pattern, replacement in self._quote_patterns:
standardized = pattern.sub(replacement, standardized)
return standardized
def extract_summary(self, text: str, max_sentences: int = 3) -> str:
"""Extract summary from news text.
Args:
text: Full news text
max_sentences: Maximum number of sentences in summary
Returns:
Summary text
"""
if not text:
return ""
# Split into sentences (Korean and English)
sentences = re.split(r'[.!?。!?]', text)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return ""
# Take first few sentences as summary
summary_sentences = sentences[:max_sentences]
summary = '. '.join(summary_sentences)
# Ensure it ends with proper punctuation
if not summary.endswith(('.', '!', '?', '。', '!', '?')):
summary += '.'
return summary
def extract_keywords(self, text: str, max_keywords: int = 10) -> list:
"""Extract keywords from news text.
Args:
text: Input text
max_keywords: Maximum number of keywords to extract
Returns:
List of extracted keywords
"""
if not text:
return []
# Simple keyword extraction (can be improved with NLP libraries)
# Extract Korean words, English words, and numbers
words = re.findall(r'[가-힣]{2,}|[a-zA-Z]{3,}|[0-9]+', text.lower())
# Filter out common stop words
stop_words = {
'그리고', '하지만', '그러나', '또한', '따라서', '때문에',
'이번', '오늘', '어제', '내일', '지금', '현재', '당시',
'and', 'but', 'the', 'for', 'with', 'this', 'that'
}
# Count word frequencies
word_freq = {}
for word in words:
if word not in stop_words and len(word) >= 2:
word_freq[word] = word_freq.get(word, 0) + 1
# Sort by frequency and return top keywords
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
keywords = [word for word, freq in sorted_words[:max_keywords]]
return keywords
def clean_phone_numbers(self, text: str) -> str:
"""Remove phone numbers from text.
Args:
text: Input text
Returns:
Text with phone numbers removed
"""
if not text:
return ""
# Korean phone number patterns
phone_patterns = [
r'\d{2,3}-\d{3,4}-\d{4}', # 02-1234-5678
r'\d{3}-\d{4}-\d{4}', # 010-1234-5678
r'\(\d{2,3}\)\s*\d{3,4}-\d{4}', # (02) 1234-5678
]
cleaned = text
for pattern in phone_patterns:
cleaned = re.sub(pattern, '[연락처]', cleaned)
return cleaned