MCP News Collector

preprocessor.py•9.31 KiB

"""News preprocessing utilities.""" import re import logging from typing import Dict, Any, Optional from datetime import datetime from bs4 import BeautifulSoup class NewsPreprocessor: """News content preprocessing and normalization.""" def __init__(self): """Initialize preprocessor.""" self.logger = logging.getLogger("preprocessor") # Compile regex patterns for efficiency self._ad_patterns = [ re.compile(r'\[광고\].*?(?=\n|$)', re.IGNORECASE), re.compile(r'※.*?광고.*?(?=\n|$)', re.IGNORECASE), re.compile(r'▲.*?광고.*?(?=\n|$)', re.IGNORECASE), re.compile(r'<광고>.*?</광고>', re.IGNORECASE | re.DOTALL), ] self._quote_patterns = [ (re.compile(r'[""„«»]'), '"'), # Various quote marks to standard double quotes (re.compile(r'[''`]'), "'"), # Various single quotes ] self._whitespace_pattern = re.compile(r'\s+') self._reporter_pattern = re.compile(r'([가-힣]+)\s*기자') self._date_pattern = re.compile(r'(\d{4})년\s*(\d{1,2})월\s*(\d{1,2})일') self._company_pattern = re.compile(r'([가-힣]+(?:전자|그룹|회사|기업|코퍼레이션))') def process(self, text: str) -> str: """Full preprocessing pipeline. Args: text: Raw text content Returns: Processed and cleaned text """ if not text: return "" # Apply all preprocessing steps processed = text processed = self.clean_html(processed) processed = self.remove_ads_and_promotions(processed) processed = self.standardize_quotes(processed) processed = self.normalize_text(processed) return processed.strip() def clean_html(self, html_content: str) -> str: """Clean HTML tags and extract text content. Args: html_content: Raw HTML content Returns: Cleaned text content """ if not html_content: return "" try: # Parse with BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') # Remove script and style elements for element in soup(["script", "style", "meta", "link", "noscript"]): element.decompose() # Remove common ad containers ad_classes = ['ad', 'advertisement', 'banner', 'sponsor', 'promotion'] for ad_class in ad_classes: for element in soup.find_all(class_=re.compile(ad_class, re.I)): element.decompose() # Get text content text = soup.get_text(separator=' ', strip=True) return text except Exception as e: self.logger.warning(f"HTML cleaning failed: {e}") # Fallback: simple tag removal text = re.sub(r'<[^>]+>', ' ', html_content) return text def normalize_text(self, text: str) -> str: """Normalize text whitespace and formatting. Args: text: Input text Returns: Normalized text """ if not text: return "" # Replace multiple whitespace with single space normalized = self._whitespace_pattern.sub(' ', text) # Strip leading and trailing whitespace normalized = normalized.strip() # Remove excessive line breaks normalized = re.sub(r'\n\s*\n\s*\n+', '\n\n', normalized) return normalized def extract_metadata(self, text: str) -> Dict[str, Any]: """Extract metadata from news text. Args: text: News text content Returns: Dictionary with extracted metadata """ metadata = { "date": None, "reporter": None, "company": None } if not text: return metadata # Extract reporter name reporter_match = self._reporter_pattern.search(text) if reporter_match: metadata["reporter"] = reporter_match.group(1) # Extract date date_match = self._date_pattern.search(text) if date_match: year, month, day = date_match.groups() try: metadata["date"] = datetime(int(year), int(month), int(day)).isoformat() except ValueError: pass # Invalid date # Extract company names company_matches = self._company_pattern.findall(text) if company_matches: # Remove duplicates and get most common companies = list(set(company_matches)) metadata["company"] = companies[:5] # Top 5 companies mentioned return metadata def remove_ads_and_promotions(self, text: str) -> str: """Remove advertisements and promotional content. Args: text: Input text Returns: Text with ads removed """ if not text: return "" cleaned = text # Apply all ad removal patterns for pattern in self._ad_patterns: cleaned = pattern.sub('', cleaned) # Remove common promotional phrases promo_patterns = [ r'지금\s+구매하세요.*?(?=\n|$)', r'할인.*?이벤트.*?(?=\n|$)', r'▶.*?바로가기.*?(?=\n|$)', r'클릭.*?더보기.*?(?=\n|$)', ] for pattern in promo_patterns: cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE) return cleaned def standardize_quotes(self, text: str) -> str: """Standardize various quote marks to standard ASCII quotes. Args: text: Input text Returns: Text with standardized quotes """ if not text: return "" standardized = text # Apply quote standardization patterns for pattern, replacement in self._quote_patterns: standardized = pattern.sub(replacement, standardized) return standardized def extract_summary(self, text: str, max_sentences: int = 3) -> str: """Extract summary from news text. Args: text: Full news text max_sentences: Maximum number of sentences in summary Returns: Summary text """ if not text: return "" # Split into sentences (Korean and English) sentences = re.split(r'[.!?。！？]', text) sentences = [s.strip() for s in sentences if s.strip()] if not sentences: return "" # Take first few sentences as summary summary_sentences = sentences[:max_sentences] summary = '. '.join(summary_sentences) # Ensure it ends with proper punctuation if not summary.endswith(('.', '!', '?', '。', '！', '？')): summary += '.' return summary def extract_keywords(self, text: str, max_keywords: int = 10) -> list: """Extract keywords from news text. Args: text: Input text max_keywords: Maximum number of keywords to extract Returns: List of extracted keywords """ if not text: return [] # Simple keyword extraction (can be improved with NLP libraries) # Extract Korean words, English words, and numbers words = re.findall(r'[가-힣]{2,}|[a-zA-Z]{3,}|[0-9]+', text.lower()) # Filter out common stop words stop_words = { '그리고', '하지만', '그러나', '또한', '따라서', '때문에', '이번', '오늘', '어제', '내일', '지금', '현재', '당시', 'and', 'but', 'the', 'for', 'with', 'this', 'that' } # Count word frequencies word_freq = {} for word in words: if word not in stop_words and len(word) >= 2: word_freq[word] = word_freq.get(word, 0) + 1 # Sort by frequency and return top keywords sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) keywords = [word for word, freq in sorted_words[:max_keywords]] return keywords def clean_phone_numbers(self, text: str) -> str: """Remove phone numbers from text. Args: text: Input text Returns: Text with phone numbers removed """ if not text: return "" # Korean phone number patterns phone_patterns = [ r'\d{2,3}-\d{3,4}-\d{4}', # 02-1234-5678 r'\d{3}-\d{4}-\d{4}', # 010-1234-5678 r'$\d{2,3}$\s*\d{3,4}-\d{4}', # (02) 1234-5678 ] cleaned = text for pattern in phone_patterns: cleaned = re.sub(pattern, '[연락처]', cleaned) return cleaned

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/whdghk1907/mcp-news-collector'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

preprocessor.py•9.31 KiB