Local Search MCP Server

content_cleaner.py•7.52 KiB

""" Content Cleaner Module for Local Search MCP Provides deduplication, boilerplate removal, and size filtering for chunks. """ import hashlib import re import sys from typing import List, Set, Tuple from dataclasses import dataclass from difflib import SequenceMatcher from tqdm import tqdm from langchain_core.documents import Document @dataclass class CleaningStats: """Statistics from content cleaning process.""" total_input: int exact_duplicates_removed: int near_duplicates_removed: int boilerplate_removed: int too_small_removed: int total_output: int @property def uniqueness_ratio(self) -> float: """Calculate uniqueness ratio (0-1).""" if self.total_input == 0: return 1.0 return self.total_output / self.total_input class ContentCleaner: """ Cleans content by removing duplicates, boilerplate, and too-small chunks. """ # Configuration MIN_CHUNK_SIZE = 100 # Minimum characters per chunk NEAR_DUPLICATE_THRESHOLD = 0.95 # 95% similarity threshold BOILERPLATE_MIN_OCCURRENCES = 3 # Minimum occurrences to be considered boilerplate def __init__(self): self.seen_hashes: Set[str] = set() self.seen_contents: List[str] = [] self.boilerplate_patterns: List[str] = [] def clean_chunks( self, chunks: List[Document], detect_boilerplate: bool = True ) -> Tuple[List[Document], CleaningStats]: """ Clean a list of chunks by removing duplicates and low-quality content. Args: chunks: List of Document chunks detect_boilerplate: Whether to detect and remove boilerplate patterns Returns: Tuple of (cleaned_chunks, cleaning_stats) """ stats = CleaningStats( total_input=len(chunks), exact_duplicates_removed=0, near_duplicates_removed=0, boilerplate_removed=0, too_small_removed=0, total_output=0 ) # Reset state self.seen_hashes.clear() self.seen_contents.clear() # Detect boilerplate patterns if requested if detect_boilerplate: self.boilerplate_patterns = self._detect_boilerplate_patterns(chunks) cleaned_chunks = [] for chunk in tqdm(chunks, desc="Cleaning chunks", file=sys.stderr, unit="chunk"): content = chunk.page_content.strip() # Skip empty chunks if not content: stats.too_small_removed += 1 continue # Check minimum size if len(content) < self.MIN_CHUNK_SIZE: stats.too_small_removed += 1 continue # Check for exact duplicates (MD5 hash) content_hash = self._hash_content(content) if content_hash in self.seen_hashes: stats.exact_duplicates_removed += 1 continue # Check for near duplicates if self._is_near_duplicate(content): stats.near_duplicates_removed += 1 continue # Check for boilerplate if detect_boilerplate and self._is_boilerplate(content): stats.boilerplate_removed += 1 continue # Chunk passed all checks self.seen_hashes.add(content_hash) self.seen_contents.append(content) cleaned_chunks.append(chunk) stats.total_output = len(cleaned_chunks) return cleaned_chunks, stats def _hash_content(self, content: str) -> str: """Generate MD5 hash of content for exact duplicate detection.""" return hashlib.md5(content.encode('utf-8')).hexdigest() def _is_near_duplicate(self, content: str) -> bool: """ Check if content is a near duplicate of previously seen content. Uses SequenceMatcher for similarity comparison. Optimized to use quick_ratio and cached second sequence (b). """ # Only check against recent contents to avoid O(n²) complexity # Check last 100 items max check_limit = min(100, len(self.seen_contents)) recent_contents = self.seen_contents[-check_limit:] # Initialize matcher with content as the second sequence (b) # This is more efficient as SequenceMatcher analyzes 'b' once matcher = SequenceMatcher(None, b=content) for seen_content in recent_contents: matcher.set_seq1(seen_content) # Optimization: Check upper bounds first before expensive ratio() if matcher.real_quick_ratio() < self.NEAR_DUPLICATE_THRESHOLD: continue if matcher.quick_ratio() < self.NEAR_DUPLICATE_THRESHOLD: continue if matcher.ratio() >= self.NEAR_DUPLICATE_THRESHOLD: return True return False def _detect_boilerplate_patterns(self, chunks: List[Document]) -> List[str]: """ Detect boilerplate patterns by finding frequently repeated content. Returns: List of boilerplate pattern strings """ # Count line occurrences line_counts = {} for chunk in chunks: lines = chunk.page_content.split('\n') for line in lines: line = line.strip() if len(line) > 20: # Only consider substantial lines line_counts[line] = line_counts.get(line, 0) + 1 # Find patterns that occur frequently boilerplate = [] for line, count in line_counts.items(): if count >= self.BOILERPLATE_MIN_OCCURRENCES: boilerplate.append(line) return boilerplate def _is_boilerplate(self, content: str) -> bool: """ Check if content matches boilerplate patterns. """ if not self.boilerplate_patterns: return False # Check if content is entirely boilerplate lines = [line.strip() for line in content.split('\n') if line.strip()] if not lines: return False boilerplate_lines = sum( 1 for line in lines if any(pattern in line for pattern in self.boilerplate_patterns) ) # Consider it boilerplate if >80% of lines match patterns return (boilerplate_lines / len(lines)) > 0.8 def remove_common_boilerplate(self, text: str) -> str: """ Remove common boilerplate patterns from text. This includes: - Copyright notices - License headers - Common footers - Navigation elements """ # Remove copyright notices text = re.sub( r'Copyright\s+©?\s*\d{4}.*?(?=\n\n|\Z)', '', text, flags=re.IGNORECASE | re.DOTALL ) # Remove license headers (common patterns) text = re.sub( r'Licensed under.*?(?=\n\n|\Z)', '', text, flags=re.IGNORECASE | re.DOTALL ) # Remove common navigation patterns text = re.sub( r'^\s*(?:Home|Back|Next|Previous|Table of Contents)\s*$', '', text, flags=re.MULTILINE | re.IGNORECASE ) # Remove excessive newlines text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() def reset(self): """Reset cleaner state (seen hashes and contents).""" self.seen_hashes.clear() self.seen_contents.clear() self.boilerplate_patterns.clear()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/yuzuponikemi/localsearch-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

content_cleaner.py•7.52 KiB