Skip to main content
Glama
text_processing.py8.88 kB
"""Text processing utilities for Writer MCP.""" import re from typing import List, Optional, Set from collections import Counter from ..utils.logger import get_logger logger = get_logger(__name__) def clean_text(text: str) -> str: """Clean and normalize text. Args: text: Text to clean Returns: Cleaned text """ if not isinstance(text, str): return "" # Remove extra whitespace text = re.sub(r'\s+', ' ', text) # Remove leading/trailing whitespace text = text.strip() return text def extract_keywords(text: str, min_length: int = 3, max_keywords: int = 20) -> List[str]: """Extract keywords from text. Args: text: Text to extract keywords from min_length: Minimum keyword length max_keywords: Maximum number of keywords to return Returns: List of extracted keywords """ if not isinstance(text, str) or not text.strip(): return [] # Convert to lowercase text = text.lower() # Remove punctuation and split into words words = re.findall(r'\b[a-zA-Z]+\b', text) # Filter by length and remove common stop words stop_words = { 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'her', 'its', 'our', 'their' } filtered_words = [ word for word in words if len(word) >= min_length and word not in stop_words ] # Count word frequency word_counts = Counter(filtered_words) # Return most common words keywords = [word for word, _ in word_counts.most_common(max_keywords)] return keywords def generate_summary(text: str, max_sentences: int = 3) -> str: """Generate a simple extractive summary of text. Args: text: Text to summarize max_sentences: Maximum number of sentences in summary Returns: Generated summary """ if not isinstance(text, str) or not text.strip(): return "" # Split into sentences sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if s.strip()] if len(sentences) <= max_sentences: return text # Simple scoring based on sentence length and position scored_sentences = [] for i, sentence in enumerate(sentences): # Score based on length (prefer medium-length sentences) length_score = min(len(sentence.split()) / 20.0, 1.0) # Score based on position (prefer earlier sentences) position_score = 1.0 - (i / len(sentences)) total_score = length_score * 0.7 + position_score * 0.3 scored_sentences.append((sentence, total_score)) # Sort by score and take top sentences scored_sentences.sort(key=lambda x: x[1], reverse=True) top_sentences = [s[0] for s in scored_sentences[:max_sentences]] # Maintain original order summary_sentences = [] for sentence in sentences: if sentence in top_sentences: summary_sentences.append(sentence) if len(summary_sentences) >= max_sentences: break return '. '.join(summary_sentences) + '.' def calculate_text_similarity(text1: str, text2: str) -> float: """Calculate similarity between two texts using Jaccard similarity. Args: text1: First text text2: Second text Returns: Similarity score between 0.0 and 1.0 """ if not isinstance(text1, str) or not isinstance(text2, str): return 0.0 # Extract keywords from both texts keywords1 = set(extract_keywords(text1)) keywords2 = set(extract_keywords(text2)) if not keywords1 and not keywords2: return 1.0 if text1.strip() == text2.strip() else 0.0 if not keywords1 or not keywords2: return 0.0 # Calculate Jaccard similarity intersection = len(keywords1.intersection(keywords2)) union = len(keywords1.union(keywords2)) return intersection / union if union > 0 else 0.0 def extract_character_mentions(text: str, character_names: List[str]) -> List[str]: """Extract mentions of character names from text. Args: text: Text to search character_names: List of character names to look for Returns: List of mentioned character names """ if not isinstance(text, str) or not character_names: return [] mentioned = [] text_lower = text.lower() for name in character_names: if isinstance(name, str) and name.strip(): # Create regex pattern for whole word matching pattern = r'\b' + re.escape(name.lower()) + r'\b' if re.search(pattern, text_lower): mentioned.append(name) return mentioned def tokenize_text(text: str) -> List[str]: """Tokenize text into words. Args: text: Text to tokenize Returns: List of tokens """ if not isinstance(text, str): return [] # Extract words (letters only) tokens = re.findall(r'\b[a-zA-Z]+\b', text.lower()) return tokens def calculate_readability_score(text: str) -> float: """Calculate a simple readability score for text. Args: text: Text to analyze Returns: Readability score (higher = more readable) """ if not isinstance(text, str) or not text.strip(): return 0.0 # Split into sentences and words sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if s.strip()] words = tokenize_text(text) if not sentences or not words: return 0.0 # Calculate metrics avg_sentence_length = len(words) / len(sentences) avg_word_length = sum(len(word) for word in words) / len(words) # Simple readability score (lower sentence length and word length = higher score) # Normalize to 0-1 range sentence_score = max(0, 1 - (avg_sentence_length - 10) / 20) word_score = max(0, 1 - (avg_word_length - 4) / 6) return (sentence_score + word_score) / 2 def find_text_patterns(text: str, patterns: List[str]) -> List[tuple[str, List[str]]]: """Find specific patterns in text. Args: text: Text to search patterns: List of regex patterns to find Returns: List of (pattern, matches) tuples """ if not isinstance(text, str) or not patterns: return [] results = [] for pattern in patterns: try: matches = re.findall(pattern, text, re.IGNORECASE) if matches: results.append((pattern, matches)) except re.error as e: logger.warning(f"Invalid regex pattern '{pattern}': {e}") return results def normalize_whitespace(text: str) -> str: """Normalize whitespace in text. Args: text: Text to normalize Returns: Text with normalized whitespace """ if not isinstance(text, str): return "" # Replace multiple whitespace characters with single space text = re.sub(r'\s+', ' ', text) # Remove leading/trailing whitespace text = text.strip() return text def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: """Truncate text to maximum length. Args: text: Text to truncate max_length: Maximum length suffix: Suffix to add if truncated Returns: Truncated text """ if not isinstance(text, str): return "" if len(text) <= max_length: return text # Try to truncate at word boundary truncated = text[:max_length - len(suffix)] last_space = truncated.rfind(' ') if last_space > max_length * 0.7: # If we can find a reasonable word boundary truncated = truncated[:last_space] return truncated + suffix def extract_quoted_text(text: str) -> List[str]: """Extract quoted text from a string. Args: text: Text to search Returns: List of quoted strings """ if not isinstance(text, str): return [] # Find text in double quotes double_quoted = re.findall(r'"([^"]+)"', text) # Find text in single quotes single_quoted = re.findall(r"'([^']+)'", text) return double_quoted + single_quoted

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/huangjien/writer-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server