"""Text profile tool for generating text analytics.
This tool analyzes text and returns various linguistic and statistical features.
"""
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from mcp_server.schemas import TextProfile
# Regular expression for extracting words
_WORD_RE = re.compile(r"[A-Za-z]+")
# VADER sentiment analyzer (initialized once)
_analyzer = SentimentIntensityAnalyzer()
def _read_doc(doc_id: str) -> str | None:
"""Read a document from the corpus by its ID.
Args:
doc_id: Document identifier (filename)
Returns:
str | None: Document text or None if not found
"""
# TODO: Implement document reading
return None
def _tokenize(text: str) -> list[str]:
"""Tokenize text into words.
Args:
text: Input text
Returns:
list[str]: List of lowercase word tokens
"""
# TODO: Use _WORD_RE to extract words and convert to lowercase
return []
def _flesch_reading_ease(text: str) -> float:
"""Calculate Flesch Reading Ease score.
Formula: 206.835 - 1.015 * (words/sentences) - 84.6 * (syllables/words)
Args:
text: Input text
Returns:
float: Flesch Reading Ease score (higher = easier to read)
"""
# TODO: Implement Flesch Reading Ease calculation
# words = _tokenize(text)
# TODO: Count syllables (hint: count vowel groups in each word)
return 0.0
def _top_terms(text: str, n_top: int = 10) -> list[str]:
"""Extract top n-grams from text using TF-IDF.
Args:
text: Input text
n_top: Number of top terms to return
Returns:
list[str]: Top n-grams by score
"""
# TODO: Implement top term extraction with TfidfVectorizer
# Note: For single document, use max_df=1.0 to avoid errors
return []
def text_profile(text_or_doc_id: str) -> TextProfile:
"""Generate a text profile for a document or raw text.
Args:
text_or_doc_id: Either raw text or a document ID to analyze
Returns:
TextProfile: Complete text analytics profile
"""
# TODO: Implement full text profiling
# 1. Try to read as document ID first, otherwise use as raw text
text = _read_doc(text_or_doc_id) or text_or_doc_id
# 2. Count characters
chars = len(text)
# 3. Tokenize and count tokens
tokens = _tokenize(text)
token_count = len(tokens)
# 4. Calculate type-token ratio (lexical diversity)
ttr = 0.0 # TODO: Calculate unique tokens / total tokens
# 5. Calculate readability score
readability = _flesch_reading_ease(text)
# 6. Calculate sentiment using VADER
sentiment = 0.0 # TODO: Use _analyzer.polarity_scores(text)["compound"]
# 7. Extract top terms and keywords
ngrams = _top_terms(text, n_top=10)
keywords = ngrams[:10]
# 8. Return TextProfile with all fields
return TextProfile(
char_count=chars,
token_count=token_count,
type_token_ratio=round(ttr, 4),
top_ngrams=ngrams,
readability_flesch=round(readability, 2),
sentiment=round(sentiment, 4),
keywords=keywords,
)