Threat Intelligence MCP Server

clustering.py•5.61 KiB

"""News article clustering using Jaccard similarity. Groups news articles by content similarity to reduce noise and identify story clusters. """ import logging import re logger = logging.getLogger("world-intel-mcp.analysis.clustering") _STOPWORDS = frozenset({ "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "shall", "in", "on", "at", "to", "for", "of", "and", "or", "but", "not", "no", "nor", "with", "from", "by", "it", "its", "that", "this", "these", "those", "he", "she", "they", "we", "you", "i", "me", "my", "his", "her", "our", "your", "their", "what", "which", "who", "whom", "how", "when", "where", "why", "if", "then", "than", "so", "as", "up", "out", "about", "into", "over", "after", "before", "between", "under", "again", "also", "just", "more", "most", "other", "some", "such", "very", "new", "said", "says", "one", "two", "first", }) def _tokenize(text: str) -> set[str]: """Extract meaningful word tokens from text.""" words = re.findall(r'[a-z]{3,}', text.lower()) return {w for w in words if w not in _STOPWORDS} def jaccard_similarity(set_a: set, set_b: set) -> float: """Compute Jaccard similarity between two sets.""" if not set_a or not set_b: return 0.0 intersection = len(set_a & set_b) union = len(set_a | set_b) return intersection / union if union > 0 else 0.0 def cluster_articles( articles: list[dict], similarity_threshold: float = 0.3, title_field: str = "title", ) -> list[dict]: """Cluster articles by title similarity. Args: articles: List of article dicts. similarity_threshold: Minimum Jaccard similarity to merge. title_field: Key containing the article title. Returns: List of cluster dicts with representative article and member count. """ if not articles: return [] # Tokenize all titles tokenized = [] for article in articles: title = article.get(title_field, "") or "" tokens = _tokenize(title) tokenized.append(tokens) # Greedy single-linkage clustering assigned = [False] * len(articles) clusters: list[dict] = [] for i in range(len(articles)): if assigned[i]: continue cluster_members = [i] assigned[i] = True for j in range(i + 1, len(articles)): if assigned[j]: continue # Check similarity against cluster representative (first member) sim = jaccard_similarity(tokenized[i], tokenized[j]) if sim >= similarity_threshold: cluster_members.append(j) assigned[j] = True # Build cluster output representative = articles[cluster_members[0]] clusters.append({ "representative": representative, "member_count": len(cluster_members), "member_indices": cluster_members, }) # Sort by member count descending clusters.sort(key=lambda c: c["member_count"], reverse=True) return clusters # --------------------------------------------------------------------------- # MCP-facing async wrapper # --------------------------------------------------------------------------- async def fetch_news_clusters( fetcher, category: str | None = None, limit: int = 100, threshold: float = 0.25, ) -> dict: """Fetch recent news and cluster by topic similarity. Args: fetcher: HTTP fetcher instance. category: Optional RSS feed category filter. limit: Max news items to fetch. threshold: Jaccard similarity threshold (0.0 - 1.0). Returns: Dict with clusters[], cluster_count, total_items, singleton_count, source. """ from datetime import datetime, timezone from ..sources import news feed_data = await news.fetch_news_feed(fetcher, category=category, limit=limit) items = feed_data.get("items", []) raw_clusters = cluster_articles(items, similarity_threshold=threshold) out_clusters = [] for c in raw_clusters: if c["member_count"] <= 1: continue rep = c["representative"] # Collect keywords from all member titles all_tokens: set[str] = set() member_items = [] for idx in c["member_indices"]: art = items[idx] all_tokens |= _tokenize(art.get("title", "")) member_items.append({ "title": art.get("title", ""), "source": art.get("source_name", art.get("source", "")), "link": art.get("link", ""), }) # Top keywords by frequency word_freq: dict[str, int] = {} for idx in c["member_indices"]: for w in _tokenize(items[idx].get("title", "")): word_freq[w] = word_freq.get(w, 0) + 1 top_kw = sorted(word_freq, key=word_freq.get, reverse=True)[:8] # type: ignore[arg-type] out_clusters.append({ "headline": rep.get("title", ""), "size": c["member_count"], "keywords": top_kw, "sources": list({m["source"] for m in member_items if m["source"]}), "items": member_items[:5], }) singletons = sum(1 for c in raw_clusters if c["member_count"] == 1) return { "clusters": out_clusters, "cluster_count": len(out_clusters), "total_items": len(items), "singleton_count": singletons, "threshold": threshold, "source": "jaccard-clustering", "timestamp": datetime.now(timezone.utc).isoformat(), }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/marc-shade/threat-intel-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

clustering.py•5.61 KiB