MCP Gateway

autotag.rs•18.3 KiB

//! Automatic keyword extraction for MCP tool descriptions.
//!
//! Enriches tool descriptions from MCP backends with a `[keywords: ...]` suffix
//! so they participate in the same keyword-matching pipeline as capability tools.
//!
//! # Design
//!
//! - Idempotent: descriptions that already contain `[keywords:` are returned as-is.
//! - Deterministic: given the same input, always produces the same output.
//! - Zero-allocation fast-path: returns the original string unchanged when no
//!   keywords are extracted.
//! - Top-7 keywords, preferring longer (more specific) words, deduplicated.
//!
//! # Example
//!
//! ```rust
//! use mcp_gateway::autotag::enrich_description;
//!
//! let result = enrich_description("Reads a file from the local filesystem.");
//! assert!(result.contains("[keywords:"));
//! assert!(result.contains("file") || result.contains("filesystem") || result.contains("reads"));
//!
//! // Already-tagged descriptions are returned unchanged.
//! let tagged = "A tool. [keywords: file, read]";
//! assert_eq!(enrich_description(tagged), tagged);
//! ```

// ============================================================================
// Public API
// ============================================================================

/// Enrich a tool description with auto-extracted keyword tags.
///
/// If the description already contains a `[keywords: ...]` section, it is
/// returned unchanged. Otherwise, meaningful words are extracted and appended
/// as `[keywords: tag1, tag2, ...]`.
///
/// Returns the original description (owned) when no keywords can be extracted.
///
/// # Examples
///
/// ```rust
/// use mcp_gateway::autotag::enrich_description;
///
/// // Appends keywords
/// let result = enrich_description("Fetches weather data for a city.");
/// assert!(result.contains("[keywords:"));
///
/// // Idempotent — already-tagged descriptions pass through unchanged
/// let tagged = "My tool. [keywords: weather, fetch]";
/// assert_eq!(enrich_description(tagged), tagged);
///
/// // Empty description
/// assert_eq!(enrich_description(""), "");
/// ```
#[must_use]
pub fn enrich_description(description: &str) -> String {
    if description.contains("[keywords:") {
        return description.to_string();
    }

    let tags = extract_keywords(description);
    if tags.is_empty() {
        return description.to_string();
    }

    format!("{} [keywords: {}]", description.trim_end(), tags.join(", "))
}

// ============================================================================
// Keyword extraction internals
// ============================================================================

use std::cmp::Reverse;

/// Maximum number of keywords appended per description.
const MAX_KEYWORDS: usize = 7;

/// Minimum word length to consider (inclusive).
const MIN_WORD_LEN: usize = 3;

/// Extract up to [`MAX_KEYWORDS`] meaningful keywords from `text`.
///
/// Pipeline:
/// 1. Tokenise on non-alphabetic boundaries (hyphens treated as word separators)
/// 2. Lowercase
/// 3. Filter stopwords and tool-generic words
/// 4. Filter words shorter than [`MIN_WORD_LEN`]
/// 5. Deduplicate, preserving first-occurrence order
/// 6. Sort stable by descending length (longer = more specific) within the deduped set
/// 7. Truncate to [`MAX_KEYWORDS`]
fn extract_keywords(text: &str) -> Vec<String> {
    let mut seen = std::collections::HashSet::new();
    let mut words: Vec<String> = Vec::new();

    for raw_word in tokenize(text) {
        let word = raw_word.to_lowercase();
        if word.len() < MIN_WORD_LEN {
            continue;
        }
        if is_stopword(&word) {
            continue;
        }
        if seen.insert(word.clone()) {
            words.push(word);
        }
    }

    // Stable sort descending by length — longer words are more specific.
    words.sort_by_key(|w| Reverse(w.len()));
    words.truncate(MAX_KEYWORDS);
    words
}

/// Tokenise `text` by splitting on any non-ASCII-alphabetic character.
///
/// Hyphens and underscores are treated as separators, so `"entity-discovery"`
/// yields `["entity", "discovery"]`.
fn tokenize(text: &str) -> impl Iterator<Item = &str> {
    text.split(|c: char| !c.is_ascii_alphabetic())
        .filter(|s| !s.is_empty())
}

/// Return `true` if `word` (already lowercase) is a stopword or tool-generic term
/// that provides no search value.
fn is_stopword(word: &str) -> bool {
    STOPWORDS.binary_search(&word).is_ok()
}

// ============================================================================
// Stopword table (sorted — enables O(log n) binary_search)
// ============================================================================

/// Combined English stopwords and tool-generic terms.
///
/// MUST remain sorted (ASCII lexicographic) for `binary_search` to be correct.
/// Run `src/autotag/verify_sorted` or the test below to validate.
const STOPWORDS: &[&str] = &[
    "about",
    "above",
    "after",
    "again",
    "against",
    "all",
    "allow",
    "allows",
    "also",
    "am",
    "and",
    "annotations",
    "any",
    "are",
    "as",
    "at",
    "available",
    "based",
    "be",
    "because",
    "been",
    "before",
    "being",
    "below",
    "between",
    "both",
    "but",
    "by",
    "can",
    "current",
    "data",
    "default",
    "did",
    "do",
    "does",
    "down",
    "during",
    "each",
    "enable",
    "enables",
    "etc",
    "every",
    "examples",
    "few",
    "for",
    "from",
    "function",
    "functions",
    "further",
    "get",
    "gets",
    "given",
    "had",
    "has",
    "have",
    "here",
    "how",
    "if",
    "in",
    "include",
    "includes",
    "including",
    "information",
    "input",
    "into",
    "is",
    "it",
    "its",
    "just",
    "may",
    "method",
    "methods",
    "might",
    "more",
    "most",
    "multiple",
    "must",
    "new",
    "no",
    "not",
    "note",
    "of",
    "off",
    "on",
    "once",
    "one",
    "only",
    "optional",
    "or",
    "other",
    "out",
    "output",
    "over",
    "own",
    "parameter",
    "parameters",
    "per",
    "provide",
    "provides",
    "required",
    "result",
    "results",
    "return",
    "returns",
    "same",
    "see",
    "set",
    "sets",
    "shall",
    "should",
    "single",
    "so",
    "some",
    "specific",
    "specified",
    "such",
    "support",
    "supports",
    "than",
    "that",
    "the",
    "then",
    "there",
    "these",
    "this",
    "those",
    "through",
    "too",
    "tool",
    "tools",
    "two",
    "type",
    "types",
    "under",
    "until",
    "up",
    "use",
    "used",
    "uses",
    "using",
    "value",
    "values",
    "various",
    "very",
    "via",
    "was",
    "well",
    "were",
    "what",
    "when",
    "where",
    "which",
    "while",
    "who",
    "whom",
    "will",
    "with",
    "would",
    "you",
];

// ============================================================================
// Tests
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    // ── enrich_description ──────────────────────────────────────────────

    #[test]
    fn enrich_description_appends_keywords_to_plain_description() {
        // GIVEN: a plain description with no keyword tags
        // WHEN: enriched
        // THEN: a [keywords: ...] suffix is appended
        let result = enrich_description("Fetches weather forecasts from an external service.");
        assert!(
            result.contains("[keywords:"),
            "Expected keywords suffix, got: {result}"
        );
    }

    #[test]
    fn enrich_description_is_idempotent_when_already_tagged() {
        // GIVEN: a description that already has [keywords: ...]
        let input = "Searches the web. [keywords: search, web]";
        // WHEN / THEN: returned unchanged
        assert_eq!(enrich_description(input), input);
    }

    #[test]
    fn enrich_description_returns_unchanged_when_no_keywords_extracted() {
        // GIVEN: a description consisting only of stopwords
        let input = "use the tool to get data";
        // WHEN: enriched
        let result = enrich_description(input);
        // THEN: no keywords suffix is added (all words filtered)
        assert!(!result.contains("[keywords:"), "Got: {result}");
    }

    #[test]
    fn enrich_description_returns_empty_string_unchanged() {
        assert_eq!(enrich_description(""), "");
    }

    #[test]
    fn enrich_description_trims_trailing_whitespace_before_appending() {
        // GIVEN: description with trailing whitespace
        let result = enrich_description("Manages calendar events.   ");
        // THEN: no double-space before [keywords:
        assert!(
            !result.contains("   [keywords:"),
            "Unexpected whitespace before tag: {result}"
        );
        assert!(result.contains(" [keywords:"));
    }

    #[test]
    fn enrich_description_preserves_original_description_text() {
        let desc = "Resolves DNS hostnames into IP addresses.";
        let result = enrich_description(desc);
        assert!(
            result.starts_with(desc),
            "Original text not preserved. Got: {result}"
        );
    }

    #[test]
    fn enrich_description_keywords_are_comma_separated() {
        let result = enrich_description("Authenticates users with OAuth tokens.");
        if result.contains("[keywords:") {
            let kw_part = result
                .split("[keywords:")
                .nth(1)
                .unwrap()
                .trim_end_matches(']');
            // Keyword list should contain commas separating multiple words
            // (at least one comma when more than one keyword exists)
            let keywords: Vec<&str> = kw_part.split(',').map(str::trim).collect();
            for kw in &keywords {
                assert!(!kw.is_empty(), "Empty keyword in list");
            }
        }
    }

    // ── extract_keywords ────────────────────────────────────────────────

    #[test]
    fn extract_keywords_filters_stopwords() {
        // GIVEN: text containing only stopwords
        let keywords = extract_keywords("use the tool to get data from it");
        // THEN: empty (all filtered)
        assert!(
            keywords.is_empty(),
            "Expected empty keywords, got: {keywords:?}"
        );
    }

    #[test]
    fn extract_keywords_filters_words_shorter_than_min_len() {
        // GIVEN: text with very short words that aren't stopwords
        let keywords = extract_keywords("an is a of to go");
        assert!(keywords.is_empty(), "Got: {keywords:?}");
    }

    #[test]
    fn extract_keywords_deduplicates_repeated_words() {
        // GIVEN: "calendar calendar calendar"
        let keywords = extract_keywords("calendar events calendar schedule calendar");
        let calendar_count = keywords.iter().filter(|k| k.as_str() == "calendar").count();
        assert_eq!(calendar_count, 1, "Duplicate 'calendar' found: {keywords:?}");
    }

    #[test]
    fn extract_keywords_limits_to_max_keywords() {
        // GIVEN: a long description with many distinct keywords
        let desc = "authenticates resolves transforms aggregates monitors publishes \
                    archives broadcasts encrypts compresses validates normalizes";
        let keywords = extract_keywords(desc);
        assert!(
            keywords.len() <= MAX_KEYWORDS,
            "Got {} keywords, expected ≤{MAX_KEYWORDS}: {keywords:?}",
            keywords.len()
        );
    }

    #[test]
    fn extract_keywords_prefers_longer_words() {
        // GIVEN: mix of short and long meaningful words
        let keywords = extract_keywords("authenticates resolves dns ip");
        // 'authenticates' (13 chars) should be in result before short ones
        if keywords.len() >= 2 {
            assert!(
                keywords[0].len() >= keywords[1].len(),
                "Expected descending length order, got: {keywords:?}"
            );
        }
    }

    #[test]
    fn extract_keywords_splits_hyphenated_words() {
        // GIVEN: hyphenated word "entity-discovery"
        let keywords = extract_keywords("entity-discovery workflow");
        assert!(
            keywords.contains(&"entity".to_string())
                || keywords.contains(&"discovery".to_string()),
            "Hyphen not split: {keywords:?}"
        );
    }

    #[test]
    fn extract_keywords_splits_underscore_separated_words() {
        // GIVEN: underscore-separated tokens (common in tool names embedded in descriptions)
        let keywords = extract_keywords("oauth_token refresh_token");
        assert!(
            keywords.contains(&"oauth".to_string())
                || keywords.contains(&"token".to_string())
                || keywords.contains(&"refresh".to_string()),
            "Underscore not split: {keywords:?}"
        );
    }

    #[test]
    fn extract_keywords_lowercases_all_output() {
        let keywords = extract_keywords("Authenticates OAuth Calendar");
        for kw in &keywords {
            assert_eq!(kw.as_str(), kw.to_lowercase(), "Non-lowercase keyword: {kw}");
        }
    }

    #[test]
    fn extract_keywords_from_realistic_tool_description() {
        let desc = "Search for code repositories on GitHub using keyword queries. \
                    Returns repository names, descriptions, star counts, and topics.";
        let keywords = extract_keywords(desc);
        // Should pick up domain-relevant words like "repositories", "github", "keyword",
        // "repository", "descriptions", "topics" — not "search", "returns"
        assert!(
            !keywords.is_empty(),
            "Expected keywords from realistic description"
        );
        assert!(keywords.len() <= MAX_KEYWORDS);
        // "data" is a stopword — must not appear
        assert!(!keywords.contains(&"data".to_string()));
        // "returns" is a stopword — must not appear
        assert!(!keywords.contains(&"returns".to_string()));
    }

    // ── tokenize ────────────────────────────────────────────────────────

    #[test]
    fn tokenize_splits_on_spaces() {
        let tokens: Vec<&str> = tokenize("hello world rust").collect();
        assert_eq!(tokens, ["hello", "world", "rust"]);
    }

    #[test]
    fn tokenize_splits_on_punctuation() {
        let tokens: Vec<&str> = tokenize("reads, writes, and deletes.").collect();
        assert!(tokens.contains(&"reads"));
        assert!(tokens.contains(&"writes"));
        assert!(tokens.contains(&"deletes"));
        // "and" is a stopword but tokenization itself should include it
        assert!(tokens.contains(&"and"));
    }

    #[test]
    fn tokenize_splits_on_hyphens() {
        let tokens: Vec<&str> = tokenize("entity-discovery").collect();
        assert_eq!(tokens, ["entity", "discovery"]);
    }

    #[test]
    fn tokenize_produces_no_empty_tokens() {
        for token in tokenize("  hello   world  ") {
            assert!(!token.is_empty());
        }
    }

    // ── is_stopword ─────────────────────────────────────────────────────

    #[test]
    fn stopwords_list_is_sorted() {
        // INVARIANT: STOPWORDS must be sorted for binary_search correctness
        let is_sorted = STOPWORDS.windows(2).all(|w| w[0] <= w[1]);
        assert!(
            is_sorted,
            "STOPWORDS is not sorted — binary_search will produce incorrect results"
        );
    }

    #[test]
    fn stopwords_contains_common_english_words() {
        for word in &["the", "and", "for", "with", "from", "this", "that"] {
            assert!(is_stopword(word), "'{word}' should be a stopword");
        }
    }

    #[test]
    fn stopwords_contains_tool_generic_words() {
        for word in &["tool", "tools", "returns", "parameter", "parameters", "data"] {
            assert!(is_stopword(word), "'{word}' should be a tool-generic stopword");
        }
    }

    #[test]
    fn stopwords_does_not_contain_domain_keywords() {
        for word in &[
            "calendar",
            "oauth",
            "github",
            "kubernetes",
            "bitcoin",
            "forecast",
        ] {
            assert!(!is_stopword(word), "'{word}' should NOT be a stopword");
        }
    }

    // ── edge cases ──────────────────────────────────────────────────────

    #[test]
    fn enrich_description_handles_description_with_only_numbers() {
        let result = enrich_description("42 100 3.14");
        // Numbers don't contain ASCII alphabetics, so no keywords — returns unchanged
        assert_eq!(result, "42 100 3.14");
    }

    #[test]
    fn enrich_description_handles_single_meaningful_word() {
        let result = enrich_description("Authenticates.");
        assert!(
            result.contains("[keywords: authenticates]"),
            "Got: {result}"
        );
    }

    #[test]
    fn enrich_description_handles_unicode_text() {
        // Non-ASCII chars are split boundaries; the ASCII portions should tokenize fine
        let result = enrich_description("Manages calendar entries (créer).");
        // "manages", "calendar", "entries" should be extracted; "cr" + "er" filtered by len
        assert!(result.contains("[keywords:"));
        let kws_part = result.split("[keywords:").nth(1).unwrap_or("");
        assert!(kws_part.contains("calendar") || kws_part.contains("manages") || kws_part.contains("entries"));
    }

    #[test]
    fn enrich_description_max_keywords_from_dense_description() {
        // A very long description should still produce at most MAX_KEYWORDS tags
        let desc = "Subscribes authenticates resolves normalizes aggregates \
                    broadcasts archives encrypts compresses validates transforms \
                    schedules deploys monitors analyzes generates publishes";
        let result = enrich_description(desc);
        if let Some(kw_part) = result.split("[keywords:").nth(1) {
            let count = kw_part.trim_end_matches(']').split(',').count();
            assert!(
                count <= MAX_KEYWORDS,
                "Got {count} keywords, max is {MAX_KEYWORDS}"
            );
        }
    }
}

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/MikkoParkkola/mcp-gateway'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

autotag.rs•18.3 KiB

//! Automatic keyword extraction for MCP tool descriptions.
//!
//! Enriches tool descriptions from MCP backends with a `[keywords: ...]` suffix
//! so they participate in the same keyword-matching pipeline as capability tools.
//!
//! # Design
//!
//! - Idempotent: descriptions that already contain `[keywords:` are returned as-is.
//! - Deterministic: given the same input, always produces the same output.
//! - Zero-allocation fast-path: returns the original string unchanged when no
//!   keywords are extracted.
//! - Top-7 keywords, preferring longer (more specific) words, deduplicated.
//!
//! # Example
//!
//! ```rust
//! use mcp_gateway::autotag::enrich_description;
//!
//! let result = enrich_description("Reads a file from the local filesystem.");
//! assert!(result.contains("[keywords:"));
//! assert!(result.contains("file") || result.contains("filesystem") || result.contains("reads"));
//!
//! // Already-tagged descriptions are returned unchanged.
//! let tagged = "A tool. [keywords: file, read]";
//! assert_eq!(enrich_description(tagged), tagged);
//! ```

// ============================================================================
// Public API
// ============================================================================

/// Enrich a tool description with auto-extracted keyword tags.
///
/// If the description already contains a `[keywords: ...]` section, it is
/// returned unchanged. Otherwise, meaningful words are extracted and appended
/// as `[keywords: tag1, tag2, ...]`.
///
/// Returns the original description (owned) when no keywords can be extracted.
///
/// # Examples
///
/// ```rust
/// use mcp_gateway::autotag::enrich_description;
///
/// // Appends keywords
/// let result = enrich_description("Fetches weather data for a city.");
/// assert!(result.contains("[keywords:"));
///
/// // Idempotent — already-tagged descriptions pass through unchanged
/// let tagged = "My tool. [keywords: weather, fetch]";
/// assert_eq!(enrich_description(tagged), tagged);
///
/// // Empty description
/// assert_eq!(enrich_description(""), "");
/// ```
#[must_use]
pub fn enrich_description(description: &str) -> String {
    if description.contains("[keywords:") {
        return description.to_string();
    }

    let tags = extract_keywords(description);
    if tags.is_empty() {
        return description.to_string();
    }

    format!("{} [keywords: {}]", description.trim_end(), tags.join(", "))
}

// ============================================================================
// Keyword extraction internals
// ============================================================================

use std::cmp::Reverse;

/// Maximum number of keywords appended per description.
const MAX_KEYWORDS: usize = 7;

/// Minimum word length to consider (inclusive).
const MIN_WORD_LEN: usize = 3;

/// Extract up to [`MAX_KEYWORDS`] meaningful keywords from `text`.
///
/// Pipeline:
/// 1. Tokenise on non-alphabetic boundaries (hyphens treated as word separators)
/// 2. Lowercase
/// 3. Filter stopwords and tool-generic words
/// 4. Filter words shorter than [`MIN_WORD_LEN`]
/// 5. Deduplicate, preserving first-occurrence order
/// 6. Sort stable by descending length (longer = more specific) within the deduped set
/// 7. Truncate to [`MAX_KEYWORDS`]
fn extract_keywords(text: &str) -> Vec<String> {
    let mut seen = std::collections::HashSet::new();
    let mut words: Vec<String> = Vec::new();

    for raw_word in tokenize(text) {
        let word = raw_word.to_lowercase();
        if word.len() < MIN_WORD_LEN {
            continue;
        }
        if is_stopword(&word) {
            continue;
        }
        if seen.insert(word.clone()) {
            words.push(word);
        }
    }

    // Stable sort descending by length — longer words are more specific.
    words.sort_by_key(|w| Reverse(w.len()));
    words.truncate(MAX_KEYWORDS);
    words
}

/// Tokenise `text` by splitting on any non-ASCII-alphabetic character.
///
/// Hyphens and underscores are treated as separators, so `"entity-discovery"`
/// yields `["entity", "discovery"]`.
fn tokenize(text: &str) -> impl Iterator<Item = &str> {
    text.split(|c: char| !c.is_ascii_alphabetic())
        .filter(|s| !s.is_empty())
}

/// Return `true` if `word` (already lowercase) is a stopword or tool-generic term
/// that provides no search value.
fn is_stopword(word: &str) -> bool {
    STOPWORDS.binary_search(&word).is_ok()
}

// ============================================================================
// Stopword table (sorted — enables O(log n) binary_search)
// ============================================================================

/// Combined English stopwords and tool-generic terms.
///
/// MUST remain sorted (ASCII lexicographic) for `binary_search` to be correct.
/// Run `src/autotag/verify_sorted` or the test below to validate.
const STOPWORDS: &[&str] = &[
    "about",
    "above",
    "after",
    "again",
    "against",
    "all",
    "allow",
    "allows",
    "also",
    "am",
    "and",
    "annotations",
    "any",
    "are",
    "as",
    "at",
    "available",
    "based",
    "be",
    "because",
    "been",
    "before",
    "being",
    "below",
    "between",
    "both",
    "but",
    "by",
    "can",
    "current",
    "data",
    "default",
    "did",
    "do",
    "does",
    "down",
    "during",
    "each",
    "enable",
    "enables",
    "etc",
    "every",
    "examples",
    "few",
    "for",
    "from",
    "function",
    "functions",
    "further",
    "get",
    "gets",
    "given",
    "had",
    "has",
    "have",
    "here",
    "how",
    "if",
    "in",
    "include",
    "includes",
    "including",
    "information",
    "input",
    "into",
    "is",
    "it",
    "its",
    "just",
    "may",
    "method",
    "methods",
    "might",
    "more",
    "most",
    "multiple",
    "must",
    "new",
    "no",
    "not",
    "note",
    "of",
    "off",
    "on",
    "once",
    "one",
    "only",
    "optional",
    "or",
    "other",
    "out",
    "output",
    "over",
    "own",
    "parameter",
    "parameters",
    "per",
    "provide",
    "provides",
    "required",
    "result",
    "results",
    "return",
    "returns",
    "same",
    "see",
    "set",
    "sets",
    "shall",
    "should",
    "single",
    "so",
    "some",
    "specific",
    "specified",
    "such",
    "support",
    "supports",
    "than",
    "that",
    "the",
    "then",
    "there",
    "these",
    "this",
    "those",
    "through",
    "too",
    "tool",
    "tools",
    "two",
    "type",
    "types",
    "under",
    "until",
    "up",
    "use",
    "used",
    "uses",
    "using",
    "value",
    "values",
    "various",
    "very",
    "via",
    "was",
    "well",
    "were",
    "what",
    "when",
    "where",
    "which",
    "while",
    "who",
    "whom",
    "will",
    "with",
    "would",
    "you",
];

// ============================================================================
// Tests
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    // ── enrich_description ──────────────────────────────────────────────

    #[test]
    fn enrich_description_appends_keywords_to_plain_description() {
        // GIVEN: a plain description with no keyword tags
        // WHEN: enriched
        // THEN: a [keywords: ...] suffix is appended
        let result = enrich_description("Fetches weather forecasts from an external service.");
        assert!(
            result.contains("[keywords:"),
            "Expected keywords suffix, got: {result}"
        );
    }

    #[test]
    fn enrich_description_is_idempotent_when_already_tagged() {
        // GIVEN: a description that already has [keywords: ...]
        let input = "Searches the web. [keywords: search, web]";
        // WHEN / THEN: returned unchanged
        assert_eq!(enrich_description(input), input);
    }

    #[test]
    fn enrich_description_returns_unchanged_when_no_keywords_extracted() {
        // GIVEN: a description consisting only of stopwords
        let input = "use the tool to get data";
        // WHEN: enriched
        let result = enrich_description(input);
        // THEN: no keywords suffix is added (all words filtered)
        assert!(!result.contains("[keywords:"), "Got: {result}");
    }

    #[test]
    fn enrich_description_returns_empty_string_unchanged() {
        assert_eq!(enrich_description(""), "");
    }

    #[test]
    fn enrich_description_trims_trailing_whitespace_before_appending() {
        // GIVEN: description with trailing whitespace
        let result = enrich_description("Manages calendar events.   ");
        // THEN: no double-space before [keywords:
        assert!(
            !result.contains("   [keywords:"),
            "Unexpected whitespace before tag: {result}"
        );
        assert!(result.contains(" [keywords:"));
    }

    #[test]
    fn enrich_description_preserves_original_description_text() {
        let desc = "Resolves DNS hostnames into IP addresses.";
        let result = enrich_description(desc);
        assert!(
            result.starts_with(desc),
            "Original text not preserved. Got: {result}"
        );
    }

    #[test]
    fn enrich_description_keywords_are_comma_separated() {
        let result = enrich_description("Authenticates users with OAuth tokens.");
        if result.contains("[keywords:") {
            let kw_part = result
                .split("[keywords:")
                .nth(1)
                .unwrap()
                .trim_end_matches(']');
            // Keyword list should contain commas separating multiple words
            // (at least one comma when more than one keyword exists)
            let keywords: Vec<&str> = kw_part.split(',').map(str::trim).collect();
            for kw in &keywords {
                assert!(!kw.is_empty(), "Empty keyword in list");
            }
        }
    }

    // ── extract_keywords ────────────────────────────────────────────────

    #[test]
    fn extract_keywords_filters_stopwords() {
        // GIVEN: text containing only stopwords
        let keywords = extract_keywords("use the tool to get data from it");
        // THEN: empty (all filtered)
        assert!(
            keywords.is_empty(),
            "Expected empty keywords, got: {keywords:?}"
        );
    }

    #[test]
    fn extract_keywords_filters_words_shorter_than_min_len() {
        // GIVEN: text with very short words that aren't stopwords
        let keywords = extract_keywords("an is a of to go");
        assert!(keywords.is_empty(), "Got: {keywords:?}");
    }

    #[test]
    fn extract_keywords_deduplicates_repeated_words() {
        // GIVEN: "calendar calendar calendar"
        let keywords = extract_keywords("calendar events calendar schedule calendar");
        let calendar_count = keywords.iter().filter(|k| k.as_str() == "calendar").count();
        assert_eq!(calendar_count, 1, "Duplicate 'calendar' found: {keywords:?}");
    }

    #[test]
    fn extract_keywords_limits_to_max_keywords() {
        // GIVEN: a long description with many distinct keywords
        let desc = "authenticates resolves transforms aggregates monitors publishes \
                    archives broadcasts encrypts compresses validates normalizes";
        let keywords = extract_keywords(desc);
        assert!(
            keywords.len() <= MAX_KEYWORDS,
            "Got {} keywords, expected ≤{MAX_KEYWORDS}: {keywords:?}",
            keywords.len()
        );
    }

    #[test]
    fn extract_keywords_prefers_longer_words() {
        // GIVEN: mix of short and long meaningful words
        let keywords = extract_keywords("authenticates resolves dns ip");
        // 'authenticates' (13 chars) should be in result before short ones
        if keywords.len() >= 2 {
            assert!(
                keywords[0].len() >= keywords[1].len(),
                "Expected descending length order, got: {keywords:?}"
            );
        }
    }

    #[test]
    fn extract_keywords_splits_hyphenated_words() {
        // GIVEN: hyphenated word "entity-discovery"
        let keywords = extract_keywords("entity-discovery workflow");
        assert!(
            keywords.contains(&"entity".to_string())
                || keywords.contains(&"discovery".to_string()),
            "Hyphen not split: {keywords:?}"
        );
    }

    #[test]
    fn extract_keywords_splits_underscore_separated_words() {
        // GIVEN: underscore-separated tokens (common in tool names embedded in descriptions)
        let keywords = extract_keywords("oauth_token refresh_token");
        assert!(
            keywords.contains(&"oauth".to_string())
                || keywords.contains(&"token".to_string())
                || keywords.contains(&"refresh".to_string()),
            "Underscore not split: {keywords:?}"
        );
    }

    #[test]
    fn extract_keywords_lowercases_all_output() {
        let keywords = extract_keywords("Authenticates OAuth Calendar");
        for kw in &keywords {
            assert_eq!(kw.as_str(), kw.to_lowercase(), "Non-lowercase keyword: {kw}");
        }
    }

    #[test]
    fn extract_keywords_from_realistic_tool_description() {
        let desc = "Search for code repositories on GitHub using keyword queries. \
                    Returns repository names, descriptions, star counts, and topics.";
        let keywords = extract_keywords(desc);
        // Should pick up domain-relevant words like "repositories", "github", "keyword",
        // "repository", "descriptions", "topics" — not "search", "returns"
        assert!(
            !keywords.is_empty(),
            "Expected keywords from realistic description"
        );
        assert!(keywords.len() <= MAX_KEYWORDS);
        // "data" is a stopword — must not appear
        assert!(!keywords.contains(&"data".to_string()));
        // "returns" is a stopword — must not appear
        assert!(!keywords.contains(&"returns".to_string()));
    }

    // ── tokenize ────────────────────────────────────────────────────────

    #[test]
    fn tokenize_splits_on_spaces() {
        let tokens: Vec<&str> = tokenize("hello world rust").collect();
        assert_eq!(tokens, ["hello", "world", "rust"]);
    }

    #[test]
    fn tokenize_splits_on_punctuation() {
        let tokens: Vec<&str> = tokenize("reads, writes, and deletes.").collect();
        assert!(tokens.contains(&"reads"));
        assert!(tokens.contains(&"writes"));
        assert!(tokens.contains(&"deletes"));
        // "and" is a stopword but tokenization itself should include it
        assert!(tokens.contains(&"and"));
    }

    #[test]
    fn tokenize_splits_on_hyphens() {
        let tokens: Vec<&str> = tokenize("entity-discovery").collect();
        assert_eq!(tokens, ["entity", "discovery"]);
    }

    #[test]
    fn tokenize_produces_no_empty_tokens() {
        for token in tokenize("  hello   world  ") {
            assert!(!token.is_empty());
        }
    }

    // ── is_stopword ─────────────────────────────────────────────────────

    #[test]
    fn stopwords_list_is_sorted() {
        // INVARIANT: STOPWORDS must be sorted for binary_search correctness
        let is_sorted = STOPWORDS.windows(2).all(|w| w[0] <= w[1]);
        assert!(
            is_sorted,
            "STOPWORDS is not sorted — binary_search will produce incorrect results"
        );
    }

    #[test]
    fn stopwords_contains_common_english_words() {
        for word in &["the", "and", "for", "with", "from", "this", "that"] {
            assert!(is_stopword(word), "'{word}' should be a stopword");
        }
    }

    #[test]
    fn stopwords_contains_tool_generic_words() {
        for word in &["tool", "tools", "returns", "parameter", "parameters", "data"] {
            assert!(is_stopword(word), "'{word}' should be a tool-generic stopword");
        }
    }

    #[test]
    fn stopwords_does_not_contain_domain_keywords() {
        for word in &[
            "calendar",
            "oauth",
            "github",
            "kubernetes",
            "bitcoin",
            "forecast",
        ] {
            assert!(!is_stopword(word), "'{word}' should NOT be a stopword");
        }
    }

    // ── edge cases ──────────────────────────────────────────────────────

    #[test]
    fn enrich_description_handles_description_with_only_numbers() {
        let result = enrich_description("42 100 3.14");
        // Numbers don't contain ASCII alphabetics, so no keywords — returns unchanged
        assert_eq!(result, "42 100 3.14");
    }

    #[test]
    fn enrich_description_handles_single_meaningful_word() {
        let result = enrich_description("Authenticates.");
        assert!(
            result.contains("[keywords: authenticates]"),
            "Got: {result}"
        );
    }

    #[test]
    fn enrich_description_handles_unicode_text() {
        // Non-ASCII chars are split boundaries; the ASCII portions should tokenize fine
        let result = enrich_description("Manages calendar entries (créer).");
        // "manages", "calendar", "entries" should be extracted; "cr" + "er" filtered by len
        assert!(result.contains("[keywords:"));
        let kws_part = result.split("[keywords:").nth(1).unwrap_or("");
        assert!(kws_part.contains("calendar") || kws_part.contains("manages") || kws_part.contains("entries"));
    }

    #[test]
    fn enrich_description_max_keywords_from_dense_description() {
        // A very long description should still produce at most MAX_KEYWORDS tags
        let desc = "Subscribes authenticates resolves normalizes aggregates \
                    broadcasts archives encrypts compresses validates transforms \
                    schedules deploys monitors analyzes generates publishes";
        let result = enrich_description(desc);
        if let Some(kw_part) = result.split("[keywords:").nth(1) {
            let count = kw_part.trim_end_matches(']').split(',').count();
            assert!(
                count <= MAX_KEYWORDS,
                "Got {count} keywords, max is {MAX_KEYWORDS}"
            );
        }
    }
}