statelessagent

Overview Schema Related Servers Score Discussions

text_processing.go•17.8 KiB

package hooks import ( "regexp" "strings" "github.com/sgx-labs/statelessagent/internal/store" ) // Prompt injection patterns — content matching these is stripped from snippets // before injection. Prevents vault notes from hijacking agent behavior. var injectionPatterns = []string{ "ignore previous", "ignore all previous", "ignore above", "disregard previous", "disregard all previous", "you are now", "new instructions", "system prompt", "<system>", "</system>", "IMPORTANT:", "CRITICAL:", "override", } // smartTruncate truncates text at a sentence or paragraph boundary near maxLen. // Falls back to word boundary if no sentence break is found. func smartTruncate(text string, maxLen int) string { if len(text) <= maxLen { return text } // Look for the last sentence-ending punctuation before maxLen // Search in the last 30% of the allowed range for a good break searchStart := maxLen * 7 / 10 candidate := text[:maxLen] // Try paragraph break first (double newline) if idx := strings.LastIndex(candidate[searchStart:], "\n\n"); idx >= 0 { return strings.TrimSpace(candidate[:searchStart+idx]) } // Try sentence break (. ! ? followed by space or newline) bestBreak := -1 for i := searchStart; i < maxLen-1; i++ { if (candidate[i] == '.' || candidate[i] == '!' || candidate[i] == '?') && (candidate[i+1] == ' ' || candidate[i+1] == '\n') { bestBreak = i + 1 } } if bestBreak > 0 { return strings.TrimSpace(candidate[:bestBreak]) } // Try single newline if idx := strings.LastIndex(candidate[searchStart:], "\n"); idx >= 0 { return strings.TrimSpace(candidate[:searchStart+idx]) } // Fall back to word boundary if idx := strings.LastIndex(candidate[searchStart:], " "); idx >= 0 { return strings.TrimSpace(candidate[:searchStart+idx]) } return candidate } // stripLeadingHeadings removes leading markdown headings (# Title lines) // from text. The snippet already shows the title in bold, so repeating it // as a heading wastes tokens. Returns the text starting from the first // non-heading, non-empty line. func stripLeadingHeadings(text string) string { lines := strings.SplitN(text, "\n", 20) // only check first 20 lines start := 0 for i, line := range lines { trimmed := strings.TrimSpace(line) if trimmed == "" || strings.HasPrefix(trimmed, "#") { start = i + 1 continue } break } if start >= len(lines) { return "" // all lines were headings, omit snippet } // Rejoin from the first content line return strings.TrimSpace(strings.Join(lines[start:], "\n")) } // queryBiasedSnippet extracts the most query-relevant window of text. // Instead of always showing the first N chars (which may just be an intro), // it finds the paragraph with the most query-term overlap and starts there. // Falls back to the beginning if no query terms match. func queryBiasedSnippet(text string, maxLen int) string { text = stripLeadingHeadings(text) if text == "" || len(text) <= maxLen { return text } prompt := keyTermsPrompt if prompt == "" { return smartTruncate(text, maxLen) } words := store.QueryWordsForTitleMatch(prompt) if len(words) == 0 { return smartTruncate(text, maxLen) } // Split into paragraphs (double newline) or single lines sep := "\n\n" paragraphs := strings.Split(text, sep) if len(paragraphs) <= 1 { sep = "\n" paragraphs = strings.Split(text, sep) } // Score each paragraph by query-term overlap bestIdx := 0 bestScore := 0 for i, para := range paragraphs { paraLower := strings.ToLower(para) score := 0 for _, w := range words { if strings.Contains(paraLower, w) { score++ } } if score > bestScore { bestScore = score bestIdx = i } } if bestScore == 0 { return smartTruncate(text, maxLen) } // Start from the best paragraph, or one earlier for context startIdx := bestIdx if startIdx > 0 && len(paragraphs[startIdx-1]) < 100 { startIdx-- } // Calculate byte offset by summing paragraph lengths + separators offset := 0 for i := 0; i < startIdx; i++ { offset += len(paragraphs[i]) + len(sep) } if offset >= len(text) { return smartTruncate(text, maxLen) } return smartTruncate(text[offset:], maxLen) } // contentTermCoverage returns the fraction of terms that appear in the text. // Used to evaluate whether a keyword search result covers the query well. func contentTermCoverage(text string, terms []string) float64 { if len(terms) == 0 { return 0 } lower := strings.ToLower(text) matches := 0 for _, t := range terms { if strings.Contains(lower, strings.ToLower(t)) { matches++ } } return float64(matches) / float64(len(terms)) } // sanitizeSnippet removes prompt injection patterns from snippet text. // Primary detection uses go-promptguard's multi-detector (pattern matching + // statistical analysis). The legacy string-match list is kept as a fallback // for belt-and-suspenders defense. func sanitizeSnippet(text string) string { // Primary: go-promptguard multi-detector if detectInjection(text) { return "[content filtered for security]" } // Fallback: legacy pattern matching lower := strings.ToLower(text) for _, pattern := range injectionPatterns { if strings.Contains(lower, strings.ToLower(pattern)) { return "[content filtered for security]" } } return text } // sanitizeContextTags strips XML-like tags from note content that could // break structural wrappers (vault-context, plugin-context, session-bootstrap, // vault-handoff, vault-decisions, same-diagnostic) and enable indirect prompt // injection. A crafted note containing "</vault-context>\n<same-diagnostic>" // would escape the context wrapper and inject system-level instructions. func sanitizeContextTags(text string) string { // All tag names used as structural wrappers in the hook system. // Each pair (open + close) must be neutralized to prevent escape. tagNames := []string{ "vault-context", "plugin-context", "session-bootstrap", "vault-handoff", "vault-decisions", "same-diagnostic", // F16: Additional tags used by AI systems that could enable prompt injection "system-reminder", "system", "instructions", "tool_result", "tool_use", "important", } // LLM-specific injection patterns (Llama/Mistral [INST], <<SYS>>, XML CDATA). // These are not XML tags but delimiters used by specific model families that // could be exploited to escape the vault-context wrapper. type literalPattern struct { pattern string // lowercase match replacement string } llmPatterns := []literalPattern{ {"[inst]", "[[inst]]"}, {"[/inst]", "[[/inst]]"}, {"<<sys>>", "[[sys]]"}, {"<</sys>>", "[[/sys]]"}, {"<![cdata[", "[CDATA["}, {"]]>", "]]>"}, } // Case-insensitive replacement: scan character-by-character and replace // any matching XML open/close/self-closing tag with bracket-escaped equivalents. lower := strings.ToLower(text) var result strings.Builder result.Grow(len(text)) i := 0 for i < len(text) { matched := false // Check LLM-specific literal patterns first for _, lp := range llmPatterns { if i+len(lp.pattern) <= len(text) && lower[i:i+len(lp.pattern)] == lp.pattern { result.WriteString(lp.replacement) i += len(lp.pattern) matched = true break } } if matched { continue } for _, tag := range tagNames { closeTag := "</" + tag + ">" openTag := "<" + tag + ">" openTagAttr := "<" + tag + " " // tag with attributes selfClose := "<" + tag + "/>" if i+len(closeTag) <= len(text) && lower[i:i+len(closeTag)] == closeTag { result.WriteString("[/" + tag + "]") i += len(closeTag) matched = true break } if i+len(selfClose) <= len(text) && lower[i:i+len(selfClose)] == selfClose { result.WriteString("[" + tag + "/]") i += len(selfClose) matched = true break } if i+len(openTag) <= len(text) && lower[i:i+len(openTag)] == openTag { result.WriteString("[" + tag + "]") i += len(openTag) matched = true break } if i+len(openTagAttr) <= len(text) && lower[i:i+len(openTagAttr)] == openTagAttr { result.WriteString("[" + tag + " ") i += len(openTagAttr) matched = true break } } if !matched { result.WriteByte(text[i]) i++ } } return result.String() } // titleWordRe matches word tokens in titles (letters, digits, underscores). var titleWordRe = regexp.MustCompile(`[\w]+`) // queryWordsForTitleMatch extracts all meaningful words from the prompt // for title overlap matching. More permissive than extractKeyTerms — // includes short words (3+ chars) and alphanumeric tokens — because // title overlap scoring handles false positives via bidirectional threshold. func queryWordsForTitleMatch() []string { prompt := keyTermsPrompt if prompt == "" { return nil } words := titleWordRe.FindAllString(prompt, -1) seen := make(map[string]bool) var result []string for _, w := range words { lower := strings.ToLower(w) if len(w) < 3 { // Allow meaningful 2-char terms (domain acronyms commonly found // in vault note titles). Skips common English 2-char words. if len(w) == 2 && meaningful2CharTerms[lower] { // keep it } else { continue } } if titleMatchStopWords[lower] || seen[lower] { continue } result = append(result, w) seen[lower] = true } return result } // titleMatchStopWords filters common English words from title matching terms. // More comprehensive than the keyword stopWords since title matching extracts // shorter words (3+ chars). var titleMatchStopWords = map[string]bool{ // 3-letter "the": true, "and": true, "for": true, "are": true, "but": true, "not": true, "you": true, "all": true, "can": true, "has": true, "her": true, "his": true, "how": true, "its": true, "may": true, "new": true, "now": true, "our": true, "out": true, "own": true, "too": true, "use": true, "was": true, "who": true, "why": true, "did": true, "get": true, "got": true, "had": true, "let": true, "say": true, "she": true, "any": true, "way": true, "yet": true, // 4-letter "also": true, "area": true, "back": true, "been": true, "best": true, "call": true, "case": true, "come": true, "data": true, "does": true, "done": true, "each": true, "even": true, "find": true, "from": true, "give": true, "goes": true, "good": true, "have": true, "help": true, "here": true, "into": true, "just": true, "keep": true, "kind": true, "know": true, "last": true, "left": true, "like": true, "list": true, "long": true, "look": true, "made": true, "main": true, "make": true, "many": true, "more": true, "most": true, "much": true, "must": true, "need": true, "next": true, "once": true, "only": true, "open": true, "over": true, "part": true, "show": true, "side": true, "some": true, "such": true, "sure": true, "take": true, "talk": true, "tell": true, "test": true, "than": true, "that": true, "them": true, "then": true, "they": true, "this": true, "time": true, "turn": true, "type": true, "used": true, "uses": true, "very": true, "want": true, "well": true, "went": true, "were": true, "what": true, "when": true, "will": true, "with": true, "work": true, "your": true, // 5+ letter (same as keyword stopWords) "about": true, "above": true, "after": true, "again": true, "being": true, "below": true, "between": true, "could": true, "doing": true, "during": true, "every": true, "found": true, "going": true, "having": true, "might": true, "never": true, "other": true, "should": true, "their": true, "there": true, "these": true, "thing": true, "think": true, "those": true, "under": true, "until": true, "using": true, "where": true, "which": true, "while": true, "would": true, "write": true, "yours": true, "really": true, "please": true, "right": true, "since": true, "still": true, "today": true, // Query boilerplate "explain": true, "tracked": true, "defined": true, } // meaningful2CharTerms are short terms that carry domain-specific meaning // and should be kept as title match terms despite being only 2 chars. // These commonly appear in vault note titles (e.g., "AI Experiments Hub"). var meaningful2CharTerms = map[string]bool{ "ai": true, "os": true, "pm": true, "qa": true, "ui": true, "ux": true, "hr": true, "ml": true, "v1": true, "v2": true, "v3": true, "v4": true, "v5": true, } // titleOverlapScore computes bidirectional term overlap between query terms // and a note's title + path. Returns queryCoverage * wordCoverage in [0, 1]. // // Words are extracted from both the title and path (directory components), // with underscore splitting (team_roles -> team, roles) and simple plural // matching (project <-> projects). This catches notes where the project/folder // name contains query terms even if the filename is generic (e.g., design-brief.md). func titleOverlapScore(queryTerms []string, title, path string) float64 { if len(queryTerms) == 0 { return 0 } // Extract words from title allWords := titleWordRe.FindAllString(title, -1) // Extract words from path components (strip .md extension first) cleanPath := strings.TrimSuffix(path, ".md") for _, part := range strings.Split(cleanPath, "/") { allWords = append(allWords, titleWordRe.FindAllString(part, -1)...) } // Build lowercase set, splitting underscores and filtering short tokens wordSet := make(map[string]bool, len(allWords)) for _, w := range allWords { // Split underscore-separated words: "team_roles" -> "team", "roles" subWords := []string{w} if strings.Contains(w, "_") { subWords = strings.Split(w, "_") } for _, sub := range subWords { if len(sub) >= 2 { wordSet[strings.ToLower(sub)] = true } } } wordCount := len(wordSet) if wordCount == 0 { return 0 } // Expand hyphenated query terms: "chain-of-thought" -> ["chain","of","thought"] var expanded []string for _, t := range queryTerms { if strings.Contains(t, "-") { for _, part := range strings.Split(t, "-") { if len(part) >= 2 { expanded = append(expanded, part) } } } else { expanded = append(expanded, t) } } if len(expanded) == 0 { return 0 } // Count expanded terms that match. Each wordSet entry can only be matched // once to prevent inflated scores when multiple query terms match the same // word (e.g., "prompt" and "prompting" both matching wordSet "prompting"). // Matching cascades: exact -> plural -> edit distance 1 -> common stem. matchCount := 0 matchedEntries := make(map[string]bool, len(wordSet)) for _, t := range expanded { lower := strings.ToLower(t) var matched string if wordSet[lower] && !matchedEntries[lower] { matched = lower } else if wordSet[lower+"s"] && !matchedEntries[lower+"s"] { matched = lower + "s" } else if len(lower) > 2 && strings.HasSuffix(lower, "s") && wordSet[lower[:len(lower)-1]] && !matchedEntries[lower[:len(lower)-1]] { matched = lower[:len(lower)-1] } if matched == "" { // Fuzzy matching: edit distance 1 and common root/stem for w := range wordSet { if matchedEntries[w] { continue } if isEditDistance1(lower, w) || sharesStem(lower, w) { matched = w break } } } if matched != "" { matchedEntries[matched] = true matchCount++ } } if matchCount == 0 { return 0 } queryCoverage := float64(matchCount) / float64(len(expanded)) wordCoverage := float64(matchCount) / float64(wordCount) // Short word sets (1-2 unique words) are noisy for single-term matches // because wordCoverage is trivially high. Require higher queryCoverage // to compensate. if wordCount <= 2 && queryCoverage < 0.30 { return 0 } return queryCoverage * wordCoverage } // isEditDistance1 returns true if two lowercase strings differ by exactly one // character (insertion, deletion, or substitution). Only applies to words // >= 7 chars to avoid false positives on short words. // Example: "kubernetes" vs "kuberntes" (one deleted char). func isEditDistance1(a, b string) bool { la, lb := len(a), len(b) if la < 7 && lb < 7 { return false } diff := la - lb if diff < -1 || diff > 1 { return false } if diff == 0 { // Same length: check for exactly one substitution diffs := 0 for i := 0; i < la; i++ { if a[i] != b[i] { diffs++ if diffs > 1 { return false } } } return diffs == 1 } // Different length by 1: check for exactly one insertion/deletion longer, shorter := a, b if lb > la { longer, shorter = b, a } diffs := 0 j := 0 for i := 0; i < len(longer) && j < len(shorter); i++ { if longer[i] != shorter[j] { diffs++ if diffs > 1 { return false } continue // skip the extra char in longer } j++ } return true } // sharesStem returns true if two lowercase words likely share the same root. // Requires both words >= 5 chars and a common prefix that covers all but // the last char of the shorter word, with at most 3 extra chars on the longer. // Examples: "invoice"/"invoicing", "finance"/"financing", "report"/"reporting". func sharesStem(a, b string) bool { la, lb := len(a), len(b) if la < 5 || lb < 5 { return false } shorter := la if lb < shorter { shorter = lb } lengthDiff := la - lb if lengthDiff < 0 { lengthDiff = -lengthDiff } if lengthDiff > 3 { return false } // Find common prefix length common := 0 for i := 0; i < shorter; i++ { if a[i] != b[i] { break } common++ } // Common prefix must cover all but the last char of the shorter word return common >= shorter-1 && common >= 5 } // overlapForSort returns the overlap score to use for sorting and gap-cap. // Uses title-only overlap as the primary signal, but when path-inclusive // overlap is strong (>= 0.25), provides a reduced score (half-strength) // so path-matched notes survive gap-cap without competing with direct // title matches. This allows notes like "design-brief.md" in well-named // project directories to appear alongside title-matched results. func overlapForSort(queryTerms []string, title, path string) float64 { titleOnly := titleOverlapScore(queryTerms, title, "") if titleOnly > 0 { return titleOnly } fullOverlap := titleOverlapScore(queryTerms, title, path) if fullOverlap >= 0.25 { return fullOverlap * 0.5 } return 0 }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sgx-labs/statelessagent'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

text_processing.go•17.8 KiB