package store
import (
"encoding/json"
"fmt"
"os"
"sort"
"strings"
)
// SearchResult represents a single search result with scoring.
type SearchResult struct {
Path string `json:"path"`
Title string `json:"title"`
ChunkHeading string `json:"chunk_heading"`
Score float64 `json:"score"`
Distance float64 `json:"distance"`
Snippet string `json:"snippet"`
Domain string `json:"domain"`
Workstream string `json:"workstream"`
Agent string `json:"agent,omitempty"`
Tags string `json:"tags"`
ContentType string `json:"content_type,omitempty"`
Confidence float64 `json:"confidence,omitempty"`
}
// SearchOptions configures a vector search.
type SearchOptions struct {
TopK int
Domain string
Workstream string
Agent string
Tags []string
}
// VectorSearch performs a KNN vector search with optional metadata filtering
// and per-path deduplication.
func (db *DB) VectorSearch(queryVec []float32, opts SearchOptions) ([]SearchResult, error) {
if opts.TopK <= 0 {
opts.TopK = 10
}
if opts.TopK > 100 {
opts.TopK = 100
}
vecData, err := serializeFloat32(queryVec)
if err != nil {
return nil, fmt.Errorf("serialize query: %w", err)
}
// Fetch extra results for deduplication and filtering
fetchK := opts.TopK * 5
rows, err := db.conn.Query(`
SELECT v.distance, n.id, n.path, n.title, n.chunk_heading, n.text,
n.domain, n.workstream, COALESCE(n.agent, ''), n.tags, n.content_type, n.confidence, n.modified
FROM vault_notes_vec v
JOIN vault_notes n ON n.id = v.note_id
WHERE v.embedding MATCH ? AND k = ?
AND UPPER(n.path) NOT LIKE '_PRIVATE/%%'
ORDER BY v.distance`,
vecData, fetchK,
)
if err != nil {
return nil, fmt.Errorf("vector search: %w", err)
}
defer rows.Close()
type rawResult struct {
distance float64
id int64
path string
title string
heading string
text string
domain string
workstream string
agent string
tags string
contentType string
confidence float64
modified float64
}
var raw []rawResult
for rows.Next() {
var r rawResult
if err := rows.Scan(
&r.distance, &r.id, &r.path, &r.title, &r.heading, &r.text,
&r.domain, &r.workstream, &r.agent, &r.tags, &r.contentType, &r.confidence, &r.modified,
); err != nil {
return nil, err
}
raw = append(raw, r)
}
if err := rows.Err(); err != nil {
return nil, err
}
// Apply metadata filters
filtered := raw[:0]
for _, r := range raw {
if opts.Domain != "" && !strings.EqualFold(r.domain, opts.Domain) {
continue
}
if opts.Workstream != "" && !strings.EqualFold(r.workstream, opts.Workstream) {
continue
}
if opts.Agent != "" && !strings.EqualFold(r.agent, opts.Agent) {
continue
}
if len(opts.Tags) > 0 && !hasTags(r.tags, opts.Tags) {
continue
}
filtered = append(filtered, r)
}
// Deduplicate by path (keep best-scoring chunk per note)
seen := make(map[string]bool)
var deduped []rawResult
for _, r := range filtered {
if seen[r.path] {
continue
}
seen[r.path] = true
deduped = append(deduped, r)
if len(deduped) >= opts.TopK {
break
}
}
if len(deduped) == 0 {
return nil, nil
}
// Score results using absolute distance thresholds combined with relative
// normalization. Pure relative scoring makes the top result always 1.0
// even when nothing is relevant. Instead, we use an absolute distance
// ceiling (absDistCeiling) as the reference point so that results near
// the ceiling score low regardless of relative position.
//
// absDistCeiling = 20.0 chosen to be above the maxDistance eval threshold
// (16.3) so that relevant results score well, while off-topic results
// (distance > 18) get appropriately low scores.
const absDistCeiling = 20.0
results := make([]SearchResult, 0, len(deduped))
for _, r := range deduped {
// Absolute score: 1.0 at distance 0, 0.0 at absDistCeiling
absScore := 1.0 - (r.distance / absDistCeiling)
if absScore < 0 {
absScore = 0
}
// Relative score within this result set (preserves original ranking signal)
minDist := deduped[0].distance
maxDist := deduped[len(deduped)-1].distance
distRange := maxDist - minDist
if distRange <= 0 {
distRange = 1.0
}
relScore := 1.0 - ((r.distance - minDist) / distRange)
// Blend: 70% absolute + 30% relative. This ensures poor absolute
// results score low while still differentiating within a result set.
score := 0.7*absScore + 0.3*relScore
snippet := r.text
if len(snippet) > 500 {
snippet = snippet[:500]
}
results = append(results, SearchResult{
Path: r.path,
Title: r.title,
ChunkHeading: r.heading,
Score: round3(score),
Distance: round1(r.distance),
Snippet: snippet,
Domain: r.domain,
Workstream: r.workstream,
Agent: r.agent,
Tags: r.tags,
ContentType: r.contentType,
Confidence: round3(r.confidence),
})
}
return results, nil
}
// VectorSearchRaw returns raw results with full metadata for composite scoring.
// Does not normalize scores — caller is responsible for scoring.
type RawSearchResult struct {
NoteID int64
Distance float64
Path string
Title string
Heading string
Text string
Domain string
Workstream string
Agent string
Tags string
ContentType string
Confidence float64
Modified float64
}
// VectorSearchRaw performs a raw vector search without score normalization.
func (db *DB) VectorSearchRaw(queryVec []float32, fetchK int) ([]RawSearchResult, error) {
vecData, err := serializeFloat32(queryVec)
if err != nil {
return nil, fmt.Errorf("serialize query: %w", err)
}
rows, err := db.conn.Query(`
SELECT v.distance, n.id, n.path, n.title, n.chunk_heading, n.text,
n.domain, n.workstream, COALESCE(n.agent, ''), n.tags, n.content_type, n.confidence, n.modified
FROM vault_notes_vec v
JOIN vault_notes n ON n.id = v.note_id
WHERE v.embedding MATCH ? AND k = ?
AND UPPER(n.path) NOT LIKE '_PRIVATE/%%'
ORDER BY v.distance`,
vecData, fetchK,
)
if err != nil {
return nil, fmt.Errorf("vector search: %w", err)
}
defer rows.Close()
var results []RawSearchResult
for rows.Next() {
var r RawSearchResult
if err := rows.Scan(
&r.Distance, &r.NoteID, &r.Path, &r.Title, &r.Heading, &r.Text,
&r.Domain, &r.Workstream, &r.Agent, &r.Tags, &r.ContentType, &r.Confidence, &r.Modified,
); err != nil {
return nil, err
}
results = append(results, r)
}
return results, rows.Err()
}
func hasTags(tagsJSON string, required []string) bool {
var noteTags []string
if err := json.Unmarshal([]byte(tagsJSON), ¬eTags); err != nil {
return false
}
noteTagsLower := make(map[string]bool, len(noteTags))
for _, t := range noteTags {
noteTagsLower[strings.ToLower(t)] = true
}
for _, req := range required {
if noteTagsLower[strings.ToLower(req)] {
return true
}
}
return false
}
// KeywordSearch performs a SQL LIKE search on title and text fields.
// Uses OR between terms and ranks by match count. Used as a fallback when
// vector search misses exact-term queries.
func (db *DB) KeywordSearch(terms []string, limit int) ([]RawSearchResult, error) {
if len(terms) == 0 || limit <= 0 {
return nil, nil
}
// Build a score expression: count how many terms match in title or text
var matchExprs []string
var args []interface{}
for _, term := range terms {
pattern := "%" + escapeLIKE(term) + "%"
matchExprs = append(matchExprs,
`(CASE WHEN LOWER(n.title) LIKE LOWER(?) ESCAPE '\' OR LOWER(n.text) LIKE LOWER(?) ESCAPE '\' THEN 1 ELSE 0 END)`)
args = append(args, pattern, pattern)
}
// Build OR conditions: at least one term must match
var conditions []string
for _, term := range terms {
pattern := "%" + escapeLIKE(term) + "%"
conditions = append(conditions, `(LOWER(n.title) LIKE LOWER(?) ESCAPE '\' OR LOWER(n.text) LIKE LOWER(?) ESCAPE '\')`)
args = append(args, pattern, pattern)
}
scoreExpr := strings.Join(matchExprs, " + ")
// Use EXISTS instead of IN for better query planning — SQLite can
// short-circuit once it finds the first matching chunk for each path.
query := fmt.Sprintf(`
SELECT 0 as distance, n.id, n.path, n.title, n.chunk_heading, n.text,
n.domain, n.workstream, COALESCE(n.agent, ''), n.tags, n.content_type, n.confidence, n.modified
FROM vault_notes n
WHERE n.chunk_id = 0 AND UPPER(n.path) NOT LIKE '_PRIVATE/%%' AND EXISTS (
SELECT 1 FROM vault_notes n2
WHERE n2.path = n.path AND (%s)
)
ORDER BY (%s) DESC, n.modified DESC
LIMIT ?`,
strings.Join(conditions, " OR "),
scoreExpr)
args = append(args, limit)
rows, err := db.conn.Query(query, args...)
if err != nil {
return nil, fmt.Errorf("keyword search: %w", err)
}
defer rows.Close()
var results []RawSearchResult
for rows.Next() {
var r RawSearchResult
if err := rows.Scan(
&r.Distance, &r.NoteID, &r.Path, &r.Title, &r.Heading, &r.Text,
&r.Domain, &r.Workstream, &r.Agent, &r.Tags, &r.ContentType, &r.Confidence, &r.Modified,
); err != nil {
return nil, err
}
results = append(results, r)
}
return results, rows.Err()
}
// ContentTermSearch finds notes where a minimum number of search terms
// appear across ANY chunk. Unlike KeywordSearch (which ranks by chunk_id=0
// match count), this function counts distinct terms across all chunks,
// correctly finding notes where terms appear in later sections.
// Returns chunk_id=0 data for matching notes, ranked by term coverage
// (highest first), then recency as tiebreaker.
func (db *DB) ContentTermSearch(terms []string, minTerms int, limit int) ([]RawSearchResult, error) {
if len(terms) == 0 || limit <= 0 || minTerms <= 0 {
return nil, nil
}
// Build per-term coverage expressions that check across all chunks.
// Each expression evaluates to 1 if ANY chunk of the note contains
// the term, 0 otherwise.
// Also build chunk-frequency expressions for content density ranking.
// Content relevance = chunk_freq^2 / chunk_count. This rewards notes
// that are genuinely ABOUT the topic (high frequency AND high density)
// over notes that are simply long (high frequency, low density) or
// short (high density from one mention).
var coverageExprs []string
var freqExprs []string
var covArgs []interface{}
var freqArgs []interface{}
for _, term := range terms {
pattern := "%" + escapeLIKE(term) + "%"
coverageExprs = append(coverageExprs,
`(CASE WHEN SUM(CASE WHEN LOWER(n2.title) LIKE LOWER(?) ESCAPE '\' OR LOWER(n2.text) LIKE LOWER(?) ESCAPE '\' THEN 1 ELSE 0 END) > 0 THEN 1 ELSE 0 END)`)
covArgs = append(covArgs, pattern, pattern)
freqExprs = append(freqExprs,
`SUM(CASE WHEN LOWER(n2.text) LIKE LOWER(?) ESCAPE '\' THEN 1 ELSE 0 END)`)
freqArgs = append(freqArgs, pattern)
}
coverageExpr := strings.Join(coverageExprs, " + ")
freqExpr := strings.Join(freqExprs, " + ")
// Args order: coverage args, freq args, minTerms, limit
var args []interface{}
args = append(args, covArgs...)
args = append(args, freqArgs...)
args = append(args, minTerms, limit)
query := fmt.Sprintf(`
WITH note_coverage AS (
SELECT n2.path, (%s) as cov,
(%s) as chunk_freq, COUNT(*) as chunk_count
FROM vault_notes n2
GROUP BY n2.path
)
SELECT 0 as distance, n.id, n.path, n.title, n.chunk_heading, n.text,
n.domain, n.workstream, COALESCE(n.agent, ''), n.tags, n.content_type, n.confidence, n.modified
FROM vault_notes n
JOIN note_coverage nc ON n.path = nc.path
WHERE n.chunk_id = 0 AND UPPER(n.path) NOT LIKE '_PRIVATE/%%' AND nc.cov >= ?
ORDER BY nc.cov DESC,
CAST(nc.chunk_freq * nc.chunk_freq AS REAL) / nc.chunk_count DESC,
n.modified DESC
LIMIT ?`,
coverageExpr, freqExpr)
rows, err := db.conn.Query(query, args...)
if err != nil {
return nil, fmt.Errorf("content term search: %w", err)
}
defer rows.Close()
var results []RawSearchResult
for rows.Next() {
var r RawSearchResult
if err := rows.Scan(
&r.Distance, &r.NoteID, &r.Path, &r.Title, &r.Heading, &r.Text,
&r.Domain, &r.Workstream, &r.Agent, &r.Tags, &r.ContentType, &r.Confidence, &r.Modified,
); err != nil {
return nil, err
}
results = append(results, r)
}
return results, rows.Err()
}
// KeywordSearchTitleMatch performs a keyword search on note titles (and
// optionally paths), requiring at least minMatches terms to appear.
// When titleOnly is true, only n.title is checked (more precise, avoids
// false positives from folder names). When false, both n.title and n.path
// are checked (catches folder-organized content like "projects/security-audit/").
func (db *DB) KeywordSearchTitleMatch(terms []string, minMatches int, limit int, titleOnly ...bool) ([]RawSearchResult, error) {
if len(terms) == 0 || limit <= 0 || minMatches <= 0 {
return nil, nil
}
onlyTitle := len(titleOnly) > 0 && titleOnly[0]
var matchExprs []string
var args []interface{}
for _, term := range terms {
pattern := "%" + escapeLIKE(term) + "%"
if onlyTitle {
matchExprs = append(matchExprs,
`(CASE WHEN LOWER(n.title) LIKE LOWER(?) ESCAPE '\' THEN 1 ELSE 0 END)`)
args = append(args, pattern)
} else {
matchExprs = append(matchExprs,
`(CASE WHEN LOWER(n.title) LIKE LOWER(?) ESCAPE '\' OR LOWER(n.path) LIKE LOWER(?) ESCAPE '\' THEN 1 ELSE 0 END)`)
args = append(args, pattern, pattern)
}
}
scoreExpr := strings.Join(matchExprs, " + ")
// scoreExpr appears twice (WHERE + ORDER BY), so build args in correct
// order: [WHERE match args..., minMatches, ORDER BY match args..., limit]
matchArgs := make([]interface{}, len(args))
copy(matchArgs, args)
var finalArgs []interface{}
finalArgs = append(finalArgs, args...) // WHERE match args
finalArgs = append(finalArgs, minMatches) // >= ?
finalArgs = append(finalArgs, matchArgs...) // ORDER BY match args
finalArgs = append(finalArgs, limit) // LIMIT ?
query := fmt.Sprintf(`
SELECT 0 as distance, n.id, n.path, n.title, n.chunk_heading, n.text,
n.domain, n.workstream, COALESCE(n.agent, ''), n.tags, n.content_type, n.confidence, n.modified
FROM vault_notes n
WHERE n.chunk_id = 0 AND UPPER(n.path) NOT LIKE '_PRIVATE/%%' AND (%s) >= ?
ORDER BY (%s) DESC, n.modified DESC
LIMIT ?`,
scoreExpr, scoreExpr)
rows, err := db.conn.Query(query, finalArgs...)
if err != nil {
return nil, fmt.Errorf("keyword title search: %w", err)
}
defer rows.Close()
var results []RawSearchResult
for rows.Next() {
var r RawSearchResult
if err := rows.Scan(
&r.Distance, &r.NoteID, &r.Path, &r.Title, &r.Heading, &r.Text,
&r.Domain, &r.Workstream, &r.Agent, &r.Tags, &r.ContentType, &r.Confidence, &r.Modified,
); err != nil {
return nil, err
}
results = append(results, r)
}
return results, rows.Err()
}
// HybridSearch combines vector KNN search with keyword title matching.
// Vector results fill most of TopK; keyword-only results are scored by
// term coverage and interleaved by score so strong title matches rank high.
func (db *DB) HybridSearch(queryVec []float32, queryText string, opts SearchOptions) ([]SearchResult, error) {
// 1. Vector search (primary)
vectorResults, err := db.VectorSearch(queryVec, opts)
if err != nil {
return nil, err
}
// 2. Keyword title search (supplemental)
terms := ExtractSearchTerms(queryText)
var kwResults []RawSearchResult
if len(terms) > 0 {
kwResults, _ = db.KeywordSearchTitleMatch(terms, 1, opts.TopK*2, true)
}
// Steps 3-8: Merge keyword results into vector results.
// Wrapped in a block so that step 9 (post-processing) always runs.
merged := vectorResults
if len(kwResults) > 0 {
// Build terms set (used for both vector boost and keyword scoring).
termsSet := make(map[string]bool, len(terms))
for _, t := range terms {
termsSet[t] = true
}
// 3. Build keyword path->score map for score fusion.
// When a vector result also appears in keyword results, we boost
// its score using the keyword signal so it ranks higher in the
// final sort.
kwPathScore := make(map[string]float64, len(kwResults))
for _, r := range kwResults {
titleLower := strings.ToLower(r.Title)
score := keywordTitleScore(titleLower, terms, termsSet)
if existing, ok := kwPathScore[r.Path]; !ok || score > existing {
kwPathScore[r.Path] = score
}
}
// Fuse keyword score into vector results additively, but only when
// the keyword match is strong (score >= 0.7, meaning most query terms
// appear in the title). Weak matches (few terms) tend to be noise
// and cause regressions when boosted.
for i, r := range merged {
if kwScore, ok := kwPathScore[r.Path]; ok && kwScore >= 0.7 {
merged[i].Score = round3(r.Score + 0.5*kwScore)
}
}
// 4. Cut vector results to make room for keyword additions.
// Reserve up to 30% of TopK (min 2) for keyword results.
maxReplace := (opts.TopK*3 + 9) / 10 // ceil(topK * 0.3)
if maxReplace < 2 {
maxReplace = 2
}
if len(merged) >= opts.TopK {
cutAt := len(merged) - maxReplace
if cutAt < 0 {
cutAt = 0
}
merged = merged[:cutAt]
}
// 5. Build seen map from REMAINING vector results (not pre-cut ones).
seen := make(map[string]bool, len(merged))
for _, r := range merged {
seen[r.Path] = true
}
var newKW []SearchResult
for _, r := range kwResults {
if seen[r.Path] {
continue
}
seen[r.Path] = true
snippet := r.Text
if len(snippet) > 500 {
snippet = snippet[:500]
}
titleLower := strings.ToLower(r.Title)
score := keywordTitleScore(titleLower, terms, termsSet)
newKW = append(newKW, SearchResult{
Path: r.Path,
Title: r.Title,
ChunkHeading: r.Heading,
Score: score,
Distance: 0,
Snippet: snippet,
Domain: r.Domain,
Workstream: r.Workstream,
Agent: r.Agent,
Tags: r.Tags,
ContentType: r.ContentType,
Confidence: round3(r.Confidence),
})
}
if len(newKW) > 0 {
// 6. Sort keyword results by score so highest-value matches
// (e.g. exact title matches at 0.95) get picked first when
// we have more candidates than available slots.
sort.Slice(newKW, func(i, j int) bool {
return newKW[i].Score > newKW[j].Score
})
// 7. Merge keyword results into vector results (filling up to TopK).
remaining := opts.TopK - len(merged)
if remaining > len(newKW) {
remaining = len(newKW)
}
full := make([]SearchResult, 0, opts.TopK)
full = append(full, merged...)
full = append(full, newKW[:remaining]...)
merged = full
}
// 8. Sort merged results by score descending so high-confidence
// keyword matches interleave with vector results instead of
// being stuck at the bottom.
sort.Slice(merged, func(i, j int) bool {
return merged[i].Score > merged[j].Score
})
// 8b. If slots remain unfilled, try fuzzy title matching
// to catch single-character typos/omissions.
filled := len(merged)
if filled < opts.TopK {
fuzzyResults, _ := db.FuzzyTitleSearch(terms, opts.TopK*2)
for _, r := range fuzzyResults {
if filled >= opts.TopK {
break
}
if seen[r.Path] {
continue
}
seen[r.Path] = true
snippet := r.Text
if len(snippet) > 500 {
snippet = snippet[:500]
}
merged = append(merged, SearchResult{
Path: r.Path,
Title: r.Title,
ChunkHeading: r.Heading,
Score: 0.4,
Distance: 0,
Snippet: snippet,
Domain: r.Domain,
Workstream: r.Workstream,
Agent: r.Agent,
Tags: r.Tags,
ContentType: r.ContentType,
Confidence: round3(r.Confidence),
})
filled++
}
}
}
// 9. Post-process: apply title-overlap-aware ranking.
// Uses bidirectional term overlap to re-sort results so that
// title-relevant notes rank above vector-only semantic matches.
// Also filters configured noise paths and near-dedups versioned files.
// This step ALWAYS runs, regardless of whether keyword results
// were merged, to ensure filtering and dedup apply to all results.
queryTerms := QueryWordsForTitleMatch(queryText)
if len(queryTerms) > 0 {
merged = RankSearchResults(merged, queryTerms)
}
return merged, nil
}
// keywordTitleScore computes a score for a keyword result based on how many
// search terms appear in its title. Exact title matches get the highest score.
// Scores are calibrated to interleave well with vector results (which range 0-1).
func keywordTitleScore(titleLower string, terms []string, termsSet map[string]bool) float64 {
trimmed := strings.TrimSpace(titleLower)
// Exact title match -> score just below top vector result
if termsSet[trimmed] {
return 0.95
}
// Count how many search terms appear in the title
matchCount := 0
for _, t := range terms {
if strings.Contains(titleLower, t) {
matchCount++
}
}
if matchCount == 0 {
return 0.5
}
// Score range: 0.55 (1 of many) to 0.85 (all terms match)
fraction := float64(matchCount) / float64(len(terms))
return round3(0.5 + 0.35*fraction)
}
// FuzzyTitleSearch finds notes whose titles contain words within edit distance 1
// of any search term. Only considers terms >= 5 chars to avoid false positives.
func (db *DB) FuzzyTitleSearch(terms []string, limit int) ([]RawSearchResult, error) {
// Filter to terms long enough for meaningful fuzzy matching
var fuzzyTerms []string
for _, t := range terms {
if len(t) >= 5 {
fuzzyTerms = append(fuzzyTerms, strings.ToLower(t))
}
}
if len(fuzzyTerms) == 0 || limit <= 0 {
return nil, nil
}
// Limit the scan to avoid full table scan on large vaults.
// We fetch more than needed since post-filtering reduces the set.
scanLimit := limit * 10
if scanLimit < 200 {
scanLimit = 200
}
rows, err := db.conn.Query(`
SELECT 0 as distance, n.id, n.path, n.title, n.chunk_heading, n.text,
n.domain, n.workstream, COALESCE(n.agent, ''), n.tags, n.content_type, n.confidence, n.modified
FROM vault_notes n WHERE n.chunk_id = 0 AND UPPER(n.path) NOT LIKE '_PRIVATE/%'
ORDER BY n.modified DESC
LIMIT ?`, scanLimit)
if err != nil {
return nil, err
}
defer rows.Close()
var results []RawSearchResult
for rows.Next() {
var r RawSearchResult
if err := rows.Scan(
&r.Distance, &r.NoteID, &r.Path, &r.Title, &r.Heading, &r.Text,
&r.Domain, &r.Workstream, &r.Agent, &r.Tags, &r.ContentType, &r.Confidence, &r.Modified,
); err != nil {
continue
}
titleLower := strings.ToLower(r.Title)
titleWords := splitTitleWords(titleLower)
for _, term := range fuzzyTerms {
matched := false
for _, word := range titleWords {
if editDistance1(term, word) {
matched = true
break
}
}
if matched {
results = append(results, r)
break
}
}
if len(results) >= limit {
break
}
}
return results, rows.Err()
}
// splitTitleWords splits a title into lowercase words, treating common
// punctuation as separators.
func splitTitleWords(title string) []string {
f := func(r rune) bool {
return r == ' ' || r == '-' || r == '_' || r == '(' || r == ')' ||
r == ',' || r == '.' || r == '/' || r == ':' || r == '\u2014' || r == '&'
}
return strings.FieldsFunc(title, f)
}
// editDistance1 checks if two strings have edit distance exactly 1
// (one substitution, insertion, or deletion).
func editDistance1(a, b string) bool {
la, lb := len(a), len(b)
if la == lb {
diffs := 0
for i := range a {
if a[i] != b[i] {
diffs++
}
if diffs > 1 {
return false
}
}
return diffs == 1
}
if la == lb+1 {
return canDeleteOne(a, b)
}
if la+1 == lb {
return canDeleteOne(b, a)
}
return false
}
// canDeleteOne checks if removing one character from longer produces shorter.
func canDeleteOne(longer, shorter string) bool {
i, j := 0, 0
skipped := false
for i < len(longer) && j < len(shorter) {
if longer[i] == shorter[j] {
i++
j++
} else if !skipped {
skipped = true
i++
} else {
return false
}
}
return true
}
// searchStopWords are common English words filtered from keyword search terms.
var searchStopWords = map[string]bool{
"the": true, "a": true, "an": true, "is": true, "are": true,
"was": true, "were": true, "be": true, "been": true, "being": true,
"have": true, "has": true, "had": true, "do": true, "does": true,
"did": true, "will": true, "would": true, "could": true, "should": true,
"may": true, "might": true, "shall": true, "can": true,
"of": true, "in": true, "to": true, "for": true, "with": true,
"on": true, "at": true, "from": true, "by": true, "about": true,
"as": true, "into": true, "through": true, "during": true,
"and": true, "or": true, "but": true, "not": true, "so": true,
"what": true, "how": true, "when": true, "where": true, "which": true,
"who": true, "whom": true, "this": true, "that": true, "these": true,
"those": true, "it": true, "its": true, "my": true, "your": true,
"our": true, "their": true, "i": true, "me": true, "we": true,
"you": true, "he": true, "she": true, "they": true, "them": true,
"explain": true, "describe": true, "tell": true, "show": true,
"work": true, "works": true, "tracked": true, "area": true,
"project": true, "help": true, "find": true, "search": true,
}
// meaningfulShortTerms are 2-character terms that carry domain meaning.
var meaningfulShortTerms = map[string]bool{
"ai": true, "os": true, "pm": true, "qa": true,
"ui": true, "ux": true, "hr": true, "ml": true,
}
// ExtractSearchTerms extracts meaningful search terms from a natural language
// query, filtering stop words and short terms. Exported for use by MCP and CLI.
func ExtractSearchTerms(query string) []string {
words := strings.Fields(query)
var terms []string
seen := make(map[string]bool)
for _, w := range words {
lower := strings.ToLower(w)
lower = strings.Trim(lower, ".,;:!?\"'()[]{}")
if len(lower) < 2 {
continue
}
if len(lower) == 2 && !meaningfulShortTerms[lower] {
continue
}
if searchStopWords[lower] || seen[lower] {
continue
}
seen[lower] = true
terms = append(terms, lower)
}
return terms
}
// sanitizeFTS5Term strips FTS5 special operators from a search term to prevent
// injection of FTS5 query syntax.
// escapeLIKE escapes SQL LIKE wildcard characters (% and _) in a term
// so they are matched literally. Use with ESCAPE '\' in the LIKE clause.
var likeEscaper = strings.NewReplacer(`\`, `\\`, `%`, `\%`, `_`, `\_`)
func escapeLIKE(term string) string {
return likeEscaper.Replace(term)
}
func sanitizeFTS5Term(term string) string {
return strings.Map(func(r rune) rune {
switch r {
case '*', '^', '-', '"', '{', '}', '(', ')':
return -1
default:
return r
}
}, term)
}
// FTS5Search performs a full-text search using the FTS5 index with BM25 ranking.
// Used as a fallback when embedding provider is unavailable.
// Returns an error if FTS5 is not available.
func (db *DB) FTS5Search(query string, opts SearchOptions) ([]SearchResult, error) {
if !db.ftsAvailable {
return nil, fmt.Errorf("FTS5 not available")
}
if opts.TopK <= 0 {
opts.TopK = 10
}
// FTS5 query: use OR between terms so partial matches are included.
// BM25 ranking naturally scores documents with more matching terms higher,
// so multi-term matches rank above single-term matches without requiring AND.
terms := ExtractSearchTerms(query)
if len(terms) == 0 {
return nil, nil
}
// Sanitize terms to prevent FTS5 operator injection
sanitizedTerms := make([]string, 0, len(terms))
for _, term := range terms {
sanitized := sanitizeFTS5Term(term)
if sanitized != "" {
sanitizedTerms = append(sanitizedTerms, sanitized)
}
}
if len(sanitizedTerms) == 0 {
return nil, nil
}
ftsQuery := strings.Join(sanitizedTerms, " OR ")
rows, err := db.conn.Query(`
SELECT n.path, n.title, n.chunk_heading, n.text,
n.domain, n.workstream, COALESCE(n.agent, ''), n.tags, n.content_type, n.confidence, n.modified
FROM vault_notes_fts f
JOIN vault_notes n ON n.id = f.rowid
WHERE vault_notes_fts MATCH ? AND UPPER(n.path) NOT LIKE '_PRIVATE/%%'
ORDER BY bm25(vault_notes_fts) ASC
LIMIT ?`,
ftsQuery, opts.TopK*3,
)
if err != nil {
return nil, fmt.Errorf("FTS5 search: %w", err)
}
defer rows.Close()
seen := make(map[string]bool)
var results []SearchResult
for rows.Next() {
var r SearchResult
var modified float64
if err := rows.Scan(
&r.Path, &r.Title, &r.ChunkHeading, &r.Snippet,
&r.Domain, &r.Workstream, &r.Agent, &r.Tags, &r.ContentType, &r.Confidence, &modified,
); err != nil {
continue
}
// Dedup by path
if seen[r.Path] {
continue
}
seen[r.Path] = true
if len(r.Snippet) > 500 {
r.Snippet = r.Snippet[:500]
}
r.Score = 0.5 // FTS results get a baseline score
r.Distance = 0
// Apply metadata filters
if opts.Domain != "" && !strings.EqualFold(r.Domain, opts.Domain) {
continue
}
if opts.Workstream != "" && !strings.EqualFold(r.Workstream, opts.Workstream) {
continue
}
if opts.Agent != "" && !strings.EqualFold(r.Agent, opts.Agent) {
continue
}
if len(opts.Tags) > 0 && !hasTags(r.Tags, opts.Tags) {
continue
}
results = append(results, r)
if len(results) >= opts.TopK {
break
}
}
return results, rows.Err()
}
// FederatedResult extends SearchResult with the source vault name.
type FederatedResult struct {
SearchResult
Vault string `json:"vault"`
}
// MaxFederatedVaults is the maximum number of vaults that can be searched in
// a single federated search call. Prevents resource exhaustion.
const MaxFederatedVaults = 50
// FederatedSearch searches across multiple vault databases and merges results.
// Each entry in vaultDBPaths maps a vault alias to its database file path.
// Uses vector search (HybridSearch) if queryVec is non-nil and the vault has
// vectors; falls back to FTS5 or keyword search otherwise.
// SECURITY: DB paths should be pre-validated by the caller (derived from the
// vault registry, not from user input). Error messages use vault aliases only,
// never raw filesystem paths.
func FederatedSearch(vaultDBPaths map[string]string, queryVec []float32, queryText string, opts SearchOptions) ([]FederatedResult, error) {
if len(vaultDBPaths) == 0 {
return nil, nil
}
if len(vaultDBPaths) > MaxFederatedVaults {
return nil, fmt.Errorf("too many vaults (%d), maximum is %d", len(vaultDBPaths), MaxFederatedVaults)
}
queryText = strings.TrimSpace(queryText)
if queryText == "" {
return nil, nil
}
perVaultK := opts.TopK
if perVaultK <= 0 {
perVaultK = 10
}
var allResults []FederatedResult
var searchErrors []string
for alias, dbPath := range vaultDBPaths {
vaultDB, err := OpenPath(dbPath)
if err != nil {
// SECURITY: Use alias in error, not the raw filesystem path
searchErrors = append(searchErrors, fmt.Sprintf("vault %q: unavailable", alias))
continue
}
var results []SearchResult
vaultOpts := SearchOptions{
TopK: perVaultK,
Domain: opts.Domain,
Workstream: opts.Workstream,
Agent: opts.Agent,
Tags: opts.Tags,
}
if queryVec != nil && vaultDB.HasVectors() {
results, err = vaultDB.HybridSearch(queryVec, queryText, vaultOpts)
} else if vaultDB.FTSAvailable() {
results, err = vaultDB.FTS5Search(queryText, vaultOpts)
} else {
// Final fallback: keyword search on title/text
terms := ExtractSearchTerms(queryText)
if len(terms) > 0 {
raw, kwErr := vaultDB.KeywordSearch(terms, vaultOpts.TopK)
if kwErr == nil {
for _, r := range raw {
snippet := r.Text
if len(snippet) > 500 {
snippet = snippet[:500]
}
results = append(results, SearchResult{
Path: r.Path,
Title: r.Title,
ChunkHeading: r.Heading,
Score: 0.5,
Snippet: snippet,
Domain: r.Domain,
Workstream: r.Workstream,
Agent: r.Agent,
Tags: r.Tags,
ContentType: r.ContentType,
Confidence: round3(r.Confidence),
})
}
}
}
if len(results) == 0 {
if cerr := vaultDB.Close(); cerr != nil {
searchErrors = append(searchErrors, fmt.Sprintf("vault %q: close failed", alias))
}
continue
}
}
if cerr := vaultDB.Close(); cerr != nil {
searchErrors = append(searchErrors, fmt.Sprintf("vault %q: close failed", alias))
}
if err != nil {
// SECURITY: Use alias in error, not raw DB error which may contain paths
searchErrors = append(searchErrors, fmt.Sprintf("vault %q: search failed", alias))
continue
}
for _, r := range results {
allResults = append(allResults, FederatedResult{
SearchResult: r,
Vault: alias,
})
}
}
// Sort by score descending, then deduplicate by path+vault
sort.Slice(allResults, func(i, j int) bool {
return allResults[i].Score > allResults[j].Score
})
// Deduplicate: same path in same vault (shouldn't happen but defensive)
seen := make(map[string]bool)
var deduped []FederatedResult
for _, r := range allResults {
key := r.Vault + ":" + r.Path
if seen[key] {
continue
}
seen[key] = true
deduped = append(deduped, r)
}
// Trim to requested TopK
if opts.TopK > 0 && len(deduped) > opts.TopK {
deduped = deduped[:opts.TopK]
}
// Log any vault-level errors so users can diagnose issues.
if len(searchErrors) > 0 {
fmt.Fprintf(os.Stderr, "same: federated search: %d vault(s) skipped: %s\n",
len(searchErrors), strings.Join(searchErrors, "; "))
}
return deduped, nil
}
func round3(f float64) float64 {
return float64(int(f*1000+0.5)) / 1000
}
func round1(f float64) float64 {
return float64(int(f*10+0.5)) / 10
}