docdex

Overview Schema Related Servers Score Discussions

docdex
src
index

mod.rs•80.6 kB

mod ignore_rules; mod impact; pub(crate) mod libs; mod symbols; use crate::error::{ repo_resolution_details, AppError, ERR_BACKOFF_REQUIRED, ERR_INVALID_ARGUMENT, ERR_MISSING_INDEX, ERR_MISSING_REPO_PATH, ERR_REPO_STATE_MISMATCH, ERR_STALE_INDEX, }; use crate::impact::{extract_import_edges, ImpactGraphEdge}; use crate::symbols::{ AstQuery, AstQueryMatch, AstResponseV1, AstSearchMatch, AstSearchMode, SymbolSearchMatch, SymbolsParserStatus, SymbolsResponseV1, SymbolsStore, }; use anyhow::{anyhow, Context, Result}; use ignore_rules::{build_ignore_matcher, IgnoreMatcher}; use once_cell::sync::Lazy; use parking_lot::Mutex; use regex::Regex; use std::cmp::Ordering; use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::fs::{self, File}; use std::io::{self, BufRead, BufReader, Read}; use std::path::{Component, Path, PathBuf}; use std::sync::{Arc, Once}; use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT}; use tantivy::DocAddress; use tantivy::{ doc, Document, Index, IndexReader, IndexWriter, ReloadPolicy, SnippetGenerator, Term, }; use thiserror::Error; use tracing::warn; use walkdir::WalkDir; const MAX_INDEX_RAM_BYTES: usize = 50 * 1024 * 1024; const MAX_BINARY_FILE_BYTES: u64 = 5 * 1024 * 1024; const BINARY_SNIFF_BYTES: usize = 8192; const DOC_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdx", ".txt"]; const CODE_EXTENSIONS: &[&str] = &[ ".rs", ".py", ".js", ".jsx", ".ts", ".tsx", ".go", ".java", ".cs", ".c", ".h", ".cc", ".cpp", ".cxx", ".hh", ".hpp", ".hxx", ".php", ".kt", ".kts", ".swift", ".rb", ".lua", ".dart", ]; const DEFAULT_EXTENSIONS: &[&str] = &[ ".md", ".markdown", ".mdx", ".txt", ".rs", ".py", ".js", ".jsx", ".ts", ".tsx", ".go", ".java", ".cs", ".c", ".h", ".cc", ".cpp", ".cxx", ".hh", ".hpp", ".hxx", ".php", ".kt", ".kts", ".swift", ".rb", ".lua", ".dart", ]; const DEFAULT_EXCLUDED_DIR_NAMES: &[&str] = &[ // Core VCS / tooling ".git", ".idea", ".vscode", ".cache", "tmp", "temp", ".hg", ".svn", ".bzr", ".darcs", ".fossil", ".pijul", "cvs", // JS / TS / Node ecosystem "node_modules", ".pnpm-store", ".yarn", ".yarn-cache", ".npm", "dist", "build", "coverage", ".vite", ".turbo", ".nx", ".parcel-cache", ".rollup-cache", ".webpack-cache", ".tsbuildinfo", ".next", ".nuxt", ".svelte-kit", ".angular", ".expo", // Python "__pycache__", ".venv", "venv", ".mypy_cache", ".pytest_cache", ".ruff_cache", ".tox", ".ipynb_checkpoints", // Rust "target", ".cargo", // Go "bin", "go-build", // Java / Kotlin / JVM ".gradle", ".mvn", "out", // .NET / C# / Visual Studio "obj", ".vs", ".nuget", "testresults", // Swift / Xcode / Apple "deriveddata", ".build", ".swiftpm", "carthage", // PHP / Composer "vendor", // Ruby / Bundler ".bundle", // Dart / Flutter ".dart_tool", ".flutter-plugins", ".flutter-plugins-dependencies", ".pub-cache", // Kotlin ".kotlin", // Android ".android", // iOS / CocoaPods "pods", // C / C++ / CMake / native "debug", "release", "cmake-build-debug", "cmake-build-release", "cmakefiles", ".conan", "vcpkg_installed", // Lua ".luarocks", // Haskell ".stack-work", "dist-newstyle", "cabal-dev", // Elixir / Erlang "_build", "deps", ".elixir_ls", // Scala / Metals / Bloop ".bloop", ".metals", // Clojure ".cpcache", // Elm "elm-stuff", // Nim "nimcache", // OCaml / Dune / opam "_opam", // R / RStudio ".rproj.user", // Game engines: Unity / Unreal / Godot "library", "logs", "obj", "binaries", "deriveddatacache", "intermediate", ".godot", // Infra / deployment / serverless ".docker", "docker-data", ".terraform", ".serverless", ".vercel", ".netlify", ]; const DEFAULT_EXCLUDED_RELATIVE_PREFIXES: &[&str] = &[ "logs/", ".docdex/", ".docdex/logs/", ".docdex/tmp/", ".gpt-creator/logs/", ".gpt-creator/tmp/", ".mastercoda/logs/", ".mastercoda/tmp/", "docker/.data/", "docker-data/", ".docker/", ]; const MAX_SUMMARY_CHARS: usize = 360; const MAX_SUMMARY_SEGMENTS: usize = 4; const MAX_SNIPPET_CHARS: usize = 420; const FALLBACK_PREVIEW_LINES: usize = 60; #[derive(Clone)] pub struct IndexConfig { state_dir: PathBuf, excluded_dir_names: Vec<String>, excluded_relative_prefixes: Vec<String>, symbols_enabled: bool, ignore_matcher: Option<Arc<IgnoreMatcher>>, } #[derive(Clone)] pub struct Indexer { repo_root: PathBuf, config: IndexConfig, index: Index, reader: IndexReader, doc_id_field: tantivy::schema::Field, path_field: tantivy::schema::Field, body_field: tantivy::schema::Field, summary_field: tantivy::schema::Field, token_field: tantivy::schema::Field, kind_field: Option<tantivy::schema::Field>, writer: Option<Arc<Mutex<IndexWriter>>>, symbols_store: Option<SymbolsStore>, } #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(rename_all = "snake_case")] pub enum DocumentKind { Doc, Code, } impl DocumentKind { fn as_str(&self) -> &'static str { match self { DocumentKind::Doc => "doc", DocumentKind::Code => "code", } } } #[derive(Debug, Clone, serde::Serialize)] pub struct Hit { pub doc_id: String, pub rel_path: String, // Stable search contract alias for `rel_path` (preferred by downstream clients). pub path: String, pub kind: DocumentKind, pub score: f32, pub summary: String, pub snippet: String, pub token_estimate: u64, #[serde(skip_serializing_if = "Option::is_none")] pub snippet_origin: Option<SearchSnippetOrigin>, #[serde(skip_serializing_if = "Option::is_none")] pub snippet_truncated: Option<bool>, #[serde(skip_serializing_if = "Option::is_none")] pub line_start: Option<usize>, #[serde(skip_serializing_if = "Option::is_none")] pub line_end: Option<usize>, } #[derive(Debug, Clone, serde::Serialize)] #[serde(rename_all = "snake_case")] pub enum SearchSnippetOrigin { Query, Preview, Summary, } #[derive(Debug, Clone, serde::Serialize)] #[serde(rename_all = "snake_case")] pub enum QueryRewrite { None, Sanitized, } #[derive(Debug, Clone, serde::Serialize)] pub struct SearchQueryMeta { pub raw: String, pub effective: String, pub rewrite: QueryRewrite, } #[derive(Error, Debug)] pub enum SearchError { #[error("invalid query: {reason}")] InvalidQuery { reason: String }, } #[derive(Debug, Clone, serde::Serialize)] #[serde(rename_all = "snake_case")] pub enum SnippetOrigin { Query, Preview, } #[derive(Debug, Clone)] pub struct SnippetResult { pub text: String, pub html: Option<String>, pub truncated: bool, pub origin: SnippetOrigin, pub line_start: Option<usize>, pub line_end: Option<usize>, } #[derive(Debug, serde::Serialize)] pub struct DocSnapshot { pub doc_id: String, pub rel_path: String, pub kind: DocumentKind, pub summary: String, pub token_estimate: u64, } #[derive(Debug, Clone, serde::Serialize)] pub struct IndexStats { pub num_docs: u64, pub state_dir: PathBuf, pub index_size_bytes: u64, pub segments: usize, #[serde(skip_serializing_if = "Option::is_none")] pub avg_bytes_per_doc: Option<u64>, pub generated_at_epoch_ms: u128, #[serde(skip_serializing_if = "Option::is_none")] pub last_updated_epoch_ms: Option<u128>, } impl IndexConfig { #[allow(dead_code)] pub fn for_repo(repo_root: &Path) -> Result<Self> { if env_flag_disabled("DOCDEX_ENABLE_SYMBOL_EXTRACTION") { warn!( target: "docdexd", "symbol + impact extraction are always enabled; ignoring DOCDEX_ENABLE_SYMBOL_EXTRACTION=0" ); } Self::with_overrides(repo_root, None, Vec::new(), Vec::new(), true) } pub fn with_overrides( repo_root: &Path, state_dir: Option<PathBuf>, extra_excluded_dirs: Vec<String>, extra_excluded_prefixes: Vec<String>, symbols_enabled: bool, ) -> Result<Self> { if !symbols_enabled { warn!( target: "docdexd", "symbol + impact extraction are always enabled; ignoring symbols_enabled=false" ); } let state_dir = resolve_state_dir(repo_root, state_dir)?; let mut excluded_dir_names: Vec<String> = DEFAULT_EXCLUDED_DIR_NAMES .iter() .map(|value| value.to_string()) .collect(); for dir in extra_excluded_dirs { let lowered = dir.trim().to_lowercase(); if lowered.is_empty() { continue; } if !excluded_dir_names.contains(&lowered) { excluded_dir_names.push(lowered); } } let mut excluded_relative_prefixes: Vec<String> = DEFAULT_EXCLUDED_RELATIVE_PREFIXES .iter() .map(|value| value.to_string()) .collect(); for prefix in extra_excluded_prefixes { let normalized = normalize_prefix(&prefix); if normalized.is_empty() { continue; } if !excluded_relative_prefixes.contains(&normalized) { excluded_relative_prefixes.push(normalized); } } if let Ok(rel_state) = state_dir.strip_prefix(repo_root) { let normalized = normalize_prefix(rel_state.to_string_lossy().as_ref()); if !normalized.is_empty() && !excluded_relative_prefixes.contains(&normalized) { excluded_relative_prefixes.push(normalized); } } let ignore_matcher = build_ignore_matcher(repo_root, &excluded_dir_names).map(Arc::new); Ok(Self { state_dir, excluded_dir_names, excluded_relative_prefixes, symbols_enabled: true, ignore_matcher, }) } pub fn state_dir(&self) -> &Path { &self.state_dir } pub fn excluded_dir_names(&self) -> &[String] { &self.excluded_dir_names } pub fn excluded_relative_prefixes(&self) -> &[String] { &self.excluded_relative_prefixes } pub fn symbols_enabled(&self) -> bool { self.symbols_enabled } pub fn ignore_matcher(&self) -> Option<&IgnoreMatcher> { self.ignore_matcher.as_deref() } } impl Indexer { #[allow(dead_code)] pub fn new(repo_root: PathBuf) -> Result<Self> { if !repo_root.exists() { return Err(missing_repo_path_error(&repo_root).into()); } if !repo_root.is_dir() { return Err(AppError::new( ERR_INVALID_ARGUMENT, format!("repo root is not a directory: {}", repo_root.display()), ) .into()); } let repo_root = repo_root.canonicalize().context("resolve repo root")?; let config = IndexConfig::for_repo(&repo_root)?; Self::with_config(repo_root, config) } pub fn with_config(repo_root: PathBuf, config: IndexConfig) -> Result<Self> { if !repo_root.exists() { return Err(missing_repo_path_error(&repo_root).into()); } if !repo_root.is_dir() { return Err(AppError::new( ERR_INVALID_ARGUMENT, format!("repo root is not a directory: {}", repo_root.display()), ) .into()); } let repo_root = repo_root.canonicalize().context("resolve repo root")?; let created_state_dir = !config.state_dir().exists(); if created_state_dir { ensure_state_dir_secure(config.state_dir())?; } let (schema, _, _, _, _, _, _) = build_schema(); let index = if config.state_dir().join("meta.json").exists() { Index::open_in_dir(config.state_dir()) .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))? } else { Index::create_in_dir(config.state_dir(), schema.clone())? }; ensure_state_dir_secure(config.state_dir())?; hold_after_state_dir_created(); let reader = index .reader_builder() .reload_policy(ReloadPolicy::OnCommit) .try_into()?; let schema = index.schema(); let doc_id_field = schema .get_field("doc_id") .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))?; let path_field = schema .get_field("rel_path") .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))?; let body_field = schema .get_field("body") .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))?; let summary_field = schema .get_field("summary") .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))?; let token_field = schema .get_field("token_estimate") .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))?; let kind_field = schema.get_field("kind").ok(); let writer = index.writer(MAX_INDEX_RAM_BYTES)?; let symbols_store = if config.symbols_enabled() { symbols::open_symbols_store(&repo_root, config.state_dir(), true) } else { None }; if let Err(err) = crate::repo_manager::record_repo_opened(&repo_root, config.state_dir()) { if let Some(identity) = err.downcast_ref::<crate::repo_manager::RepoIdentityError>() { return Err(repo_state_mismatch_error( &repo_root, Some(config.state_dir()), identity, ) .into()); } return Err(err).context("record repo identity metadata"); } Ok(Self { repo_root, config, index, reader, doc_id_field, path_field, body_field, summary_field, token_field, kind_field, writer: Some(Arc::new(Mutex::new(writer))), symbols_store, }) } pub fn with_config_read_only(repo_root: PathBuf, config: IndexConfig) -> Result<Self> { if !repo_root.exists() { return Err(missing_repo_path_error(&repo_root).into()); } if !repo_root.is_dir() { return Err(AppError::new( ERR_INVALID_ARGUMENT, format!("repo root is not a directory: {}", repo_root.display()), ) .into()); } let repo_root = repo_root.canonicalize().context("resolve repo root")?; if !config.state_dir().exists() { return Err(AppError::new( ERR_MISSING_INDEX, format!( "index not found at {}; run `docdexd index --repo {}` first", config.state_dir().display(), repo_root.display() ), ) .into()); } let index = Index::open_in_dir(config.state_dir()) .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))?; let reader = index .reader_builder() .reload_policy(ReloadPolicy::OnCommit) .try_into()?; let schema = index.schema(); let doc_id_field = schema .get_field("doc_id") .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))?; let path_field = schema .get_field("rel_path") .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))?; let body_field = schema .get_field("body") .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))?; let summary_field = schema .get_field("summary") .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))?; let token_field = schema .get_field("token_estimate") .map_err(|_| stale_index_error(config.state_dir(), Some(&repo_root)))?; let kind_field = schema.get_field("kind").ok(); let symbols_store = if config.symbols_enabled() { symbols::open_symbols_store(&repo_root, config.state_dir(), false) } else { None }; if let Err(err) = crate::repo_manager::validate_repo_state_dir(&repo_root, config.state_dir()) { if let Some(identity) = err.downcast_ref::<crate::repo_manager::RepoIdentityError>() { return Err(repo_state_mismatch_error( &repo_root, Some(config.state_dir()), identity, ) .into()); } return Err(err).context("validate repo identity metadata"); } Ok(Self { repo_root, config, index, reader, doc_id_field, path_field, body_field, summary_field, token_field, kind_field, writer: None, symbols_store, }) } pub async fn reindex_all(&self) -> Result<()> { let writer_arc = self.writer()?; let mut writer = writer_arc.lock(); writer.delete_all_documents()?; self.reset_symbols_store(); let mut impact_edges: BTreeSet<ImpactGraphEdge> = BTreeSet::new(); let mut impact_diagnostics: HashMap<String, crate::impact::ImpactDiagnostics> = HashMap::new(); for entry in WalkDir::new(&self.repo_root) .into_iter() .filter_map(|e| e.ok()) .filter(|e| e.file_type().is_file()) { let path = entry.path(); let decision = decide_file(path, &self.repo_root, &self.config); if !decision.should_index() { continue; } let ingest = self.add_document(&mut writer, path)?; self.maybe_update_symbols(&ingest); if self.symbols_store.is_some() { for edge in ingest.impact_edges { impact_edges.insert(edge); } if let Some(diag) = ingest.impact_diagnostics { impact_diagnostics.insert(ingest.rel_path.clone(), diag); } } } writer.commit()?; self.reader.reload()?; if self.symbols_store.is_some() { self.write_impact_graph(impact_edges.into_iter().collect(), impact_diagnostics)?; } Ok(()) } pub async fn ingest_file(&self, file: PathBuf) -> Result<FileDecision> { let path = file.canonicalize().context("resolve file")?; let decision = decide_file(&path, &self.repo_root, &self.config); if !decision.should_index() { return Ok(decision); } let writer_arc = self.writer()?; let mut writer = writer_arc.lock(); let rel = self.rel_path(&path)?; let term = Term::from_field_text(self.doc_id_field, &rel); writer.delete_term(term); let ingest = self.add_document(&mut writer, &path)?; self.maybe_update_symbols(&ingest); writer.commit()?; self.reader.reload()?; if self.symbols_store.is_some() { self.update_impact_graph_for_file( &rel, &ingest.impact_edges, ingest.impact_diagnostics, )?; } Ok(decision) } pub async fn delete_file(&self, file: PathBuf) -> Result<()> { let rel = match self.rel_path(&file) { Ok(rel) => rel, Err(_) => return Ok(()), }; let writer_arc = self.writer()?; let mut writer = writer_arc.lock(); let term = Term::from_field_text(self.doc_id_field, &rel); writer.delete_term(term); writer.commit()?; self.reader.reload()?; self.delete_symbols_record(&rel); if self.symbols_store.is_some() { self.remove_impact_edges_for_file(&rel)?; } Ok(()) } #[allow(dead_code)] pub fn search(&self, query: &str, limit: usize) -> Result<Vec<Hit>> { let (hits, _meta) = self.search_with_query_meta(query, limit)?; Ok(hits) } pub fn search_with_query_meta( &self, query: &str, limit: usize, ) -> Result<(Vec<Hit>, SearchQueryMeta)> { let raw = query.trim(); if raw.is_empty() { return Err(SearchError::InvalidQuery { reason: "query must not be empty".to_string(), } .into()); } // Tantivy's query parser accepts some operator-only inputs (e.g. "!!!") that contain // no searchable terms. Enforce a strict "must contain at least one term" rule for // determinism and predictable validation behavior. if sanitize_query(raw).trim().is_empty() { return Err(SearchError::InvalidQuery { reason: "query contains no searchable terms".to_string(), } .into()); } let searcher = self.reader.searcher(); let parser = QueryParser::for_index( &self.index, vec![self.body_field, self.summary_field, self.path_field], ); let (tantivy_query, query_meta) = match parser.parse_query(raw) { Ok(q) => ( q, SearchQueryMeta { raw: raw.to_string(), effective: raw.to_string(), rewrite: QueryRewrite::None, }, ), Err(err) => { let sanitized = sanitize_query(raw); if sanitized.trim().is_empty() { return Err(SearchError::InvalidQuery { reason: "query contains no searchable terms".to_string(), } .into()); } match parser.parse_query(&sanitized) { Ok(q) => ( q, SearchQueryMeta { raw: raw.to_string(), effective: sanitized.clone(), rewrite: QueryRewrite::Sanitized, }, ), Err(err2) => { return Err(SearchError::InvalidQuery { reason: format!( "query parse failed: {err}; sanitized parse failed: {err2}" ), } .into()); } } } }; let mut snippet_generator = SnippetGenerator::create(&searcher, tantivy_query.as_ref(), self.body_field).ok(); if let Some(generator) = snippet_generator.as_mut() { generator.set_max_num_chars(MAX_SNIPPET_CHARS); } let top_docs = searcher.search(&tantivy_query, &TopDocs::with_limit(limit))?; let mut results = Vec::with_capacity(top_docs.len()); for (score, addr) in top_docs { let retrieved = searcher.doc(addr)?; let body_text = retrieved .get_first(self.body_field) .and_then(|v| v.as_text()) .unwrap_or_default() .to_string(); let doc_id = retrieved .get_first(self.doc_id_field) .and_then(|v| v.as_text().map(|s| s.to_string())) .unwrap_or_default(); let rel_path = retrieved .get_first(self.path_field) .and_then(|v| v.as_text().map(|s| s.to_string())) .unwrap_or_default(); let path = rel_path.clone(); let summary = retrieved .get_first(self.summary_field) .and_then(|v| v.as_text().map(|s| s.to_string())) .unwrap_or_default(); let kind = self.document_kind_from_doc(&retrieved, &rel_path); let token_estimate = retrieved .get_first(self.token_field) .and_then(|v| v.as_u64()) .unwrap_or(0); let (snippet, snippet_origin, snippet_truncated, line_start, line_end) = snippet_generator .as_ref() .and_then(|gen| { let snippet = gen.snippet_from_doc(&retrieved); let fragment = snippet.fragment().trim().to_string(); if fragment.is_empty() { None } else { let range = line_range_for_fragment(&body_text, &fragment); let inferred_truncated = fragment.chars().count() >= MAX_SNIPPET_CHARS.saturating_sub(1); Some(( fragment, SearchSnippetOrigin::Query, inferred_truncated, range.map(|r| r.0), range.map(|r| r.1), )) } }) .or_else(|| { match self.preview_snippet(&rel_path, FALLBACK_PREVIEW_LINES) { Ok(Some((text, truncated, start_line, end_line))) => { Some(( text, SearchSnippetOrigin::Preview, truncated, Some(start_line), Some(end_line), )) } Ok(None) => None, Err(err) => { warn!(target: "docdexd", error = ?err, %rel_path, "failed to build fallback snippet"); None } } }) .unwrap_or_else(|| { ( summary.clone(), SearchSnippetOrigin::Summary, false, None, None, ) }); results.push(Hit { doc_id, rel_path, path, kind, score, summary, snippet, token_estimate, snippet_origin: Some(snippet_origin), snippet_truncated: Some(snippet_truncated), line_start, line_end, }); } sort_hits_deterministically(&mut results); Ok((results, query_meta)) } fn fetch_document(&self, doc_id: &str) -> Result<Option<Document>> { let searcher = self.reader.searcher(); let term = Term::from_field_text(self.doc_id_field, doc_id); let term_query = tantivy::query::TermQuery::new(term, tantivy::schema::IndexRecordOption::Basic); let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1))?; if let Some((_score, addr)) = top_docs.into_iter().next() { let doc = searcher.doc(addr)?; return Ok(Some(doc)); } Ok(None) } pub fn preview_snippet( &self, rel_path: &str, max_lines: usize, ) -> Result<Option<(String, bool, usize, usize)>> { if max_lines == 0 { return Ok(None); } if !is_safe_rel_path(rel_path) { return Ok(None); } let path = self.repo_root.join(rel_path); let file = match File::open(&path) { Ok(file) => file, Err(err) => { if err.kind() == io::ErrorKind::NotFound { return Ok(None); } return Err(err).with_context(|| format!("open {}", path.display())); } }; let reader = BufReader::new(file); let mut preview_lines: Vec<(usize, String)> = Vec::new(); let mut truncated = false; for (idx, line_res) in reader.lines().enumerate() { if idx >= max_lines { truncated = true; break; } let line = line_res?; let trimmed = line.trim(); if !trimmed.is_empty() { preview_lines.push((idx + 1, trimmed.to_string())); } } if preview_lines.is_empty() { return Ok(None); } let (snippet, snippet_truncated) = condense_snippet( &preview_lines .iter() .map(|(_, text)| text.clone()) .collect::<Vec<_>>(), MAX_SNIPPET_CHARS, ); if snippet.is_empty() { return Ok(None); } let start_line = preview_lines.first().map(|(line, _)| *line).unwrap_or(1); let end_line = preview_lines .last() .map(|(line, _)| *line) .unwrap_or(start_line); Ok(Some(( snippet, truncated || snippet_truncated, start_line, end_line, ))) } pub fn repo_root(&self) -> &Path { &self.repo_root } pub fn read_symbols(&self, rel_path: &str) -> Result<Option<SymbolsResponseV1>> { let Some(store) = self.symbols_store.as_ref() else { return Ok(None); }; if store.requires_reindex()? { return Ok(None); } store.read_symbols(rel_path) } pub fn read_ast(&self, rel_path: &str, max_nodes: usize) -> Result<Option<AstResponseV1>> { let Some(store) = self.symbols_store.as_ref() else { return Ok(None); }; if store.requires_reindex()? { return Ok(None); } store.read_ast(rel_path, max_nodes) } pub fn symbols_parser_status(&self) -> Result<SymbolsParserStatus> { match self.symbols_store.as_ref() { Some(store) => store.parser_status(), None => { let store = SymbolsStore::new(self.repo_root(), self.config.state_dir())?; store.parser_status() } } } pub fn symbols_reindex_required(&self) -> Result<bool> { let status = match self.symbols_store.as_ref() { Some(store) => store.parser_status()?, None => { let store = SymbolsStore::new(self.repo_root(), self.config.state_dir())?; store.parser_status()? } }; Ok(status.requires_reindex || status.drift) } pub fn search_symbols( &self, query: &str, max_files: usize, max_symbols_per_file: usize, ) -> Result<Vec<SymbolSearchMatch>> { let Some(store) = self.symbols_store.as_ref() else { return Ok(Vec::new()); }; if store.requires_reindex()? { return Ok(Vec::new()); } store.search_symbols(query, max_files, max_symbols_per_file) } pub fn search_ast_kinds( &self, kinds: &[String], max_files: usize, ) -> Result<Vec<AstSearchMatch>> { let Some(store) = self.symbols_store.as_ref() else { return Ok(Vec::new()); }; if store.requires_reindex()? { return Ok(Vec::new()); } store.search_ast_kinds(kinds, max_files) } pub fn search_ast_kinds_with_mode( &self, kinds: &[String], max_files: usize, mode: AstSearchMode, ) -> Result<Vec<AstSearchMatch>> { let Some(store) = self.symbols_store.as_ref() else { return Ok(Vec::new()); }; if store.requires_reindex()? { return Ok(Vec::new()); } store.search_ast_kinds_with_mode(kinds, max_files, mode) } pub fn ast_kind_counts_for_file( &self, rel_path: &str, kinds: &[String], ) -> Result<BTreeMap<String, usize>> { let Some(store) = self.symbols_store.as_ref() else { return Ok(BTreeMap::new()); }; if store.requires_reindex()? { return Ok(BTreeMap::new()); } store.ast_kind_counts_for_file(rel_path, kinds) } pub fn query_ast(&self, query: &AstQuery) -> Result<Vec<AstQueryMatch>> { let Some(store) = self.symbols_store.as_ref() else { return Ok(Vec::new()); }; if store.requires_reindex()? { return Ok(Vec::new()); } store.query_ast(query) } pub fn state_dir(&self) -> &Path { self.config.state_dir() } fn writer(&self) -> Result<Arc<Mutex<IndexWriter>>> { self.writer.clone().ok_or_else(|| { AppError::new( ERR_BACKOFF_REQUIRED, "index writer unavailable (another docdexd may be indexing); retry later", ) .into() }) } pub fn config(&self) -> &IndexConfig { &self.config } pub fn symbols_enabled(&self) -> bool { self.config.symbols_enabled() } pub fn num_docs(&self) -> u64 { let searcher = self.reader.searcher(); let mut num_docs: u64 = 0; for segment_reader in searcher.segment_readers() { let live_docs = segment_reader .alive_bitset() .map(|bits| bits.num_alive_docs() as u64) .unwrap_or_else(|| segment_reader.max_doc() as u64); num_docs = num_docs.saturating_add(live_docs); } num_docs } pub fn stats(&self) -> Result<IndexStats> { let searcher = self.reader.searcher(); let num_docs = self.num_docs(); let mut segments: usize = 0; for _ in searcher.segment_readers() { segments += 1; } let state_dir = self.config.state_dir().to_path_buf(); let index_size_bytes = walkdir::WalkDir::new(&state_dir) .into_iter() .filter_map(|entry| entry.ok()) .filter_map(|entry| entry.metadata().ok()) .map(|meta| meta.len()) .sum(); let mut last_updated_epoch_ms: Option<u128> = None; for entry in walkdir::WalkDir::new(&state_dir).into_iter().flatten() { if let Ok(meta) = entry.metadata() { if let Ok(modified) = meta.modified() { if let Ok(dur) = modified.duration_since(std::time::UNIX_EPOCH) { let millis = dur.as_millis(); if last_updated_epoch_ms .map(|current| millis > current) .unwrap_or(true) { last_updated_epoch_ms = Some(millis); } } } } } let generated_at_epoch_ms = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap_or_default() .as_millis(); let avg_bytes_per_doc = if num_docs > 0 { Some(index_size_bytes / num_docs) } else { None }; Ok(IndexStats { num_docs, state_dir, index_size_bytes, segments, avg_bytes_per_doc, generated_at_epoch_ms, last_updated_epoch_ms, }) } pub fn snapshot_with_snippet( &self, doc_id: &str, query: Option<&str>, fallback_lines: usize, ) -> Result<Option<(DocSnapshot, Option<SnippetResult>)>> { let Some(doc) = self.fetch_document(doc_id)? else { return Ok(None); }; let snapshot = self.snapshot_from_document(doc_id, &doc); let snippet = self.snippet_from_document(&doc, Some(&snapshot.rel_path), query, fallback_lines)?; Ok(Some((snapshot, snippet))) } pub fn list_docs(&self, offset: usize, limit: usize) -> Result<(Vec<DocSnapshot>, u64)> { let searcher = self.reader.searcher(); let mut snapshots = Vec::new(); let mut skipped = 0usize; let mut total_live: u64 = 0; 'outer: for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() { let alive = segment_reader.alive_bitset(); let max_doc = segment_reader.max_doc(); let live_in_segment = alive .map(|bits| bits.num_alive_docs() as u64) .unwrap_or_else(|| max_doc as u64); total_live = total_live.saturating_add(live_in_segment); let doc_iter: Box<dyn Iterator<Item = u32>> = if let Some(bits) = alive { Box::new(bits.iter_alive()) } else { Box::new(0..max_doc) }; for doc_id in doc_iter { if skipped < offset { skipped += 1; continue; } if snapshots.len() >= limit { break 'outer; } let address = DocAddress::new(segment_ord as u32, doc_id); let doc = searcher.doc(address)?; let doc_id_text = doc .get_first(self.doc_id_field) .and_then(|v| v.as_text()) .unwrap_or_default(); snapshots.push(self.snapshot_from_document(doc_id_text, &doc)); } } Ok((snapshots, total_live)) } fn add_document(&self, writer: &mut IndexWriter, path: &Path) -> Result<DocumentIngest> { let rel = self.rel_path(path)?; let rel_for_return = rel.clone(); let (content, read_error) = match fs::read_to_string(path) { Ok(content) => (content, None), Err(err) => (String::new(), Some(err.to_string())), }; let content_for_symbols = if self.symbols_store.is_some() { content.clone() } else { String::new() }; let (impact_edges, impact_diagnostics) = if self.symbols_store.is_some() && read_error.is_none() { let result = extract_import_edges( &self.repo_root, self.config.state_dir(), &rel_for_return, &content, ); (result.edges, result.diagnostics) } else { (Vec::new(), None) }; let summary = summarize(&content); let tokens = estimate_tokens(&content); let kind = document_kind_for_path(&rel_for_return); let mut document = doc!( self.doc_id_field => rel.clone(), self.path_field => rel, self.body_field => content, self.summary_field => summary, self.token_field => tokens, ); if let Some(kind_field) = self.kind_field { document.add_text(kind_field, kind.as_str()); } writer.add_document(document)?; Ok(DocumentIngest { rel_path: rel_for_return, content: content_for_symbols, read_error, impact_edges, impact_diagnostics, }) } fn rel_path(&self, path: &Path) -> Result<String> { let rel = path .strip_prefix(&self.repo_root) .map_err(|_| anyhow!("{} is outside repo root", path.display()))?; Ok(rel.to_string_lossy().replace('\\', "/")) } fn snapshot_from_document(&self, doc_id: &str, doc: &Document) -> DocSnapshot { let rel_path = doc .get_first(self.path_field) .and_then(|v| v.as_text().map(|s| s.to_string())) .unwrap_or_default(); let summary = doc .get_first(self.summary_field) .and_then(|v| v.as_text().map(|s| s.to_string())) .unwrap_or_default(); let kind = self.document_kind_from_doc(doc, &rel_path); let token_estimate = doc .get_first(self.token_field) .and_then(|v| v.as_u64()) .unwrap_or(0); DocSnapshot { doc_id: doc_id.to_string(), rel_path, kind, summary, token_estimate, } } fn snippet_from_document( &self, doc: &Document, rel_path_hint: Option<&str>, query: Option<&str>, fallback_lines: usize, ) -> Result<Option<SnippetResult>> { let searcher = self.reader.searcher(); if let Some(query) = query.and_then(|q| { let trimmed = q.trim(); if trimmed.is_empty() { None } else { Some(trimmed) } }) { let parser = QueryParser::for_index(&self.index, vec![self.body_field]); if let Ok(parsed) = parser.parse_query(query) { if let Ok(mut generator) = SnippetGenerator::create(&searcher, parsed.as_ref(), self.body_field) { generator.set_max_num_chars(MAX_SNIPPET_CHARS); let snippet = generator.snippet_from_doc(doc); let fragment = snippet.fragment().trim(); if !fragment.is_empty() { return Ok(Some(SnippetResult { text: fragment.to_string(), html: Some(snippet.to_html()), truncated: false, origin: SnippetOrigin::Query, line_start: None, line_end: None, })); } } } } let rel_path = rel_path_hint.map(|p| p.to_string()).or_else(|| { doc.get_first(self.path_field) .and_then(|v| v.as_text().map(|s| s.to_string())) .map(|text| text.to_string()) }); if let Some(rel_path) = rel_path { if let Some((text, truncated, line_start, line_end)) = self.preview_snippet(&rel_path, fallback_lines)? { return Ok(Some(SnippetResult { text, html: None, truncated, origin: SnippetOrigin::Preview, line_start: Some(line_start), line_end: Some(line_end), })); } } Ok(None) } } struct DocumentIngest { rel_path: String, content: String, read_error: Option<String>, impact_edges: Vec<ImpactGraphEdge>, impact_diagnostics: Option<crate::impact::ImpactDiagnostics>, } fn env_flag_disabled(key: &str) -> bool { std::env::var(key) .ok() .map(|v| { matches!( v.trim().to_lowercase().as_str(), "0" | "false" | "no" | "off" ) }) .unwrap_or(false) } fn build_schema() -> ( Schema, tantivy::schema::Field, tantivy::schema::Field, tantivy::schema::Field, tantivy::schema::Field, tantivy::schema::Field, tantivy::schema::Field, ) { let mut builder = Schema::builder(); let doc_id_field = builder.add_text_field("doc_id", STRING | STORED); let path_field = builder.add_text_field("rel_path", STRING | STORED); let body_field = builder.add_text_field("body", TEXT | STORED); let summary_field = builder.add_text_field("summary", TEXT | STORED); let token_field = builder.add_u64_field("token_estimate", FAST | STORED); let kind_field = builder.add_text_field("kind", STRING | STORED); let schema = builder.build(); ( schema, doc_id_field, path_field, body_field, summary_field, token_field, kind_field, ) } fn document_kind_from_text(value: &str) -> Option<DocumentKind> { match value.trim() { "doc" => Some(DocumentKind::Doc), "code" => Some(DocumentKind::Code), _ => None, } } fn document_kind_for_path(rel_path: &str) -> DocumentKind { let extension = Path::new(rel_path) .extension() .and_then(|ext| ext.to_str()) .map(|ext| format!(".{}", ext.to_lowercase())); if let Some(extension) = extension { if DOC_EXTENSIONS.contains(&extension.as_str()) { return DocumentKind::Doc; } if CODE_EXTENSIONS.contains(&extension.as_str()) { return DocumentKind::Code; } } DocumentKind::Doc } impl Indexer { fn document_kind_from_doc(&self, doc: &Document, rel_path: &str) -> DocumentKind { if let Some(kind_field) = self.kind_field { if let Some(raw) = doc.get_first(kind_field).and_then(|v| v.as_text()) { if let Some(kind) = document_kind_from_text(raw) { return kind; } } } document_kind_for_path(rel_path) } } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)] #[serde(rename_all = "snake_case")] pub enum FileDecisionOutcome { Include, Exclude, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)] #[serde(rename_all = "snake_case", tag = "code")] pub enum FileDecisionReason { OutsideRepo, StateDir, NotAFile, ExcludedPrefix { prefix: String }, ExcludedDirName { name: String }, IgnoredByPattern, MissingExtension, UnsupportedExtension { extension: String }, BinaryTooLarge { bytes: u64 }, AllowedExtension { extension: String }, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)] pub struct FileDecision { pub decision: FileDecisionOutcome, pub reason: FileDecisionReason, } impl FileDecision { fn include(reason: FileDecisionReason) -> Self { Self { decision: FileDecisionOutcome::Include, reason, } } fn exclude(reason: FileDecisionReason) -> Self { Self { decision: FileDecisionOutcome::Exclude, reason, } } pub fn should_index(&self) -> bool { matches!(self.decision, FileDecisionOutcome::Include) } } pub(crate) fn decide_file(path: &Path, repo_root: &Path, config: &IndexConfig) -> FileDecision { if path.starts_with(config.state_dir()) { return FileDecision::exclude(FileDecisionReason::StateDir); } if let (Ok(state_dir), Ok(canonical)) = (config.state_dir().canonicalize(), path.canonicalize()) { if canonical.starts_with(state_dir) { return FileDecision::exclude(FileDecisionReason::StateDir); } } if path.exists() && !path.is_file() { return FileDecision::exclude(FileDecisionReason::NotAFile); } let relative: PathBuf = if path.starts_with(repo_root) { match path.strip_prefix(repo_root) { Ok(value) => value.to_path_buf(), Err(_) => { return FileDecision::exclude(FileDecisionReason::OutsideRepo); } } } else if let (Ok(repo_canon), Ok(path_canon)) = (repo_root.canonicalize(), path.canonicalize()) { if path_canon.starts_with(&repo_canon) { match path_canon.strip_prefix(&repo_canon) { Ok(value) => value.to_path_buf(), Err(_) => { return FileDecision::exclude(FileDecisionReason::OutsideRepo); } } } else { return FileDecision::exclude(FileDecisionReason::OutsideRepo); } } else { return FileDecision::exclude(FileDecisionReason::OutsideRepo); }; let normalized = relative .to_string_lossy() .replace('\\', "/") .trim_start_matches('/') .to_string() .to_lowercase(); if let Some(matcher) = config.ignore_matcher() { let is_dir = path.is_dir(); if matcher.is_ignored(path, is_dir) { return FileDecision::exclude(FileDecisionReason::IgnoredByPattern); } } let mut best_prefix: Option<&String> = None; for prefix in config.excluded_relative_prefixes().iter() { if !normalized.starts_with(prefix) { continue; } best_prefix = match best_prefix { None => Some(prefix), Some(current) => { if prefix.len() > current.len() || (prefix.len() == current.len() && prefix.as_str() < current.as_str()) { Some(prefix) } else { Some(current) } } }; } if let Some(prefix) = best_prefix { return FileDecision::exclude(FileDecisionReason::ExcludedPrefix { prefix: prefix.clone(), }); } for component in relative.components() { if let Component::Normal(name) = component { let name_lower = name.to_string_lossy().to_lowercase(); if config .excluded_dir_names() .iter() .any(|excluded| excluded == &name_lower) { return FileDecision::exclude(FileDecisionReason::ExcludedDirName { name: name_lower, }); } } } let Some(ext) = path.extension().and_then(|e| e.to_str()) else { return FileDecision::exclude(FileDecisionReason::MissingExtension); }; let extension = format!(".{}", ext.to_lowercase()); if !DEFAULT_EXTENSIONS.contains(&extension.as_str()) { return FileDecision::exclude(FileDecisionReason::UnsupportedExtension { extension }); } if let Ok(meta) = path.metadata() { if meta.len() > MAX_BINARY_FILE_BYTES { if is_probably_binary(path).unwrap_or(true) { return FileDecision::exclude(FileDecisionReason::BinaryTooLarge { bytes: meta.len(), }); } } } FileDecision::include(FileDecisionReason::AllowedExtension { extension }) } pub(crate) fn should_index(path: &Path, repo_root: &Path, config: &IndexConfig) -> bool { decide_file(path, repo_root, config).should_index() } fn is_probably_binary(path: &Path) -> io::Result<bool> { let mut file = File::open(path)?; let mut buffer = [0u8; BINARY_SNIFF_BYTES]; let read = file.read(&mut buffer)?; let sample = &buffer[..read]; if sample.iter().any(|byte| *byte == 0) { return Ok(true); } Ok(std::str::from_utf8(sample).is_err()) } #[cfg(test)] mod file_decision_tests { use super::*; use std::fs; use tempfile::TempDir; #[test] fn decide_file_picks_longest_excluded_prefix() { let repo = TempDir::new().expect("temp repo"); let repo_root = repo.path().canonicalize().expect("canonical repo root"); let config = IndexConfig::with_overrides( &repo_root, None, Vec::new(), vec!["docs/".into(), "docs/private/".into()], true, ) .expect("config"); let file = repo_root.join("docs/private/a.md"); fs::create_dir_all(file.parent().expect("parent dir")).expect("mkdir"); fs::write(&file, "# test\n").expect("write file"); let decision = decide_file(&file, &repo_root, &config); assert_eq!(decision.decision, FileDecisionOutcome::Exclude); assert_eq!( decision.reason, FileDecisionReason::ExcludedPrefix { prefix: "docs/private/".to_string() } ); } #[test] fn decide_file_excludes_state_dir_before_prefix_rules() { let repo = TempDir::new().expect("temp repo"); let repo_root = repo.path().canonicalize().expect("canonical repo root"); let state_dir = repo_root.join(".docdex-state"); let config = IndexConfig::with_overrides( &repo_root, Some(state_dir.clone()), Vec::new(), Vec::new(), true, ) .expect("config"); let file = config.state_dir().join("doc.md"); fs::create_dir_all(file.parent().expect("parent dir")).expect("mkdir"); fs::write(&file, "# state dir\n").expect("write file"); let decision = decide_file(&file, &repo_root, &config); assert_eq!(decision.decision, FileDecisionOutcome::Exclude); assert_eq!(decision.reason, FileDecisionReason::StateDir); } #[test] fn decide_file_excludes_default_vendor_dir() { let repo = TempDir::new().expect("temp repo"); let repo_root = repo.path().canonicalize().expect("canonical repo root"); let config = IndexConfig::with_overrides(&repo_root, None, Vec::new(), Vec::new(), true) .expect("config"); let file = repo_root.join("vendor/doc.md"); fs::create_dir_all(file.parent().expect("parent dir")).expect("mkdir"); fs::write(&file, "# vendor\n").expect("write file"); let decision = decide_file(&file, &repo_root, &config); assert_eq!(decision.decision, FileDecisionOutcome::Exclude); assert_eq!( decision.reason, FileDecisionReason::ExcludedDirName { name: "vendor".to_string() } ); } #[test] fn decide_file_excludes_outside_repo() { let repo = TempDir::new().expect("temp repo"); let repo_root = repo.path().canonicalize().expect("canonical repo root"); let config = IndexConfig::with_overrides(&repo_root, None, Vec::new(), Vec::new(), true) .expect("config"); let other = TempDir::new().expect("other repo"); let outside = other.path().join("note.md"); fs::write(&outside, "# outside\n").expect("write file"); let decision = decide_file(&outside, &repo_root, &config); assert_eq!(decision.decision, FileDecisionOutcome::Exclude); assert_eq!(decision.reason, FileDecisionReason::OutsideRepo); } #[test] fn decide_file_excludes_large_binary() { let repo = TempDir::new().expect("temp repo"); let repo_root = repo.path().canonicalize().expect("canonical repo root"); let config = IndexConfig::with_overrides(&repo_root, None, Vec::new(), Vec::new(), true) .expect("config"); let binary_path = repo_root.join("large.md"); let blob = vec![0u8; (MAX_BINARY_FILE_BYTES as usize) + 1]; fs::write(&binary_path, blob).expect("write binary"); let decision = decide_file(&binary_path, &repo_root, &config); assert_eq!( decision.reason, FileDecisionReason::BinaryTooLarge { bytes: (MAX_BINARY_FILE_BYTES + 1) } ); } #[test] fn decide_file_includes_supported_extensions() { let repo = TempDir::new().expect("temp repo"); let repo_root = repo.path().canonicalize().expect("canonical repo root"); let config = IndexConfig::with_overrides(&repo_root, None, Vec::new(), Vec::new(), true) .expect("config"); let file = repo_root.join("docs/notes.txt"); fs::create_dir_all(file.parent().expect("parent dir")).expect("mkdir"); fs::write(&file, "hello\n").expect("write file"); let decision = decide_file(&file, &repo_root, &config); assert_eq!(decision.decision, FileDecisionOutcome::Include); assert_eq!( decision.reason, FileDecisionReason::AllowedExtension { extension: ".txt".to_string() } ); } #[test] fn decide_file_respects_gitignore() { let repo = TempDir::new().expect("temp repo"); let repo_root = repo.path().canonicalize().expect("canonical repo root"); let ignore_path = repo_root.join(".gitignore"); fs::write(&ignore_path, "ignored.md\n").expect("write gitignore"); let file = repo_root.join("ignored.md"); fs::write(&file, "ignore me\n").expect("write file"); let config = IndexConfig::with_overrides(&repo_root, None, Vec::new(), Vec::new(), true) .expect("config"); let decision = decide_file(&file, &repo_root, &config); assert_eq!(decision.decision, FileDecisionOutcome::Exclude); assert_eq!(decision.reason, FileDecisionReason::IgnoredByPattern); } #[test] fn decide_file_respects_docdexignore() { let repo = TempDir::new().expect("temp repo"); let repo_root = repo.path().canonicalize().expect("canonical repo root"); let ignore_path = repo_root.join(".docdexignore"); fs::write(&ignore_path, "docs/private/\n").expect("write docdexignore"); let file = repo_root.join("docs/private/notes.md"); fs::create_dir_all(file.parent().expect("parent dir")).expect("mkdir"); fs::write(&file, "ignore me\n").expect("write file"); let config = IndexConfig::with_overrides(&repo_root, None, Vec::new(), Vec::new(), true) .expect("config"); let decision = decide_file(&file, &repo_root, &config); assert_eq!(decision.decision, FileDecisionOutcome::Exclude); assert_eq!(decision.reason, FileDecisionReason::IgnoredByPattern); } } fn hold_after_state_dir_created() { let Ok(value) = std::env::var("DOCDEX_TEST_HOLD_AFTER_STATE_DIR_CREATED_MS") else { return; }; let Ok(ms) = value.trim().parse::<u64>() else { return; }; static HOLD_ONCE: Once = Once::new(); HOLD_ONCE.call_once(|| std::thread::sleep(std::time::Duration::from_millis(ms))); } pub(crate) fn ensure_state_dir_secure(path: &Path) -> Result<()> { #[cfg(unix)] { use std::fs::DirBuilder; use std::os::unix::fs::DirBuilderExt; use std::os::unix::fs::PermissionsExt; let mut builder = DirBuilder::new(); builder.recursive(true); builder.mode(0o700); builder.create(path)?; let metadata = fs::metadata(path)?; let current = metadata.permissions().mode() & 0o777; if current != 0o700 { let mut perms = metadata.permissions(); perms.set_mode(0o700); if let Err(err) = fs::set_permissions(path, perms) { let is_perm_err = err.kind() == std::io::ErrorKind::PermissionDenied || err.raw_os_error() == Some(1); if is_perm_err && can_write_dir(path) { warn!( target: "docdexd", error = %err, "state dir permissions could not be tightened; continuing with existing perms" ); } else { return Err(err.into()); } } } } #[cfg(not(unix))] { fs::create_dir_all(path)?; } Ok(()) } #[cfg(unix)] fn can_write_dir(path: &Path) -> bool { let probe = path.join(format!(".docdex-perm-check-{}", std::process::id())); match fs::OpenOptions::new() .write(true) .create_new(true) .open(&probe) { Ok(_) => { let _ = fs::remove_file(&probe); true } Err(_) => false, } } fn normalize_for_error(path: &Path) -> String { path.to_string_lossy().replace('\\', "/") } fn known_canonical_path_from_repo_meta(index_state_dir: &Path) -> Option<String> { if index_state_dir.file_name().and_then(|s| s.to_str())? != "index" { return None; } let state_key_dir = index_state_dir.parent()?; let state_key = state_key_dir.file_name()?.to_string_lossy().to_string(); let repos_dir = state_key_dir.parent()?; if repos_dir.file_name().and_then(|s| s.to_str())? != "repos" { return None; } let base_dir = repos_dir.parent()?; let registry_path = base_dir.join("repos").join("repo_registry.json"); if let Ok(raw) = fs::read_to_string(&registry_path) { if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&raw) { if let Some(repos) = parsed.get("repos").and_then(|v| v.as_object()) { for entry in repos.values() { let entry_state_key = entry.get("state_key").and_then(|v| v.as_str())?; if entry_state_key == state_key { return entry .get("canonical_path") .and_then(|v| v.as_str()) .map(|s| s.to_string()); } } } } } None } fn canonical_path_from_repo_meta(repo_root: &Path) -> Option<String> { let meta_path = repo_root.join("repo_meta.json"); let raw = fs::read_to_string(&meta_path).ok()?; let parsed = serde_json::from_str::<serde_json::Value>(&raw).ok()?; parsed .get("canonical_path") .and_then(|v| v.as_str()) .map(|s| s.to_string()) } fn missing_repo_path_error(repo_root: &Path) -> AppError { AppError::new(ERR_MISSING_REPO_PATH, "repo path not found").with_details(repo_resolution_details( normalize_for_error(repo_root), None, None, vec![ "Repo may have moved or been renamed.".to_string(), "Re-run with the repo's current path.".to_string(), format!( "If you previously indexed this repo, you may need to reindex after moving it: `docdexd index --repo {}`.", normalize_for_error(repo_root) ), ], )) } fn stale_index_error(state_dir: &Path, repo_root: Option<&Path>) -> AppError { let reindex_hint = repo_root .map(|root| format!("docdexd index --repo {}", normalize_for_error(root))) .unwrap_or_else(|| "docdexd index --repo <repo>".to_string()); AppError::new( ERR_STALE_INDEX, format!( "index schema mismatch at {}; reindex with `{}`", state_dir.display(), reindex_hint ), ) } fn repo_state_mismatch_error( repo_root: &Path, index_state_dir: Option<&Path>, identity: &crate::repo_manager::RepoIdentityError, ) -> AppError { let attempted_fingerprint = crate::repo_manager::repo_fingerprint_sha256(repo_root).ok(); let mut known_canonical_path = index_state_dir.and_then(known_canonical_path_from_repo_meta); if known_canonical_path.is_none() { known_canonical_path = canonical_path_from_repo_meta(repo_root); } if let crate::repo_manager::RepoIdentityError::CanonicalPathCollision { canonical_path, .. } = identity { known_canonical_path = Some(canonical_path.clone()); } if let crate::repo_manager::RepoIdentityError::ReassociationRequired { registered_canonical_path, .. } = identity { known_canonical_path = Some(registered_canonical_path.clone()); } AppError::new( ERR_REPO_STATE_MISMATCH, "repo state mismatch; refusing to associate this repo with the existing state directory", ) .with_details(repo_resolution_details( normalize_for_error(repo_root), attempted_fingerprint, known_canonical_path, vec![ "Repo may have moved or been renamed.".to_string(), "Verify you are using the correct `--repo` and `--state-dir` combination.".to_string(), "Run: `docdexd repo inspect --repo <repo> --state-dir <shared_state_dir>` to see the repo fingerprint and any known canonical/alias mappings.".to_string(), "To explicitly re-associate a moved repo to existing shared state, run: `docdexd repo reassociate --repo <new_path> --state-dir <shared_state_dir> --old-path <knownCanonicalPath>` (or `--fingerprint <attemptedFingerprint>`)." .to_string(), "Do not reuse a shared `--state-dir` across unrelated repos; choose a different state dir or clear the conflicting state." .to_string(), ], )) } fn resolve_state_dir(repo_root: &Path, state_dir: Option<PathBuf>) -> Result<PathBuf> { if !repo_root.exists() { return Err(missing_repo_path_error(repo_root).into()); } if !repo_root.is_dir() { return Err(AppError::new( ERR_INVALID_ARGUMENT, format!("repo root is not a directory: {}", repo_root.display()), ) .into()); } match state_dir { Some(custom) if custom.is_absolute() => { // Guardrail: when an absolute state dir is provided outside the repo root, // treat it as a shared *base* directory and scope all state under a repo id. // This prevents accidental cross-repo mixing when the same `--state-dir` is // used across multiple repos. let repo_root = repo_root .canonicalize() .unwrap_or_else(|_| repo_root.to_path_buf()); if custom.starts_with(&repo_root) { return Ok(custom); } match crate::repo_manager::resolve_shared_index_state_dir(&repo_root, &custom) { Ok(path) => Ok(path), Err(err) => { if let Some(identity) = err.downcast_ref::<crate::repo_manager::RepoIdentityError>() { let index_dir_hint = match identity { crate::repo_manager::RepoIdentityError::StateMetaFingerprintMismatch { state_key, .. } => { Some(custom.join("repos").join(state_key).join("index")) } crate::repo_manager::RepoIdentityError::StateKeyConflict { existing_state_key, .. } => Some(custom.join("repos").join(existing_state_key).join("index")), _ => None, }; return Err(repo_state_mismatch_error( &repo_root, index_dir_hint.as_deref(), identity, ) .into()); } Err(err) } } } Some(custom) => Ok(repo_root.join(custom)), None => { let base_dir = crate::state_paths::default_state_base_dir()?; let repo_root = repo_root .canonicalize() .unwrap_or_else(|_| repo_root.to_path_buf()); crate::repo_manager::resolve_shared_index_state_dir(&repo_root, &base_dir) } } } fn normalize_prefix(input: &str) -> String { let mut cleaned = input .replace('\\', "/") .trim() .trim_start_matches('/') .to_lowercase(); if cleaned.is_empty() { return String::new(); } if !cleaned.ends_with('/') { cleaned.push('/'); } cleaned } fn summarize(content: &str) -> String { let cleaned = strip_front_matter(content); let segments = collect_segments(cleaned, MAX_SUMMARY_SEGMENTS); if segments.is_empty() { let collapsed = collapse_whitespace(cleaned); let (truncated, was_truncated) = truncate_to_limit(&collapsed, MAX_SUMMARY_CHARS); return if was_truncated { truncated } else { collapsed }; } let mut summary = String::new(); let mut awaiting_break_after_heading = false; for segment in segments { if summary.is_empty() { summary.push_str(&segment.text); awaiting_break_after_heading = segment.is_heading; continue; } if awaiting_break_after_heading { summary.push_str(" — "); awaiting_break_after_heading = false; } else { summary.push(' '); } summary.push_str(&segment.text); if summary.chars().count() >= MAX_SUMMARY_CHARS { break; } } let summary = summary.trim().to_string(); if summary.is_empty() { let fallback = cleaned .split_whitespace() .take(60) .collect::<Vec<_>>() .join(" "); let (truncated, was_truncated) = truncate_to_limit(&fallback, MAX_SUMMARY_CHARS); return if was_truncated { truncated } else { fallback }; } let (truncated, was_truncated) = truncate_to_limit(&summary, MAX_SUMMARY_CHARS); if was_truncated { truncated } else { summary } } fn strip_front_matter(content: &str) -> &str { let text = content.trim_start_matches('\u{feff}'); if !text.starts_with("---") { return text; } let mut iter = text.split_inclusive('\n'); let Some(first_line) = iter.next() else { return text; }; if first_line.trim_end() != "---" { return text; } let mut offset = first_line.len(); for line in iter { offset += line.len(); if line.trim_end() == "---" { let remainder = text[offset..].trim_start_matches(|c| c == '\n' || c == '\r'); return remainder; } } text } #[derive(Clone)] struct Segment { text: String, is_heading: bool, } fn collect_segments(text: &str, max_segments: usize) -> Vec<Segment> { let mut segments = Vec::with_capacity(max_segments); let mut buffer: Vec<String> = Vec::new(); let mut in_code_block = false; for raw_line in text.lines() { let trimmed = raw_line.trim(); if is_code_fence(trimmed) { in_code_block = !in_code_block; continue; } if in_code_block { continue; } if trimmed.is_empty() { push_buffer_segment(&mut segments, &mut buffer, max_segments); if segments.len() >= max_segments { break; } continue; } let Some((normalized, is_heading)) = normalize_line(trimmed) else { continue; }; if is_heading { push_buffer_segment(&mut segments, &mut buffer, max_segments); if segments.len() >= max_segments { break; } segments.push(Segment { text: normalized, is_heading: true, }); if segments.len() >= max_segments { break; } } else { buffer.push(normalized); } } if segments.len() < max_segments { push_buffer_segment(&mut segments, &mut buffer, max_segments); } segments } fn push_buffer_segment(segments: &mut Vec<Segment>, buffer: &mut Vec<String>, max_segments: usize) { if buffer.is_empty() { return; } let joined = buffer.join(" "); buffer.clear(); if joined.trim().is_empty() { return; } if segments.len() >= max_segments { return; } let collapsed = collapse_whitespace(&joined); if collapsed.is_empty() { return; } segments.push(Segment { text: collapsed, is_heading: false, }); } fn normalize_line(line: &str) -> Option<(String, bool)> { let mut text = line.trim(); if text.is_empty() { return None; } let mut is_heading = false; if text.starts_with('#') { is_heading = true; text = text.trim_start_matches('#').trim_start(); } while text.starts_with('>') { text = text[1..].trim_start(); } text = strip_list_prefix(text); if text.is_empty() { return None; } let mut owned = text.to_string(); owned = MARKDOWN_LINK_RE.replace_all(&owned, "$1").into_owned(); owned = INLINE_CODE_RE.replace_all(&owned, "$1").into_owned(); owned = HTML_TAG_RE.replace_all(&owned, "").into_owned(); owned = owned.replace('`', ""); let collapsed = collapse_whitespace(&owned); if collapsed.is_empty() { return None; } Some((collapsed, is_heading)) } fn strip_list_prefix(text: &str) -> &str { let working = text.trim_start(); for prefix in &["- [ ]", "- [x]", "- [X]", "* [ ]", "* [x]", "* [X]"] { if starts_with_case_insensitive(working, prefix) { let (_, rest) = working.split_at(prefix.len()); return rest.trim_start(); } } for prefix in &["- ", "* ", "+ "] { if working.starts_with(prefix) { let (_, rest) = working.split_at(prefix.len()); return rest.trim_start(); } } if let Some(mat) = ORDERED_LIST_RE.find(working) { let rest = working[mat.end()..].trim_start_matches(|c: char| c == ')' || c == '.'); return rest.trim_start(); } working } fn starts_with_case_insensitive(value: &str, prefix: &str) -> bool { value .get(0..prefix.len()) .map(|candidate| candidate.eq_ignore_ascii_case(prefix)) .unwrap_or(false) } fn is_code_fence(line: &str) -> bool { let trimmed = line.trim_start(); trimmed.starts_with("```") || trimmed.starts_with("~~~") } fn collapse_whitespace(text: &str) -> String { MULTISPACE_RE.replace_all(text, " ").trim().to_string() } fn truncate_to_limit(text: &str, max_chars: usize) -> (String, bool) { if max_chars == 0 { return (String::new(), true); } let char_count = text.chars().count(); if char_count <= max_chars { return (text.to_string(), false); } let take_chars = max_chars.saturating_sub(1); let mut truncated = String::new(); for (idx, ch) in text.chars().enumerate() { if idx >= take_chars { break; } truncated.push(ch); } while truncated .chars() .last() .map(|c| c.is_whitespace()) .unwrap_or(false) { truncated.pop(); } truncated.push('…'); (truncated, true) } fn condense_snippet(lines: &[String], max_chars: usize) -> (String, bool) { if lines.is_empty() { return (String::new(), false); } let joined = lines .iter() .map(|line| line.trim()) .filter(|line| !line.is_empty()) .collect::<Vec<_>>() .join(" "); if joined.is_empty() { return (String::new(), false); } let normalized = collapse_whitespace(&joined); let mut snippet = String::new(); let mut total_chars = 0usize; for part in SENTENCE_SPLIT_RE.split(&normalized) { let sentence = part.trim(); if sentence.is_empty() { continue; } if !snippet.is_empty() { snippet.push(' '); total_chars += 1; } snippet.push_str(sentence); total_chars += sentence.chars().count(); if total_chars >= max_chars { break; } } if snippet.is_empty() { return (String::new(), false); } if total_chars > max_chars || snippet.chars().count() > max_chars { let (truncated, _) = truncate_to_limit(&snippet, max_chars); return (truncated, true); } (snippet, false) } fn is_safe_rel_path(rel_path: &str) -> bool { let path = Path::new(rel_path); if path.is_absolute() { return false; } path.components() .all(|component| matches!(component, Component::CurDir | Component::Normal(_))) } fn sanitize_query(input: &str) -> String { let cleaned: String = input .chars() .map(|c| { if c.is_alphanumeric() || c.is_whitespace() || c == '_' { c } else { ' ' } }) .collect(); cleaned .split_whitespace() .filter(|token| !token.is_empty()) .collect::<Vec<_>>() .join(" ") } fn estimate_tokens(text: &str) -> u64 { text.split_whitespace().count() as u64 } static MARKDOWN_LINK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\[([^\]]+)\]$[^)]+$").unwrap()); static INLINE_CODE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"`([^`]+)`").unwrap()); static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap()); static MULTISPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap()); static SENTENCE_SPLIT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[.!?]+\s+").unwrap()); static ORDERED_LIST_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^(?:\d+[\.)])+").unwrap()); fn line_range_for_fragment(body: &str, fragment: &str) -> Option<(usize, usize)> { if fragment.is_empty() { return None; } if let Some(idx) = body.find(fragment) { let prefix = &body[..idx]; let start_line = prefix.chars().filter(|&c| c == '\n').count() + 1; let lines_in_fragment = fragment.lines().count().max(1); let end_line = start_line + lines_in_fragment - 1; return Some((start_line, end_line)); } // fallback: match on first/last non-empty lines of the fragment let frag_lines: Vec<&str> = fragment.lines().filter(|l| !l.trim().is_empty()).collect(); if frag_lines.is_empty() { return None; } let body_lines: Vec<&str> = body.lines().collect(); let first = frag_lines.first().copied().unwrap_or(""); let last = frag_lines.last().copied().unwrap_or(first); let mut start_line = None; for (idx, line) in body_lines.iter().enumerate() { if line.contains(first) { start_line = Some(idx + 1); break; } } let Some(start) = start_line else { return None; }; let mut end_line_val = start; for (idx, line) in body_lines.iter().enumerate().skip(start - 1) { if line.contains(last) { end_line_val = idx + 1; break; } } Some((start, end_line_val)) } fn sort_hits_deterministically(hits: &mut [Hit]) { hits.sort_by(|a, b| { let score_cmp = b.score.total_cmp(&a.score); if score_cmp != Ordering::Equal { return score_cmp; } let path_cmp = a.rel_path.cmp(&b.rel_path); if path_cmp != Ordering::Equal { return path_cmp; } a.doc_id.cmp(&b.doc_id) }); } #[cfg(test)] mod tests { use super::{sort_hits_deterministically, DocumentKind, Hit}; fn hit(doc_id: &str, rel_path: &str, score: f32) -> Hit { Hit { doc_id: doc_id.to_string(), rel_path: rel_path.to_string(), path: rel_path.to_string(), kind: DocumentKind::Doc, score, summary: String::new(), snippet: String::new(), token_estimate: 0, snippet_origin: None, snippet_truncated: None, line_start: None, line_end: None, } } #[test] fn deterministic_sorting_orders_by_score_then_path_then_doc_id() { let mut hits = vec![ hit("b", "docs/b.md", 1.0), hit("a", "docs/a.md", 1.0), hit("z", "docs/c.md", 2.0), hit("c", "docs/a.md", 1.0), ]; sort_hits_deterministically(&mut hits); let ordered = hits .iter() .map(|h| (h.score, h.rel_path.as_str(), h.doc_id.as_str())) .collect::<Vec<_>>(); assert_eq!( ordered, vec![ (2.0, "docs/c.md", "z"), (1.0, "docs/a.md", "a"), (1.0, "docs/a.md", "c"), (1.0, "docs/b.md", "b"), ] ); } }

Loading blob content...

Latest Blog Posts

Don't Use Large Strings as Cache Keys
By punkpeye on January 11, 2026.
markdown
node-js
cache
What are Claude Skills?
By punkpeye on January 10, 2026.
mcp
skills
How to Test MCP Streamable HTTP Endpoints Using cURL
By punkpeye on January 2, 2026.
tutorial
bash

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/bekirdag/docdex'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

mod.rs•80.6 kB