mod.rs•41.4 kB
use anyhow::{anyhow, Context, Result};
use once_cell::sync::Lazy;
use parking_lot::Mutex;
use regex::Regex;
use std::fs::{self, File};
use std::io::{self, BufRead, BufReader};
use std::path::{Component, Path, PathBuf};
use std::sync::Arc;
use tantivy::collector::TopDocs;
use tantivy::query::{AllQuery, QueryParser};
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
use tantivy::DocAddress;
use tantivy::{
doc, Document, Index, IndexReader, IndexWriter, ReloadPolicy, SnippetGenerator, Term,
};
use tracing::warn;
use walkdir::WalkDir;
const MAX_INDEX_RAM_BYTES: usize = 50 * 1024 * 1024;
const DEFAULT_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdx", ".txt"];
const DEFAULT_EXCLUDED_DIR_NAMES: &[&str] = &[
// Core VCS / tooling
".git",
".idea",
".vscode",
".cache",
"tmp",
"temp",
".hg",
".svn",
// JS / TS / Node ecosystem
"node_modules",
".pnpm-store",
".yarn",
".yarn-cache",
".npm",
"dist",
"build",
"coverage",
".vite",
".turbo",
".nx",
".parcel-cache",
".rollup-cache",
".webpack-cache",
".tsbuildinfo",
".next",
".nuxt",
".svelte-kit",
".angular",
".expo",
// Python
"__pycache__",
".venv",
"venv",
".mypy_cache",
".pytest_cache",
".ruff_cache",
".tox",
".ipynb_checkpoints",
// Rust
"target",
".cargo",
// Go
"bin",
"pkg",
"go-build",
// Java / Kotlin / JVM
".gradle",
".mvn",
"out",
// .NET / C# / Visual Studio
"obj",
".vs",
// Swift / Xcode / Apple
"deriveddata",
// PHP / Composer
"vendor",
// Ruby / Bundler
".bundle",
// Dart / Flutter
".dart_tool",
".flutter-plugins",
".flutter-plugins-dependencies",
".pub-cache",
// Android
".android",
// iOS / CocoaPods
"pods",
// C / C++ / CMake / native
"debug",
"release",
"cmake-build-debug",
"cmake-build-release",
"cmakefiles",
".conan",
"vcpkg_installed",
// Haskell
".stack-work",
"dist-newstyle",
"cabal-dev",
// Elixir / Erlang
"_build",
"deps",
".elixir_ls",
// Scala / Metals / Bloop
".bloop",
".metals",
// Clojure
".cpcache",
// Elm
"elm-stuff",
// Nim
"nimcache",
// OCaml / Dune / opam
"_opam",
// R / RStudio
".rproj.user",
// Game engines: Unity / Unreal / Godot
"library",
"logs",
"obj",
"binaries",
"deriveddatacache",
"intermediate",
".godot",
// Infra / deployment / serverless
".docker",
"docker-data",
".terraform",
".serverless",
".vercel",
".netlify",
];
const DEFAULT_EXCLUDED_RELATIVE_PREFIXES: &[&str] = &[
"logs/",
".docdex/",
".docdex/logs/",
".docdex/tmp/",
".gpt-creator/logs/",
".gpt-creator/tmp/",
".mastercoda/logs/",
".mastercoda/tmp/",
"docker/.data/",
"docker-data/",
".docker/",
];
const MAX_SUMMARY_CHARS: usize = 360;
const MAX_SUMMARY_SEGMENTS: usize = 4;
const MAX_SNIPPET_CHARS: usize = 420;
const FALLBACK_PREVIEW_LINES: usize = 60;
#[derive(Clone)]
pub struct IndexConfig {
state_dir: PathBuf,
excluded_dir_names: Vec<String>,
excluded_relative_prefixes: Vec<String>,
}
#[derive(Clone)]
pub struct Indexer {
repo_root: PathBuf,
config: IndexConfig,
index: Index,
reader: IndexReader,
doc_id_field: tantivy::schema::Field,
path_field: tantivy::schema::Field,
body_field: tantivy::schema::Field,
summary_field: tantivy::schema::Field,
token_field: tantivy::schema::Field,
writer: Option<Arc<Mutex<IndexWriter>>>,
}
#[derive(Debug, serde::Serialize)]
pub struct Hit {
pub doc_id: String,
pub rel_path: String,
pub score: f32,
pub summary: String,
pub snippet: String,
pub token_estimate: u64,
#[serde(skip_serializing_if = "Option::is_none")]
pub line_start: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub line_end: Option<usize>,
}
#[derive(Debug, Clone, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum SnippetOrigin {
Query,
Preview,
}
#[derive(Debug, Clone)]
pub struct SnippetResult {
pub text: String,
pub html: Option<String>,
pub truncated: bool,
pub origin: SnippetOrigin,
pub line_start: Option<usize>,
pub line_end: Option<usize>,
}
#[derive(Debug, serde::Serialize)]
pub struct DocSnapshot {
pub doc_id: String,
pub rel_path: String,
pub summary: String,
pub token_estimate: u64,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct IndexStats {
pub num_docs: u64,
pub state_dir: PathBuf,
pub index_size_bytes: u64,
pub segments: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub avg_bytes_per_doc: Option<u64>,
pub generated_at_epoch_ms: u128,
#[serde(skip_serializing_if = "Option::is_none")]
pub last_updated_epoch_ms: Option<u128>,
}
impl IndexConfig {
#[allow(dead_code)]
pub fn for_repo(repo_root: &Path) -> Self {
Self::with_overrides(repo_root, None, Vec::new(), Vec::new())
}
pub fn with_overrides(
repo_root: &Path,
state_dir: Option<PathBuf>,
extra_excluded_dirs: Vec<String>,
extra_excluded_prefixes: Vec<String>,
) -> Self {
let state_dir = resolve_state_dir(repo_root, state_dir);
let mut excluded_dir_names: Vec<String> = DEFAULT_EXCLUDED_DIR_NAMES
.iter()
.map(|value| value.to_string())
.collect();
for dir in extra_excluded_dirs {
let lowered = dir.trim().to_lowercase();
if lowered.is_empty() {
continue;
}
if !excluded_dir_names.contains(&lowered) {
excluded_dir_names.push(lowered);
}
}
let mut excluded_relative_prefixes: Vec<String> = DEFAULT_EXCLUDED_RELATIVE_PREFIXES
.iter()
.map(|value| value.to_string())
.collect();
for prefix in extra_excluded_prefixes {
let normalized = normalize_prefix(&prefix);
if normalized.is_empty() {
continue;
}
if !excluded_relative_prefixes.contains(&normalized) {
excluded_relative_prefixes.push(normalized);
}
}
if let Ok(rel_state) = state_dir.strip_prefix(repo_root) {
let normalized = normalize_prefix(rel_state.to_string_lossy().as_ref());
if !normalized.is_empty() && !excluded_relative_prefixes.contains(&normalized) {
excluded_relative_prefixes.push(normalized);
}
}
Self {
state_dir,
excluded_dir_names,
excluded_relative_prefixes,
}
}
pub fn state_dir(&self) -> &Path {
&self.state_dir
}
pub fn excluded_dir_names(&self) -> &[String] {
&self.excluded_dir_names
}
pub fn excluded_relative_prefixes(&self) -> &[String] {
&self.excluded_relative_prefixes
}
}
impl Indexer {
#[allow(dead_code)]
pub fn new(repo_root: PathBuf) -> Result<Self> {
let repo_root = repo_root.canonicalize().context("resolve repo root")?;
let config = IndexConfig::for_repo(&repo_root);
Self::with_config(repo_root, config)
}
pub fn with_config(repo_root: PathBuf, config: IndexConfig) -> Result<Self> {
let repo_root = repo_root.canonicalize().context("resolve repo root")?;
ensure_state_dir_secure(config.state_dir())?;
let (schema, doc_id_field, path_field, body_field, summary_field, token_field) =
build_schema();
let index = Index::open_or_create(
tantivy::directory::MmapDirectory::open(config.state_dir())?,
schema.clone(),
)?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
let writer = index.writer(MAX_INDEX_RAM_BYTES)?;
Ok(Self {
repo_root,
config,
index,
reader,
doc_id_field,
path_field,
body_field,
summary_field,
token_field,
writer: Some(Arc::new(Mutex::new(writer))),
})
}
pub fn with_config_read_only(repo_root: PathBuf, config: IndexConfig) -> Result<Self> {
let repo_root = repo_root.canonicalize().context("resolve repo root")?;
if !config.state_dir().exists() {
return Err(anyhow!(
"index not found at {}; run `docdexd index` first",
config.state_dir().display()
));
}
let index = Index::open_in_dir(config.state_dir())?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
let schema = index.schema();
let doc_id_field = schema.get_field("doc_id").unwrap();
let path_field = schema.get_field("rel_path").unwrap();
let body_field = schema.get_field("body").unwrap();
let summary_field = schema.get_field("summary").unwrap();
let token_field = schema.get_field("token_estimate").unwrap();
Ok(Self {
repo_root,
config,
index,
reader,
doc_id_field,
path_field,
body_field,
summary_field,
token_field,
writer: None,
})
}
pub async fn reindex_all(&self) -> Result<()> {
let writer_arc = self.writer()?;
let mut writer = writer_arc.lock();
writer.delete_all_documents()?;
for entry in WalkDir::new(&self.repo_root)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().is_file())
{
let path = entry.path();
if !should_index(path, &self.repo_root, &self.config) {
continue;
}
self.add_document(&mut writer, path)?;
}
writer.commit()?;
self.reader.reload()?;
Ok(())
}
pub async fn ingest_file(&self, file: PathBuf) -> Result<()> {
let path = file.canonicalize().context("resolve file")?;
if !should_index(&path, &self.repo_root, &self.config) {
return Ok(());
}
let rel = self.rel_path(&path)?;
let writer_arc = self.writer()?;
let mut writer = writer_arc.lock();
let term = Term::from_field_text(self.doc_id_field, &rel);
writer.delete_term(term);
self.add_document(&mut writer, &path)?;
writer.commit()?;
self.reader.reload()?;
Ok(())
}
pub async fn delete_file(&self, file: PathBuf) -> Result<()> {
let rel = match self.rel_path(&file) {
Ok(rel) => rel,
Err(_) => return Ok(()),
};
let writer_arc = self.writer()?;
let mut writer = writer_arc.lock();
let term = Term::from_field_text(self.doc_id_field, &rel);
writer.delete_term(term);
writer.commit()?;
self.reader.reload()?;
Ok(())
}
pub fn search(&self, query: &str, limit: usize) -> Result<Vec<Hit>> {
let searcher = self.reader.searcher();
let parser = QueryParser::for_index(
&self.index,
vec![self.body_field, self.summary_field, self.path_field],
);
let tantivy_query = match parser.parse_query(query) {
Ok(q) => q,
Err(err) => {
let sanitized = sanitize_query(query);
if sanitized.trim().is_empty() {
warn!(
target: "docdexd",
error = ?err,
"query parse failed; using AllQuery fallback"
);
Box::new(AllQuery)
} else {
match parser.parse_query(&sanitized) {
Ok(q) => {
warn!(
target: "docdexd",
error = ?err,
sanitized = %sanitized,
"query parse failed; using sanitized query"
);
q
}
Err(err2) => {
warn!(
target: "docdexd",
error = ?err2,
sanitized = %sanitized,
"sanitized query parse failed; using AllQuery fallback"
);
Box::new(AllQuery)
}
}
}
}
};
let mut snippet_generator =
SnippetGenerator::create(&searcher, tantivy_query.as_ref(), self.body_field).ok();
if let Some(generator) = snippet_generator.as_mut() {
generator.set_max_num_chars(MAX_SNIPPET_CHARS);
}
let top_docs = searcher.search(&tantivy_query, &TopDocs::with_limit(limit))?;
let mut results = Vec::with_capacity(top_docs.len());
for (score, addr) in top_docs {
let retrieved = searcher.doc(addr)?;
let body_text = retrieved
.get_first(self.body_field)
.and_then(|v| v.as_text())
.unwrap_or_default()
.to_string();
let doc_id = retrieved
.get_first(self.doc_id_field)
.and_then(|v| v.as_text().map(|s| s.to_string()))
.unwrap_or_default();
let rel_path = retrieved
.get_first(self.path_field)
.and_then(|v| v.as_text().map(|s| s.to_string()))
.unwrap_or_default();
let summary = retrieved
.get_first(self.summary_field)
.and_then(|v| v.as_text().map(|s| s.to_string()))
.unwrap_or_default();
let token_estimate = retrieved
.get_first(self.token_field)
.and_then(|v| v.as_u64())
.unwrap_or(0);
let (snippet, line_start, line_end) = snippet_generator
.as_ref()
.and_then(|gen| {
let snippet = gen.snippet_from_doc(&retrieved);
let fragment = snippet.fragment().trim().to_string();
if fragment.is_empty() {
None
} else {
let range = line_range_for_fragment(&body_text, &fragment);
Some((fragment, range.map(|r| r.0), range.map(|r| r.1)))
}
})
.or_else(|| {
match self.preview_snippet(&rel_path, FALLBACK_PREVIEW_LINES) {
Ok(Some((text, _truncated, start_line, end_line))) => {
Some((text, Some(start_line), Some(end_line)))
}
Ok(None) => None,
Err(err) => {
warn!(target: "docdexd", error = ?err, %rel_path, "failed to build fallback snippet");
None
}
}
})
.unwrap_or_else(|| (summary.clone(), None, None));
results.push(Hit {
doc_id,
rel_path,
score,
summary,
snippet,
token_estimate,
line_start,
line_end,
});
}
Ok(results)
}
fn fetch_document(&self, doc_id: &str) -> Result<Option<Document>> {
let searcher = self.reader.searcher();
let term = Term::from_field_text(self.doc_id_field, doc_id);
let term_query =
tantivy::query::TermQuery::new(term, tantivy::schema::IndexRecordOption::Basic);
let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1))?;
if let Some((_score, addr)) = top_docs.into_iter().next() {
let doc = searcher.doc(addr)?;
return Ok(Some(doc));
}
Ok(None)
}
pub fn preview_snippet(
&self,
rel_path: &str,
max_lines: usize,
) -> Result<Option<(String, bool, usize, usize)>> {
if max_lines == 0 {
return Ok(None);
}
if !is_safe_rel_path(rel_path) {
return Ok(None);
}
let path = self.repo_root.join(rel_path);
let file = match File::open(&path) {
Ok(file) => file,
Err(err) => {
if err.kind() == io::ErrorKind::NotFound {
return Ok(None);
}
return Err(err).with_context(|| format!("open {}", path.display()));
}
};
let reader = BufReader::new(file);
let mut preview_lines: Vec<(usize, String)> = Vec::new();
let mut truncated = false;
for (idx, line_res) in reader.lines().enumerate() {
if idx >= max_lines {
truncated = true;
break;
}
let line = line_res?;
let trimmed = line.trim();
if !trimmed.is_empty() {
preview_lines.push((idx + 1, trimmed.to_string()));
}
}
if preview_lines.is_empty() {
return Ok(None);
}
let (snippet, snippet_truncated) = condense_snippet(
&preview_lines
.iter()
.map(|(_, text)| text.clone())
.collect::<Vec<_>>(),
MAX_SNIPPET_CHARS,
);
if snippet.is_empty() {
return Ok(None);
}
let start_line = preview_lines.first().map(|(line, _)| *line).unwrap_or(1);
let end_line = preview_lines
.last()
.map(|(line, _)| *line)
.unwrap_or(start_line);
Ok(Some((
snippet,
truncated || snippet_truncated,
start_line,
end_line,
)))
}
pub fn repo_root(&self) -> &Path {
&self.repo_root
}
fn writer(&self) -> Result<Arc<Mutex<IndexWriter>>> {
self.writer
.clone()
.ok_or_else(|| anyhow!("index opened in read-only mode; writer unavailable"))
}
pub fn config(&self) -> &IndexConfig {
&self.config
}
pub fn stats(&self) -> Result<IndexStats> {
let searcher = self.reader.searcher();
let mut num_docs: u64 = 0;
let mut segments: usize = 0;
for segment_reader in searcher.segment_readers() {
segments += 1;
let live_docs = segment_reader
.alive_bitset()
.map(|bits| bits.num_alive_docs() as u64)
.unwrap_or_else(|| segment_reader.max_doc() as u64);
num_docs = num_docs.saturating_add(live_docs);
}
let state_dir = self.config.state_dir().to_path_buf();
let index_size_bytes = walkdir::WalkDir::new(&state_dir)
.into_iter()
.filter_map(|entry| entry.ok())
.filter_map(|entry| entry.metadata().ok())
.map(|meta| meta.len())
.sum();
let mut last_updated_epoch_ms: Option<u128> = None;
for entry in walkdir::WalkDir::new(&state_dir).into_iter().flatten() {
if let Ok(meta) = entry.metadata() {
if let Ok(modified) = meta.modified() {
if let Ok(dur) = modified.duration_since(std::time::UNIX_EPOCH) {
let millis = dur.as_millis();
if last_updated_epoch_ms
.map(|current| millis > current)
.unwrap_or(true)
{
last_updated_epoch_ms = Some(millis);
}
}
}
}
}
let generated_at_epoch_ms = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_millis();
let avg_bytes_per_doc = if num_docs > 0 {
Some(index_size_bytes / num_docs)
} else {
None
};
Ok(IndexStats {
num_docs,
state_dir,
index_size_bytes,
segments,
avg_bytes_per_doc,
generated_at_epoch_ms,
last_updated_epoch_ms,
})
}
pub fn snapshot_with_snippet(
&self,
doc_id: &str,
query: Option<&str>,
fallback_lines: usize,
) -> Result<Option<(DocSnapshot, Option<SnippetResult>)>> {
let Some(doc) = self.fetch_document(doc_id)? else {
return Ok(None);
};
let snapshot = self.snapshot_from_document(doc_id, &doc);
let snippet =
self.snippet_from_document(&doc, Some(&snapshot.rel_path), query, fallback_lines)?;
Ok(Some((snapshot, snippet)))
}
pub fn list_docs(&self, offset: usize, limit: usize) -> Result<(Vec<DocSnapshot>, u64)> {
let searcher = self.reader.searcher();
let mut snapshots = Vec::new();
let mut skipped = 0usize;
let mut total_live: u64 = 0;
'outer: for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
let alive = segment_reader.alive_bitset();
let max_doc = segment_reader.max_doc();
let live_in_segment = alive
.map(|bits| bits.num_alive_docs() as u64)
.unwrap_or_else(|| max_doc as u64);
total_live = total_live.saturating_add(live_in_segment);
let doc_iter: Box<dyn Iterator<Item = u32>> = if let Some(bits) = alive {
Box::new(bits.iter_alive())
} else {
Box::new(0..max_doc)
};
for doc_id in doc_iter {
if skipped < offset {
skipped += 1;
continue;
}
if snapshots.len() >= limit {
break 'outer;
}
let address = DocAddress::new(segment_ord as u32, doc_id);
let doc = searcher.doc(address)?;
let doc_id_text = doc
.get_first(self.doc_id_field)
.and_then(|v| v.as_text())
.unwrap_or_default();
snapshots.push(self.snapshot_from_document(doc_id_text, &doc));
}
}
Ok((snapshots, total_live))
}
fn add_document(&self, writer: &mut IndexWriter, path: &Path) -> Result<()> {
let rel = self.rel_path(path)?;
let content = fs::read_to_string(path).unwrap_or_default();
let summary = summarize(&content);
let tokens = estimate_tokens(&content);
writer.add_document(doc!(
self.doc_id_field => rel.clone(),
self.path_field => rel,
self.body_field => content,
self.summary_field => summary,
self.token_field => tokens,
))?;
Ok(())
}
fn rel_path(&self, path: &Path) -> Result<String> {
let rel = path
.strip_prefix(&self.repo_root)
.map_err(|_| anyhow!("{} is outside repo root", path.display()))?;
Ok(rel.to_string_lossy().replace('\\', "/"))
}
fn snapshot_from_document(&self, doc_id: &str, doc: &Document) -> DocSnapshot {
let rel_path = doc
.get_first(self.path_field)
.and_then(|v| v.as_text().map(|s| s.to_string()))
.unwrap_or_default();
let summary = doc
.get_first(self.summary_field)
.and_then(|v| v.as_text().map(|s| s.to_string()))
.unwrap_or_default();
let token_estimate = doc
.get_first(self.token_field)
.and_then(|v| v.as_u64())
.unwrap_or(0);
DocSnapshot {
doc_id: doc_id.to_string(),
rel_path,
summary,
token_estimate,
}
}
fn snippet_from_document(
&self,
doc: &Document,
rel_path_hint: Option<&str>,
query: Option<&str>,
fallback_lines: usize,
) -> Result<Option<SnippetResult>> {
let searcher = self.reader.searcher();
if let Some(query) = query.and_then(|q| {
let trimmed = q.trim();
if trimmed.is_empty() {
None
} else {
Some(trimmed)
}
}) {
let parser = QueryParser::for_index(&self.index, vec![self.body_field]);
if let Ok(parsed) = parser.parse_query(query) {
if let Ok(mut generator) =
SnippetGenerator::create(&searcher, parsed.as_ref(), self.body_field)
{
generator.set_max_num_chars(MAX_SNIPPET_CHARS);
let snippet = generator.snippet_from_doc(doc);
let fragment = snippet.fragment().trim();
if !fragment.is_empty() {
return Ok(Some(SnippetResult {
text: fragment.to_string(),
html: Some(snippet.to_html()),
truncated: false,
origin: SnippetOrigin::Query,
line_start: None,
line_end: None,
}));
}
}
}
}
let rel_path = rel_path_hint.map(|p| p.to_string()).or_else(|| {
doc.get_first(self.path_field)
.and_then(|v| v.as_text().map(|s| s.to_string()))
.map(|text| text.to_string())
});
if let Some(rel_path) = rel_path {
if let Some((text, truncated, line_start, line_end)) =
self.preview_snippet(&rel_path, fallback_lines)?
{
return Ok(Some(SnippetResult {
text,
html: None,
truncated,
origin: SnippetOrigin::Preview,
line_start: Some(line_start),
line_end: Some(line_end),
}));
}
}
Ok(None)
}
}
fn build_schema() -> (
Schema,
tantivy::schema::Field,
tantivy::schema::Field,
tantivy::schema::Field,
tantivy::schema::Field,
tantivy::schema::Field,
) {
let mut builder = Schema::builder();
let doc_id_field = builder.add_text_field("doc_id", STRING | STORED);
let path_field = builder.add_text_field("rel_path", STRING | STORED);
let body_field = builder.add_text_field("body", TEXT | STORED);
let summary_field = builder.add_text_field("summary", TEXT | STORED);
let token_field = builder.add_u64_field("token_estimate", FAST | STORED);
let schema = builder.build();
(
schema,
doc_id_field,
path_field,
body_field,
summary_field,
token_field,
)
}
pub(crate) fn should_index(path: &Path, repo_root: &Path, config: &IndexConfig) -> bool {
if path.starts_with(config.state_dir()) {
return false;
}
if let (Ok(state_dir), Ok(canonical)) = (config.state_dir().canonicalize(), path.canonicalize())
{
if canonical.starts_with(state_dir) {
return false;
}
}
let relative = path.strip_prefix(repo_root).unwrap_or(path);
let normalized = relative
.to_string_lossy()
.replace('\\', "/")
.trim_start_matches('/')
.to_string()
.to_lowercase();
if config
.excluded_relative_prefixes()
.iter()
.any(|prefix| normalized.starts_with(prefix))
{
return false;
}
for component in relative.components() {
if let Component::Normal(name) = component {
let name_lower = name.to_string_lossy().to_lowercase();
if config
.excluded_dir_names()
.iter()
.any(|excluded| excluded == &name_lower)
{
return false;
}
}
}
let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
return false;
};
let lower = format!(".{}", ext.to_lowercase());
DEFAULT_EXTENSIONS.contains(&lower.as_str())
}
fn ensure_state_dir_secure(path: &Path) -> Result<()> {
#[cfg(unix)]
{
use std::fs::DirBuilder;
use std::os::unix::fs::DirBuilderExt;
use std::os::unix::fs::PermissionsExt;
let mut builder = DirBuilder::new();
builder.recursive(true);
builder.mode(0o700);
builder.create(path)?;
let metadata = fs::metadata(path)?;
let current = metadata.permissions().mode() & 0o777;
if current != 0o700 {
let mut perms = metadata.permissions();
perms.set_mode(0o700);
fs::set_permissions(path, perms)?;
}
}
#[cfg(not(unix))]
{
fs::create_dir_all(path)?;
}
Ok(())
}
fn resolve_state_dir(repo_root: &Path, state_dir: Option<PathBuf>) -> PathBuf {
match state_dir {
Some(custom) if custom.is_absolute() => custom,
Some(custom) => repo_root.join(custom),
None => {
let default_dir = repo_root.join(".docdex").join("index");
let legacy_dir = repo_root.join(".gpt-creator").join("docdex").join("index");
if !default_dir.exists() && legacy_dir.exists() {
warn!(
target: "docdexd",
legacy = %legacy_dir.display(),
default = %default_dir.display(),
"using legacy docdex index path; consider migrating to the new default"
);
legacy_dir
} else {
default_dir
}
}
}
}
fn normalize_prefix(input: &str) -> String {
let mut cleaned = input
.replace('\\', "/")
.trim()
.trim_start_matches('/')
.to_lowercase();
if cleaned.is_empty() {
return String::new();
}
if !cleaned.ends_with('/') {
cleaned.push('/');
}
cleaned
}
fn summarize(content: &str) -> String {
let cleaned = strip_front_matter(content);
let segments = collect_segments(cleaned, MAX_SUMMARY_SEGMENTS);
if segments.is_empty() {
let collapsed = collapse_whitespace(cleaned);
let (truncated, was_truncated) = truncate_to_limit(&collapsed, MAX_SUMMARY_CHARS);
return if was_truncated { truncated } else { collapsed };
}
let mut summary = String::new();
let mut awaiting_break_after_heading = false;
for segment in segments {
if summary.is_empty() {
summary.push_str(&segment.text);
awaiting_break_after_heading = segment.is_heading;
continue;
}
if awaiting_break_after_heading {
summary.push_str(" — ");
awaiting_break_after_heading = false;
} else {
summary.push(' ');
}
summary.push_str(&segment.text);
if summary.chars().count() >= MAX_SUMMARY_CHARS {
break;
}
}
let summary = summary.trim().to_string();
if summary.is_empty() {
let fallback = cleaned
.split_whitespace()
.take(60)
.collect::<Vec<_>>()
.join(" ");
let (truncated, was_truncated) = truncate_to_limit(&fallback, MAX_SUMMARY_CHARS);
return if was_truncated { truncated } else { fallback };
}
let (truncated, was_truncated) = truncate_to_limit(&summary, MAX_SUMMARY_CHARS);
if was_truncated {
truncated
} else {
summary
}
}
fn strip_front_matter(content: &str) -> &str {
let text = content.trim_start_matches('\u{feff}');
if !text.starts_with("---") {
return text;
}
let mut iter = text.split_inclusive('\n');
let Some(first_line) = iter.next() else {
return text;
};
if first_line.trim_end() != "---" {
return text;
}
let mut offset = first_line.len();
for line in iter {
offset += line.len();
if line.trim_end() == "---" {
let remainder = text[offset..].trim_start_matches(|c| c == '\n' || c == '\r');
return remainder;
}
}
text
}
#[derive(Clone)]
struct Segment {
text: String,
is_heading: bool,
}
fn collect_segments(text: &str, max_segments: usize) -> Vec<Segment> {
let mut segments = Vec::with_capacity(max_segments);
let mut buffer: Vec<String> = Vec::new();
let mut in_code_block = false;
for raw_line in text.lines() {
let trimmed = raw_line.trim();
if is_code_fence(trimmed) {
in_code_block = !in_code_block;
continue;
}
if in_code_block {
continue;
}
if trimmed.is_empty() {
push_buffer_segment(&mut segments, &mut buffer, max_segments);
if segments.len() >= max_segments {
break;
}
continue;
}
let Some((normalized, is_heading)) = normalize_line(trimmed) else {
continue;
};
if is_heading {
push_buffer_segment(&mut segments, &mut buffer, max_segments);
if segments.len() >= max_segments {
break;
}
segments.push(Segment {
text: normalized,
is_heading: true,
});
if segments.len() >= max_segments {
break;
}
} else {
buffer.push(normalized);
}
}
if segments.len() < max_segments {
push_buffer_segment(&mut segments, &mut buffer, max_segments);
}
segments
}
fn push_buffer_segment(segments: &mut Vec<Segment>, buffer: &mut Vec<String>, max_segments: usize) {
if buffer.is_empty() {
return;
}
let joined = buffer.join(" ");
buffer.clear();
if joined.trim().is_empty() {
return;
}
if segments.len() >= max_segments {
return;
}
let collapsed = collapse_whitespace(&joined);
if collapsed.is_empty() {
return;
}
segments.push(Segment {
text: collapsed,
is_heading: false,
});
}
fn normalize_line(line: &str) -> Option<(String, bool)> {
let mut text = line.trim();
if text.is_empty() {
return None;
}
let mut is_heading = false;
if text.starts_with('#') {
is_heading = true;
text = text.trim_start_matches('#').trim_start();
}
while text.starts_with('>') {
text = text[1..].trim_start();
}
text = strip_list_prefix(text);
if text.is_empty() {
return None;
}
let mut owned = text.to_string();
owned = MARKDOWN_LINK_RE.replace_all(&owned, "$1").into_owned();
owned = INLINE_CODE_RE.replace_all(&owned, "$1").into_owned();
owned = HTML_TAG_RE.replace_all(&owned, "").into_owned();
owned = owned.replace('`', "");
let collapsed = collapse_whitespace(&owned);
if collapsed.is_empty() {
return None;
}
Some((collapsed, is_heading))
}
fn strip_list_prefix(text: &str) -> &str {
let working = text.trim_start();
for prefix in &["- [ ]", "- [x]", "- [X]", "* [ ]", "* [x]", "* [X]"] {
if starts_with_case_insensitive(working, prefix) {
let (_, rest) = working.split_at(prefix.len());
return rest.trim_start();
}
}
for prefix in &["- ", "* ", "+ "] {
if working.starts_with(prefix) {
let (_, rest) = working.split_at(prefix.len());
return rest.trim_start();
}
}
if let Some(mat) = ORDERED_LIST_RE.find(working) {
let rest = working[mat.end()..].trim_start_matches(|c: char| c == ')' || c == '.');
return rest.trim_start();
}
working
}
fn starts_with_case_insensitive(value: &str, prefix: &str) -> bool {
value
.get(0..prefix.len())
.map(|candidate| candidate.eq_ignore_ascii_case(prefix))
.unwrap_or(false)
}
fn is_code_fence(line: &str) -> bool {
let trimmed = line.trim_start();
trimmed.starts_with("```") || trimmed.starts_with("~~~")
}
fn collapse_whitespace(text: &str) -> String {
MULTISPACE_RE.replace_all(text, " ").trim().to_string()
}
fn truncate_to_limit(text: &str, max_chars: usize) -> (String, bool) {
if max_chars == 0 {
return (String::new(), true);
}
let char_count = text.chars().count();
if char_count <= max_chars {
return (text.to_string(), false);
}
let take_chars = max_chars.saturating_sub(1);
let mut truncated = String::new();
for (idx, ch) in text.chars().enumerate() {
if idx >= take_chars {
break;
}
truncated.push(ch);
}
while truncated
.chars()
.last()
.map(|c| c.is_whitespace())
.unwrap_or(false)
{
truncated.pop();
}
truncated.push('…');
(truncated, true)
}
fn condense_snippet(lines: &[String], max_chars: usize) -> (String, bool) {
if lines.is_empty() {
return (String::new(), false);
}
let joined = lines
.iter()
.map(|line| line.trim())
.filter(|line| !line.is_empty())
.collect::<Vec<_>>()
.join(" ");
if joined.is_empty() {
return (String::new(), false);
}
let normalized = collapse_whitespace(&joined);
let mut snippet = String::new();
let mut total_chars = 0usize;
for part in SENTENCE_SPLIT_RE.split(&normalized) {
let sentence = part.trim();
if sentence.is_empty() {
continue;
}
if !snippet.is_empty() {
snippet.push(' ');
total_chars += 1;
}
snippet.push_str(sentence);
total_chars += sentence.chars().count();
if total_chars >= max_chars {
break;
}
}
if snippet.is_empty() {
return (String::new(), false);
}
if total_chars > max_chars || snippet.chars().count() > max_chars {
let (truncated, _) = truncate_to_limit(&snippet, max_chars);
return (truncated, true);
}
(snippet, false)
}
fn is_safe_rel_path(rel_path: &str) -> bool {
let path = Path::new(rel_path);
if path.is_absolute() {
return false;
}
path.components()
.all(|component| matches!(component, Component::CurDir | Component::Normal(_)))
}
fn sanitize_query(input: &str) -> String {
let cleaned: String = input
.chars()
.map(|c| {
if c.is_alphanumeric() || c.is_whitespace() || c == '_' {
c
} else {
' '
}
})
.collect();
cleaned
.split_whitespace()
.filter(|token| !token.is_empty())
.collect::<Vec<_>>()
.join(" ")
}
fn estimate_tokens(text: &str) -> u64 {
text.split_whitespace().count() as u64
}
static MARKDOWN_LINK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\[([^\]]+)\]\([^)]+\)").unwrap());
static INLINE_CODE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"`([^`]+)`").unwrap());
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
static MULTISPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
static SENTENCE_SPLIT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[.!?]+\s+").unwrap());
static ORDERED_LIST_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^(?:\d+[\.)])+").unwrap());
fn line_range_for_fragment(body: &str, fragment: &str) -> Option<(usize, usize)> {
if fragment.is_empty() {
return None;
}
if let Some(idx) = body.find(fragment) {
let prefix = &body[..idx];
let start_line = prefix.chars().filter(|&c| c == '\n').count() + 1;
let lines_in_fragment = fragment.lines().count().max(1);
let end_line = start_line + lines_in_fragment - 1;
return Some((start_line, end_line));
}
// fallback: match on first/last non-empty lines of the fragment
let frag_lines: Vec<&str> = fragment.lines().filter(|l| !l.trim().is_empty()).collect();
if frag_lines.is_empty() {
return None;
}
let body_lines: Vec<&str> = body.lines().collect();
let first = frag_lines.first().copied().unwrap_or("");
let last = frag_lines.last().copied().unwrap_or(first);
let mut start_line = None;
for (idx, line) in body_lines.iter().enumerate() {
if line.contains(first) {
start_line = Some(idx + 1);
break;
}
}
let Some(start) = start_line else {
return None;
};
let mut end_line_val = start;
for (idx, line) in body_lines.iter().enumerate().skip(start - 1) {
if line.contains(last) {
end_line_val = idx + 1;
break;
}
}
Some((start, end_line_val))
}