docdex

Overview Schema Related Servers Score Discussions

docdex
src
orchestrator

web.rs•158 KiB

use crate::config; use crate::index::Hit; use crate::index::Indexer; use crate::libs::LibsIndexer; use crate::llm::adapter::{resolve_agent_adapter, LlmClient, LlmCompletion, LlmFuture}; use crate::max_size::truncate_utf8_chars; use crate::mcoda::registry::McodaRegistry; use crate::ollama::OllamaClient; use crate::search; use crate::state_layout::StateLayout; use crate::tier2::{Tier2Unavailable, Tier2UnavailableReason}; use crate::util; use crate::web::cache; use crate::web::ddg::{DdgDiscovery, WebDiscoveryResponse, WebDiscoveryResult}; use crate::web::normalize::{dedupe_urls, unwrap_ddg_redirect}; use crate::web::readability::extract_readable_text; use crate::web::scraper::ScraperEngine; use crate::web::status::fetch_status; use crate::web::WebConfig; use anyhow::Context; use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::collections::{HashMap, HashSet}; use std::env; use std::path::Path; use std::sync::{Arc, Mutex}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tracing::{info, warn}; const DEFAULT_WEB_TRIGGER_THRESHOLD: f32 = 0.7; const DEFAULT_WEB_MIN_MATCH_RATIO: f32 = 0.2; const DEFAULT_LOCAL_RELEVANCE_THRESHOLD: f32 = 0.7; const MAX_WEB_SUMMARY_TOKENS: u32 = 256; const WEB_SUMMARY_TIMEOUT_MS: u64 = 15_000; const DEFAULT_WEB_SUMMARY_INPUT_MAX_CHARS: usize = 6000; const MAX_QUERY_CATEGORY_TOKENS: u32 = 48; const QUERY_CATEGORY_TIMEOUT_MS: u64 = 4_000; const WEB_CONTEXT_MIN_RELEVANCE_SCORE: f32 = 0.2; const WEB_HTML2TEXT_WRAP_COLS: usize = 120; const MAX_MATCH_HITS: usize = 3; const LOCAL_RELEVANCE_TIMEOUT_MS: u64 = 8_000; const LOCAL_RELEVANCE_MAX_TOKENS: u32 = 96; const MAX_LOCAL_RELEVANCE_INPUT_CHARS: usize = 800; const WEB_BATCH_SIZE: usize = 10; const WEB_MAX_BATCHES: usize = 2; const MAX_CODE_BLOCKS: usize = 4; const MAX_CODE_BLOCK_CHARS: usize = 1800; const WEB_GOOD_RELEVANCE_SCORE: f32 = 0.7; const WEB_DISCOVERY_MULTIPLIER: usize = 4; const WEB_DISCOVERY_MIN_RESULTS: usize = 4; const WEB_DISCOVERY_MAX_QUERY_TOKENS: usize = 6; const WEB_MIN_CONTENT_CHARS: usize = 200; const WEB_MIN_CONTENT_WORDS: usize = 30; const WEB_MAX_RESULTS_PER_DOMAIN: usize = 2; const WEB_QUALITY_FAIL_THRESHOLD: u32 = 3; const WEB_QUALITY_BLOCK_THRESHOLD: u32 = 2; const WEB_QUALITY_CHALLENGE_THRESHOLD: u32 = 1; const WEB_QUALITY_COOLDOWN_SECS: u64 = 600; const WEB_QUALITY_TTL_SECS: u64 = 86_400; const COMMON_STOPWORDS: &[&str] = &[ "a", "an", "and", "are", "as", "at", "be", "by", "do", "does", "for", "from", "how", "i", "if", "in", "is", "it", "of", "on", "or", "the", "to", "use", "using", "was", "we", "what", "when", "where", "who", "why", "with", "you", "your", ]; const MATCH_STOPWORDS_EXTRA: &[&str] = &["add", "append", "build", "create", "insert", "make"]; const MATCH_STOPWORDS_GENERIC: &[&str] = &[ "code", "sample", "samples", "example", "examples", "snippet", "snippets", "tutorial", "tutorials", "guide", "guides", "docs", "documentation", "reference", "references", ]; const DOMAIN_STOPWORDS: &[&str] = &[ "code", "sample", "samples", "example", "examples", "tutorial", "tutorials", "guide", "guides", "docs", "documentation", "reference", "references", "overview", "intro", "introduction", "getting", "started", "learn", "learning", ]; static STOPWORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| { COMMON_STOPWORDS .iter() .chain(DOMAIN_STOPWORDS.iter()) .copied() .collect() }); static MATCH_STOPWORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| { COMMON_STOPWORDS .iter() .chain(MATCH_STOPWORDS_EXTRA.iter()) .chain(MATCH_STOPWORDS_GENERIC.iter()) .copied() .collect() }); #[derive(Clone, Copy)] struct CachedQueryCategory { category: QueryCategory, source: QueryCategorySource, } static QUERY_CATEGORY_CACHE: Lazy<Mutex<HashMap<String, CachedQueryCategory>>> = Lazy::new(|| Mutex::new(HashMap::new())); static ANSI_ESCAPE_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"\x1b\[[0-9;]*[A-Za-z]").expect("valid ansi escape regex")); static HEADING_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"([a-z0-9])#([A-Za-z])").expect("valid heading join regex")); static TAG_ATTR_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| { regex::Regex::new(r"(<[A-Za-z]+)([a-z]{2,})(=)").expect("valid tag attr join regex") }); static TLD_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| { regex::Regex::new(r"\.(com|org|net|io|dev|co|us|uk|edu|gov)([A-Z])") .expect("valid tld join regex") }); static LOWER_UPPER_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"([a-z])([A-Z])").expect("valid lower upper join regex")); static AND_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"([a-z])and([A-Z])").expect("valid and join regex")); static AND_LOWER_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| { regex::Regex::new(r"([a-z]{3,})and([a-z]{3,})").expect("valid and lower join regex") }); static CAPITAL_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| { regex::Regex::new(r"([A-Z][a-z]{2,})([A-Z][a-z]{2,})").expect("valid capital join regex") }); static BRACKET_LEFT_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"([A-Za-z])\[").expect("valid bracket left join regex")); static BRACKET_RIGHT_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"\]([A-Za-z])").expect("valid bracket right join regex")); static LOWER_JOIN_STOPWORD_RE: Lazy<regex::Regex> = Lazy::new(|| { regex::Regex::new(r"([A-Za-z]{3,})(and|of|in|to|with|by|as|for|from|its|is)([A-Za-z]{3,})") .expect("valid lower join stopword regex") }); static TITLE_AND_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| { regex::Regex::new(r"([A-Z][a-z]{2,})and(\s+[a-z]{3,})").expect("valid title and join regex") }); static PREFIX_COMMON_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| { regex::Regex::new( r"(?i)\b(the|a|an|of|in|to|with|by|as|its|is)(capital|largest|city|population|area|state|country|province|district|region|union|metropolitan|inhabitants|limit|limits|river|county|kingdom|republic)", ) .expect("valid prefix common join regex") }); static LONG_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| { regex::Regex::new(r"([a-z]{5,})(within|into|over|under|between|across)([a-z]{3,})") .expect("valid long join regex") }); static PUNCT_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"([.!?])([A-Z])").expect("valid punctuation join regex")); static COMMA_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"([,])([A-Za-z])").expect("valid comma join regex")); static COLON_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"([:;])([A-Za-z])").expect("valid colon join regex")); static WORD_METHOD_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| { regex::Regex::new(r"([A-Za-z]{2,})([a-z]{2,}$)").expect("valid word method join regex") }); static PAREN_WORD_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"($)([A-Za-z])").expect("valid paren word join regex")); static LABEL_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| { regex::Regex::new( r"(?i)\b(example|syntax|description|parameters|returns?|usage|notes)([A-Za-z])", ) .expect("valid label join regex") }); static CODE_KEYWORD_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| { regex::Regex::new(r"\b(const|let|var|function|return|class|struct|enum)([A-Za-z_])") .expect("valid code keyword join regex") }); static ALLCAPS_JOIN_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"([a-z])([A-Z]{2,})").expect("valid allcaps join regex")); static CAMEL_BREAK_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"([a-z])([A-Z])").expect("valid camel break regex")); #[derive(Clone, Debug)] pub struct WebGateConfig { pub enabled: bool, pub trigger_threshold: f32, pub min_local_match_ratio: f32, pub browser_hint: Option<String>, pub browser_available: bool, } impl WebGateConfig { pub fn from_env() -> Self { let enabled = env_boolish("DOCDEX_WEB_ENABLED").unwrap_or(false); let trigger_threshold = env_f32("DOCDEX_WEB_TRIGGER_THRESHOLD") .or_else(config_web_trigger_threshold) .unwrap_or(DEFAULT_WEB_TRIGGER_THRESHOLD); let trigger_threshold = trigger_threshold.clamp(0.0, 1.0); let min_local_match_ratio = env_f32("DOCDEX_WEB_MIN_MATCH_RATIO") .or_else(config_web_min_match_ratio) .unwrap_or(DEFAULT_WEB_MIN_MATCH_RATIO) .clamp(0.0, 1.0); let browser_hint = env_string("DOCDEX_WEB_BROWSER").or_else(config_web_browser_path); let browser_available = resolve_browser_available(browser_hint.as_deref()); Self { enabled, trigger_threshold, min_local_match_ratio, browser_hint, browser_available, } } pub fn should_attempt( &self, top_score_normalized: Option<f32>, local_match_ratio: Option<f32>, force_web: bool, use_match_ratio: bool, ) -> bool { if force_web { return true; } if use_match_ratio { if let Some(local_match_ratio) = local_match_ratio { if local_match_ratio < self.min_local_match_ratio { return true; } } } top_score_normalized.map_or(true, |score| score < self.trigger_threshold) } } pub(crate) fn build_gate_meta( gate: &WebGateConfig, top_score: Option<f32>, top_score_normalized: Option<f32>, local_match_ratio: Option<f32>, force_web: bool, ) -> WebGateMeta { WebGateMeta { enabled: gate.enabled, forced: force_web, threshold: gate.trigger_threshold, top_score, top_score_normalized, top_score_normalized_camel: top_score_normalized, local_match_ratio, local_match_ratio_camel: local_match_ratio, } } #[derive(Debug, Clone, Serialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum WebDiscoveryStatusCode { Skipped, Disabled, Unavailable, Served, } #[derive(Debug, Clone, Serialize)] pub struct WebGateMeta { pub enabled: bool, pub forced: bool, pub threshold: f32, #[serde(skip_serializing_if = "Option::is_none")] pub top_score: Option<f32>, #[serde(skip_serializing_if = "Option::is_none")] pub top_score_normalized: Option<f32>, #[serde(rename = "topScoreNormalized", skip_serializing_if = "Option::is_none")] pub top_score_normalized_camel: Option<f32>, #[serde(skip_serializing_if = "Option::is_none")] pub local_match_ratio: Option<f32>, #[serde(rename = "localMatchRatio", skip_serializing_if = "Option::is_none")] pub local_match_ratio_camel: Option<f32>, } #[derive(Debug, Clone, Serialize)] pub struct WebDiscoveryStatus { pub status: WebDiscoveryStatusCode, #[serde(skip_serializing_if = "Option::is_none")] pub reason: Option<String>, #[serde(skip_serializing_if = "Option::is_none")] pub message: Option<String>, #[serde(skip_serializing_if = "Option::is_none")] pub unavailable: Option<Tier2Unavailable>, #[serde(skip_serializing_if = "Option::is_none")] pub discovery: Option<WebDiscoveryResponse>, #[serde(skip_serializing_if = "Option::is_none")] pub fetches: Option<Vec<WebFetchResult>>, #[serde(skip_serializing_if = "Option::is_none")] pub debug: Option<Vec<String>>, pub gate: WebGateMeta, } pub fn web_context_from_status(status: &WebDiscoveryStatus) -> Option<Vec<WebFetchResult>> { let fetches = status.fetches.as_ref()?; let mut items = Vec::new(); for item in fetches { if item .relevance_score .map_or(false, |score| score < WEB_CONTEXT_MIN_RELEVANCE_SCORE) { continue; } let content = item.ai_digested_content.as_ref().or(item.content.as_ref()); let Some(content) = content else { continue; }; if content.trim().is_empty() { continue; } let mut cloned = item.clone(); cloned.error = None; cloned.debug = None; cloned.debug_html = None; cloned.debug_dom_text = None; items.push(cloned); } if items.is_empty() { None } else { Some(items) } } #[derive(Debug, Clone, Serialize)] pub struct WebFetchResult { pub url: String, #[serde(skip_serializing_if = "Option::is_none")] pub status: Option<u16>, #[serde(skip_serializing_if = "Option::is_none")] pub fetched_at_epoch_ms: Option<u128>, pub cached: bool, #[serde(skip_serializing_if = "Option::is_none")] pub content: Option<String>, #[serde(skip_serializing_if = "Option::is_none")] pub ai_digested_content: Option<String>, #[serde(skip_serializing_if = "Option::is_none")] pub ai_digested_kind: Option<String>, #[serde(skip_serializing_if = "Option::is_none")] pub relevance_score: Option<f32>, #[serde(skip_serializing_if = "Option::is_none")] pub debug_html: Option<String>, #[serde(skip_serializing_if = "Option::is_none")] pub debug_dom_text: Option<String>, #[serde(skip_serializing_if = "Option::is_none")] pub error: Option<String>, #[serde(skip_serializing_if = "Option::is_none")] pub debug: Option<Vec<String>>, } #[derive(Debug, Clone, Serialize, Deserialize)] struct WebFetchCacheEntry { url: String, status: Option<u16>, fetched_at_epoch_ms: u128, content: String, #[serde(default)] code_blocks: Vec<String>, } #[derive(Debug, Clone, Serialize, Deserialize)] struct WebSummaryCacheEntry { query_hash: String, content_hash: String, relevance_score: f32, kind: String, output: String, } #[derive(Debug, Clone, Serialize, Deserialize)] struct WebPhraseCacheEntry { query_hash: String, fetched_at_epoch_ms: u128, ai_digested_kind: String, ai_digested_content: String, #[serde(default)] url: String, #[serde(default)] relevance_score: Option<f32>, } #[derive(Debug, Clone, Serialize, Deserialize)] struct DomainQualityEntry { host: String, fail_count: u32, blocked_count: u32, challenge_count: u32, last_failure_epoch_ms: u64, cooldown_until_epoch_ms: u64, } #[derive(Debug, Clone, Copy)] enum DomainFailureKind { Fetch, Blocked, Challenge, } struct OllamaPromptClient { client: OllamaClient, model: String, adapter: String, } impl LlmClient for OllamaPromptClient { fn generate<'a>( &'a self, prompt: &'a str, max_tokens: u32, timeout: Duration, ) -> LlmFuture<'a> { Box::pin(async move { let output = self .client .generate(&self.model, prompt, max_tokens, timeout) .await .context("ollama generate")?; Ok(LlmCompletion { output, adapter: self.adapter.clone(), model: Some(self.model.clone()), metadata: None, }) }) } } #[derive(Clone)] struct QueryCategoryClient { client: Arc<dyn LlmClient>, max_tokens: u32, timeout: Duration, } #[derive(Clone)] struct WebSummaryClient { client: Arc<dyn LlmClient>, max_tokens: u32, timeout: Duration, } #[derive(Debug, Deserialize)] struct WebEvalResponse { relevant: bool, score: f32, kind: String, output: String, } #[derive(Debug, Deserialize)] struct QueryCategoryResponse { category: String, } struct WebEvalOutput { relevance_score: f32, kind: String, output: String, } #[derive(Clone)] struct LocalRelevanceClient { client: Arc<dyn LlmClient>, max_tokens: u32, timeout: Duration, } #[derive(Debug, Deserialize)] struct LocalRelevanceResponse { relevant: bool, score: f32, } const WEB_SUMMARY_INSTRUCTIONS: &str = include_str!("../../prompts/web_summary_instructions.txt"); const QUERY_CATEGORY_INSTRUCTIONS: &str = include_str!("../../prompts/query_category_instructions.txt"); const LOCAL_RELEVANCE_INSTRUCTIONS: &str = include_str!("../../prompts/local_relevance_instructions.txt"); impl WebSummaryClient { async fn evaluate( &self, query: &str, category: QueryCategory, content: &str, code_blocks: &[String], ) -> Option<WebEvalOutput> { let trimmed = content.trim(); if trimmed.is_empty() { return None; } let intent = detect_query_intent(query); let allow_code = (matches!(intent, QueryIntent::Code) || matches!(category, QueryCategory::CodeExample)) && !code_blocks.is_empty(); let prompt = build_summary_prompt(query, category, trimmed, code_blocks); let result = self .client .generate(&prompt, self.max_tokens, self.timeout) .await .ok()?; let output = result.output; let parsed: WebEvalResponse = match parse_json_response(&output) { Some(parsed) => parsed, None => { if let Some(parsed) = parse_web_eval_response_lenient(&output) { parsed } else { let raw = output.trim(); if raw.is_empty() { return None; } if looks_like_web_eval_metadata(raw) { return None; } let kind = if allow_code && looks_like_code_output(raw) { "code" } else { "summary" }; let output = if kind == "code" { clean_code_text(raw) } else { clean_summary_text(raw) }; if output.is_empty() { return None; } return Some(WebEvalOutput { relevance_score: 0.4, kind: kind.to_string(), output, }); } } }; let relevant = parsed.relevant; let score = parsed.score.clamp(0.0, 1.0); let mut kind = parsed.kind.trim().to_ascii_lowercase(); if kind == "code" && !allow_code { kind = "summary".to_string(); } let output = if kind == "code" { let cleaned = clean_code_text(&parsed.output); format_md_code(&cleaned) } else { clean_summary_text(&parsed.output) }; if !relevant || output.is_empty() { return None; } let kind = if kind == "code" { "code" } else { "summary" }; Some(WebEvalOutput { relevance_score: score, kind: kind.to_string(), output, }) } } impl LocalRelevanceClient { async fn evaluate(&self, query: &str, hit: &Hit) -> Option<LocalRelevanceResponse> { let prompt = build_local_relevance_prompt(query, hit); let result = self .client .generate(&prompt, self.max_tokens, self.timeout) .await .ok()?; let parsed: LocalRelevanceResponse = parse_json_response(&result.output)?; let score = parsed.score.clamp(0.0, 1.0); Some(LocalRelevanceResponse { relevant: parsed.relevant, score, }) } } impl QueryCategoryClient { async fn evaluate(&self, query: &str) -> Option<QueryCategory> { let prompt = build_query_category_prompt(query); let result = self .client .generate(&prompt, self.max_tokens, self.timeout) .await .ok()?; let parsed: QueryCategoryResponse = parse_json_response(&result.output)?; parse_query_category(&parsed.category) } } fn load_query_category_client( model_override: Option<&str>, llm_agent: Option<&str>, ) -> Option<QueryCategoryClient> { let client = load_llm_client(model_override, llm_agent)?; let max_tokens = load_llm_config(model_override) .map(|config| config.max_answer_tokens.min(MAX_QUERY_CATEGORY_TOKENS)) .unwrap_or(MAX_QUERY_CATEGORY_TOKENS); Some(QueryCategoryClient { client, max_tokens, timeout: Duration::from_millis(QUERY_CATEGORY_TIMEOUT_MS), }) } fn load_web_summary_client( model_override: Option<&str>, llm_agent: Option<&str>, ) -> Option<WebSummaryClient> { let client = load_llm_client(model_override, llm_agent)?; let max_tokens = load_llm_config(model_override) .map(|config| config.max_answer_tokens.min(MAX_WEB_SUMMARY_TOKENS)) .unwrap_or(MAX_WEB_SUMMARY_TOKENS); Some(WebSummaryClient { client, max_tokens, timeout: Duration::from_millis(WEB_SUMMARY_TIMEOUT_MS), }) } fn load_local_relevance_client( model_override: Option<&str>, llm_agent: Option<&str>, ) -> Option<LocalRelevanceClient> { let client = load_llm_client(model_override, llm_agent)?; let max_tokens = load_llm_config(model_override) .map(|config| config.max_answer_tokens.min(LOCAL_RELEVANCE_MAX_TOKENS)) .unwrap_or(LOCAL_RELEVANCE_MAX_TOKENS); Some(LocalRelevanceClient { client, max_tokens, timeout: Duration::from_millis(LOCAL_RELEVANCE_TIMEOUT_MS), }) } fn load_llm_config(model_override: Option<&str>) -> Option<config::LlmConfig> { let path = config::default_config_path().ok(); let mut config = if let Some(path) = path { if path.exists() { config::load_config_from_path(&path).ok()? } else { let mut config = config::AppConfig::default(); config.apply_defaults().ok()?; config } } else { let mut config = config::AppConfig::default(); config.apply_defaults().ok()?; config }; config.apply_defaults().ok()?; if let Some(model_override) = model_override { let trimmed = model_override.trim(); if !trimmed.is_empty() { config.llm.default_model = trimmed.to_string(); } } Some(config.llm) } fn load_llm_client( model_override: Option<&str>, llm_agent: Option<&str>, ) -> Option<Arc<dyn LlmClient>> { if let Some(agent_id) = llm_agent { let registry = match McodaRegistry::load_default() { Ok(Some(registry)) => registry, Ok(None) => { warn!("mcoda registry not found; skipping agent {agent_id}"); return None; } Err(err) => { warn!("failed to load mcoda registry: {err}"); return None; } }; let agent = registry .agent_by_id(agent_id) .or_else(|| registry.agent_by_slug(agent_id)); let agent = match agent { Some(agent) => agent, None => { warn!("mcoda agent not found: {agent_id}"); return None; } }; match resolve_agent_adapter(agent) { Ok(adapter) => return Some(Arc::new(adapter)), Err(err) => { warn!("failed to resolve mcoda agent {agent_id}: {err}"); return None; } } } let config = load_llm_config(model_override)?; if !config.provider.trim().eq_ignore_ascii_case("ollama") { return None; } let base_url = config.base_url.trim(); let model = config.default_model.trim(); if base_url.is_empty() || model.is_empty() { return None; } let client = OllamaClient::new(base_url.to_string()).ok()?; let adapter = OllamaPromptClient { client, model: model.to_string(), adapter: "ollama".to_string(), }; Some(Arc::new(adapter)) } fn build_summary_prompt( query: &str, category: QueryCategory, content: &str, code_blocks: &[String], ) -> String { let snippet = truncate_summary_input(content); let query = query.trim(); let mut prompt = String::new(); prompt.push_str("Here is what I found online.\n"); prompt.push_str("The answer to the user query is in the page text below.\n"); prompt.push_str("The text below is the main content with headers/menus removed.\n"); prompt.push_str("\nPage text (cleaned):\n"); prompt.push_str(&snippet); prompt.push_str("\n\n"); if code_blocks.is_empty() { prompt.push_str("Code blocks (verbatim): <none>\n"); } else { prompt.push_str("Code blocks (verbatim):\n"); for (idx, block) in code_blocks.iter().enumerate() { prompt.push_str(&format!( "[code {}]\n{}\n[/code {}]\n", idx + 1, block, idx + 1 )); } } prompt.push_str("\nUser query:\n"); if query.is_empty() { prompt.push_str("<empty>"); } else { prompt.push_str(query); } prompt.push_str("\n\nQuery category: "); prompt.push_str(category.as_str()); prompt.push('\n'); prompt.push_str(WEB_SUMMARY_INSTRUCTIONS); prompt } fn build_query_category_prompt(query: &str) -> String { let query = query.trim(); let mut prompt = String::new(); if query.is_empty() { prompt.push_str("User query: <empty>\n\n"); } else { prompt.push_str("User query:\n"); prompt.push_str(query); prompt.push_str("\n\n"); } prompt.push_str(QUERY_CATEGORY_INSTRUCTIONS); prompt } fn build_local_relevance_prompt(query: &str, hit: &Hit) -> String { let query = query.trim(); let intent = detect_query_intent(query); let summary = hit.summary.trim(); let snippet = hit.snippet.trim(); let (summary_trimmed, _) = truncate_utf8_chars(summary, MAX_LOCAL_RELEVANCE_INPUT_CHARS); let (snippet_trimmed, _) = truncate_utf8_chars(snippet, MAX_LOCAL_RELEVANCE_INPUT_CHARS); let mut prompt = String::new(); if query.is_empty() { prompt.push_str("User query: <empty>\n\n"); } else { prompt.push_str("User query:\n"); prompt.push_str(query); prompt.push_str("\n\n"); } prompt.push_str("Local result:\n"); prompt.push_str("Path: "); prompt.push_str(&hit.rel_path); prompt.push('\n'); if !summary_trimmed.is_empty() { prompt.push_str("Summary:\n"); prompt.push_str(&summary_trimmed); prompt.push('\n'); } if !snippet_trimmed.is_empty() { prompt.push_str("Snippet:\n"); prompt.push_str(&snippet_trimmed); prompt.push('\n'); } match intent { QueryIntent::Code => { prompt.push_str("Query intent: code example/snippet\n"); } QueryIntent::Definition => { prompt.push_str("Query intent: documentation/summary\n"); } QueryIntent::General => { prompt.push_str("Query intent: general\n"); } } prompt.push('\n'); prompt.push_str(LOCAL_RELEVANCE_INSTRUCTIONS); prompt } fn clean_summary_text(text: &str) -> String { let mut lines = Vec::new(); for line in text.lines() { let trimmed = line.trim(); if trimmed.is_empty() { continue; } lines.push(trimmed.to_string()); } lines.join("\n") } fn clean_code_text(text: &str) -> String { let trimmed = text.trim(); if trimmed.starts_with("```") { let mut lines = trimmed.lines(); lines.next(); let mut body: Vec<&str> = lines.collect(); if body.last().map(|line| line.trim()) == Some("```") { body.pop(); } let joined = body.join("\n"); return sanitize_code_block_text(&joined); } sanitize_code_block_text(trimmed) } fn sanitize_code_block_text(text: &str) -> String { let mut cleaned = Vec::new(); for line in text.lines() { let trimmed = line.trim(); if trimmed.is_empty() { continue; } if is_code_marker_line(trimmed) { continue; } let stripped = strip_copy_prefix(trimmed); if stripped.is_empty() { continue; } cleaned.push(stripped.to_string()); } cleaned.join("\n").trim().to_string() } fn is_code_marker_line(line: &str) -> bool { let lower = line.to_ascii_lowercase(); lower.starts_with("[code ") || lower.starts_with("[/code ") || lower == "```" } fn strip_copy_prefix(line: &str) -> &str { let lower = line.to_ascii_lowercase(); let prefixes = ["copy code", "copycode", "textcopy", "copy"]; for prefix in prefixes { if lower.starts_with(prefix) { return line[prefix.len()..].trim_start(); } } line } fn format_md_output(kind: &str, output: &str) -> String { match kind { "code" => format_md_code(&clean_code_text(output)), _ => clean_summary_text(output), } } fn format_md_code(text: &str) -> String { let trimmed = text.trim(); if trimmed.is_empty() { return String::new(); } if trimmed.contains("```") { return trimmed.to_string(); } format!("```\n{}\n```", trimmed) } fn parse_json_response<T: serde::de::DeserializeOwned>(text: &str) -> Option<T> { let trimmed = text.trim(); if let Ok(parsed) = serde_json::from_str::<T>(trimmed) { return Some(parsed); } let start = trimmed.find('{')?; let end = trimmed.rfind('}')?; if end <= start { return None; } let slice = &trimmed[start..=end]; if let Ok(parsed) = serde_json::from_str::<T>(slice) { return Some(parsed); } let fixed = escape_unescaped_json_newlines(slice); serde_json::from_str::<T>(&fixed).ok() } fn parse_web_eval_response_lenient(raw: &str) -> Option<WebEvalResponse> { let output = extract_loose_output_field(raw)?; let kind = extract_loose_string_field(raw, "kind") .or_else(|| extract_loose_string_field(raw, "type")) .unwrap_or_else(|| "summary".to_string()) .to_ascii_lowercase(); let relevant = extract_loose_bool_field(raw, "relevant").unwrap_or(true); let score = extract_loose_float_field(raw, "score").unwrap_or(0.5); Some(WebEvalResponse { relevant, score, kind, output, }) } fn looks_like_web_eval_metadata(raw: &str) -> bool { let lower = raw.to_ascii_lowercase(); let has_relevant = lower.contains("\"relevant\""); let has_score = lower.contains("\"score\""); let has_kind = lower.contains("\"kind\"") || lower.contains("\"type\""); let has_output = lower.contains("\"output\""); if has_relevant && has_score && has_kind && !has_output { return true; } lower.contains("```json") && has_relevant && has_score } fn extract_loose_output_field(raw: &str) -> Option<String> { let lower = raw.to_ascii_lowercase(); let key = "\"output\""; let key_pos = lower.find(key)?; let after_key = &raw[key_pos + key.len()..]; let colon_pos = after_key.find(':')?; let mut idx = key_pos + key.len() + colon_pos + 1; let bytes = raw.as_bytes(); while idx < bytes.len() && bytes[idx].is_ascii_whitespace() { idx += 1; } if idx >= bytes.len() { return None; } if bytes[idx] == b'"' { idx += 1; let end = raw.rfind('"')?; if end <= idx { return None; } return Some(raw[idx..end].to_string()); } let trimmed = raw[idx..].trim(); if trimmed.is_empty() { None } else { Some(trimmed.to_string()) } } fn extract_loose_string_field(raw: &str, field: &str) -> Option<String> { let lower = raw.to_ascii_lowercase(); let key = format!("\"{}\"", field); let key_pos = lower.find(&key)?; let after_key = &raw[key_pos + key.len()..]; let colon_pos = after_key.find(':')?; let mut idx = key_pos + key.len() + colon_pos + 1; let bytes = raw.as_bytes(); while idx < bytes.len() && bytes[idx].is_ascii_whitespace() { idx += 1; } if idx >= bytes.len() { return None; } if bytes[idx] == b'"' { idx += 1; let rest = &raw[idx..]; if let Some(end) = rest.find('"') { return Some(rest[..end].to_string()); } } None } fn extract_loose_bool_field(raw: &str, field: &str) -> Option<bool> { let lower = raw.to_ascii_lowercase(); let key = format!("\"{}\"", field); let key_pos = lower.find(&key)?; let after_key = &lower[key_pos + key.len()..]; let colon_pos = after_key.find(':')?; let mut idx = key_pos + key.len() + colon_pos + 1; let bytes = lower.as_bytes(); while idx < bytes.len() && bytes[idx].is_ascii_whitespace() { idx += 1; } if lower[idx..].starts_with("true") { return Some(true); } if lower[idx..].starts_with("false") { return Some(false); } None } fn extract_loose_float_field(raw: &str, field: &str) -> Option<f32> { let lower = raw.to_ascii_lowercase(); let key = format!("\"{}\"", field); let key_pos = lower.find(&key)?; let after_key = &lower[key_pos + key.len()..]; let colon_pos = after_key.find(':')?; let mut idx = key_pos + key.len() + colon_pos + 1; let bytes = lower.as_bytes(); while idx < bytes.len() && bytes[idx].is_ascii_whitespace() { idx += 1; } let rest = &lower[idx..]; let mut end = 0usize; for (i, ch) in rest.char_indices() { if ch.is_ascii_digit() || ch == '.' { end = i + ch.len_utf8(); } else if end > 0 { break; } else if ch == '-' { end = i + ch.len_utf8(); } else if ch.is_ascii_whitespace() { continue; } else { break; } } if end == 0 { return None; } rest[..end].trim().parse::<f32>().ok() } fn escape_unescaped_json_newlines(text: &str) -> String { let mut out = String::with_capacity(text.len()); let mut in_string = false; let mut escaped = false; for ch in text.chars() { if in_string { if escaped { out.push(ch); escaped = false; continue; } if ch == '\\' { out.push(ch); escaped = true; continue; } if ch == '"' { out.push(ch); in_string = false; continue; } match ch { '\n' => { out.push_str("\\n"); continue; } '\r' => { out.push_str("\\r"); continue; } '\t' => { out.push_str("\\t"); continue; } _ => {} } out.push(ch); } else { if ch == '"' { in_string = true; } out.push(ch); } } out } #[derive(Debug, Clone, Serialize)] pub struct WebResearchResponse { pub completion: String, pub hits: Vec<Hit>, #[serde(skip_serializing_if = "Option::is_none")] pub top_score: Option<f32>, #[serde(rename = "topScore", skip_serializing_if = "Option::is_none")] pub top_score_camel: Option<f32>, #[serde(skip_serializing_if = "Option::is_none")] pub top_score_normalized: Option<f32>, #[serde(rename = "topScoreNormalized", skip_serializing_if = "Option::is_none")] pub top_score_normalized_camel: Option<f32>, #[serde(rename = "webDiscovery")] pub web_discovery: WebDiscoveryStatus, } pub async fn run_web_research( request_id: &str, indexer: &Indexer, libs_indexer: Option<&LibsIndexer>, query: &str, limit: usize, web_limit: Option<usize>, force_web: bool, gate: &WebGateConfig, llm_filter_local_results: bool, skip_local_search: bool, disable_web_cache: bool, llm_model: Option<&str>, llm_agent: Option<&str>, ) -> Result<WebResearchResponse, anyhow::Error> { let query = query.trim(); let intent = detect_query_intent(query); let (hits, top_score, top_score_normalized, local_match_ratio) = if skip_local_search { (Vec::new(), None, None, None) } else { let search_response = search::run_query( indexer, libs_indexer, query, limit, search::RankingSurface::Search, ) .await?; let original_top_score_normalized = search_response.top_score_normalized; let mut hits = filter_local_hits_with_llm( query, intent, search_response.hits, original_top_score_normalized, llm_filter_local_results, llm_model, llm_agent, ) .await; let mut top_score = hits.first().map(|hit| hit.score); let mut top_score_normalized = top_score.map(search::normalize_score); let mut local_match_ratio = local_match_ratio(query, &hits); if matches!(intent, QueryIntent::Code) && local_match_ratio == Some(0.0) { hits.clear(); top_score = None; top_score_normalized = None; local_match_ratio = Some(0.0); } (hits, top_score, top_score_normalized, local_match_ratio) }; let completion = build_completion(query, &hits); let web_limit = resolve_web_limit(web_limit, limit); let web_discovery = if !gate.enabled { let unavailable = Tier2Unavailable::new( Tier2UnavailableReason::Disabled, "web discovery is disabled", ) .with_correlation_id(request_id); WebDiscoveryStatus { status: WebDiscoveryStatusCode::Disabled, reason: Some("disabled".to_string()), message: Some(unavailable.message.clone()), unavailable: Some(unavailable), discovery: None, fetches: None, debug: None, gate: build_gate_meta( gate, top_score, top_score_normalized, local_match_ratio, force_web, ), } } else if !gate.should_attempt( top_score_normalized, local_match_ratio, force_web, llm_filter_local_results, ) { WebDiscoveryStatus { status: WebDiscoveryStatusCode::Skipped, reason: Some("confidence_above_threshold".to_string()), message: Some("web discovery skipped by confidence gate".to_string()), unavailable: None, discovery: None, fetches: None, debug: None, gate: build_gate_meta( gate, top_score, top_score_normalized, local_match_ratio, force_web, ), } } else { run_web_discovery( request_id, gate, query, web_limit, top_score, top_score_normalized, local_match_ratio, force_web, disable_web_cache, llm_model, llm_agent, ) .await }; Ok(WebResearchResponse { completion, hits, top_score, top_score_camel: top_score, top_score_normalized, top_score_normalized_camel: top_score_normalized, web_discovery, }) } pub(crate) async fn filter_local_hits_with_llm( query: &str, intent: QueryIntent, hits: Vec<Hit>, top_score_normalized: Option<f32>, use_llm: bool, llm_model: Option<&str>, llm_agent: Option<&str>, ) -> Vec<Hit> { if hits.is_empty() { return hits; } if query.trim().is_empty() { return hits; } let query_tokens = tokenize_terms_for_match(query); if query_tokens.is_empty() { if matches!(intent, QueryIntent::Code) { return Vec::new(); } return hits; } let query_len = query_tokens.len(); let min_required = min_required_matches(query_len); let min_ratio = min_overlap_ratio_for_intent(intent, query_len); let code_intent = matches!(intent, QueryIntent::Code); let threshold = resolve_local_relevance_threshold(); let client = if use_llm { load_local_relevance_client(llm_model, llm_agent) } else { None }; if !code_intent { if let (Some(score), None) = (top_score_normalized, client.as_ref()) { if score >= threshold { return hits; } } } let Some(client) = client else { let mut filtered = Vec::new(); for hit in hits { let Some((matched, ratio)) = hit_match_stats(&query_tokens, query_len, &hit) else { continue; }; let overlap_ok = matched >= min_required && min_ratio.map_or(true, |min| ratio >= min); let has_code = hit_has_code_markers(&hit); let specific_match = hit_matches_specific_token(&query_tokens, &hit); if code_intent { if !overlap_ok || !has_code || !specific_match { continue; } } else if !overlap_ok { continue; } let mut adjusted = hit; if code_intent { apply_code_intent_penalty(&mut adjusted, has_code); } filtered.push(adjusted); } filtered.sort_by(|a, b| { b.score .partial_cmp(&a.score) .unwrap_or(std::cmp::Ordering::Equal) }); return filtered; }; let all_hits = hits; let mut filtered = Vec::new(); let mut llm_responses = 0usize; let mut llm_failures = 0usize; for hit in &all_hits { match client.evaluate(query, hit).await { Some(response) => { llm_responses += 1; if !response.relevant { continue; } let Some((matched, ratio)) = hit_match_stats(&query_tokens, query_len, hit) else { continue; }; let overlap_ok = matched >= min_required && min_ratio.map_or(true, |min| ratio >= min); let has_code = hit_has_code_markers(hit); let specific_match = hit_matches_specific_token(&query_tokens, hit); if code_intent { if !overlap_ok || !has_code || !specific_match { continue; } } else if !overlap_ok { continue; } let mut adjusted = hit.clone(); if code_intent { apply_code_intent_penalty(&mut adjusted, has_code); } filtered.push(adjusted); } None => { llm_failures += 1; } } } if llm_responses == 0 && llm_failures > 0 { let mut fallback = Vec::new(); for hit in all_hits { let Some((matched, ratio)) = hit_match_stats(&query_tokens, query_len, &hit) else { continue; }; let overlap_ok = matched >= min_required && min_ratio.map_or(true, |min| ratio >= min); let has_code = hit_has_code_markers(&hit); let specific_match = hit_matches_specific_token(&query_tokens, &hit); if code_intent { if !overlap_ok || !has_code || !specific_match { continue; } } else if !overlap_ok { continue; } let mut adjusted = hit; if code_intent { apply_code_intent_penalty(&mut adjusted, has_code); } fallback.push(adjusted); } fallback.sort_by(|a, b| { b.score .partial_cmp(&a.score) .unwrap_or(std::cmp::Ordering::Equal) }); return fallback; } filtered.sort_by(|a, b| { b.score .partial_cmp(&a.score) .unwrap_or(std::cmp::Ordering::Equal) }); if !filtered.is_empty() { return filtered; } let mut fallback = Vec::new(); for hit in &all_hits { let Some((matched, ratio)) = hit_match_stats(&query_tokens, query_len, hit) else { continue; }; let overlap_ok = matched >= min_required && min_ratio.map_or(true, |min| ratio >= min); let has_code = hit_has_code_markers(hit); let specific_match = hit_matches_specific_token(&query_tokens, hit); if code_intent { if !overlap_ok || !has_code || !specific_match { continue; } } else if !overlap_ok { continue; } let mut adjusted = hit.clone(); if code_intent { apply_code_intent_penalty(&mut adjusted, has_code); } fallback.push(adjusted); } fallback.sort_by(|a, b| { b.score .partial_cmp(&a.score) .unwrap_or(std::cmp::Ordering::Equal) }); if !fallback.is_empty() { return fallback; } if code_intent { return Vec::new(); } all_hits } pub(crate) fn evaluate_gate_status( request_id: &str, gate: &WebGateConfig, top_score: Option<f32>, top_score_normalized: Option<f32>, local_match_ratio: Option<f32>, force_web: bool, use_match_ratio: bool, ) -> WebDiscoveryStatus { let gate_meta = build_gate_meta( gate, top_score, top_score_normalized, local_match_ratio, force_web, ); if !gate.enabled { let unavailable = Tier2Unavailable::new( Tier2UnavailableReason::Disabled, "web discovery is disabled", ) .with_correlation_id(request_id); return WebDiscoveryStatus { status: WebDiscoveryStatusCode::Disabled, reason: Some("disabled".to_string()), message: Some(unavailable.message.clone()), unavailable: Some(unavailable), discovery: None, fetches: None, debug: None, gate: gate_meta, }; } if !gate.should_attempt( top_score_normalized, local_match_ratio, force_web, use_match_ratio, ) { return WebDiscoveryStatus { status: WebDiscoveryStatusCode::Skipped, reason: Some("confidence_above_threshold".to_string()), message: Some("web discovery skipped by confidence gate".to_string()), unavailable: None, discovery: None, fetches: None, debug: None, gate: gate_meta, }; } if !gate.browser_available { let message = match gate.browser_hint.as_deref() { Some(hint) => format!("web browser not available: {hint}; run `docdexd browser setup`"), None => "web browser not available; run `docdexd browser setup`".to_string(), }; let unavailable = Tier2Unavailable::new(Tier2UnavailableReason::StartupFailed, message.clone()) .with_correlation_id(request_id); return WebDiscoveryStatus { status: WebDiscoveryStatusCode::Unavailable, reason: Some("missing_dependency".to_string()), message: Some(message), unavailable: Some(unavailable), discovery: None, fetches: None, debug: None, gate: gate_meta, }; } let unavailable = Tier2Unavailable::new( Tier2UnavailableReason::StartupFailed, "web discovery is not configured", ) .with_correlation_id(request_id); WebDiscoveryStatus { status: WebDiscoveryStatusCode::Unavailable, reason: Some("not_configured".to_string()), message: Some(unavailable.message.clone()), unavailable: Some(unavailable), discovery: None, fetches: None, debug: None, gate: gate_meta, } } async fn run_web_discovery( request_id: &str, gate: &WebGateConfig, query: &str, web_limit: usize, top_score: Option<f32>, top_score_normalized: Option<f32>, local_match_ratio: Option<f32>, force_web: bool, disable_web_cache: bool, llm_model: Option<&str>, llm_agent: Option<&str>, ) -> WebDiscoveryStatus { let config = WebConfig::from_env(); let mut config = config; let cache_enabled = !disable_web_cache && !config.cache_ttl.is_zero(); if !cache_enabled { config.cache_ttl = Duration::ZERO; } let cache_key = WebCacheKey { query, web_limit, force_web, llm_model: normalize_cache_opt(llm_model), llm_agent: normalize_cache_opt(llm_agent), }; let query_hash = phrase_cache_hash(&cache_key); if !config.enabled { let unavailable = Tier2Unavailable::new( Tier2UnavailableReason::Disabled, "web discovery is disabled", ) .with_correlation_id(request_id); return WebDiscoveryStatus { status: WebDiscoveryStatusCode::Disabled, reason: Some("disabled".to_string()), message: Some(unavailable.message.clone()), unavailable: Some(unavailable), discovery: None, fetches: None, debug: None, gate: build_gate_meta( gate, top_score, top_score_normalized, local_match_ratio, force_web, ), }; } if cache_enabled && !query.trim().is_empty() { if let Some(layout) = cache::cache_layout_from_config() { if let Some(entry) = read_phrase_cache(&layout, &query_hash, config.cache_ttl) { if entry.url.trim().is_empty() { // Ignore legacy phrase cache entries without a source URL. } else { let result = WebFetchResult { url: entry.url.clone(), status: None, fetched_at_epoch_ms: Some(entry.fetched_at_epoch_ms), cached: true, content: None, ai_digested_content: Some(entry.ai_digested_content), ai_digested_kind: Some(entry.ai_digested_kind), relevance_score: Some(entry.relevance_score.unwrap_or(1.0)), debug_html: None, debug_dom_text: None, error: None, debug: None, }; return WebDiscoveryStatus { status: WebDiscoveryStatusCode::Served, reason: Some("phrase_cache".to_string()), message: Some("web discovery served from exact phrase cache".to_string()), unavailable: None, discovery: None, fetches: Some(vec![result]), debug: None, gate: build_gate_meta( gate, top_score, top_score_normalized, local_match_ratio, force_web, ), }; } } } } if !gate.browser_available { let message = match gate.browser_hint.as_deref() { Some(hint) => format!("web browser not available: {hint}; run `docdexd browser setup`"), None => "web browser not available; run `docdexd browser setup`".to_string(), }; let unavailable = Tier2Unavailable::new(Tier2UnavailableReason::StartupFailed, message.clone()) .with_correlation_id(request_id); return WebDiscoveryStatus { status: WebDiscoveryStatusCode::Unavailable, reason: Some("missing_dependency".to_string()), message: Some(message), unavailable: Some(unavailable), discovery: None, fetches: None, debug: None, gate: build_gate_meta( gate, top_score, top_score_normalized, local_match_ratio, force_web, ), }; } let discovery = match DdgDiscovery::new(config.clone()) { Ok(discovery) => discovery, Err(err) => { let unavailable = Tier2Unavailable::new( Tier2UnavailableReason::StartupFailed, format!("web discovery init failed: {err}"), ) .with_correlation_id(request_id); return WebDiscoveryStatus { status: WebDiscoveryStatusCode::Unavailable, reason: Some("discovery_init_failed".to_string()), message: Some(unavailable.message.clone()), unavailable: Some(unavailable), discovery: None, fetches: None, debug: None, gate: build_gate_meta( gate, top_score, top_score_normalized, local_match_ratio, force_web, ), }; } }; let debug_enabled = env_boolish("DOCDEX_WEB_DEBUG").unwrap_or(false); let (query_category, category_source) = classify_query_category(query, llm_model, llm_agent).await; let mut discovery_limit = (web_limit * WEB_DISCOVERY_MULTIPLIER) .max(web_limit) .max(WEB_DISCOVERY_MIN_RESULTS); discovery_limit = discovery_limit.min(config.max_results.max(web_limit)); match discovery.discover(query, discovery_limit).await { Ok(response) => { let mut debug = Vec::new(); if debug_enabled { debug.push(format!( "query_category: {} ({})", query_category.as_str(), category_source.as_str() )); } let (mut discovery_response, mut urls) = normalize_discovery_response(response, &config, web_limit, query_category); if urls.is_empty() { debug.push(format!( "discovery returned empty results for query: {}", query )); if let Some(fallback_query) = simplify_discovery_query(query) { if !fallback_query.eq_ignore_ascii_case(query) { match discovery.discover(&fallback_query, discovery_limit).await { Ok(fallback_response) => { let normalized = normalize_discovery_response( fallback_response, &config, web_limit, query_category, ); discovery_response = normalized.0; urls = normalized.1; debug.push(format!("fallback discovery query: {}", fallback_query)); if urls.is_empty() { debug.push( "fallback discovery returned empty results".to_string(), ); } } Err(err) => { debug.push(format!("fallback discovery failed: {err}")); } } } } } if urls.is_empty() { debug.push(format!( "discovery query used: {}", discovery_response.query )); return WebDiscoveryStatus { status: WebDiscoveryStatusCode::Unavailable, reason: Some("discovery_empty".to_string()), message: Some("web discovery returned empty results".to_string()), unavailable: None, discovery: Some(discovery_response), fetches: None, debug: Some(debug), gate: build_gate_meta( gate, top_score, top_score_normalized, local_match_ratio, force_web, ), }; } discovery_response.results = urls .iter() .take(web_limit) .map(|url| WebDiscoveryResult { url: url.clone() }) .collect(); let fetches = fetch_web_documents( query, &query_hash, &urls, &config, web_limit, query_category, gate.trigger_threshold, llm_model, llm_agent, ) .await; let message = if gate.browser_available { None } else { Some("web discovery complete; browser unavailable for fetch".to_string()) }; WebDiscoveryStatus { status: WebDiscoveryStatusCode::Served, reason: Some("discovery".to_string()), message, unavailable: None, discovery: Some(discovery_response), fetches: if fetches.is_empty() { None } else { Some(fetches) }, debug: if debug.is_empty() { None } else { Some(debug) }, gate: build_gate_meta( gate, top_score, top_score_normalized, local_match_ratio, force_web, ), } } Err(err) => { let unavailable = Tier2Unavailable::new( Tier2UnavailableReason::StartupFailed, format!("web discovery failed: {err}"), ) .with_correlation_id(request_id); WebDiscoveryStatus { status: WebDiscoveryStatusCode::Unavailable, reason: Some("discovery_failed".to_string()), message: Some(unavailable.message.clone()), unavailable: Some(unavailable), discovery: None, fetches: None, debug: None, gate: build_gate_meta( gate, top_score, top_score_normalized, local_match_ratio, force_web, ), } } } } fn normalize_discovery_response( response: WebDiscoveryResponse, config: &WebConfig, limit: usize, query_category: QueryCategory, ) -> (WebDiscoveryResponse, Vec<String>) { let mut urls = Vec::with_capacity(response.results.len()); for result in response.results { let raw = result.url; let unwrapped = unwrap_ddg_redirect(&raw).unwrap_or(raw); urls.push(unwrapped); } let mut urls = dedupe_urls(urls); urls.retain(|value| is_allowed_url(value, &config.blocklist)); urls.retain(|value| !is_tracking_url(value)); sort_urls_for_category(&mut urls, query_category); let urls = enforce_domain_diversity(urls, WEB_MAX_RESULTS_PER_DOMAIN); let results = urls .iter() .take(limit) .map(|url| WebDiscoveryResult { url: url.clone() }) .collect(); ( WebDiscoveryResponse { provider: response.provider, query: response.query, results, }, urls, ) } fn simplify_discovery_query(query: &str) -> Option<String> { let tokens = tokenize_terms(query); if tokens.is_empty() { return None; } let mut seen = HashSet::new(); let mut kept = Vec::new(); for token in tokens { if seen.insert(token.clone()) { kept.push(token); } if kept.len() >= WEB_DISCOVERY_MAX_QUERY_TOKENS { break; } } let simplified = kept.join(" "); let simplified = simplified.trim(); if simplified.is_empty() { return None; } if simplified.eq_ignore_ascii_case(query.trim()) { return None; } Some(simplified.to_string()) } fn enforce_domain_diversity(urls: Vec<String>, max_per_domain: usize) -> Vec<String> { if max_per_domain == 0 { return Vec::new(); } let mut out = Vec::new(); let mut counts = std::collections::HashMap::new(); for url in urls { let host = match url::Url::parse(&url) { Ok(parsed) => parsed .host_str() .map(|value| value.trim().to_ascii_lowercase()), Err(_) => None, }; let Some(host) = host else { continue; }; if host.is_empty() { continue; } let entry = counts.entry(host).or_insert(0usize); if *entry >= max_per_domain { continue; } *entry += 1; out.push(url); } out } fn sort_urls_for_category(urls: &mut Vec<String>, category: QueryCategory) { if urls.len() < 2 || matches!(category, QueryCategory::General) { return; } let mut scored = Vec::with_capacity(urls.len()); for (idx, url) in urls.iter().enumerate() { let score = score_url_for_category(url, category); scored.push((score, idx, url.clone())); } scored.sort_by(|a, b| b.0.cmp(&a.0).then_with(|| a.1.cmp(&b.1))); *urls = scored.into_iter().map(|(_, _, url)| url).collect(); } fn score_url_for_category(url: &str, category: QueryCategory) -> i32 { let url_lc = url.trim().to_ascii_lowercase(); let parsed = url::Url::parse(&url_lc).ok(); let host = parsed .as_ref() .and_then(|item| item.host_str()) .unwrap_or(""); let mut score = 0; match category { QueryCategory::CodeExample => { if url_contains_any( host, &[ "github.com", "gitlab.com", "bitbucket.org", "gist.github.com", ], ) { score += 6; } if url_contains_any( &url_lc, &[ "example", "examples", "sample", "snippet", "code", "repository", "repo", ], ) { score += 3; } if url_contains_any(&url_lc, &["docs", "reference"]) { score -= 1; } if url_contains_any(&url_lc, &["blog", "news"]) { score -= 1; } } QueryCategory::ApiReference => { if url_contains_any(host, &["docs.", "developer.", "api."]) { score += 3; } if url_contains_any(&url_lc, &["/docs", "/reference", "/api", "sdk"]) { score += 2; } if url_contains_any(host, &["github.com", "gitlab.com"]) { score -= 1; } } QueryCategory::HowToGuide => { if url_contains_any( &url_lc, &[ "how-to", "howto", "tutorial", "guide", "walkthrough", "getting-started", ], ) { score += 3; } if url_contains_any(&url_lc, &["blog", "learn", "academy"]) { score += 1; } } QueryCategory::ConceptDefinition => { if url_contains_any( &url_lc, &["overview", "introduction", "what-is", "concept", "glossary"], ) { score += 3; } if url_contains_any(&url_lc, &["docs", "reference"]) { score += 1; } } QueryCategory::Troubleshooting => { if url_contains_any( host, &["stackoverflow.com", "serverfault.com", "superuser.com"], ) { score += 4; } if url_contains_any(&url_lc, &["issue", "issues", "error", "fix", "debug"]) { score += 2; } if url_contains_any(&url_lc, &["github.com/issues", "gitlab.com/issues"]) { score += 2; } } QueryCategory::SpecStandard => { if url_contains_any( host, &["eips.ethereum.org", "rfc-editor.org", "ietf.org", "w3.org"], ) { score += 5; } if url_contains_any(&url_lc, &["spec", "standard", "eip-", "erc-", "rfc"]) { score += 2; } } QueryCategory::ComparisonOpinion => { if url_contains_any( &url_lc, &[ "compare", "comparison", "-vs-", "/vs/", "versus", "best", "top", ], ) { score += 3; } if url_contains_any(&url_lc, &["review", "pros", "cons", "alternatives"]) { score += 2; } } QueryCategory::NewsRelease => { if url_contains_any( &url_lc, &[ "blog", "news", "release", "changelog", "announcement", "press", ], ) { score += 3; } if url_contains_any(&url_lc, &["/blog", "/news"]) { score += 1; } } QueryCategory::General => {} } score } fn is_allowed_url(raw: &str, blocklist: &[String]) -> bool { let url = match url::Url::parse(raw) { Ok(url) => url, Err(_) => return false, }; let host = match url.host_str() { Some(host) => host.trim().to_ascii_lowercase(), None => return false, }; if host.is_empty() { return false; } for entry in blocklist { let trimmed = entry.trim().trim_start_matches('.').to_ascii_lowercase(); if trimmed.is_empty() { continue; } if host == trimmed || host.ends_with(&format!(".{trimmed}")) { return false; } } true } fn url_contains_any(haystack: &str, needles: &[&str]) -> bool { needles.iter().any(|needle| haystack.contains(needle)) } fn is_tracking_url(raw: &str) -> bool { let url = match url::Url::parse(raw) { Ok(url) => url, Err(_) => return true, }; let host = url.host_str().unwrap_or("").trim().to_ascii_lowercase(); if host.is_empty() { return true; } if let Some(query) = url.query() { if query.len() > 4 { for (_, value) in url.query_pairs() { let val = value.trim(); if val.is_empty() { continue; } let val_lc = val.to_ascii_lowercase(); if val_lc.starts_with("http://") || val_lc.starts_with("https://") || val_lc.contains("http%3a") || val_lc.contains("https%3a") { return true; } } } } false } fn domain_quality_key(host: &str) -> String { format!("quality:{host}") } fn read_domain_quality(layout: &StateLayout, host: &str) -> Option<DomainQualityEntry> { let key = domain_quality_key(host); let ttl = Duration::from_secs(WEB_QUALITY_TTL_SECS); let payload = cache::read_cache_entry_with_ttl(layout, &key, ttl).ok()??; serde_json::from_slice(&payload).ok() } fn write_domain_quality(layout: &StateLayout, entry: &DomainQualityEntry) { if let Ok(payload) = serde_json::to_vec(entry) { let _ = cache::write_cache_entry(layout, &domain_quality_key(&entry.host), &payload); } } fn domain_in_cooldown(layout: &StateLayout, host: &str, now_ms: u64) -> Option<u64> { let entry = read_domain_quality(layout, host)?; if entry.cooldown_until_epoch_ms > now_ms { Some(entry.cooldown_until_epoch_ms) } else { None } } fn record_domain_failure( layout: Option<&StateLayout>, host: &str, kind: DomainFailureKind, now_ms: u64, ) { let layout = match layout { Some(layout) => layout, None => return, }; let mut entry = read_domain_quality(layout, host).unwrap_or(DomainQualityEntry { host: host.to_string(), fail_count: 0, blocked_count: 0, challenge_count: 0, last_failure_epoch_ms: now_ms, cooldown_until_epoch_ms: 0, }); match kind { DomainFailureKind::Fetch => { entry.fail_count = entry.fail_count.saturating_add(1); } DomainFailureKind::Blocked => { entry.blocked_count = entry.blocked_count.saturating_add(1); } DomainFailureKind::Challenge => { entry.challenge_count = entry.challenge_count.saturating_add(1); } } entry.last_failure_epoch_ms = now_ms; if entry.fail_count >= WEB_QUALITY_FAIL_THRESHOLD || entry.blocked_count >= WEB_QUALITY_BLOCK_THRESHOLD || entry.challenge_count >= WEB_QUALITY_CHALLENGE_THRESHOLD { entry.cooldown_until_epoch_ms = now_ms.saturating_add(WEB_QUALITY_COOLDOWN_SECS * 1000); } write_domain_quality(layout, &entry); } fn record_domain_success(layout: Option<&StateLayout>, host: &str) { let layout = match layout { Some(layout) => layout, None => return, }; let mut entry = read_domain_quality(layout, host).unwrap_or(DomainQualityEntry { host: host.to_string(), fail_count: 0, blocked_count: 0, challenge_count: 0, last_failure_epoch_ms: 0, cooldown_until_epoch_ms: 0, }); entry.fail_count = 0; entry.blocked_count = 0; entry.challenge_count = 0; entry.cooldown_until_epoch_ms = 0; write_domain_quality(layout, &entry); } fn classify_status_failure(status: Option<u16>) -> Option<DomainFailureKind> { match status { Some(401 | 403 | 429 | 451) => Some(DomainFailureKind::Blocked), Some(code) if code >= 500 => Some(DomainFailureKind::Fetch), Some(404 | 408) => Some(DomainFailureKind::Fetch), Some(code) if code >= 400 => Some(DomainFailureKind::Blocked), _ => None, } } fn hash_text(text: &str) -> String { let mut hasher = Sha256::new(); hasher.update(text.as_bytes()); hex::encode(hasher.finalize()) } #[derive(Serialize)] struct WebCacheKey<'a> { query: &'a str, web_limit: usize, force_web: bool, llm_model: Option<&'a str>, llm_agent: Option<&'a str>, } fn normalize_cache_opt(value: Option<&str>) -> Option<&str> { value.and_then(|raw| { let trimmed = raw.trim(); if trimmed.is_empty() { None } else { Some(trimmed) } }) } fn phrase_cache_key(query_hash: &str) -> String { format!("phrase:{query_hash}") } fn phrase_cache_hash(key: &WebCacheKey<'_>) -> String { let payload = serde_json::to_string(key).unwrap_or_default(); hash_text(&payload) } fn summary_cache_key(query_hash: &str, content_hash: &str) -> String { format!("summary:{query_hash}:{content_hash}") } fn summary_cache_entry( query_hash: &str, content_text: &str, code_blocks: &[String], ) -> (String, String) { let mut content_input = String::new(); content_input.push_str(content_text); if !code_blocks.is_empty() { content_input.push_str("\n\n"); content_input.push_str(&code_blocks.join("\n\n")); } let content_hash = hash_text(&content_input); (query_hash.to_string(), content_hash) } fn read_phrase_cache( layout: &StateLayout, query_hash: &str, ttl: Duration, ) -> Option<WebPhraseCacheEntry> { let key = phrase_cache_key(query_hash); let payload = cache::read_cache_entry_with_ttl(layout, &key, ttl).ok()??; serde_json::from_slice::<WebPhraseCacheEntry>(&payload).ok() } fn write_phrase_cache(layout: &StateLayout, query_hash: &str, entry: &WebPhraseCacheEntry) { if let Ok(payload) = serde_json::to_vec(entry) { let _ = cache::write_cache_entry(layout, &phrase_cache_key(query_hash), &payload); } } fn read_summary_cache( layout: &StateLayout, query_hash: &str, content_hash: &str, ttl: Duration, ) -> Option<WebEvalOutput> { if ttl.is_zero() { return None; } let key = summary_cache_key(query_hash, content_hash); let payload = cache::read_cache_entry_with_ttl(layout, &key, ttl).ok()??; let entry: WebSummaryCacheEntry = serde_json::from_slice(&payload).ok()?; if entry.query_hash != query_hash || entry.content_hash != content_hash { return None; } if entry.output.trim().is_empty() { return None; } Some(WebEvalOutput { relevance_score: entry.relevance_score.clamp(0.0, 1.0), kind: entry.kind, output: entry.output, }) } fn write_summary_cache( layout: &StateLayout, query_hash: &str, content_hash: &str, evaluation: &WebEvalOutput, ) { let entry = WebSummaryCacheEntry { query_hash: query_hash.to_string(), content_hash: content_hash.to_string(), relevance_score: evaluation.relevance_score.clamp(0.0, 1.0), kind: evaluation.kind.clone(), output: evaluation.output.clone(), }; if let Ok(payload) = serde_json::to_vec(&entry) { let _ = cache::write_cache_entry( layout, &summary_cache_key(query_hash, content_hash), &payload, ); } } async fn fetch_web_documents( query: &str, query_hash: &str, urls: &[String], config: &WebConfig, target_count: usize, query_category: QueryCategory, early_stop_score: f32, llm_model: Option<&str>, llm_agent: Option<&str>, ) -> Vec<WebFetchResult> { if urls.is_empty() { return Vec::new(); } let desired_count = target_count.max(1); let layout = cache::cache_layout_from_config(); let summary_client = load_web_summary_client(llm_model, llm_agent); let debug_enabled = env_boolish("DOCDEX_WEB_DEBUG").unwrap_or(false); let early_stop_score = early_stop_score.clamp(0.0, 1.0); let scraper = match ScraperEngine::from_web_config(config) { Ok(scraper) => scraper, Err(err) => { return vec![WebFetchResult { url: String::new(), status: None, fetched_at_epoch_ms: None, cached: false, content: None, ai_digested_content: None, ai_digested_kind: None, relevance_score: None, debug_html: None, debug_dom_text: None, error: Some(err.to_string()), debug: None, }]; } }; let boilerplate_phrases = &config.boilerplate_phrases; let mut all_results = Vec::new(); let mut good_count = 0usize; let mut last_good: Option<WebFetchResult> = None; 'batch_loop: for batch in urls.chunks(WEB_BATCH_SIZE).take(WEB_MAX_BATCHES) { let mut batch_results = Vec::new(); let mut early_stop_now = false; macro_rules! push_result { ($result:expr) => {{ batch_results.push($result); if let Some(item) = batch_results.last() { if early_stop_now { all_results.clear(); all_results.push(item.clone()); break 'batch_loop; } if item.relevance_score.unwrap_or(0.0) >= WEB_GOOD_RELEVANCE_SCORE { good_count += 1; last_good = Some(item.clone()); } } if good_count >= desired_count { if desired_count == 1 { all_results.clear(); if let Some(best) = last_good.take() { all_results.push(best); } } else { all_results.extend(batch_results); } break 'batch_loop; } }}; } for raw in batch { let url = match url::Url::parse(raw) { Ok(url) => url, Err(_) => continue, }; if debug_enabled { info!("web fetch start url={}", url.as_str()); } let host = match url.host_str() { Some(host) => host.trim().to_ascii_lowercase(), None => continue, }; let cache_key = url.as_str(); let mut cached = false; let mut fetched_at_epoch_ms = None; let mut status: Option<u16> = None; let mut content: Option<String> = None; let mut content_error: Option<String> = None; let mut skip_summary = false; let mut code_blocks: Vec<String> = Vec::new(); let mut quality_scale = 1.0f32; let mut debug_notes: Vec<String> = Vec::new(); let mut debug_html: Option<String> = None; let mut debug_dom_text: Option<String> = None; let intent = resolve_query_intent(query, query_category); if let Some(layout) = layout.as_ref() { if let Ok(Some(payload)) = cache::read_cache_entry_with_ttl(layout, cache_key, config.cache_ttl) { if let Ok(entry) = serde_json::from_slice::<WebFetchCacheEntry>(&payload) { cached = true; fetched_at_epoch_ms = Some(entry.fetched_at_epoch_ms); status = entry.status; content = Some(normalize_text_spacing(&entry.content)); code_blocks = entry.code_blocks; tighten_code_blocks_for_category(query_category, &mut code_blocks); if debug_enabled { info!("web fetch cache hit url={}", url.as_str()); } } } } if content.is_none() { let now_ms = now_epoch_ms_u64(); if let Some(layout) = layout.as_ref() { if let Some(until_ms) = domain_in_cooldown(layout, &host, now_ms) { if debug_enabled { info!( "web fetch skipped cooldown url={} until={}", url.as_str(), until_ms ); } push_result!(WebFetchResult { url: url.to_string(), status: None, fetched_at_epoch_ms: None, cached: false, content: None, ai_digested_content: None, ai_digested_kind: None, relevance_score: None, debug_html: debug_html.clone(), debug_dom_text: debug_dom_text.clone(), error: Some(format!( "web fetch skipped for host cooldown until {until_ms}" )), debug: None, }); continue; } } crate::web::fetch::enforce_domain_delay(&url, config.fetch_delay).await; fetched_at_epoch_ms = Some(now_epoch_ms()); let status_probe = fetch_status(&url, &config.user_agent, config.request_timeout).await; if should_skip_status(status_probe) { if debug_enabled { info!( "web fetch skipped preflight url={} status={:?}", url.as_str(), status_probe ); } push_result!(WebFetchResult { url: url.to_string(), status: status_probe, fetched_at_epoch_ms, cached: false, content: None, ai_digested_content: None, ai_digested_kind: None, relevance_score: None, debug_html: debug_html.clone(), debug_dom_text: debug_dom_text.clone(), error: Some("web fetch skipped due to preflight status".to_string()), debug: None, }); continue; } match scraper.fetch_dom(&url).await { Ok(fetch_result) => { status = fetch_result.status.or(status_probe); let html = fetch_result.html; let readable_opt = extract_readable_text(&html, &url); if debug_enabled { info!("web fetch dom ok url={} status={:?}", url.as_str(), status); } if debug_enabled { debug_html = Some(truncate_debug_html(&html)); if let Some(readable) = readable_opt.as_ref() { debug_dom_text = Some(truncate_debug_text(readable)); } } if debug_enabled { if let Some(final_url) = fetch_result.final_url.as_ref() { if final_url == "about:blank" { debug_notes.push( "browser navigation stayed on about:blank".to_string(), ); } } else { debug_notes.push("browser final_url missing".to_string()); } } code_blocks = extract_code_blocks(&html); let ad_markers = count_ad_markers(&html); let formatted_html = format_html_text(&html); if debug_enabled { if readable_opt.is_some() { debug_notes.push("used readability content".to_string()); } else if !formatted_html.trim().is_empty() { debug_notes.push("readability failed; using html2text".to_string()); } else { debug_notes .push("readability failed; using html tag strip".to_string()); } } let mut readable = if let Some(readable) = readable_opt { readable } else if !formatted_html.trim().is_empty() { formatted_html } else { clean_web_text(&html) }; readable = normalize_text_spacing(&readable); if is_js_challenge(&html, &readable) { record_domain_failure( layout.as_ref(), &host, DomainFailureKind::Challenge, now_ms, ); if debug_enabled { debug_notes.push( "js challenge detected (multiple signals + short text)" .to_string(), ); info!("web fetch blocked by js challenge url={}", url.as_str()); } push_result!(WebFetchResult { url: url.to_string(), status, fetched_at_epoch_ms, cached: false, content: None, ai_digested_content: None, ai_digested_kind: None, relevance_score: None, debug_html: debug_html.clone(), debug_dom_text: debug_dom_text.clone(), error: Some("web fetch blocked by JS challenge".to_string()), debug: if debug_enabled && !debug_notes.is_empty() { Some(debug_notes.clone()) } else { None }, }); continue; } let banner_result = strip_banner_lines(&readable); if banner_result.removed_lines > 0 && debug_enabled { debug_notes.push(format!( "banner lines removed: {}/{}", banner_result.removed_lines, banner_result.total_lines )); } let banner_only = is_banner_only(&banner_result); let mut readable = banner_result.filtered; if banner_only { if debug_enabled { debug_notes.push("content appears to be banner-only".to_string()); } content_error = Some("banner-only".to_string()); content = None; } else { if debug_enabled && readable.trim().is_empty() { debug_notes .push("text extraction empty after fallback".to_string()); } let boiler_ratio = boilerplate_ratio(&readable, boilerplate_phrases); let penalty = quality_penalty(boiler_ratio, ad_markers); if penalty == 0.0 { if debug_enabled { info!("web fetch skipped boilerplate url={}", url.as_str()); } push_result!(WebFetchResult { url: url.to_string(), status: status_probe, fetched_at_epoch_ms, cached: false, content: None, ai_digested_content: None, ai_digested_kind: None, relevance_score: None, debug_html: debug_html.clone(), debug_dom_text: debug_dom_text.clone(), error: Some( "web fetch skipped due to boilerplate noise".to_string() ), debug: None, }); continue; } quality_scale = penalty; let filtered = filter_boilerplate_text(query, &readable, boilerplate_phrases); readable = if filtered.trim().is_empty() { readable } else { filtered }; if matches!(intent, QueryIntent::Code) && code_blocks.is_empty() { let fallback_blocks = extract_probable_code_blocks(&readable); if !fallback_blocks.is_empty() { if debug_enabled { debug_notes.push(format!( "code fallback extracted {} block(s)", fallback_blocks.len() )); } code_blocks = fallback_blocks; } } tighten_code_blocks_for_category(query_category, &mut code_blocks); let trimmed = truncate_content_output(&readable); if trimmed.trim().is_empty() { if debug_enabled { debug_notes .push("extracted text empty after cleanup".to_string()); } content = None; } else { let char_count = trimmed.chars().count(); let word_count = trimmed.split_whitespace().count(); let allow_short = matches!(intent, QueryIntent::Code) && !code_blocks.is_empty(); if !allow_short && char_count < WEB_MIN_CONTENT_CHARS && word_count < WEB_MIN_CONTENT_WORDS { skip_summary = true; content_error = Some("low_content".to_string()); if debug_enabled { debug_notes.push(format!( "low content: {char_count} chars, {word_count} words" )); } } content = Some(trimmed); } } } Err(err) => { let failure_kind = classify_status_failure(status_probe) .unwrap_or(DomainFailureKind::Fetch); record_domain_failure(layout.as_ref(), &host, failure_kind, now_ms); if debug_enabled { info!( "web fetch failed url={} status={:?} err={}", url.as_str(), status_probe, err ); } push_result!(WebFetchResult { url: url.to_string(), status: status_probe, fetched_at_epoch_ms, cached: false, content: None, ai_digested_content: None, ai_digested_kind: None, relevance_score: None, debug_html: debug_html.clone(), debug_dom_text: debug_dom_text.clone(), error: Some(format!("web fetch failed: {err}")), debug: None, }); continue; } }; } let Some(content_text) = content.as_ref() else { let empty_error = content_error .clone() .unwrap_or_else(|| "content empty".to_string()); push_result!(WebFetchResult { url: url.to_string(), status, fetched_at_epoch_ms, cached, content: Some(String::new()), ai_digested_content: None, ai_digested_kind: None, relevance_score: Some(0.0), debug_html: debug_html.clone(), debug_dom_text: debug_dom_text.clone(), error: Some(empty_error), debug: if debug_enabled && !debug_notes.is_empty() { Some(debug_notes.clone()) } else { None }, }); continue; }; if !skip_summary { let char_count = content_text.chars().count(); let word_count = content_text.split_whitespace().count(); let allow_short = matches!(intent, QueryIntent::Code) && !code_blocks.is_empty(); if !allow_short && char_count < WEB_MIN_CONTENT_CHARS && word_count < WEB_MIN_CONTENT_WORDS { skip_summary = true; content_error = Some("low_content".to_string()); if debug_enabled { debug_notes.push(format!( "low content (post-cache): {char_count} chars, {word_count} words" )); } } } if skip_summary { let error = content_error .clone() .unwrap_or_else(|| "low_content".to_string()); push_result!(WebFetchResult { url: url.to_string(), status, fetched_at_epoch_ms, cached, content: Some(content_text.clone()), ai_digested_content: None, ai_digested_kind: None, relevance_score: Some(0.0), debug_html: debug_html.clone(), debug_dom_text: debug_dom_text.clone(), error: Some(error), debug: if debug_enabled && !debug_notes.is_empty() { Some(debug_notes.clone()) } else { None }, }); continue; } let allow_code_summary = matches!(intent, QueryIntent::Code) || matches!(query_category, QueryCategory::CodeExample); let summary_blocks: Vec<String> = if allow_code_summary { code_blocks.clone() } else { Vec::new() }; let (summary_query_hash, content_hash) = summary_cache_entry(query_hash, content_text, &summary_blocks); let cached_summary = layout.as_ref().and_then(|layout| { read_summary_cache(layout, &summary_query_hash, &content_hash, config.cache_ttl) }); let used_cached_summary = cached_summary.is_some(); let llm_available = summary_client.is_some(); let evaluation = if let Some(summary) = cached_summary { Some(summary) } else if let Some(summary_client) = summary_client.as_ref() { summary_client .evaluate(query, query_category, content_text, &summary_blocks) .await } else { None }; let evaluation = match evaluation { Some(value) => value, None => { let wants_code = matches!(intent, QueryIntent::Code) || matches!(query_category, QueryCategory::CodeExample); if wants_code && !code_blocks.is_empty() { let selected = select_best_code_block(query, &code_blocks) .unwrap_or_else(|| code_blocks.join("\n\n")); WebEvalOutput { relevance_score: 0.5, kind: "code".to_string(), output: selected, } } else { WebEvalOutput { relevance_score: 0.0, kind: "summary".to_string(), output: if llm_available { String::new() } else { clean_summary_text(content_text) }, } } } }; let formatted_output = format_md_output(&evaluation.kind, &evaluation.output); let mut ai_kind = evaluation.kind.clone(); let mut summary_error = None; let mut ai_digested_content = if formatted_output.trim().is_empty() { None } else { Some(formatted_output) }; if ai_kind == "code" { if let Some(output) = ai_digested_content.as_ref() { if !looks_like_code_output(output) { ai_digested_content = None; } } else { ai_digested_content = None; } } if ai_digested_content.is_none() { if matches!(intent, QueryIntent::Code) && !code_blocks.is_empty() { let selected = select_best_code_block(query, &code_blocks) .unwrap_or_else(|| code_blocks.join("\n\n")); let fallback_code = format_md_output("code", &selected); if !fallback_code.trim().is_empty() { ai_kind = "code".to_string(); ai_digested_content = Some(fallback_code); } } } if ai_digested_content.is_none() { let fallback = clean_summary_text(content_text); if !fallback.trim().is_empty() { ai_kind = "summary".to_string(); ai_digested_content = Some(fallback); } else { summary_error = Some("summary empty".to_string()); } } let ai_digested_kind = ai_digested_content.as_ref().map(|_| ai_kind.clone()); let debug = if debug_enabled && !debug_notes.is_empty() { Some(debug_notes.clone()) } else { None }; if !cached { if let Some(layout) = layout.as_ref() { if config.cache_ttl.as_secs() > 0 { if let Some(fetched_at_epoch_ms) = fetched_at_epoch_ms { let entry = WebFetchCacheEntry { url: url.to_string(), status, fetched_at_epoch_ms, content: content_text.clone(), code_blocks: code_blocks.clone(), }; if let Ok(payload) = serde_json::to_vec(&entry) { let _ = cache::write_cache_entry(layout, cache_key, &payload); } } } } } if !used_cached_summary { if let Some(layout) = layout.as_ref() { if config.cache_ttl.as_secs() > 0 { write_summary_cache( layout, &summary_query_hash, &content_hash, &evaluation, ); } } } let match_stats = web_match_stats(query, content_text, &code_blocks); let mut relevance_score = (blend_relevance_score(evaluation.relevance_score, &match_stats) * quality_scale) .clamp(0.0, 1.0); if matches!(intent, QueryIntent::Code) { let code_score = code_block_score(&code_blocks); if code_score > 0.0 { relevance_score = (relevance_score + (0.15 * code_score)).clamp(0.0, 1.0); if ai_kind == "code" { relevance_score = (relevance_score + 0.05).clamp(0.0, 1.0); } } else if ai_kind != "code" { relevance_score = (relevance_score * 0.8).clamp(0.0, 1.0); } } else if matches!(intent, QueryIntent::Definition) { if ai_kind == "summary" { relevance_score = (relevance_score + 0.05).clamp(0.0, 1.0); } else { relevance_score = (relevance_score * 0.8).clamp(0.0, 1.0); } } else if ai_kind == "code" { relevance_score = (relevance_score * 0.9).clamp(0.0, 1.0); } let category_multiplier = category_relevance_multiplier(query_category, &url, &code_blocks, &ai_kind); relevance_score = (relevance_score * category_multiplier).clamp(0.0, 1.0); let has_content = ai_digested_content .as_ref() .map(|value| !value.trim().is_empty()) .unwrap_or(false) || !content_text.trim().is_empty(); early_stop_now = should_stop_early( desired_count, relevance_score, &match_stats, query_category, &ai_kind, has_content, early_stop_score, ); if debug_enabled && early_stop_now { info!( "web fetch early stop url={} score={:.3}", url.as_str(), relevance_score ); } record_domain_success(layout.as_ref(), &host); push_result!(WebFetchResult { url: url.to_string(), status, fetched_at_epoch_ms, cached, content: Some(content_text.clone()), ai_digested_content, ai_digested_kind, relevance_score: Some(relevance_score), debug_html, debug_dom_text, error: summary_error, debug, }); } if !batch_results.is_empty() { all_results.extend(batch_results); if good_count >= desired_count { break; } } } if all_results.is_empty() { return Vec::new(); } all_results.sort_by(|a, b| { b.relevance_score .unwrap_or(0.0) .partial_cmp(&a.relevance_score.unwrap_or(0.0)) .unwrap_or(std::cmp::Ordering::Equal) }); if let Some(best) = all_results.first() { if !config.cache_ttl.is_zero() && !query.trim().is_empty() { if let (Some(content), Some(kind)) = ( best.ai_digested_content.as_ref(), best.ai_digested_kind.as_ref(), ) { if let Some(layout) = cache::cache_layout_from_config() { let fetched_at_epoch_ms = best.fetched_at_epoch_ms.unwrap_or_else(now_epoch_ms); let entry = WebPhraseCacheEntry { query_hash: query_hash.to_string(), fetched_at_epoch_ms, ai_digested_kind: kind.clone(), ai_digested_content: content.clone(), url: best.url.clone(), relevance_score: best.relevance_score, }; write_phrase_cache(&layout, query_hash, &entry); } } } if best.relevance_score.unwrap_or(0.0) >= early_stop_score { all_results.truncate(1); return all_results; } } if all_results.len() > desired_count { all_results.truncate(desired_count); } all_results } fn clean_web_text(html: &str) -> String { let cleaned = format_html_text(html); normalize_text_spacing(&cleaned) .lines() .map(|line| line.trim()) .filter(|line| !line.is_empty()) .collect::<Vec<_>>() .join("\n") } fn format_html_text(html: &str) -> String { let cleaned = ammonia::clean(html); if cleaned.trim().is_empty() { return String::new(); } html2text::from_read(cleaned.as_bytes(), WEB_HTML2TEXT_WRAP_COLS).unwrap_or_default() } fn truncate_debug_html(html: &str) -> String { let limit = env_usize("DOCDEX_WEB_DEBUG_HTML_MAX_CHARS").unwrap_or(0); if limit == 0 { return html.to_string(); } let (snippet, _) = truncate_utf8_chars(html, limit.max(1)); snippet } fn truncate_debug_text(text: &str) -> String { let limit = env_usize("DOCDEX_WEB_DEBUG_TEXT_MAX_CHARS").unwrap_or(0); if limit == 0 { return text.to_string(); } let (snippet, _) = truncate_utf8_chars(text, limit.max(1)); snippet } fn truncate_summary_input(text: &str) -> String { let limit = env_usize("DOCDEX_WEB_SUMMARY_INPUT_MAX_CHARS") .unwrap_or(DEFAULT_WEB_SUMMARY_INPUT_MAX_CHARS); if limit == 0 { return text.to_string(); } let (snippet, _) = truncate_utf8_chars(text, limit.max(1)); snippet } fn truncate_content_output(text: &str) -> String { let limit = env_usize("DOCDEX_WEB_CONTENT_MAX_CHARS").unwrap_or(0); if limit == 0 { return text.to_string(); } let (snippet, _) = truncate_utf8_chars(text, limit.max(1)); snippet } fn normalize_text_spacing(text: &str) -> String { let mut lines = Vec::new(); for line in text.lines() { let stripped = ANSI_ESCAPE_RE.replace_all(line, ""); let trimmed = strip_invisible_chars(stripped.as_ref()).trim().to_string(); let trimmed = trimmed.as_str(); if trimmed.is_empty() { lines.push(String::new()); continue; } let mut updated = trimmed.to_string(); updated = TAG_ATTR_JOIN_RE .replace_all(&updated, "$1 $2$3") .to_string(); updated = HEADING_JOIN_RE.replace_all(&updated, "$1\n#$2").to_string(); updated = COLON_JOIN_RE.replace_all(&updated, "$1 $2").to_string(); updated = COMMA_JOIN_RE.replace_all(&updated, "$1 $2").to_string(); updated = LABEL_JOIN_RE.replace_all(&updated, "$1 $2").to_string(); updated = TLD_JOIN_RE.replace_all(&updated, ".$1 $2").to_string(); updated = BRACKET_LEFT_JOIN_RE .replace_all(&updated, "$1 [") .to_string(); updated = BRACKET_RIGHT_JOIN_RE .replace_all(&updated, "] $1") .to_string(); updated = LOWER_JOIN_STOPWORD_RE .replace_all(&updated, "$1 $2 $3") .to_string(); updated = TITLE_AND_JOIN_RE .replace_all(&updated, "$1 and$2") .to_string(); updated = AND_JOIN_RE.replace_all(&updated, "$1 and $2").to_string(); updated = AND_LOWER_JOIN_RE .replace_all(&updated, "$1 and $2") .to_string(); updated = PREFIX_COMMON_JOIN_RE .replace_all(&updated, "$1 $2") .to_string(); updated = LONG_JOIN_RE.replace_all(&updated, "$1 $2 $3").to_string(); updated = LOWER_UPPER_JOIN_RE .replace_all(&updated, "$1 $2") .to_string(); updated = CAPITAL_JOIN_RE.replace_all(&updated, "$1 $2").to_string(); updated = WORD_METHOD_JOIN_RE .replace_all(&updated, "$1 $2") .to_string(); updated = PAREN_WORD_JOIN_RE .replace_all(&updated, "$1 $2") .to_string(); updated = PUNCT_JOIN_RE.replace_all(&updated, "$1 $2").to_string(); updated = ALLCAPS_JOIN_RE.replace_all(&updated, "$1 $2").to_string(); updated = CAMEL_BREAK_RE.replace_all(&updated, "$1 $2").to_string(); if looks_codeish(&updated) { updated = CODE_KEYWORD_JOIN_RE .replace_all(&updated, "$1 $2") .to_string(); } let normalized = updated.split_whitespace().collect::<Vec<_>>().join(" "); if is_strict_code_line(trimmed) { lines.push(trimmed.to_string()); } else { let fixed = if looks_codeish(&updated) { normalized } else { rejoin_split_words(&normalized) }; lines.push(fixed); } } lines.join("\n") } fn strip_invisible_chars(text: &str) -> String { text.replace('\u{200B}', "") .replace('\u{200C}', "") .replace('\u{200D}', "") .replace('\u{FEFF}', "") .replace('\u{00AD}', "") } fn rejoin_split_words(line: &str) -> String { let tokens: Vec<&str> = line.split_whitespace().collect(); if tokens.len() < 3 { return line.to_string(); } let mut out: Vec<String> = Vec::with_capacity(tokens.len()); let mut i = 0usize; while i < tokens.len() { if i + 2 < tokens.len() { let prev = tokens[i]; let mid = tokens[i + 1]; let next = tokens[i + 2]; if should_rejoin_split(prev, mid, next) { out.push(format!("{prev}{mid}{next}")); i += 3; continue; } } out.push(tokens[i].to_string()); i += 1; } out.join(" ") } fn should_rejoin_split(prev: &str, mid: &str, next: &str) -> bool { if prev.len() < 3 || next.len() < 3 || mid.len() > 2 { return false; } if !(is_lower_word_token(prev) && is_lower_word_token(mid) && is_lower_word_token(next)) { return false; } if is_common_stopword(prev) || is_common_stopword(next) { return false; } let combined_len = prev.len() + mid.len() + next.len(); combined_len >= 8 } fn is_lower_word_token(token: &str) -> bool { !token.is_empty() && token.chars().all(|ch| ch.is_ascii_lowercase()) } fn is_common_stopword(token: &str) -> bool { matches!( token, "the" | "and" | "of" | "in" | "to" | "for" | "by" | "with" | "as" | "on" | "at" | "from" | "is" | "are" | "was" | "were" | "be" | "been" | "it" | "its" | "this" | "that" | "these" | "those" | "a" | "an" | "or" | "but" | "if" | "than" | "then" | "so" | "while" | "when" ) } fn looks_codeish(text: &str) -> bool { text.contains('=') || text.contains(';') || text.contains('{') || text.contains('}') || text.contains("()") || text.contains("=>") || text.contains("->") } fn is_probable_code_line(line: &str) -> bool { let trimmed = line.trim(); if trimmed.is_empty() { return false; } if trimmed.starts_with("```") { return true; } if trimmed.starts_with("//") { return true; } if trimmed.starts_with("/**") || trimmed.starts_with("///") || trimmed.starts_with("/*") { return true; } let symbols = ['{', '}', ';', '=', '<', '>', '[', ']', '(', ')']; let symbol_hits = trimmed.chars().filter(|ch| symbols.contains(ch)).count(); let leading_ws = line.len().saturating_sub(line.trim_start().len()); if symbol_hits >= 2 { return true; } if leading_ws >= 2 && symbol_hits >= 1 { return true; } trimmed.contains("::") || trimmed.contains("->") || trimmed.contains("=>") } fn is_strict_code_line(line: &str) -> bool { let trimmed = line.trim(); if trimmed.is_empty() { return false; } if trimmed.starts_with("```") || trimmed.starts_with("#!") { return true; } let symbols = ['{', '}', ';', '=', '<', '>', '[', ']', '(', ')']; let symbol_hits = trimmed.chars().filter(|ch| symbols.contains(ch)).count(); let hard_symbols = ['{', '}', ';', '=', '<', '>']; let hard_hits = trimmed .chars() .filter(|ch| hard_symbols.contains(ch)) .count(); let len = trimmed.len(); if hard_hits >= 1 && symbol_hits >= 3 { return true; } if (symbol_hits >= 6 && len <= 120) || (symbol_hits >= 4 && len <= 60) { return true; } trimmed.contains("::") || trimmed.contains("->") || trimmed.contains("=>") } fn extract_code_blocks(html: &str) -> Vec<String> { let formatted = format_html_text(html); extract_probable_code_blocks(&formatted) } fn extract_probable_code_blocks(text: &str) -> Vec<String> { let mut blocks = Vec::new(); let mut seen = HashSet::new(); let mut current = Vec::new(); let mut code_lines = 0usize; for line in text.lines() { let trimmed = line.trim(); if trimmed.is_empty() { if !current.is_empty() { if code_lines >= 2 || block_has_code_shape(&current) { let joined = current.join("\n"); push_code_block(&mut blocks, &mut seen, &joined, true); if blocks.len() >= MAX_CODE_BLOCKS { return blocks; } } current.clear(); code_lines = 0; } continue; } if is_probable_code_line(trimmed) { code_lines += 1; current.push(trimmed.to_string()); } else if !current.is_empty() { if code_lines >= 2 || block_has_code_shape(&current) { let joined = current.join("\n"); push_code_block(&mut blocks, &mut seen, &joined, true); if blocks.len() >= MAX_CODE_BLOCKS { return blocks; } } current.clear(); code_lines = 0; } } if !current.is_empty() && (code_lines >= 2 || block_has_code_shape(&current)) { let joined = current.join("\n"); push_code_block(&mut blocks, &mut seen, &joined, true); if blocks.len() >= MAX_CODE_BLOCKS { return blocks; } } blocks } fn block_has_code_shape(lines: &[String]) -> bool { let mut symbol_hits = 0usize; let mut total_chars = 0usize; let mut indented = 0usize; for line in lines { let trimmed = line.trim_end(); if trimmed.is_empty() { continue; } total_chars += trimmed.len(); if line.len() > trimmed.len() + 1 { indented += 1; } for ch in trimmed.chars() { if matches!( ch, '{' | '}' | ';' | '=' | '<' | '>' | '[' | ']' | '(' | ')' | ':' | ',' ) { symbol_hits += 1; } } } if total_chars == 0 { return false; } let symbol_ratio = symbol_hits as f32 / total_chars as f32; if indented >= 2 && symbol_hits >= 3 { return true; } symbol_ratio >= 0.06 && total_chars >= 80 } fn push_code_block( blocks: &mut Vec<String>, seen: &mut HashSet<String>, raw: &str, require_blocklike: bool, ) { let unescaped = html_unescape_text(raw); let normalized = unescaped.replace("\r\n", "\n"); let trimmed = normalized.trim(); if trimmed.is_empty() { return; } let cleaned = sanitize_code_block_text(trimmed); if cleaned.is_empty() { return; } if is_tiny_code_fragment(&cleaned) { return; } if require_blocklike && !is_probable_code_block(&cleaned) { return; } let lowered = cleaned.to_ascii_lowercase(); let key = normalize_text_key(&lowered); if key.is_empty() || !seen.insert(key) { return; } let (snippet, _) = truncate_utf8_chars(&cleaned, MAX_CODE_BLOCK_CHARS); blocks.push(snippet); } fn normalize_text_key(line: &str) -> String { let mut out = String::new(); let mut last_space = false; for ch in line.chars() { if ch.is_ascii_alphanumeric() { out.push(ch.to_ascii_lowercase()); last_space = false; } else if !last_space { out.push(' '); last_space = true; } } out.trim().to_string() } fn is_probable_code_block(text: &str) -> bool { let trimmed = text.trim(); if trimmed.is_empty() { return false; } if trimmed.contains('\n') { return true; } if is_tiny_code_fragment(trimmed) { return false; } if trimmed.len() >= 80 { return true; } let code_symbols = ['{', '}', ';', '=', '>', '<', '(', ')', '[', ']', ':']; let mut symbol_hits = 0usize; for ch in trimmed.chars() { if code_symbols.contains(&ch) { symbol_hits += 1; if symbol_hits >= 2 && trimmed.len() >= 30 { return true; } } } false } fn is_tiny_code_fragment(text: &str) -> bool { let trimmed = text.trim(); if trimmed.is_empty() { return true; } if trimmed.contains('\n') { return false; } let len = trimmed.len(); if len < 12 { return true; } let has_statement_markers = trimmed.contains(';') || trimmed.contains('=') || trimmed.contains('{') || trimmed.contains('}') || trimmed.contains("=>") || trimmed.contains("->"); if has_statement_markers { return false; } let token_count = trimmed.split_whitespace().count(); if token_count >= 2 && len >= 40 { return false; } let mut all_ident = true; for ch in trimmed.chars() { if ch.is_ascii_alphanumeric() || matches!( ch, '.' | '_' | '<' | '>' | ':' | '(' | ')' | '[' | ']' | '#' | '?' | '!' | ',' ) { continue; } all_ident = false; break; } if all_ident && len < 60 { return true; } token_count <= 1 && len < 60 } fn tighten_code_blocks_for_category(category: QueryCategory, blocks: &mut Vec<String>) { if !matches!(category, QueryCategory::CodeExample) { return; } let filtered: Vec<String> = blocks .iter() .filter(|block| is_strong_code_sample(block)) .cloned() .collect(); if filtered.is_empty() { blocks.clear(); } else { *blocks = filtered; } } fn is_strong_code_sample(text: &str) -> bool { let trimmed = text.trim(); if trimmed.is_empty() { return false; } if trimmed.contains('\n') { return true; } if trimmed.len() < 18 { return false; } if trimmed.contains("=>") || trimmed.contains("->") { return true; } if trimmed.contains(';') || trimmed.contains('=') { return true; } let symbol_hits = trimmed .chars() .filter(|ch| ['{', '}', '[', ']', '(', ')', ':', ','].contains(ch)) .count(); if symbol_hits >= 2 && trimmed.len() >= 30 { return true; } trimmed.contains(' ') && trimmed.contains('(') && trimmed.contains(')') } fn html_unescape_text(value: &str) -> String { value .replace("&", "&") .replace(""", "\"") .replace("'", "'") .replace("'", "'") .replace(" ", " ") .replace(" ", " ") .replace(" ", " ") .replace("<", "<") .replace(">", ">") } fn now_epoch_ms() -> u128 { SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap_or_default() .as_millis() } fn now_epoch_ms_u64() -> u64 { now_epoch_ms().try_into().unwrap_or(u64::MAX) } fn build_completion(query: &str, hits: &[Hit]) -> String { let trimmed = query.trim(); if hits.is_empty() { if trimmed.is_empty() { return "No local documents matched the query.".to_string(); } return format!("No local documents matched query: {}", trimmed); } let mut lines = Vec::new(); if !trimmed.is_empty() { lines.push(format!("Local matches for query: {}", trimmed)); } else { lines.push("Local matches:".to_string()); } for hit in hits.iter().take(3) { let summary = hit.summary.trim(); if summary.is_empty() { lines.push(format!("- {}", hit.rel_path)); } else { lines.push(format!("- {}: {}", hit.rel_path, summary)); } } lines.join("\n") } pub(crate) fn local_match_ratio(query: &str, hits: &[Hit]) -> Option<f32> { let query_tokens = tokenize_terms_for_match(query); if query_tokens.is_empty() { return None; } let query_len = query_tokens.len(); let min_required = min_required_matches(query_len); let mut best_ratio = 0.0f32; let mut best_matches = 0usize; for hit in hits.iter().take(MAX_MATCH_HITS) { if let Some((matched, ratio)) = hit_match_stats(&query_tokens, query_len, hit) { if ratio > best_ratio { best_ratio = ratio; best_matches = matched; } } } if best_matches < min_required { return Some(0.0); } Some(best_ratio) } fn hit_match_stats(query_tokens: &[String], query_len: usize, hit: &Hit) -> Option<(usize, f32)> { if query_tokens.is_empty() { return None; } let mut hit_tokens = HashSet::new(); collect_match_tokens(&hit.summary, &mut hit_tokens); collect_match_tokens(&hit.snippet, &mut hit_tokens); if hit_tokens.is_empty() { return None; } let matched = query_tokens .iter() .filter(|token| hit_tokens.contains(*token)) .count(); let ratio = matched as f32 / query_len as f32; Some((matched, ratio)) } fn min_required_matches(query_len: usize) -> usize { if query_len >= 3 { 2 } else { 1 } } fn tokenize_terms(text: &str) -> Vec<String> { tokenize_terms_with_filter(text, should_keep_token) } fn tokenize_terms_for_match(text: &str) -> Vec<String> { tokenize_terms_with_filter(text, should_keep_match_token) } fn tokenize_terms_with_filter(text: &str, keep: fn(&str) -> bool) -> Vec<String> { let mut tokens = Vec::new(); let mut buf = String::new(); for ch in text.chars() { if ch.is_ascii_alphanumeric() { buf.push(ch.to_ascii_lowercase()); } else if !buf.is_empty() { push_token_with_filter(&mut tokens, &mut buf, keep); } } if !buf.is_empty() { push_token_with_filter(&mut tokens, &mut buf, keep); } tokens } fn collect_tokens(text: &str, out: &mut HashSet<String>) { collect_tokens_with_filter(text, out, should_keep_token); } fn collect_match_tokens(text: &str, out: &mut HashSet<String>) { collect_tokens_with_filter(text, out, should_keep_match_token); } fn collect_tokens_with_filter(text: &str, out: &mut HashSet<String>, keep: fn(&str) -> bool) { let mut buf = String::new(); for ch in text.chars() { if ch.is_ascii_alphanumeric() { buf.push(ch.to_ascii_lowercase()); } else if !buf.is_empty() { if keep(&buf) { out.insert(buf.clone()); } buf.clear(); } } if !buf.is_empty() && keep(&buf) { out.insert(buf.clone()); } } #[derive(Debug, Clone, Copy)] pub(crate) enum QueryIntent { Code, Definition, General, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub(crate) enum QueryCategory { CodeExample, ApiReference, HowToGuide, ConceptDefinition, Troubleshooting, SpecStandard, ComparisonOpinion, NewsRelease, General, } #[derive(Debug, Clone, Copy)] enum QueryCategorySource { Llm, Heuristic, } impl QueryCategory { fn as_str(self) -> &'static str { match self { QueryCategory::CodeExample => "code_example", QueryCategory::ApiReference => "api_reference", QueryCategory::HowToGuide => "how_to_guide", QueryCategory::ConceptDefinition => "concept_definition", QueryCategory::Troubleshooting => "troubleshooting", QueryCategory::SpecStandard => "spec_standard", QueryCategory::ComparisonOpinion => "comparison_opinion", QueryCategory::NewsRelease => "news_release", QueryCategory::General => "general", } } } impl QueryCategorySource { fn as_str(self) -> &'static str { match self { QueryCategorySource::Llm => "llm", QueryCategorySource::Heuristic => "heuristic", } } } pub(crate) fn detect_query_intent(query: &str) -> QueryIntent { let query_lc = query.trim().to_ascii_lowercase(); if query_lc.is_empty() { return QueryIntent::General; } let tokens = tokenize_terms_with_filter(&query_lc, should_keep_category_token); let code_intent = tokens.iter().any(|token| { matches!( token.as_str(), "code" | "example" | "examples" | "sample" | "snippet" | "snippets" | "implement" | "implementation" | "tutorial" | "demo" | "template" | "boilerplate" ) }) || query_lc.contains("how to "); if code_intent { return QueryIntent::Code; } let doc_tokens = tokenize_terms_for_match(&query_lc); let doc_intent = doc_tokens.iter().any(|token| { matches!( token.as_str(), "doc" | "docs" | "documentation" | "reference" | "references" | "manual" | "guide" | "guides" | "api" ) }); if doc_intent { return QueryIntent::Definition; } let definition_intent = tokens.iter().any(|token| { matches!( token.as_str(), "define" | "definition" | "meaning" | "explain" | "overview" | "concept" | "what" ) }) || query_lc.starts_with("what is ") || query_lc.starts_with("what's "); if definition_intent { return QueryIntent::Definition; } QueryIntent::General } fn resolve_query_intent(query: &str, category: QueryCategory) -> QueryIntent { match category { QueryCategory::CodeExample => QueryIntent::Code, QueryCategory::ApiReference | QueryCategory::ConceptDefinition | QueryCategory::SpecStandard => QueryIntent::Definition, _ => detect_query_intent(query), } } fn parse_query_category(value: &str) -> Option<QueryCategory> { let normalized = value.trim().to_ascii_lowercase(); let category = match normalized.as_str() { "code_example" | "code" | "example" | "code-sample" | "code_sample" => { QueryCategory::CodeExample } "api_reference" | "api" | "reference" | "documentation" => QueryCategory::ApiReference, "how_to_guide" | "how-to" | "how_to" | "guide" | "tutorial" => QueryCategory::HowToGuide, "concept_definition" | "definition" | "concept" | "overview" => { QueryCategory::ConceptDefinition } "troubleshooting" | "debugging" | "error" | "issue" => QueryCategory::Troubleshooting, "spec_standard" | "spec" | "standard" | "rfc" => QueryCategory::SpecStandard, "comparison_opinion" | "comparison" | "compare" | "opinion" => { QueryCategory::ComparisonOpinion } "news_release" | "news" | "release" | "announcement" => QueryCategory::NewsRelease, "general" | "other" | "unknown" => QueryCategory::General, _ => return None, }; Some(category) } fn detect_query_category_heuristic(query: &str) -> QueryCategory { let query_lc = query.trim().to_ascii_lowercase(); if query_lc.is_empty() { return QueryCategory::General; } let tokens = tokenize_terms_with_filter(&query_lc, should_keep_category_token); if cfg!(test) && std::env::var("DOCDEX_DEBUG_QUERY_CATEGORY") .map(|value| value.trim() == "1") .unwrap_or(false) { eprintln!("[web] category tokens={tokens:?}"); } let has_token = |values: &[&str]| tokens.iter().any(|token| values.contains(&token.as_str())); let has_phrase = |phrase: &str| query_lc.contains(phrase); if has_token(&[ "error", "issue", "issues", "bug", "bugs", "debug", "debugging", "fix", "fixed", "failure", "failed", "panic", "exception", "stacktrace", "stack", "trace", ]) { return QueryCategory::Troubleshooting; } if has_token(&[ "spec", "specs", "standard", "standards", "rfc", "eip", "eips", "erc", "ercs", "draft", ]) { return QueryCategory::SpecStandard; } if has_token(&[ "code", "example", "examples", "sample", "samples", "snippet", "snippets", "implementation", "template", "boilerplate", ]) { return QueryCategory::CodeExample; } if has_phrase("how to ") || has_phrase("how-to") || has_phrase("getting started") { return QueryCategory::HowToGuide; } if has_token(&[ "how", "guide", "guides", "tutorial", "tutorials", "walkthrough", ]) { return QueryCategory::HowToGuide; } if has_token(&[ "api", "reference", "docs", "documentation", "sdk", "endpoint", "schema", ]) { return QueryCategory::ApiReference; } if has_token(&[ "vs", "versus", "compare", "comparison", "best", "top", "pros", "cons", "alternatives", ]) { return QueryCategory::ComparisonOpinion; } if has_token(&[ "release", "releases", "changelog", "announcement", "news", "roadmap", "version", ]) { return QueryCategory::NewsRelease; } if has_phrase("what is ") || has_phrase("what's ") || has_token(&[ "define", "definition", "meaning", "overview", "concept", "intro", ]) { return QueryCategory::ConceptDefinition; } QueryCategory::General } async fn classify_query_category( query: &str, llm_model: Option<&str>, llm_agent: Option<&str>, ) -> (QueryCategory, QueryCategorySource) { let query_key = query.trim().to_ascii_lowercase(); if query_key.is_empty() { return (QueryCategory::General, QueryCategorySource::Heuristic); } if let Ok(cache) = QUERY_CATEGORY_CACHE.lock() { if let Some(cached) = cache.get(&query_key).copied() { return (cached.category, cached.source); } } let heuristic = detect_query_category_heuristic(query); let Some(client) = load_query_category_client(llm_model, llm_agent) else { let source = QueryCategorySource::Heuristic; if let Ok(mut cache) = QUERY_CATEGORY_CACHE.lock() { cache.insert( query_key, CachedQueryCategory { category: heuristic, source, }, ); } return (heuristic, source); }; let intent = detect_query_intent(query); let (mut category, mut source) = match client.evaluate(query).await { Some(category) => (category, QueryCategorySource::Llm), None => (heuristic, QueryCategorySource::Heuristic), }; if matches!(category, QueryCategory::CodeExample) && !matches!(intent, QueryIntent::Code) && !matches!(heuristic, QueryCategory::CodeExample) { category = heuristic; source = QueryCategorySource::Heuristic; } if let Ok(mut cache) = QUERY_CATEGORY_CACHE.lock() { cache.insert(query_key, CachedQueryCategory { category, source }); } (category, source) } fn code_block_score(blocks: &[String]) -> f32 { if blocks.is_empty() { return 0.0; } let mut total_chars = 0usize; let mut total_lines = 0usize; for block in blocks.iter().take(3) { total_chars += block.len(); total_lines += block.lines().count(); } let char_score = (total_chars as f32 / 800.0).clamp(0.0, 1.0); let line_score = (total_lines as f32 / 20.0).clamp(0.0, 1.0); (0.6 * line_score + 0.4 * char_score).clamp(0.0, 1.0) } fn score_code_block(block: &str) -> f32 { let total_chars = block.len(); let total_lines = block.lines().count(); let char_score = (total_chars as f32 / 800.0).clamp(0.0, 1.0); let line_score = (total_lines as f32 / 24.0).clamp(0.0, 1.0); (0.6 * line_score + 0.4 * char_score).clamp(0.0, 1.0) } fn select_best_code_block(query: &str, blocks: &[String]) -> Option<String> { if blocks.is_empty() { return None; } let mut candidates: Vec<&str> = Vec::new(); for block in blocks { if is_viable_code_block(block) { candidates.push(block.as_str()); } } let source: Vec<&str> = if candidates.is_empty() { blocks.iter().map(|b| b.as_str()).collect() } else { candidates }; let tokens = tokenize_terms(&query.to_ascii_lowercase()); let token_count = tokens.len(); let mut best_score = -1.0f32; let mut best_block: Option<&str> = None; for block in source { let lowered = block.to_ascii_lowercase(); let mut matched = 0usize; if token_count > 0 { for token in &tokens { if token.len() < 3 { continue; } if lowered.contains(token) { matched += 1; } } } let overlap = if token_count == 0 { 0.0 } else { matched as f32 / token_count as f32 }; let base = score_code_block(block); let line_count = block.lines().count(); let length_factor = if line_count > 140 { 0.8 } else if line_count > 80 { 0.9 } else { 1.0 }; let mut symbol_hits = 0usize; let mut total_chars = 0usize; let mut indented = 0usize; for line in block.lines() { let trimmed = line.trim_end(); if trimmed.is_empty() { continue; } total_chars += trimmed.len(); if line.len() > trimmed.len() + 1 { indented += 1; } for ch in trimmed.chars() { if matches!( ch, '{' | '}' | ';' | '=' | '<' | '>' | '[' | ']' | '(' | ')' | ':' | ',' ) { symbol_hits += 1; } } } let symbol_ratio = if total_chars == 0 { 0.0 } else { symbol_hits as f32 / total_chars as f32 }; let indent_ratio = if line_count == 0 { 0.0 } else { indented as f32 / line_count as f32 }; let structure_bonus = ((symbol_ratio * 1.4) + (indent_ratio * 0.6)).clamp(0.0, 1.0) * 0.2; let score = if token_count == 0 { base * length_factor + structure_bonus } else { (0.6 * overlap + 0.4 * base) * length_factor + structure_bonus }; if score > best_score { best_score = score; best_block = Some(block); } } best_block.map(|value| value.to_string()) } fn looks_like_code_output(text: &str) -> bool { let trimmed = text.trim(); if trimmed.is_empty() { return false; } if is_probable_code_block(trimmed) { return true; } let mut symbol_hits = 0usize; let mut total = 0usize; let mut indented = 0usize; for line in trimmed.lines().take(60) { if line.trim().is_empty() { continue; } total += line.len(); if line.len() > line.trim_start().len() + 1 { indented += 1; } for ch in line.chars() { if matches!( ch, '{' | '}' | ';' | '=' | '<' | '>' | '[' | ']' | '(' | ')' | ':' | ',' ) { symbol_hits += 1; } } } if total > 0 { let ratio = symbol_hits as f32 / total as f32; if ratio >= 0.08 && total >= 40 { return true; } if indented >= 3 { return true; } } let mut code_lines = 0usize; let mut total_lines = 0usize; for line in trimmed.lines().take(20) { if line.trim().is_empty() { continue; } total_lines += 1; if is_probable_code_line(line) { code_lines += 1; } } total_lines > 0 && (code_lines * 2 >= total_lines) } fn is_viable_code_block(block: &str) -> bool { let line_count = block.lines().count(); if line_count < 4 { return false; } let mut code_lines = 0usize; let mut symbol_hits = 0usize; let mut total_chars = 0usize; let mut indented = 0usize; for line in block.lines() { let trimmed = line.trim_end(); if trimmed.is_empty() { continue; } total_chars += trimmed.len(); if line.len() > trimmed.len() + 1 { indented += 1; } if is_probable_code_line(trimmed) { code_lines += 1; } for ch in trimmed.chars() { if matches!( ch, '{' | '}' | ';' | '=' | '<' | '>' | '[' | ']' | '(' | ')' | ':' | ',' ) { symbol_hits += 1; } } } if code_lines * 2 >= line_count { return true; } if total_chars == 0 { return false; } let symbol_ratio = symbol_hits as f32 / total_chars as f32; if indented >= 2 && symbol_hits >= 3 { return true; } symbol_ratio >= 0.06 && total_chars >= 120 } fn filter_boilerplate_text(_query: &str, text: &str, phrases: &[String]) -> String { let mut kept = Vec::new(); let mut seen = HashSet::new(); for line in text.lines() { let trimmed = line.trim(); if trimmed.is_empty() { continue; } let lower = trimmed.to_ascii_lowercase(); if is_boilerplate_line(trimmed, &lower, phrases) { continue; } let key = normalize_text_key(trimmed); if !key.is_empty() && !seen.insert(key) { continue; } kept.push(trimmed.to_string()); } kept.join("\n") } #[derive(Debug, Clone)] struct BannerFilterResult { filtered: String, removed_lines: usize, total_lines: usize, } fn strip_banner_lines(text: &str) -> BannerFilterResult { let mut kept = Vec::new(); let mut removed = 0usize; let mut total = 0usize; for line in text.lines() { let trimmed = line.trim(); if trimmed.is_empty() { continue; } total += 1; if is_banner_line(trimmed) { removed += 1; continue; } kept.push(trimmed); } BannerFilterResult { filtered: kept.join("\n"), removed_lines: removed, total_lines: total, } } fn is_banner_only(result: &BannerFilterResult) -> bool { if result.total_lines == 0 { return false; } let remaining_lines = result.total_lines.saturating_sub(result.removed_lines); if remaining_lines == 0 { return true; } let remaining = result.filtered.trim(); if remaining.is_empty() { return true; } let removed_ratio = result.removed_lines as f32 / result.total_lines as f32; let remaining_words = remaining.split_whitespace().count(); removed_ratio >= 0.6 && remaining.len() < 200 && remaining_words < 40 } fn is_banner_line(line: &str) -> bool { let len = line.len(); if len == 0 { return false; } let token_count = line.split_whitespace().count(); let separators = ['|', '•', '»', '›', '>', '/']; let sep_count = line.chars().filter(|ch| separators.contains(ch)).count(); let alpha_count = line.chars().filter(|ch| ch.is_ascii_alphabetic()).count(); let non_alpha_ratio = 1.0 - (alpha_count as f32 / len.max(1) as f32); if len < 80 && sep_count >= 2 { return true; } if len < 120 && non_alpha_ratio > 0.45 && token_count <= 8 { return true; } len < 100 && token_count <= 4 && non_alpha_ratio > 0.35 } fn boilerplate_ratio(text: &str, phrases: &[String]) -> f32 { let mut total = 0usize; let mut boiler = 0usize; for line in text.lines() { let trimmed = line.trim(); if trimmed.is_empty() { continue; } total += 1; let lower = trimmed.to_ascii_lowercase(); if is_boilerplate_line(trimmed, &lower, phrases) { boiler += 1; } } if total == 0 { 0.0 } else { boiler as f32 / total as f32 } } fn quality_penalty(boiler_ratio: f32, ad_markers: usize) -> f32 { if boiler_ratio >= 0.6 { return 0.0; } let mut penalty: f32 = 1.0; if boiler_ratio >= 0.4 { penalty *= 0.6; } else if boiler_ratio >= 0.25 { penalty *= 0.8; } if ad_markers >= 8 { penalty *= 0.7; } else if ad_markers >= 4 { penalty *= 0.85; } penalty.clamp(0.0, 1.0) } fn should_skip_status(status: Option<u16>) -> bool { matches!(status, Some(404 | 410)) } fn is_js_challenge(html: &str, readable_text: &str) -> bool { let trimmed = readable_text.trim(); let text_len = trimmed.chars().count(); let html_len = html.chars().count(); if html_len < 500 { return false; } let density = text_len as f32 / html_len.max(1) as f32; let lower = html.to_ascii_lowercase(); let script_count = lower.matches("<script").count(); let noscript_count = lower.matches("<noscript").count(); let form_count = lower.matches("<form").count() + lower.matches("<input").count(); if density < 0.015 && text_len < 400 && (script_count + noscript_count + form_count) >= 3 { return true; } text_len < 120 && html_len > 5000 && (script_count + noscript_count) >= 2 } fn count_ad_markers(html: &str) -> usize { let lower = html.to_ascii_lowercase(); let iframe_count = lower.matches("<iframe").count(); let embed_count = lower.matches("<embed").count(); let object_count = lower.matches("<object").count(); let aside_count = lower.matches("<aside").count(); let script_count = lower.matches("<script").count(); iframe_count .saturating_mul(2) .saturating_add(embed_count) .saturating_add(object_count) .saturating_add(aside_count) .saturating_add(script_count / 10) } fn is_boilerplate_line(line: &str, lower: &str, phrases: &[String]) -> bool { let len = line.len(); if !phrases.is_empty() { for phrase in phrases { if phrase.is_empty() { continue; } if lower.contains(phrase) { return true; } } } if lower.starts_with("http://") || lower.starts_with("https://") { return len < 200; } let separators = ['|', '•', '»', '›', '>', '/']; let sep_count = line.chars().filter(|ch| separators.contains(ch)).count(); if len < 80 && sep_count >= 2 { return true; } let alpha_count = line.chars().filter(|ch| ch.is_ascii_alphabetic()).count(); let digit_count = line.chars().filter(|ch| ch.is_ascii_digit()).count(); let ratio = alpha_count as f32 / len.max(1) as f32; if ratio < 0.5 && len < 140 { return true; } if digit_count > alpha_count && len < 120 { return true; } let mut token_count = 0usize; let mut token_len_sum = 0usize; for token in line.split_whitespace() { token_count += 1; token_len_sum += token.len(); } if token_count > 0 { let avg_token = token_len_sum as f32 / token_count as f32; if avg_token <= 2.3 && len < 90 { return true; } if token_count <= 3 && len < 50 { return true; } } false } struct WebMatchStats { overlap_ratio: f32, matched: usize, query_len: usize, } fn web_match_stats(query: &str, content: &str, code_blocks: &[String]) -> WebMatchStats { let query_tokens = tokenize_terms(query); if query_tokens.is_empty() { return WebMatchStats { overlap_ratio: 0.0, matched: 0, query_len: 0, }; } let mut hit_tokens = HashSet::new(); collect_tokens(content, &mut hit_tokens); for block in code_blocks { collect_tokens(block, &mut hit_tokens); } if hit_tokens.is_empty() { return WebMatchStats { overlap_ratio: 0.0, matched: 0, query_len: query_tokens.len(), }; } let matched = query_tokens .iter() .filter(|token| hit_tokens.contains(*token)) .count(); let overlap_ratio = matched as f32 / query_tokens.len() as f32; WebMatchStats { overlap_ratio, matched, query_len: query_tokens.len(), } } fn blend_relevance_score(model_score: f32, stats: &WebMatchStats) -> f32 { let model_score = model_score.clamp(0.0, 1.0); let overlap_score = stats.overlap_ratio.clamp(0.0, 1.0); let blended = (model_score * 0.6) + (overlap_score * 0.4); let penalty = if stats.query_len <= 1 { 1.0 } else if stats.query_len == 2 { if stats.matched <= 1 { 0.75 } else { 1.0 } } else if stats.matched <= 1 { 0.5 } else if stats.overlap_ratio < 0.5 { 0.8 } else { 1.0 }; (blended * penalty).clamp(0.0, 1.0) } fn category_relevance_multiplier( category: QueryCategory, url: &url::Url, code_blocks: &[String], ai_kind: &str, ) -> f32 { let url_lc = url.as_str().to_ascii_lowercase(); let host = url.host_str().unwrap_or("").to_ascii_lowercase(); let has_code = !code_blocks.is_empty(); match category { QueryCategory::CodeExample => { if ai_kind == "code" && has_code { 1.15 } else if !has_code { 0.7 } else { 0.85 } } QueryCategory::ApiReference => { if url_contains_any(&url_lc, &["/api", "/reference", "api", "reference", "sdk"]) || url_contains_any(&host, &["docs.", "developer.", "api."]) { 1.1 } else { 0.95 } } QueryCategory::HowToGuide => { if url_contains_any( &url_lc, &[ "how-to", "howto", "tutorial", "guide", "walkthrough", "getting-started", ], ) { 1.1 } else { 0.97 } } QueryCategory::ConceptDefinition => { if url_contains_any( &url_lc, &["overview", "introduction", "what-is", "concept", "glossary"], ) { 1.1 } else { 0.97 } } QueryCategory::Troubleshooting => { if url_contains_any(&url_lc, &["error", "issues", "issue", "fix", "debug"]) || url_contains_any( &host, &["stackoverflow.com", "serverfault.com", "superuser.com"], ) { 1.1 } else { 0.97 } } QueryCategory::SpecStandard => { if url_contains_any(&url_lc, &["spec", "standard", "eip-", "erc-", "rfc"]) || url_contains_any( &host, &["eips.ethereum.org", "rfc-editor.org", "ietf.org", "w3.org"], ) { 1.15 } else { 0.95 } } QueryCategory::ComparisonOpinion => { if url_contains_any( &url_lc, &[ "compare", "comparison", "-vs-", "/vs/", "versus", "best", "top", ], ) { 1.1 } else { 0.95 } } QueryCategory::NewsRelease => { if url_contains_any( &url_lc, &[ "blog", "news", "release", "changelog", "announcement", "press", ], ) { 1.1 } else { 0.95 } } QueryCategory::General => 1.0, } } fn should_stop_early( desired_count: usize, score: f32, match_stats: &WebMatchStats, query_category: QueryCategory, ai_kind: &str, has_content: bool, early_stop_score: f32, ) -> bool { if !has_content { return false; } if score >= early_stop_score { return true; } if desired_count != 1 { return false; } if matches!( query_category, QueryCategory::ConceptDefinition | QueryCategory::General | QueryCategory::ApiReference ) { let short_threshold = (early_stop_score * 0.8).clamp(0.35, early_stop_score); if match_stats.query_len <= 4 && score >= short_threshold { return true; } } if match_stats.query_len >= 3 && match_stats.overlap_ratio >= 0.75 { return true; } if matches!( query_category, QueryCategory::ConceptDefinition | QueryCategory::General | QueryCategory::ApiReference ) && match_stats.query_len >= 2 && match_stats.overlap_ratio >= 0.6 && !ai_kind.eq_ignore_ascii_case("code") { return true; } false } fn min_overlap_ratio_for_intent(intent: QueryIntent, query_len: usize) -> Option<f32> { if matches!(intent, QueryIntent::Code) && query_len >= 4 { return Some(0.5); } None } fn apply_code_intent_penalty(hit: &mut Hit, has_code: bool) { if !has_code { hit.score *= 0.6; return; } if is_markdown_path(&hit.rel_path) { hit.score *= 0.85; } } fn hit_has_code_markers(hit: &Hit) -> bool { if is_code_path(&hit.rel_path) { return true; } if hit.summary.contains("```") || hit.snippet.contains("```") { return true; } for line in hit.summary.lines().chain(hit.snippet.lines()) { if is_probable_code_line(line) { return true; } } false } fn hit_matches_specific_token(query_tokens: &[String], hit: &Hit) -> bool { if query_tokens.is_empty() { return false; } hit_match_stats(query_tokens, query_tokens.len(), hit) .map(|(matched, _)| matched > 0) .unwrap_or(false) } fn is_markdown_path(path: &str) -> bool { let lower = path.to_ascii_lowercase(); lower.ends_with(".md") || lower.ends_with(".markdown") || lower.ends_with(".mdx") } fn is_code_path(path: &str) -> bool { let _ = path; false } fn push_token_with_filter(tokens: &mut Vec<String>, buf: &mut String, keep: fn(&str) -> bool) { if keep(buf) { tokens.push(buf.clone()); } buf.clear(); } fn should_keep_token(token: &str) -> bool { let token = token.trim(); if token.len() < 2 { return false; } !STOPWORDS.contains(token) } fn should_keep_match_token(token: &str) -> bool { let token = token.trim(); if token.len() < 2 { return false; } !MATCH_STOPWORDS.contains(token) } fn should_keep_category_token(token: &str) -> bool { let token = token.trim(); if token.len() < 2 { return false; } !COMMON_STOPWORDS.contains(&token) } #[cfg(test)] mod tests { use super::*; #[test] fn should_attempt_accounts_for_threshold_and_force_web() { let gate = WebGateConfig { enabled: true, trigger_threshold: 0.5, min_local_match_ratio: 0.2, browser_hint: None, browser_available: true, }; assert!(gate.should_attempt(Some(0.3), None, false, false)); assert!(!gate.should_attempt(Some(0.8), Some(0.6), false, false)); assert!(!gate.should_attempt(Some(0.8), Some(0.1), false, false)); assert!(gate.should_attempt(Some(0.8), Some(0.1), false, true)); assert!(gate.should_attempt(Some(0.8), None, true, false)); assert!(gate.should_attempt(None, None, false, false)); } #[test] fn evaluate_gate_status_skips_when_confident() { let gate = WebGateConfig { enabled: true, trigger_threshold: 0.45, min_local_match_ratio: 0.2, browser_hint: None, browser_available: true, }; let status = evaluate_gate_status("req", &gate, Some(0.8), Some(0.8), Some(0.9), false, false); assert_eq!(status.status, WebDiscoveryStatusCode::Skipped); assert_eq!(status.reason.as_deref(), Some("confidence_above_threshold")); } #[test] fn evaluate_gate_status_reports_unavailable_without_browser() { let gate = WebGateConfig { enabled: true, trigger_threshold: 0.45, min_local_match_ratio: 0.2, browser_hint: Some("playwright".to_string()), browser_available: false, }; let status = evaluate_gate_status("req", &gate, Some(0.1), Some(0.1), None, false, false); assert_eq!(status.status, WebDiscoveryStatusCode::Unavailable); assert_eq!(status.reason.as_deref(), Some("missing_dependency")); assert!(status.message.as_deref().unwrap().contains("playwright")); } #[test] fn detect_query_category_heuristic_code_example() { let category = detect_query_category_heuristic("user approval code sample"); assert_eq!(category, QueryCategory::CodeExample); } #[test] fn detect_query_category_heuristic_troubleshooting() { let category = detect_query_category_heuristic("timeout error fix"); assert_eq!(category, QueryCategory::Troubleshooting); } } fn env_boolish(key: &str) -> Option<bool> { let raw = env::var(key).ok()?; let trimmed = raw.trim().to_ascii_lowercase(); match trimmed.as_str() { "1" | "true" | "t" | "yes" | "y" | "on" => Some(true), "0" | "false" | "f" | "no" | "n" | "off" => Some(false), _ => None, } } fn env_f32(key: &str) -> Option<f32> { let raw = env::var(key).ok()?; let trimmed = raw.trim(); if trimmed.is_empty() { return None; } trimmed.parse::<f32>().ok() } fn env_usize(key: &str) -> Option<usize> { let raw = env::var(key).ok()?; let trimmed = raw.trim(); if trimmed.is_empty() { return None; } trimmed.parse::<usize>().ok() } fn env_string(key: &str) -> Option<String> { let raw = env::var(key).ok()?; let trimmed = raw.trim(); if trimmed.is_empty() { None } else { Some(trimmed.to_string()) } } fn config_web_trigger_threshold() -> Option<f32> { let path = config::default_config_path().ok()?; if !path.exists() { return None; } let config = config::load_config_from_path(&path).ok()?; Some(config.search.web_trigger_threshold) } fn config_web_min_match_ratio() -> Option<f32> { let path = config::default_config_path().ok()?; if !path.exists() { return None; } let config = config::load_config_from_path(&path).ok()?; Some(config.search.web_min_match_ratio) } fn config_local_relevance_threshold() -> Option<f32> { let path = config::default_config_path().ok()?; if !path.exists() { return None; } let config = config::load_config_from_path(&path).ok()?; Some(config.search.local_relevance_threshold) } fn config_web_max_hits() -> Option<usize> { let path = config::default_config_path().ok()?; if !path.exists() { return None; } let config = config::load_config_from_path(&path).ok()?; Some(config.search.max_web_hits) } fn config_web_browser_path() -> Option<String> { let path = config::default_config_path().ok()?; if !path.exists() { return None; } let config = config::load_config_from_path(&path).ok()?; config .web .scraper .chrome_binary_path .map(|path| path.to_string_lossy().to_string()) } fn resolve_local_relevance_threshold() -> f32 { env_f32("DOCDEX_LOCAL_RELEVANCE_THRESHOLD") .or_else(config_local_relevance_threshold) .unwrap_or(DEFAULT_LOCAL_RELEVANCE_THRESHOLD) .clamp(0.0, 1.0) } pub(crate) fn resolve_browser_available(hint: Option<&str>) -> bool { if let Some(path) = hint { if Path::new(path).is_file() { return true; } return false; } util::read_playwright_manifest() .map(|manifest| { manifest .browsers .iter() .any(|browser| browser.path.is_file()) }) .unwrap_or(false) } fn resolve_web_limit(requested: Option<usize>, fallback: usize) -> usize { let mut limit = requested.unwrap_or(fallback); if let Some(max_hits) = env_usize("DOCDEX_WEB_MAX_HITS").or_else(config_web_max_hits) { if max_hits > 0 { limit = limit.min(max_hits); } } limit.max(1) }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/bekirdag/docdex'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

web.rs•158 KiB