lib.rs•30.3 kB
#![feature(iter_from_coroutine, coroutines)]
#![feature(let_chains)]
#![feature(try_blocks)]
#![feature(ptr_metadata)]
#![feature(iterator_try_collect)]
#![feature(assert_matches)]
#![feature(impl_trait_in_assoc_type)]
#![feature(trait_alias)]
mod aggregation;
mod archive;
mod constants;
mod convex_query;
pub mod disk_index;
pub mod fragmented_segment;
mod incremental_index;
mod intersection;
mod levenshtein_dfa;
mod memory_index;
pub mod metrics;
pub mod query;
pub mod scoring;
pub mod searcher;
mod tantivy_query;
mod text_index_manager;
use std::{
    cmp,
    collections::{
        BTreeMap,
        BTreeSet,
    },
    sync::Arc,
};
use aggregation::PostingListMatchAggregator;
use anyhow::Context;
use common::{
    bootstrap_model::index::{
        text_index::TextIndexSpec,
        IndexConfig,
    },
    document::ResolvedDocument,
    index::IndexKeyBytes,
    query::{
        search_value_to_bytes,
        InternalSearch,
        InternalSearchFilterExpression,
        SearchVersion,
    },
    runtime::{
        block_in_place,
        JoinSet,
    },
    types::{
        IndexName,
        Timestamp,
    },
};
use constants::CONVEX_EN_TOKENIZER;
pub use constants::{
    convex_en,
    EXACT_SEARCH_MAX_WORD_LENGTH,
    MAX_CANDIDATE_REVISIONS,
    MAX_FILTER_CONDITIONS,
    MAX_QUERY_TERMS,
    SINGLE_TYPO_SEARCH_MAX_WORD_LENGTH,
};
use convex_query::OrTerm;
use errors::ErrorMetadata;
use indexing::index_registry::Index;
use itertools::Itertools;
use metrics::log_search_token_limit_exceeded;
pub use query::{
    CandidateRevision,
    FilterConditionRead,
    QueryReads,
    QueryResults,
    TextQueryTermRead,
};
use query::{
    RevisionWithKeys,
    TextQueryTerm,
};
use searcher::FragmentedTextStorageKeys;
use storage::Storage;
pub use tantivy::Document as TantivyDocument;
use tantivy::{
    schema::{
        BytesOptions,
        Field,
        IndexRecordOption,
        Schema,
        TextFieldIndexing,
        TextOptions,
        FAST,
    },
    tokenizer::{
        TextAnalyzer,
        Token,
    },
    Term,
};
pub use tantivy_query::SearchQueryResult;
use value::{
    values_to_bytes,
    ConvexValue,
    FieldPath,
};
use self::query::{
    CompiledFilterCondition,
    CompiledQuery,
    QueryTerm,
};
pub use self::{
    incremental_index::{
        build_new_segment,
        fetch_term_ordinals_and_remap_deletes,
        NewTextSegment,
        PreviousTextSegments,
        SegmentStatisticsUpdates,
        TextSegmentPaths,
        UpdatableTextSegment,
    },
    memory_index::{
        build_term_weights,
        MemoryTextIndex,
    },
    searcher::{
        Searcher,
        SegmentTermMetadataFetcher,
    },
    text_index_manager::{
        DiskIndex,
        SnapshotInfo,
        TextIndex,
        TextIndexManager,
        TextIndexManagerState,
    },
};
use crate::{
    aggregation::TokenMatchAggregator,
    constants::MAX_UNIQUE_QUERY_TERMS,
    metrics::log_num_segments_searched_total,
    searcher::{
        Bm25Stats,
        PostingListQuery,
        TokenQuery,
    },
};
/// The field ID of the search field in tantivy. DON'T CHANGE THIS!
const SEARCH_FIELD_ID: u32 = 3;
/// The field name for the internal ID field. DON'T CHANGE THIS!
pub const INTERNAL_ID_FIELD_NAME: &str = "internal_id";
/// The field name for the timestamp field. DON'T CHANGE THIS!
pub const TS_FIELD_NAME: &str = "ts";
/// The field name for the creation time field. DON'T CHANGE THIS!
pub const CREATION_TIME_FIELD_NAME: &str = "creation_time";
#[derive(Debug, Clone)]
pub enum DocumentTerm {
    Search { term: Term, pos: FieldPosition },
    Filter { term: Term },
}
impl DocumentTerm {
    pub fn term(&self) -> &Term {
        match self {
            Self::Search { term, .. } => term,
            Self::Filter { term } => term,
        }
    }
    pub fn position(&self) -> FieldPosition {
        match self {
            Self::Search { pos, .. } => *pos,
            // Filter fields are given a dummy position of 0
            Self::Filter { .. } => FieldPosition(0),
        }
    }
    pub fn field_id(&self) -> u32 {
        self.term().field().field_id()
    }
}
impl From<DocumentTerm> for Term {
    fn from(doc_term: DocumentTerm) -> Self {
        match doc_term {
            DocumentTerm::Search { term, .. } => term,
            DocumentTerm::Filter { term } => term,
        }
    }
}
pub type EditDistance = u8;
/// Used to represent the position of a term within a document. For now, this
/// position is wrt the document token stream so should only be used internally.
#[derive(Debug, Clone, Copy, Default, PartialOrd, Ord, Eq, PartialEq)]
pub struct FieldPosition(u32);
impl FieldPosition {
    #[cfg(test)]
    pub fn new_for_test(pos: u32) -> Self {
        Self(pos)
    }
}
impl From<FieldPosition> for u32 {
    fn from(value: FieldPosition) -> Self {
        value.0
    }
}
impl TryFrom<&Token> for FieldPosition {
    type Error = anyhow::Error;
    fn try_from(value: &Token) -> Result<Self, Self::Error> {
        Ok(Self(u32::try_from(value.position)?))
    }
}
#[derive(Clone)]
pub struct TantivySearchIndexSchema {
    analyzer: TextAnalyzer,
    internal_id_field: Field,
    ts_field: Field,
    creation_time_field: Field,
    search_field_path: FieldPath,
    pub search_field: Field,
    pub filter_fields: BTreeMap<FieldPath, Field>,
    pub(crate) schema: Schema,
}
impl From<&TantivySearchIndexSchema> for pb::searchlight::SearchIndexConfig {
    fn from(schema: &TantivySearchIndexSchema) -> Self {
        pb::searchlight::SearchIndexConfig {
            search_field_path: Some(schema.search_field_path.clone().into()),
            filter_fields: schema
                .filter_fields
                .keys()
                .cloned()
                .map(|p| p.into())
                .collect::<Vec<_>>(),
        }
    }
}
impl TantivySearchIndexSchema {
    pub fn new(index_config: &TextIndexSpec) -> Self {
        let analyzer = convex_en();
        let mut schema_builder = Schema::builder();
        let internal_id_field = schema_builder.add_bytes_field(INTERNAL_ID_FIELD_NAME, FAST);
        let ts_field = schema_builder.add_u64_field(TS_FIELD_NAME, FAST);
        let creation_time_field = schema_builder.add_f64_field(CREATION_TIME_FIELD_NAME, FAST);
        let search_field_path = index_config.search_field.clone();
        let index_opts = TextFieldIndexing::default()
            .set_tokenizer(CONVEX_EN_TOKENIZER)
            .set_fieldnorms(true)
            .set_index_option(IndexRecordOption::WithFreqsAndPositions);
        let field_opts = TextOptions::default().set_indexing_options(index_opts);
        let field_name = format!("user/search/{}", String::from(search_field_path.clone()));
        let search_field = schema_builder.add_text_field(&field_name, field_opts);
        // NB: It's important that we iterate over `index_config.filter_fields` in
        // sorted order since tantivy assigns field ids in declaration order.
        let mut filter_fields = BTreeMap::new();
        for field_path in &index_config.filter_fields {
            // We store filter fields as the SHA256 hash of their index key.
            let field_name = format!("user/filter/{}", String::from(field_path.clone()));
            let field_opts = BytesOptions::default().set_indexed();
            let filter_field = schema_builder.add_bytes_field(&field_name, field_opts);
            filter_fields.insert(field_path.clone(), filter_field);
        }
        let schema = schema_builder.build();
        Self {
            analyzer,
            internal_id_field,
            ts_field,
            creation_time_field,
            search_field_path,
            search_field,
            filter_fields,
            schema,
        }
    }
    pub fn new_for_index(
        index: &Index,
        printable_index_name: &IndexName,
    ) -> anyhow::Result<TantivySearchIndexSchema> {
        let IndexConfig::Text { ref spec, .. } = index.metadata().config else {
            anyhow::bail!(ErrorMetadata::bad_request(
                "IndexNotASearchIndexError",
                format!("Index {printable_index_name} is not a search index"),
            ));
        };
        Ok(Self::new(spec))
    }
    pub fn to_index_config(&self) -> TextIndexSpec {
        TextIndexSpec {
            search_field: self.search_field_path.clone(),
            filter_fields: self.filter_fields.keys().cloned().collect(),
        }
    }
    fn filter_field_bytes(document: &ResolvedDocument, field_path: &FieldPath) -> Vec<u8> {
        let value = document.value().get_path(field_path);
        search_value_to_bytes(value)
    }
    /// This is a pretty wild over-estimate for documents with lots of shared
    /// terms. But it does at least provide some maximum value we can use
    /// when a super rough estimate is sufficient (e.g. capping the maximum
    /// size of a new segment).
    pub fn estimate_size(&self, document: &ResolvedDocument) -> u64 {
        let document_size = if let Some(ConvexValue::String(ref s)) =
            document.value().get_path(&self.search_field_path)
        {
            s.len()
        } else {
            0
        };
        let mut filter_field_sizes = 0;
        for field_path in self.filter_fields.keys() {
            let value = TantivySearchIndexSchema::filter_field_bytes(document, field_path);
            filter_field_sizes += value.len();
        }
        (document_size + filter_field_sizes) as u64
    }
    pub fn index_into_terms(
        &self,
        document: &ResolvedDocument,
    ) -> anyhow::Result<Vec<DocumentTerm>> {
        let _timer = metrics::index_into_terms_timer();
        let mut doc_terms = vec![];
        if let Some(ConvexValue::String(ref s)) = document.value().get_path(&self.search_field_path)
        {
            let mut token_stream = self.analyzer.token_stream(&s[..]);
            while let Some(token) = token_stream.next() {
                metrics::log_text_term(&token.text);
                doc_terms.push(DocumentTerm::Search {
                    term: Term::from_field_text(self.search_field, &token.text),
                    pos: FieldPosition::try_from(token)?,
                });
            }
        }
        for (field_path, tantivy_field) in &self.filter_fields {
            let value = TantivySearchIndexSchema::filter_field_bytes(document, field_path);
            metrics::log_filter_term(&value);
            doc_terms.push(DocumentTerm::Filter {
                term: Term::from_field_bytes(*tantivy_field, &value),
            });
        }
        Ok(doc_terms)
    }
    pub fn index_into_tantivy_document(
        &self,
        document: &ResolvedDocument,
        ts: Timestamp,
    ) -> TantivyDocument {
        let _timer = metrics::index_into_tantivy_document_timer();
        let mut tantivy_document = TantivyDocument::default();
        let internal_id_bytes = Vec::<u8>::from(document.id().internal_id());
        tantivy_document.add_bytes(self.internal_id_field, internal_id_bytes);
        tantivy_document.add_u64(self.ts_field, ts.into());
        let creation_time = document.creation_time();
        tantivy_document.add_f64(self.creation_time_field, creation_time.into());
        if let Some(ConvexValue::String(ref s)) = document.value().get_path(&self.search_field_path)
        {
            tantivy_document.add_text(self.search_field, s);
        }
        for (field_path, tantivy_field) in &self.filter_fields {
            let value = TantivySearchIndexSchema::filter_field_bytes(document, field_path);
            tantivy_document.add_bytes(*tantivy_field, value);
        }
        tantivy_document
    }
    pub fn document_lengths(&self, document: &TantivyDocument) -> DocumentLengths {
        let mut search_field = 0;
        if let Some(tantivy::schema::Value::Str(ref s)) = document.get_first(self.search_field) {
            search_field += s.len();
        }
        let mut filter_fields = BTreeMap::new();
        for (field_path, tantivy_field) in &self.filter_fields {
            if let Some(tantivy::schema::Value::Bytes(ref b)) = document.get_first(*tantivy_field) {
                filter_fields.insert(field_path.clone(), b.len());
            }
        }
        DocumentLengths {
            search_field,
            filter_fields,
        }
    }
    #[fastrace::trace]
    pub async fn search(
        &self,
        compiled_query: CompiledQuery,
        memory_index: &MemoryTextIndex,
        search_storage: Arc<dyn Storage>,
        segments: Vec<FragmentedTextStorageKeys>,
        disk_index_ts: Timestamp,
        searcher: Arc<dyn Searcher>,
    ) -> anyhow::Result<RevisionWithKeys> {
        log_num_segments_searched_total(segments.len());
        // Step 1: Map the old `CompiledQuery` struct onto `TokenQuery`s.
        let mut token_queries = vec![];
        let num_text_query_terms = compiled_query.text_query.len() as u32;
        for query_term in compiled_query.text_query {
            let query = TokenQuery {
                max_distance: query_term.max_distance(),
                prefix: query_term.prefix(),
                term: query_term.into_term(),
            };
            token_queries.push(query);
        }
        let mut exist_filter_conditions = false;
        let mut num_expected_filter_conditions = 0;
        for CompiledFilterCondition::Must(term) in compiled_query.filter_conditions {
            exist_filter_conditions = true;
            num_expected_filter_conditions += 1;
            let query = TokenQuery {
                term,
                max_distance: 0,
                prefix: false,
            };
            token_queries.push(query);
        }
        // Step 2: Execute the token queries across both the memory and disk indexes,
        // and merge the results to get the top terms. Note that we spawn the calls
        // into the joinset *before* calling `block_in_place` so we can make progress
        // while this thread gets transitioned to being a blocking thread.
        let mut token_query_futures = JoinSet::new();
        for segment in &segments {
            let searcher = searcher.clone();
            let search_storage = search_storage.clone();
            let segment = segment.clone();
            let token_queries = token_queries.clone();
            token_query_futures.spawn("query_tokens", async move {
                searcher
                    .query_tokens(
                        search_storage,
                        segment,
                        token_queries,
                        MAX_UNIQUE_QUERY_TERMS,
                    )
                    .await
            });
        }
        let mut match_aggregator = TokenMatchAggregator::new(MAX_UNIQUE_QUERY_TERMS);
        block_in_place(|| memory_index.query_tokens(&token_queries, &mut match_aggregator))?;
        while let Some(result) = token_query_futures.join_next().await {
            let segment_token_matches = result??;
            anyhow::ensure!(segment_token_matches.is_sorted());
            for m in segment_token_matches {
                // Since each segment returns results in sorted order, we can stop early once we
                // know that we've already seen `MAX_UNIQUE_QUERY_TERMS` better results.
                if !match_aggregator.insert(m) {
                    break;
                }
            }
        }
        // Deduplicate terms, using the best distance for each term.
        let mut results_by_term = BTreeMap::new();
        for token_match in match_aggregator.into_results() {
            let sort_key = (
                token_match.distance,
                token_match.prefix,
                token_match.token_ord,
            );
            let existing_key = results_by_term.entry(token_match.term).or_insert(sort_key);
            // NB: Since OR and AND queries are on different fields, we can assume their
            // terms are disjoint. Assert this condition here since we're deduplicating
            // terms and taking their minimum `token_ord`, which could potentially lose
            // intersection conditions otherwise.
            let (_, _, existing_token_ord) = *existing_key;
            let existing_is_intersection = existing_token_ord >= num_text_query_terms;
            let is_intersection = token_match.token_ord >= num_text_query_terms;
            anyhow::ensure!(existing_is_intersection == is_intersection);
            *existing_key = cmp::min(*existing_key, sort_key);
        }
        let terms = results_by_term.keys().cloned().collect_vec();
        // If there are no matches, short-circuit and return an empty result.
        let not_enough_and_tokens_present = results_by_term
            .iter()
            .filter(|(_, (_, _, token_ord))| *token_ord >= num_text_query_terms)
            .count()
            < num_expected_filter_conditions;
        let no_filter_matches = exist_filter_conditions && not_enough_and_tokens_present;
        if terms.is_empty() || no_filter_matches {
            return Ok(vec![]);
        }
        // Step 3: Given the terms we decided on, query BM25 statistics across all of
        // the indexes and merge their results.
        let mut bm25_futures = JoinSet::new();
        for segment in &segments {
            let searcher = searcher.clone();
            let search_storage = search_storage.clone();
            let segment = segment.clone();
            let terms = terms.clone();
            bm25_futures.spawn("query_bm25_stats", async move {
                searcher
                    .query_bm25_stats(search_storage, segment, terms)
                    .await
            });
        }
        let mut bm25_stats = Bm25Stats::empty();
        while let Some(result) = bm25_futures.join_next().await {
            let segment_bm25_stats = result??;
            bm25_stats += segment_bm25_stats;
        }
        let bm25_stats =
            block_in_place(|| memory_index.update_bm25_stats(disk_index_ts, &terms, bm25_stats))?;
        // Step 4: Decide on our posting list queries given the previous results.
        let mut or_terms = vec![];
        let mut and_terms = vec![];
        for (term, (distance, prefix, token_ord)) in results_by_term {
            if token_ord >= num_text_query_terms {
                anyhow::ensure!(distance == 0 && !prefix);
                and_terms.push(term);
            } else {
                let doc_frequency = *bm25_stats
                    .doc_frequencies
                    .get(&term)
                    .context("Missing term frequency")?;
                // TODO: Come up with a smarter way to boost scores based on edit distance.
                // Eventually this will be in user space so developers can tweak
                // it as they desire.
                let mut boost = 1. / (1. + distance as f32);
                if prefix {
                    boost *= 0.5;
                }
                let or_term = OrTerm {
                    term,
                    doc_frequency,
                    bm25_boost: boost,
                };
                metrics::log_search_term_edit_distance(distance, prefix);
                or_terms.push(or_term);
            }
        }
        // or_terms is the set of text tokens that matches our query. and_terms only
        // filters these terms further. So if we have no or_terms, our result is
        // empty regardless of any matching and_terms.
        if or_terms.is_empty() {
            return Ok(vec![]);
        }
        // Step 5: Execute the posting list query against the memory index's tombstones
        // to know which `InternalId`s to exclude when querying the disk
        // indexes.
        let (prepared_memory_query, query) = block_in_place(|| {
            let prepared_memory_query =
                memory_index.prepare_posting_list_query(&and_terms, &or_terms, &bm25_stats)?;
            let mut deleted_internal_ids = BTreeSet::new();
            if let Some(ref prepared_query) = prepared_memory_query {
                deleted_internal_ids =
                    memory_index.query_tombstones(disk_index_ts, prepared_query)?;
            }
            let query = PostingListQuery {
                deleted_internal_ids,
                num_terms_by_field: bm25_stats.num_terms_by_field,
                num_documents: bm25_stats.num_documents,
                or_terms,
                and_terms,
                max_results: MAX_CANDIDATE_REVISIONS,
            };
            anyhow::Ok((prepared_memory_query, query))
        })?;
        // Step 6: Query the posting lists across the indexes and take the best
        // results.
        let mut posting_list_futures = JoinSet::new();
        for segment in &segments {
            let searcher = searcher.clone();
            let search_storage = search_storage.clone();
            let segment = segment.clone();
            let query = query.clone();
            posting_list_futures.spawn("query_posting_lists", async move {
                searcher
                    .query_posting_lists(search_storage, segment, query)
                    .await
            });
        }
        let mut match_aggregator = PostingListMatchAggregator::new(MAX_CANDIDATE_REVISIONS);
        if let Some(ref prepared_query) = prepared_memory_query {
            block_in_place(|| {
                memory_index.query_posting_lists(
                    disk_index_ts,
                    prepared_query,
                    &mut match_aggregator,
                )
            })?;
        }
        while let Some(result) = posting_list_futures.join_next().await {
            let segment_matches = result??;
            for m in segment_matches {
                if !match_aggregator.insert(m) {
                    break;
                }
            }
        }
        // Step 7: Convert the matches into the final result format.
        let result = block_in_place(|| {
            let mut result = vec![];
            for m in match_aggregator.into_results() {
                let candidate = CandidateRevision {
                    score: m.bm25_score,
                    id: m.internal_id,
                    ts: m.ts,
                    creation_time: m.creation_time,
                };
                let index_fields = vec![
                    Some(ConvexValue::Float64(-f64::from(m.bm25_score))),
                    Some(ConvexValue::Float64(-f64::from(m.creation_time))),
                    Some(ConvexValue::Bytes(
                        Vec::<u8>::from(m.internal_id)
                            .try_into()
                            .expect("Could not convert internal ID to value"),
                    )),
                ];
                let bytes = values_to_bytes(&index_fields);
                let index_key_bytes = IndexKeyBytes(bytes);
                result.push((candidate, index_key_bytes));
            }
            result
        });
        Ok(result)
    }
    fn compile_tokens_with_typo_tolerance(
        search_field: Field,
        tokens: &Vec<String>,
    ) -> anyhow::Result<Vec<QueryTerm>> {
        let mut res = vec![];
        let mut it = tokens.iter().peekable();
        while let Some(text) = it.next() {
            let term = Term::from_field_text(search_field, text);
            anyhow::ensure!(term.as_str().is_some(), "Term was not valid UTF8");
            let is_prefix = it.peek().is_none();
            res.push(QueryTerm::new(term, is_prefix))
        }
        Ok(res)
    }
    pub fn compile(
        &self,
        query: &InternalSearch,
        version: SearchVersion,
    ) -> anyhow::Result<(CompiledQuery, QueryReads)> {
        let timer = metrics::compile_timer();
        let mut search_text: Option<&str> = None;
        let mut filter_conditions = Vec::new();
        let mut filter_reads = Vec::new();
        for filter in query.filters.iter() {
            match filter {
                InternalSearchFilterExpression::Search(field_path, text_query) => {
                    if *field_path != self.search_field_path {
                        anyhow::bail!(ErrorMetadata::bad_request(
                            "IncorrectSearchField",
                            format!(
                                "Search query against {} contains a search filter against {:?}, \
                                 which doesn't match the indexed `searchField` {:?}.",
                                query.printable_index_name()?,
                                field_path,
                                self.search_field_path,
                            ),
                        ))
                    }
                    if search_text.is_some() {
                        anyhow::bail!(ErrorMetadata::bad_request(
                            "DuplicateSearchFiltersError",
                            format!(
                                "Search query against {} contains multiple search filters against \
                                 {field_path:?}. Only one is allowed.",
                                query.printable_index_name()?,
                            )
                        ))
                    }
                    search_text = Some(text_query)
                },
                InternalSearchFilterExpression::Eq(field_path, value) => {
                    let Some(field) = self.filter_fields.get(field_path) else {
                        anyhow::bail!(ErrorMetadata::bad_request(
                            "IncorrectFilterFieldError",
                            format!(
                                "Search query against {} contains an equality filter on \
                                 {field_path:?} but that field isn't indexed for filtering in \
                                 `filterFields`.",
                                query.printable_index_name()?,
                            )
                        ))
                    };
                    let term = Term::from_field_bytes(*field, value);
                    filter_conditions.push(CompiledFilterCondition::Must(term));
                    filter_reads.push(FilterConditionRead::Must(field_path.clone(), value.clone()));
                },
            }
        }
        let Some(search_text) = search_text else {
            anyhow::bail!(ErrorMetadata::bad_request(
                "MissingSearchFilterError",
                format!(
                    "Search query against {} does not contain any search filters. You must \
                     include a search filter like `q.search(\"{:?}\", searchText)`.",
                    query.printable_index_name()?,
                    self.search_field_path,
                )
            ))
        };
        let mut token_stream = self.analyzer.token_stream(search_text);
        let mut tokens = vec![];
        // TODO(CX-5693): Consider how/if we should surface this to developers.
        while tokens.len() < MAX_QUERY_TERMS
            && let Some(token) = token_stream.next()
        {
            tokens.push(token.text.clone());
        }
        if tokens.len() == MAX_QUERY_TERMS && token_stream.next().is_some() {
            log_search_token_limit_exceeded();
        }
        let text_query = match version {
            SearchVersion::V1 => tokens
                .iter()
                .map(|text| {
                    let term = Term::from_field_text(self.search_field, text);
                    anyhow::ensure!(term.as_str().is_some(), "Term was not valid UTF8");
                    Ok(QueryTerm::new(term, false))
                })
                .collect::<anyhow::Result<Vec<_>>>()?,
            // Only the V2 search codepath can generate QueryTerm::Fuzzy
            SearchVersion::V2 => {
                Self::compile_tokens_with_typo_tolerance(self.search_field, &tokens)?
            },
        };
        let text_reads = text_query
            .clone()
            .into_iter()
            .map(|t| {
                anyhow::Ok(TextQueryTermRead::new(
                    self.search_field_path.clone(),
                    TextQueryTerm::try_from(t)?,
                ))
            })
            .collect::<anyhow::Result<_>>()?;
        if filter_conditions.len() > MAX_FILTER_CONDITIONS {
            anyhow::bail!(ErrorMetadata::bad_request(
                "TooManyFilterConditionsInSearchQueryError",
                format!(
                    "Search query against {} has too many filter conditions. Max: {} Actual: {}",
                    query.printable_index_name()?,
                    MAX_FILTER_CONDITIONS,
                    filter_conditions.len()
                )
            ))
        }
        let query = CompiledQuery {
            text_query,
            filter_conditions,
        };
        let reads = QueryReads::new(text_reads, filter_reads.into());
        metrics::log_compiled_query(&query);
        timer.finish();
        Ok((query, reads))
    }
}
pub struct DocumentLengths {
    pub search_field: usize,
    pub filter_fields: BTreeMap<FieldPath, usize>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum SearchFileType {
    VectorSegment,
    FragmentedVectorSegment,
    VectorDeletedBitset,
    VectorIdTracker,
    Text,
    TextIdTracker,
    TextAliveBitset,
    TextDeletedTerms,
}
#[cfg(test)]
mod test {
    use std::collections::BTreeSet;
    use common::bootstrap_model::index::text_index::TextIndexSpec;
    use crate::{
        TantivySearchIndexSchema,
        SEARCH_FIELD_ID,
    };
    /// DO NOT CHANGE CONSTANTS!
    /// This test ensures that we don't accidentally change our field IDs in
    /// tantivy.
    #[test]
    fn test_field_ids_dont_change() -> anyhow::Result<()> {
        let schema = TantivySearchIndexSchema::new(&TextIndexSpec {
            search_field: "mySearchField".parse()?,
            filter_fields: BTreeSet::new(),
        });
        assert_eq!(schema.internal_id_field.field_id(), 0);
        assert_eq!(schema.ts_field.field_id(), 1);
        assert_eq!(schema.creation_time_field.field_id(), 2);
        assert_eq!(schema.search_field.field_id(), SEARCH_FIELD_ID);
        Ok(())
    }
}