GenAIScript

Official

Overview Schema Related Servers Score Discussions

LocalDocumentIndex.ts•16.5 KiB

import * as fs from "fs/promises" import * as path from "path" import { v4 } from "uuid" import { CreateIndexConfig, LocalIndex } from "./LocalIndex" import { MetadataFilter, EmbeddingsModel, MetadataTypes, EmbeddingsResponse, QueryResult, DocumentChunkMetadata, DocumentCatalogStats, } from "./types" import { LocalDocumentResult } from "./LocalDocumentResult" import { LocalDocument } from "./LocalDocument" import { TextSplitter, TextSplitterConfig } from "../textsplitter" /** * Options for querying documents in the index. */ export interface DocumentQueryOptions { /** * Optional. Maximum number of documents to return. * @remarks * Default is 10. */ maxDocuments?: number /** * Maximum number of chunks to return per document. * @remarks * Default is 50. */ maxChunks?: number /** * Optional. Filter to apply to the document metadata. */ filter?: MetadataFilter } /** * Configuration settings for a local document index. */ export interface LocalDocumentIndexConfig { /** * Folder path where the index is stored. */ folderPath: string /** * Optional. Embeddings model to use for generating document embeddings. */ embeddings?: EmbeddingsModel /** * Optional. Tokenizer to use for splitting text into tokens. */ tokenizer?: Tokenizer /** * Optional. Configuration settings for splitting text into chunks. */ chunkingConfig?: Partial<TextSplitterConfig> } /** * Represents a local index of documents stored on disk. */ export class LocalDocumentIndex extends LocalIndex<DocumentChunkMetadata> { private readonly _embeddings?: EmbeddingsModel private readonly _tokenizer: Tokenizer private readonly _chunkingConfig?: TextSplitterConfig private _catalog?: DocumentCatalog private _newCatalog?: DocumentCatalog /** * Creates a new `LocalDocumentIndex` instance. * @param config Configuration settings for the document index. */ public constructor(config: LocalDocumentIndexConfig) { super(config.folderPath) this._embeddings = config.embeddings this._chunkingConfig = Object.assign( { keepSeparators: true, chunkSize: 512, chunkOverlap: 0, } as TextSplitterConfig, config.chunkingConfig ) this._tokenizer = config.tokenizer ?? this._chunkingConfig.tokenizer this._chunkingConfig.tokenizer = this._tokenizer } /** * Returns the embeddings model used by the index (if configured.) */ public get embeddings(): EmbeddingsModel | undefined { return this._embeddings } /** * Returns the tokenizer used by the index. */ public get tokenizer(): Tokenizer { return this._tokenizer } /** * Returns true if the document catalog exists. */ public async isCatalogCreated(): Promise<boolean> { try { await fs.access(path.join(this.folderPath, "catalog.json")) return true } catch (err: unknown) { return false } } /** * Returns the document ID for the given URI. * @param uri URI of the document to lookup. * @returns Document ID or undefined if not found. */ public async getDocumentId(uri: string): Promise<string | undefined> { await this.loadIndexData() return this._catalog?.uriToId[uri] } /** * Returns the document URI for the given ID. * @param documentId ID of the document to lookup. * @returns Document URI or undefined if not found. */ public async getDocumentUri( documentId: string ): Promise<string | undefined> { await this.loadIndexData() return this._catalog?.idToUri[documentId] } /** * Loads the document catalog from disk and returns its stats. * @returns Catalog stats. */ public async getCatalogStats(): Promise<DocumentCatalogStats> { const stats = await this.getIndexStats() return { version: this._catalog!.version, documents: this._catalog!.count, chunks: stats.items, metadata_config: stats.metadata_config, } } /** * Deletes a document from the index. * @param uri URI of the document to delete. */ public async deleteDocument(uri: string): Promise<void> { // Lookup document ID const documentId = await this.getDocumentId(uri) if (documentId == undefined) { return } // Delete document chunks from index and remove from catalog await this.beginUpdate() try { // Get list of chunks for document const chunks = await this.listItemsByMetadata({ documentId }) // Delete chunks for (const chunk of chunks) { await this.deleteItem(chunk.id) } // Remove entry from catalog delete this._newCatalog!.uriToId[uri] delete this._newCatalog!.idToUri[documentId] this._newCatalog!.count-- // Commit changes await this.endUpdate() } catch (err: unknown) { // Cancel update and raise error this.cancelUpdate() throw new Error( `Error deleting document "${uri}": ${(err as any).toString()}` ) } // Delete text file from disk try { await fs.unlink(path.join(this.folderPath, `${documentId}.txt`)) } catch (err: unknown) { throw new Error( `Error removing text file for document "${uri}" from disk: ${(err as any).toString()}` ) } // Delete metadata file from disk try { await fs.unlink(path.join(this.folderPath, `${documentId}.json`)) } catch (err: unknown) { // Ignore error } } /** * Adds a document to the catalog. * @remarks * A new update is started if one is not already in progress. If an document with the same uri * already exists, it will be replaced. * @param uri - Document URI * @param text - Document text * @param docType - Optional. Document type * @param metadata - Optional. Document metadata to index * @returns Inserted document */ public async upsertDocument( uri: string, text: string, docType?: string, metadata?: Record<string, MetadataTypes> ): Promise<LocalDocument> { // Ensure embeddings configured if (!this._embeddings) { throw new Error(`Embeddings model not configured.`) } // Check for existing document ID let documentId = await this.getDocumentId(uri) if (documentId != undefined) { // Delete existing document await this.deleteDocument(uri) } else { // Generate new document ID documentId = v4() } // Initialize text splitter settings const config = Object.assign({ docType }, this._chunkingConfig) if (config.docType == undefined) { // Populate docType based on extension const pos = uri.lastIndexOf(".") if (pos >= 0) { const ext = uri.substring(pos + 1).toLowerCase() config.docType = ext } } // Split text into chunks const splitter = new TextSplitter(config) const chunks = splitter.split(text) // Break chunks into batches for embedding generation let totalTokens = 0 const chunkBatches: string[][] = [] let currentBatch: string[] = [] for (const chunk of chunks) { totalTokens += chunk.tokens.length if (totalTokens > this._embeddings.maxTokens) { chunkBatches.push(currentBatch) currentBatch = [] totalTokens = chunk.tokens.length } currentBatch.push(chunk.text.replace(/\n/g, " ")) } if (currentBatch.length > 0) { chunkBatches.push(currentBatch) } // Generate embeddings for chunks const embeddings: number[][] = [] for (const batch of chunkBatches) { let response: EmbeddingsResponse try { response = await this._embeddings.createEmbeddings(batch) } catch (err: unknown) { throw new Error( `Error generating embeddings: ${(err as any).toString()}` ) } // Check for error if (response.status != "success") { throw new Error( `Error generating embeddings: ${response.message}` ) } // Add embeddings to output for (const embedding of response.output!) { embeddings.push(embedding) } } // Add document chunks to index await this.beginUpdate() try { // Add chunks to index for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i] const embedding = embeddings[i] const chunkMetadata: DocumentChunkMetadata = Object.assign( { documentId, startPos: chunk.startPos, endPos: chunk.endPos, }, metadata ) await this.insertItem({ id: v4(), metadata: chunkMetadata, vector: embedding, }) } // Save metadata file to disk if (metadata != undefined) { await fs.writeFile( path.join(this.folderPath, `${documentId}.json`), JSON.stringify(metadata) ) } // Save text file to disk await fs.writeFile( path.join(this.folderPath, `${documentId}.txt`), text ) // Add entry to catalog this._newCatalog!.uriToId[uri] = documentId this._newCatalog!.idToUri[documentId] = uri this._newCatalog!.count++ // Commit changes await this.endUpdate() } catch (err: unknown) { // Cancel update and raise error this.cancelUpdate() throw new Error( `Error adding document "${uri}": ${(err as any).toString()}` ) } // Return document return new LocalDocument(this, documentId, uri) } /** * Returns all documents in the index. * @remarks * Each document will contain all of the documents indexed chunks. * @returns Array of documents. */ public async listDocuments(): Promise<LocalDocumentResult[]> { // Sort chunks by document ID const docs: { [documentId: string]: QueryResult<DocumentChunkMetadata>[] } = {} const chunks = await this.listItems() chunks.forEach((chunk) => { const metadata = chunk.metadata if (docs[metadata.documentId] == undefined) { docs[metadata.documentId] = [] } docs[metadata.documentId].push({ item: chunk, score: 1.0 }) }) // Create document results const results: LocalDocumentResult[] = [] for (const documentId in docs) { const uri = (await this.getDocumentUri(documentId)) as string const documentResult = new LocalDocumentResult( this, documentId, uri, docs[documentId], this._tokenizer ) results.push(documentResult) } return results } /** * Queries the index for documents similar to the given query. * @param query Text to query for. * @param options Optional. Query options. * @returns Array of document results. */ public async queryDocuments( query: string, options?: DocumentQueryOptions ): Promise<LocalDocumentResult[]> { // Ensure embeddings configured if (!this._embeddings) { throw new Error(`Embeddings model not configured.`) } // Ensure options are defined options = Object.assign( { maxDocuments: 10, maxChunks: 50, }, options ) // Generate embeddings for query let embeddings: EmbeddingsResponse try { embeddings = await this._embeddings.createEmbeddings( query.replace(/\n/g, " ") ) } catch (err: unknown) { throw new Error( `Error generating embeddings for query: ${(err as any).toString()}` ) } // Check for error if (embeddings.status != "success") { throw new Error( `Error generating embeddings for query: ${embeddings.message}` ) } // Query index for chunks const results = await this.queryItems( embeddings.output![0], options.maxChunks!, options.filter ) // Group chunks by document const documentChunks: { [documentId: string]: QueryResult<DocumentChunkMetadata>[] } = {} for (const result of results) { const metadata = result.item.metadata if (documentChunks[metadata.documentId] == undefined) { documentChunks[metadata.documentId] = [] } documentChunks[metadata.documentId].push(result) } // Create a document result for each document const documentResults: LocalDocumentResult[] = [] for (const documentId in documentChunks) { const chunks = documentChunks[documentId] const uri = (await this.getDocumentUri(documentId)) as string const documentResult = new LocalDocumentResult( this, documentId, uri, chunks, this._tokenizer ) documentResults.push(documentResult) } // Sort document results by score and return top results return documentResults .sort((a, b) => b.score - a.score) .slice(0, options.maxDocuments!) } // Overrides public async beginUpdate(): Promise<void> { await super.beginUpdate() this._newCatalog = Object.assign({}, this._catalog) } public cancelUpdate(): void { super.cancelUpdate() this._newCatalog = undefined } public async createIndex(config?: CreateIndexConfig): Promise<void> { await super.createIndex(config) await this.loadIndexData() } public async endUpdate(): Promise<void> { await super.endUpdate() try { // Save catalog await fs.writeFile( path.join(this.folderPath, "catalog.json"), JSON.stringify(this._newCatalog) ) this._catalog = this._newCatalog this._newCatalog = undefined } catch (err: unknown) { throw new Error( `Error saving document catalog: ${(err as any).toString()}` ) } } protected async loadIndexData(): Promise<void> { await super.loadIndexData() if (this._catalog) { return } const catalogPath = path.join(this.folderPath, "catalog.json") if (await this.isCatalogCreated()) { // Load catalog const buffer = await fs.readFile(catalogPath) this._catalog = JSON.parse(buffer.toString()) } else { try { // Initialize catalog this._catalog = { version: 1, count: 0, uriToId: {}, idToUri: {}, } await fs.writeFile(catalogPath, JSON.stringify(this._catalog)) } catch (err: unknown) { throw new Error( `Error creating document catalog: ${(err as any).toString()}` ) } } } } interface DocumentCatalog { version: number count: number uriToId: { [uri: string]: string } idToUri: { [id: string]: string } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/microsoft/genaiscript'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

LocalDocumentIndex.ts•16.5 KiB