Intlayer

intlayer
apps
backend
src
utils
AI
askDocQuestion

indexMarkdownFiles.ts•7.46 KiB

import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'; import { dirname, join } from 'node:path'; import { fileURLToPath } from 'node:url'; import { getMarkdownMetadata } from '@intlayer/core'; import { getBlogs, getDocs, getFrequentQuestions } from '@intlayer/docs'; import dotenv from 'dotenv'; import { OpenAI } from 'openai'; const OUTPUT_EMBEDDINGS_DIR = 'src/utils/AI/askDocQuestion/embeddings'; const __dirname = dirname(fileURLToPath(import.meta.url)); const readEmbeddingsForFile = (fileKey: string): Record<string, number[]> => { try { return JSON.parse( readFileSync( `${__dirname}/embeddings/${fileKey.replace('.md', '.json')}`, 'utf-8' ) ) as Record<string, number[]>; } catch { return {}; } }; const writeEmbeddingsForFile = ( fileKey: string, data: Record<string, number[]> ): void => { const filePath = join( OUTPUT_EMBEDDINGS_DIR, `${fileKey.replace('.md', '.json')}` ); const dir = dirname(filePath); if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } writeFileSync(filePath, JSON.stringify(data)); }; type VectorStoreEl = { fileKey: string; chunkNumber: number; content: string; embedding: number[]; docUrl: string; docName: string; }; /** * Simple in-memory vector store to hold document embeddings and their content. * Each entry contains: * - fileKey: A unique key identifying the file * - chunkNumber: The number of the chunk within the document * - content: The chunk content * - embedding: The numerical embedding vector for the chunk */ const vectorStore: VectorStoreEl[] = []; /* * Embedding model configuration */ const EMBEDDING_MODEL: OpenAI.EmbeddingModel = 'text-embedding-3-large'; // Model to use for embedding generation const OVERLAP_TOKENS: number = 200; // Number of tokens to overlap between chunks const MAX_CHUNK_TOKENS: number = 800; // Maximum number of tokens per chunk const CHAR_BY_TOKEN: number = 4.15; // Approximate pessimistically the number of characters per token // Can use `tiktoken` or other tokenizers to calculate it more precisely const MAX_CHARS: number = MAX_CHUNK_TOKENS * CHAR_BY_TOKEN; const OVERLAP_CHARS: number = OVERLAP_TOKENS * CHAR_BY_TOKEN; const skipDocEmbeddingsIndex = process.env.SKIP_DOC_EMBEDDINGS_INDEX === 'true'; /** * Splits a given text into chunks ensuring each chunk does not exceed MAX_CHARS. * @param text - The input text to split. * @returns - Array of text chunks. */ const chunkText = (text: string): string[] => { const chunks: string[] = []; let start = 0; while (start < text.length) { let end = Math.min(start + MAX_CHARS, text.length); // Ensure we don't cut words in the middle (find nearest space) if (end < text.length) { const lastSpace = text.lastIndexOf(' ', end); if (lastSpace > start) { end = lastSpace; } } chunks.push(text.substring(start, end)); // Move start forward correctly const nextStart = end - OVERLAP_CHARS; if (nextStart <= start) { // Prevent infinite loop if overlap is too large start = end; } else { start = nextStart; } } return chunks; }; /** * Generates an embedding for a given text using OpenAI's embedding API. * Trims the text if it exceeds the maximum allowed characters. * * @param text - The input text to generate an embedding for * @returns The embedding vector as a number array */ const generateEmbedding = async (text: string): Promise<number[]> => { try { const openaiClient = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); const response = await openaiClient.embeddings.create({ model: EMBEDDING_MODEL, input: text, }); return response.data[0].embedding; } catch (error) { console.error('Error generating embedding:', error); return []; } }; /** * Indexes all Markdown documents by generating embeddings for each chunk and storing them in memory. * Persists per-document embeddings under `embeddings/<fileKey>.json`. * Handles cases where files have been updated and chunk counts have changed. */ export const indexMarkdownFiles = async (): Promise<void> => { const env = process.env.NODE_ENV; dotenv.config({ path: [`.env.${env}.local`, `.env.${env}`, '.env.local', '.env'], }); // Retrieve documentation and blog posts in English locale const frequentQuestions = await getFrequentQuestions(); const docs = await getDocs(); const blogs = await getBlogs(); const files = { ...docs, ...blogs, ...frequentQuestions }; // Combine docs and blogs into a single object // Iterate over each file key (identifier) in the combined files for await (const fileKey of Object.keys(files)) { // Get the metadata of the file const fileMetadata = getMarkdownMetadata( files[fileKey as keyof typeof files] as string ); // Split the document into chunks based on headings const fileChunks = chunkText( files[fileKey as keyof typeof files] as string ); // Read existing embeddings for this file const existingEmbeddings = readEmbeddingsForFile(fileKey); // Check if the number of chunks has changed for this file const existingChunksForFile = Object.keys(existingEmbeddings); const currentChunkCount = fileChunks.length; const previousChunkCount = existingChunksForFile.length; let shouldRegenerateFileEmbeddings = false; // If chunk count differs, we need to regenerate embeddings for this file if (currentChunkCount !== previousChunkCount) { console.info( `File "${fileKey}" chunk count changed: ${previousChunkCount} -> ${currentChunkCount}. Regenerating embeddings.` ); shouldRegenerateFileEmbeddings = !skipDocEmbeddingsIndex; } // Iterate over each chunk within the current file let resultForFile: Record<string, number[]> = {}; for await (const chunkIndex of Object.keys(fileChunks)) { const chunkNumber = Number(chunkIndex) + 1; // Chunk number starts at 1 const chunksNumber = fileChunks.length; const fileChunk = fileChunks[ chunkIndex as keyof typeof fileChunks ] as string; const chunkKeyName = `chunk_${chunkNumber}`; // Unique key for the chunk within the file // Retrieve precomputed embedding if available and file hasn't changed const docEmbedding = !shouldRegenerateFileEmbeddings ? (existingEmbeddings[ chunkKeyName as keyof typeof existingEmbeddings ] as number[] | undefined) : undefined; let embedding = docEmbedding; // Use existing embedding if available and valid if (!embedding) { embedding = await generateEmbedding(fileChunk); // Generate embedding if not present or file changed console.info(`- Generated new embedding: ${fileKey}/${chunkKeyName}`); } // Update the file-scoped result object with the embedding resultForFile = { ...resultForFile, [chunkKeyName]: embedding }; // Store the embedding and content in the in-memory vector store vectorStore.push({ fileKey, chunkNumber, embedding, content: fileChunk, docUrl: fileMetadata.url, docName: fileMetadata.title, }); console.info(`- Indexed: ${fileKey}/${chunkKeyName}/${chunksNumber}`); } // Persist per-file embeddings if changed try { if ( JSON.stringify(resultForFile) !== JSON.stringify(existingEmbeddings) ) { writeEmbeddingsForFile(fileKey, resultForFile); } } catch (error) { console.error(error); } } };

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/aymericzip/intlayer'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

indexMarkdownFiles.ts•7.46 KiB