Skip to main content
Glama
indexMarkdownFiles.ts7.64 kB
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'; import { dirname, join } from 'node:path'; import { fileURLToPath } from 'node:url'; import { getMarkdownMetadata } from '@intlayer/core'; import { getBlogs, getDocs, getFrequentQuestions } from '@intlayer/docs'; import dotenv from 'dotenv'; import { OpenAI } from 'openai'; const OUTPUT_EMBEDDINGS_DIR = 'src/utils/AI/askDocQuestion/embeddings'; const __dirname = dirname(fileURLToPath(import.meta.url)); const readEmbeddingsForFile = (fileKey: string): Record<string, number[]> => { try { return JSON.parse( readFileSync( `${__dirname}/embeddings/${fileKey.replace('.md', '.json')}`, 'utf-8' ) ) as Record<string, number[]>; } catch { return {}; } }; const writeEmbeddingsForFile = ( fileKey: string, data: Record<string, number[]> ): void => { const filePath = join( OUTPUT_EMBEDDINGS_DIR, `${fileKey.replace('.md', '.json')}` ); const dir = dirname(filePath); if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } writeFileSync(filePath, JSON.stringify(data)); }; type VectorStoreEl = { fileKey: string; chunkNumber: number; content: string; embedding: number[]; docUrl: string; docName: string; }; /** * Simple in-memory vector store to hold document embeddings and their content. * Each entry contains: * - fileKey: A unique key identifying the file * - chunkNumber: The number of the chunk within the document * - content: The chunk content * - embedding: The numerical embedding vector for the chunk */ const vectorStore: VectorStoreEl[] = []; /* * Embedding model configuration */ const EMBEDDING_MODEL: OpenAI.EmbeddingModel = 'text-embedding-3-large'; // Model to use for embedding generation const OVERLAP_TOKENS: number = 200; // Number of tokens to overlap between chunks const MAX_CHUNK_TOKENS: number = 800; // Maximum number of tokens per chunk const CHAR_BY_TOKEN: number = 4.15; // Approximate pessimistically the number of characters per token // Can use `tiktoken` or other tokenizers to calculate it more precisely const MAX_CHARS: number = MAX_CHUNK_TOKENS * CHAR_BY_TOKEN; const OVERLAP_CHARS: number = OVERLAP_TOKENS * CHAR_BY_TOKEN; const skipDocEmbeddingsIndex = process.env.SKIP_DOC_EMBEDDINGS_INDEX === 'true'; /** * Splits a given text into chunks ensuring each chunk does not exceed MAX_CHARS. * @param text - The input text to split. * @returns - Array of text chunks. */ const chunkText = (text: string): string[] => { const chunks: string[] = []; let start = 0; while (start < text.length) { let end = Math.min(start + MAX_CHARS, text.length); // Ensure we don't cut words in the middle (find nearest space) if (end < text.length) { const lastSpace = text.lastIndexOf(' ', end); if (lastSpace > start) { end = lastSpace; } } chunks.push(text.substring(start, end)); // Move start forward correctly const nextStart = end - OVERLAP_CHARS; if (nextStart <= start) { // Prevent infinite loop if overlap is too large start = end; } else { start = nextStart; } } return chunks; }; /** * Generates an embedding for a given text using OpenAI's embedding API. * Trims the text if it exceeds the maximum allowed characters. * * @param text - The input text to generate an embedding for * @returns The embedding vector as a number array */ const generateEmbedding = async (text: string): Promise<number[]> => { try { const openaiClient = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); const response = await openaiClient.embeddings.create({ model: EMBEDDING_MODEL, input: text, }); return response.data[0].embedding; } catch (error) { console.error('Error generating embedding:', error); return []; } }; /** * Indexes all Markdown documents by generating embeddings for each chunk and storing them in memory. * Persists per-document embeddings under `embeddings/<fileKey>.json`. * Handles cases where files have been updated and chunk counts have changed. */ export const indexMarkdownFiles = async (): Promise<void> => { const env = process.env.NODE_ENV; dotenv.config({ path: [`.env.${env}.local`, `.env.${env}`, '.env.local', '.env'], }); // Retrieve documentation and blog posts in English locale const frequentQuestions = await getFrequentQuestions(); const docs = await getDocs(); const blogs = await getBlogs(); const files = { ...docs, ...blogs, ...frequentQuestions }; // Combine docs and blogs into a single object // Iterate over each file key (identifier) in the combined files for await (const fileKey of Object.keys(files)) { // Get the metadata of the file const fileMetadata = getMarkdownMetadata( files[fileKey as keyof typeof files] as string ); // Split the document into chunks based on headings const fileChunks = chunkText( files[fileKey as keyof typeof files] as string ); // Read existing embeddings for this file const existingEmbeddings = readEmbeddingsForFile(fileKey); // Check if the number of chunks has changed for this file const existingChunksForFile = Object.keys(existingEmbeddings); const currentChunkCount = fileChunks.length; const previousChunkCount = existingChunksForFile.length; let shouldRegenerateFileEmbeddings = false; // If chunk count differs, we need to regenerate embeddings for this file if (currentChunkCount !== previousChunkCount) { console.info( `File "${fileKey}" chunk count changed: ${previousChunkCount} -> ${currentChunkCount}. Regenerating embeddings.` ); shouldRegenerateFileEmbeddings = !skipDocEmbeddingsIndex; } // Iterate over each chunk within the current file let resultForFile: Record<string, number[]> = {}; for await (const chunkIndex of Object.keys(fileChunks)) { const chunkNumber = Number(chunkIndex) + 1; // Chunk number starts at 1 const chunksNumber = fileChunks.length; const fileChunk = fileChunks[ chunkIndex as keyof typeof fileChunks ] as string; const chunkKeyName = `chunk_${chunkNumber}`; // Unique key for the chunk within the file // Retrieve precomputed embedding if available and file hasn't changed const docEmbedding = !shouldRegenerateFileEmbeddings ? (existingEmbeddings[ chunkKeyName as keyof typeof existingEmbeddings ] as number[] | undefined) : undefined; let embedding = docEmbedding; // Use existing embedding if available and valid if (!embedding) { embedding = await generateEmbedding(fileChunk); // Generate embedding if not present or file changed console.info(`- Generated new embedding: ${fileKey}/${chunkKeyName}`); } // Update the file-scoped result object with the embedding resultForFile = { ...resultForFile, [chunkKeyName]: embedding }; // Store the embedding and content in the in-memory vector store vectorStore.push({ fileKey, chunkNumber, embedding, content: fileChunk, docUrl: fileMetadata.url, docName: fileMetadata.title, }); console.info(`- Indexed: ${fileKey}/${chunkKeyName}/${chunksNumber}`); } // Persist per-file embeddings if changed try { if ( JSON.stringify(resultForFile) !== JSON.stringify(existingEmbeddings) ) { writeEmbeddingsForFile(fileKey, resultForFile); } } catch (error) { console.error(error); } } };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/aymericzip/intlayer'

If you have feedback or need assistance with the MCP directory API, please join our Discord server