folder-mcp

Overview Schema Related Servers Score Discussions

schema.ts•14.8 KiB

/** * SQLite-vec Database Schema * * Defines the complete database schema for embedding storage using SQLite-vec. * This schema supports per-folder databases with full document and chunk tracking. */ import { readFileSync, existsSync } from 'fs'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; // Read schema version from VERSION.json file // IMPORTANT: Uses multiple fallback paths to handle different execution contexts: // 1. Relative to this file's location (for compiled dist/) // 2. process.cwd() (for development with tsx) // 3. Hardcoded fallback (v4 for current schema) function getSchemaVersion(): number { // Try paths relative to this file's location (works in compiled dist) const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // Possible locations for VERSION.json const possiblePaths = [ join(__dirname, '..', '..', '..', '..', 'VERSION.json'), // From dist/src/infrastructure/embeddings/sqlite-vec/ join(__dirname, '..', '..', '..', '..', '..', 'VERSION.json'), // One more level up join(process.cwd(), 'VERSION.json'), // CWD (development) join(process.cwd(), 'dist', 'VERSION.json'), // CWD/dist ]; for (const versionPath of possiblePaths) { try { if (existsSync(versionPath)) { const versionData = JSON.parse(readFileSync(versionPath, 'utf-8')); return versionData.dbSchemaVersion || 4; } } catch { // Continue to next path } } // Fallback: Version 4 is the current schema (vec0 virtual tables from Sprint 7.5) // This matches the dbSchemaVersion in VERSION.json to prevent false schema mismatches console.warn('Could not read VERSION.json from any known path, using default schema version 4'); return 4; } // Schema version from VERSION.json (or fallback) // Version 4: Migrated to vec0 virtual tables (Sprint 7.5) export const SCHEMA_VERSION = getSchemaVersion(); /** * Core document tracking table * Stores metadata about indexed files * Document embeddings now stored in document_embeddings vec0 table (Sprint 7.5) */ export const DOCUMENTS_TABLE = ` CREATE TABLE IF NOT EXISTS documents ( id INTEGER PRIMARY KEY AUTOINCREMENT, file_path TEXT UNIQUE NOT NULL, fingerprint TEXT NOT NULL, file_size INTEGER NOT NULL, mime_type TEXT, last_modified TIMESTAMP NOT NULL, last_indexed TIMESTAMP DEFAULT CURRENT_TIMESTAMP, needs_reindex INTEGER DEFAULT 0, -- Document-level semantic metadata document_keywords TEXT, -- JSON array of {text, score} objects keywords_extracted INTEGER DEFAULT 0, -- Flag: 1 if keywords have been extracted document_processing_ms INTEGER -- Time taken for document-level processing );`; /** * Text chunks from documents * Each document is split into multiple chunks for embedding * Enhanced with semantic metadata for AI agent navigation */ export const CHUNKS_TABLE = ` CREATE TABLE IF NOT EXISTS chunks ( id INTEGER PRIMARY KEY AUTOINCREMENT, document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE, chunk_index INTEGER NOT NULL, content TEXT NOT NULL, start_offset INTEGER NOT NULL, end_offset INTEGER NOT NULL, token_count INTEGER, -- Semantic metadata for AI agent navigation key_phrases TEXT, -- JSON array of extracted key phrases readability_score REAL, -- Flesch Reading Ease score (0-100) semantic_processed INTEGER DEFAULT 0, -- Flag: 1 if semantic extraction succeeded semantic_timestamp INTEGER, -- Unix timestamp of semantic processing UNIQUE(document_id, chunk_index) );`; /** * Vec0 virtual table for chunk embeddings * Uses implicit rowid for identity, chunk_id as metadata column for filtering * Dimension is injected dynamically based on the embedding model */ export const CHUNK_EMBEDDINGS_TABLE_TEMPLATE = ` CREATE VIRTUAL TABLE IF NOT EXISTS chunk_embeddings USING vec0( chunk_id INTEGER, embedding FLOAT32[\${dimension}] );`; /** * Vec0 virtual table for document embeddings * Uses implicit rowid for identity, document_id as metadata column for filtering * Dimension matches chunk embeddings (same model) */ export const DOCUMENT_EMBEDDINGS_TABLE_TEMPLATE = ` CREATE VIRTUAL TABLE IF NOT EXISTS document_embeddings USING vec0( document_id INTEGER, embedding FLOAT32[\${dimension}] );`; /** * Configuration tracking for the embedding model used * Ensures consistency and detects model changes */ export const EMBEDDING_CONFIG_TABLE = ` CREATE TABLE IF NOT EXISTS embedding_config ( id INTEGER PRIMARY KEY CHECK (id = 1), model_name TEXT NOT NULL, model_dimension INTEGER NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP );`; /** * File processing states table * Tracks processing outcomes for intelligent scanning decisions */ export const FILE_STATES_TABLE = ` CREATE TABLE IF NOT EXISTS file_states ( file_path TEXT PRIMARY KEY, content_hash TEXT NOT NULL, processing_state TEXT NOT NULL, last_attempt INTEGER NOT NULL, success_timestamp INTEGER, failure_reason TEXT, attempt_count INTEGER DEFAULT 1, chunk_count INTEGER, created_at INTEGER DEFAULT (strftime('%s', 'now')), updated_at INTEGER DEFAULT (strftime('%s', 'now')) );`; // REMOVED: folder_semantic_summary table (Phase 5 cleanup) // Using pure runtime aggregation instead of caching for consistency /** * Schema version tracking table * Used to detect when database needs to be rebuilt due to schema changes */ export const SCHEMA_VERSION_TABLE = ` CREATE TABLE IF NOT EXISTS schema_version ( id INTEGER PRIMARY KEY CHECK (id = 1), version INTEGER NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP );`; /** * Database performance indexes */ export const INDEXES = [ 'CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(file_path);', 'CREATE INDEX IF NOT EXISTS idx_documents_fingerprint ON documents(fingerprint);', 'CREATE INDEX IF NOT EXISTS idx_chunks_document ON chunks(document_id);', 'CREATE INDEX IF NOT EXISTS idx_documents_needs_reindex ON documents(needs_reindex);', 'CREATE INDEX IF NOT EXISTS idx_chunks_content_search ON chunks(content);', 'CREATE INDEX IF NOT EXISTS idx_file_states_hash ON file_states(content_hash);', 'CREATE INDEX IF NOT EXISTS idx_file_states_state ON file_states(processing_state);', 'CREATE INDEX IF NOT EXISTS idx_file_states_last_attempt ON file_states(last_attempt);', // Semantic indexes for Sprint 10 'CREATE INDEX IF NOT EXISTS idx_chunks_semantic_processed ON chunks(semantic_processed);', // Document-level semantic indexes for Sprint 11 'CREATE INDEX IF NOT EXISTS idx_documents_keywords_extracted ON documents(keywords_extracted);' ]; /** * Create vec0 virtual tables with correct dimension * Injects dimension into template strings */ function createChunkEmbeddingsTable(dimension: number): string { return CHUNK_EMBEDDINGS_TABLE_TEMPLATE.replace('${dimension}', dimension.toString()); } function createDocumentEmbeddingsTable(dimension: number): string { return DOCUMENT_EMBEDDINGS_TABLE_TEMPLATE.replace('${dimension}', dimension.toString()); } /** * All table creation statements in dependency order * Vec0 virtual tables created with dynamic dimension from model config */ export function getAllTableStatements(embeddingDimension: number): string[] { return [ DOCUMENTS_TABLE, CHUNKS_TABLE, createChunkEmbeddingsTable(embeddingDimension), createDocumentEmbeddingsTable(embeddingDimension), EMBEDDING_CONFIG_TABLE, FILE_STATES_TABLE, SCHEMA_VERSION_TABLE, ...INDEXES ]; } /** * Validation queries to check database integrity */ export const VALIDATION_QUERIES = { checkTables: ` SELECT name FROM sqlite_master WHERE type='table' AND name IN ('documents', 'chunks', 'chunk_embeddings', 'document_embeddings', 'embedding_config', 'file_states') ORDER BY name; `, checkIndexes: ` SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%' ORDER BY name; `, checkForeignKeys: 'PRAGMA foreign_key_check;', checkIntegrity: 'PRAGMA integrity_check;' }; /** * Common queries for database operations */ export const QUERIES = { // Document operations insertDocument: ` INSERT OR REPLACE INTO documents (file_path, fingerprint, file_size, mime_type, last_modified, last_indexed) VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP) `, getDocument: 'SELECT * FROM documents WHERE file_path = ?', deleteDocument: 'DELETE FROM documents WHERE file_path = ?', markForReindex: 'UPDATE documents SET needs_reindex = 1 WHERE file_path = ?', // Chunk operations insertChunk: ` INSERT INTO chunks (document_id, chunk_index, content, start_offset, end_offset, token_count, key_phrases, readability_score, semantic_processed, semantic_timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) `, getChunksByDocument: 'SELECT * FROM chunks WHERE document_id = ? ORDER BY chunk_index', deleteChunksByDocument: 'DELETE FROM chunks WHERE document_id = ?', // Embedding operations - vec0 tables (BLOB created via vec_f32() in application code) insertChunkEmbedding: 'INSERT INTO chunk_embeddings (chunk_id, embedding) VALUES (?, ?)', insertDocumentEmbedding: 'INSERT INTO document_embeddings (document_id, embedding) VALUES (?, ?)', deleteChunkEmbeddingsByDocument: ` DELETE FROM chunk_embeddings WHERE chunk_id IN ( SELECT c.id FROM chunks c WHERE c.document_id = ? ) `, deleteDocumentEmbedding: 'DELETE FROM document_embeddings WHERE document_id = ?', // Search operations - returns metadata only for lazy loading similaritySearch: ` SELECT c.id as chunk_id, c.chunk_index, d.file_path, d.mime_type, c.start_offset, c.end_offset, c.token_count, c.key_phrases, c.readability_score, (c.id * 0.1) as distance FROM chunk_embeddings e JOIN chunks c ON e.chunk_id = c.id JOIN documents d ON c.document_id = d.id ORDER BY c.id ASC LIMIT ? `, // Batch content retrieval for lazy loading getChunksContent: ` SELECT c.id as chunk_id, c.content, c.chunk_index, d.file_path, c.key_phrases FROM chunks c JOIN documents d ON c.document_id = d.id WHERE c.id IN (/*PLACEHOLDER*/) `, // Configuration operations getConfig: 'SELECT * FROM embedding_config WHERE id = 1', insertConfig: ` INSERT OR REPLACE INTO embedding_config (id, model_name, model_dimension, updated_at) VALUES (1, ?, ?, CURRENT_TIMESTAMP) `, // Statistics getDocumentCount: 'SELECT COUNT(*) as count FROM documents', getChunkCount: 'SELECT COUNT(*) as count FROM chunks', getChunkEmbeddingCount: 'SELECT COUNT(*) as count FROM chunk_embeddings', getDocumentEmbeddingCount: 'SELECT COUNT(*) as count FROM document_embeddings', getDatabaseSize: 'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()', // Document-level semantic operations (Sprint 11/7.5) updateDocumentKeywords: ` UPDATE documents SET document_keywords = ?, keywords_extracted = 1 WHERE id = ? `, updateDocumentProcessingTime: ` UPDATE documents SET document_processing_ms = ? WHERE id = ? `, getDocumentSemantics: ` SELECT document_keywords, keywords_extracted FROM documents WHERE id = ? `, getDocumentsNeedingSemantics: ` SELECT id, file_path FROM documents WHERE keywords_extracted = 0 `, checkDocumentEmbeddingExists: ` SELECT 1 FROM document_embeddings WHERE document_id = ? LIMIT 1 `, // Aggregation query for building folder summaries aggregateSemanticData: ` SELECT COUNT(DISTINCT d.id) as document_count, COUNT(c.id) as chunk_count, AVG(c.readability_score) as avg_readability, GROUP_CONCAT(c.key_phrases) as all_key_phrases FROM documents d JOIN chunks c ON d.id = c.document_id WHERE c.semantic_processed = 1 `, // Phase 5: Runtime path parsing for subfolder support getSubfoldersWithSemanticData: ` WITH path_analysis AS ( SELECT d.file_path, d.id, CASE WHEN d.file_path LIKE ? || '/%' THEN SUBSTR(d.file_path, LENGTH(?) + 2) ELSE d.file_path END as relative_path FROM documents d WHERE d.file_path LIKE ? || '/%' ), subfolder_extraction AS ( SELECT CASE WHEN relative_path LIKE '%/%' THEN '/' || SUBSTR(relative_path, 1, INSTR(relative_path, '/') - 1) ELSE '/' END as subfolder_path, relative_path, id, file_path FROM path_analysis ), subfolder_stats AS ( SELECT se.subfolder_path, COUNT(DISTINCT se.id) as document_count, AVG(c.readability_score) as avg_readability, GROUP_CONCAT(DISTINCT json_extract(c.key_phrases, '$[0].text')) as key_phrases FROM subfolder_extraction se JOIN chunks c ON se.id = (SELECT id FROM documents WHERE file_path = se.file_path) WHERE c.semantic_processed = 1 AND se.subfolder_path != '/' GROUP BY se.subfolder_path ) SELECT subfolder_path, document_count, ROUND(avg_readability, 1) as avg_readability, key_phrases FROM subfolder_stats WHERE document_count > 0 ORDER BY subfolder_path `, getDocumentsInSubfolder: ` SELECT d.*, json_group_array( json_object( 'key_phrase', json_extract(c.key_phrases, '$[0].text'), 'readability', c.readability_score ) ) as semantic_data FROM documents d JOIN chunks c ON d.id = c.document_id WHERE d.file_path LIKE ? || '%' AND c.semantic_processed = 1 GROUP BY d.id ORDER BY d.last_modified DESC LIMIT ? OFFSET ? ` };

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/okets/folder-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

schema.ts•14.8 KiB