// RAGServer implementation with MCP tools
import { randomUUID } from 'node:crypto'
import { readFile, readdir, unlink } from 'node:fs/promises'
import { extname, join, resolve } from 'node:path'
import { Server } from '@modelcontextprotocol/sdk/server/index.js'
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
import {
CallToolRequestSchema,
ErrorCode,
ListToolsRequestSchema,
McpError,
} from '@modelcontextprotocol/sdk/types.js'
import { SemanticChunker } from '../chunker/index.js'
import { Embedder } from '../embedder/index.js'
import { parseHtml } from '../parser/html-parser.js'
import { DocumentParser, SUPPORTED_EXTENSIONS } from '../parser/index.js'
import { extractMarkdownTitle, extractTxtTitle } from '../parser/title-extractor.js'
import { type VectorChunk, VectorStore } from '../vectordb/index.js'
import { formatErrorMessage } from './error-utils.js'
import {
type ContentFormat,
extractSourceFromPath,
generateMetaJsonPath,
generateRawDataPath,
isRawDataPath,
loadMetaJson,
saveMetaJson,
saveRawData,
} from './raw-data-utils.js'
import { toolDefinitions } from './tool-definitions.js'
import type {
DeleteFileInput,
FileEntry,
IngestDataInput,
IngestFileInput,
IngestResult,
ListFilesResult,
QueryDocumentsInput,
QueryResult,
RAGServerConfig,
SourceEntry,
} from './types.js'
/** RAG server compliant with MCP Protocol */
export class RAGServer {
private readonly server: Server
private readonly vectorStore: VectorStore
private readonly embedder: Embedder
private readonly chunker: SemanticChunker
private readonly parser: DocumentParser
private readonly dbPath: string
private readonly baseDir: string
// Used by handleListFiles filter to exclude system-managed directories
private readonly excludePaths: string[]
constructor(config: RAGServerConfig) {
this.dbPath = config.dbPath
this.baseDir = config.baseDir
this.excludePaths = [`${resolve(config.dbPath)}/`, `${resolve(config.cacheDir)}/`]
this.server = new Server(
{ name: 'rag-mcp-server', version: '1.0.0' },
{ capabilities: { tools: {} } }
)
// Component initialization
// Only pass quality filter settings if they are defined
const vectorStoreConfig: ConstructorParameters<typeof VectorStore>[0] = {
dbPath: config.dbPath,
tableName: 'chunks',
}
if (config.maxDistance !== undefined) {
vectorStoreConfig.maxDistance = config.maxDistance
}
if (config.grouping !== undefined) {
vectorStoreConfig.grouping = config.grouping
}
if (config.hybridWeight !== undefined) {
vectorStoreConfig.hybridWeight = config.hybridWeight
}
if (config.maxFiles !== undefined) {
vectorStoreConfig.maxFiles = config.maxFiles
}
this.vectorStore = new VectorStore(vectorStoreConfig)
this.embedder = new Embedder({
modelPath: config.modelName,
batchSize: 16,
cacheDir: config.cacheDir,
})
this.chunker = new SemanticChunker()
this.parser = new DocumentParser({
baseDir: config.baseDir,
maxFileSize: config.maxFileSize,
})
this.setupHandlers()
}
/**
* Set up MCP handlers
*/
private setupHandlers(): void {
// Tool list
this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
tools: toolDefinitions,
}))
// Tool invocation
this.server.setRequestHandler(
CallToolRequestSchema,
async (request: { params: { name: string; arguments?: unknown } }) => {
switch (request.params.name) {
case 'query_documents':
return await this.handleQueryDocuments(
request.params.arguments as unknown as QueryDocumentsInput
)
case 'ingest_file':
return await this.handleIngestFile(
request.params.arguments as unknown as IngestFileInput
)
case 'ingest_data':
return await this.handleIngestData(
request.params.arguments as unknown as IngestDataInput
)
case 'delete_file':
return await this.handleDeleteFile(
request.params.arguments as unknown as DeleteFileInput
)
case 'list_files':
return await this.handleListFiles()
case 'status':
return await this.handleStatus()
default:
throw new Error(`Unknown tool: ${request.params.name}`)
}
}
)
}
/**
* Initialization
*/
async initialize(): Promise<void> {
await this.vectorStore.initialize()
console.error('RAGServer initialized')
}
/**
* query_documents tool handler
*/
async handleQueryDocuments(
args: QueryDocumentsInput
): Promise<{ content: [{ type: 'text'; text: string }] }> {
try {
// Generate query embedding
const queryVector = await this.embedder.embed(args.query)
// Hybrid search (vector + BM25 keyword matching)
const searchResults = await this.vectorStore.search(queryVector, args.query, args.limit || 10)
// Format results with source restoration for raw-data files
const results: QueryResult[] = searchResults.map((result) => {
const queryResult: QueryResult = {
filePath: result.filePath,
chunkIndex: result.chunkIndex,
text: result.text,
score: result.score,
fileTitle: result.fileTitle ?? null,
}
// Restore source for raw-data files (ingested via ingest_data)
if (isRawDataPath(result.filePath)) {
const source = extractSourceFromPath(result.filePath)
if (source) {
queryResult.source = source
}
}
return queryResult
})
return {
content: [
{
type: 'text',
text: JSON.stringify(results, null, 2),
},
],
}
} catch (error) {
console.error('Failed to query documents:', error)
throw error
}
}
/**
* ingest_file tool handler (re-ingestion support, transaction processing, rollback capability)
*/
async handleIngestFile(
args: IngestFileInput
): Promise<{ content: [{ type: 'text'; text: string }] }> {
let backup: VectorChunk[] | null = null
try {
// Parse file (with header/footer filtering for PDFs)
// For raw-data files (from ingest_data), read directly without validation
// since the path is internally generated and content is already processed
const isPdf = args.filePath.toLowerCase().endsWith('.pdf')
let text: string
let title: string | null = null
if (isRawDataPath(args.filePath)) {
// Raw-data files: skip validation, read directly
text = await readFile(args.filePath, 'utf-8')
const meta = await loadMetaJson(args.filePath)
title = meta?.title ?? null
console.error(`Read raw-data file: ${args.filePath} (${text.length} characters)`)
} else if (isPdf) {
const result = await this.parser.parsePdf(args.filePath, this.embedder)
text = result.content
title = result.title || null
} else {
const result = await this.parser.parseFile(args.filePath)
text = result.content
title = result.title || null
}
// Split text into semantic chunks
const chunks = await this.chunker.chunkText(text, this.embedder)
// Fail-fast: Prevent data loss when chunking produces 0 chunks
// This check must happen BEFORE delete to preserve existing data on re-ingest
if (chunks.length === 0) {
throw new McpError(
ErrorCode.InvalidParams,
`No chunks generated from file: ${args.filePath}. The file may be empty or all content was filtered (minimum 50 characters required). Existing data has been preserved.`
)
}
// Generate embeddings for final chunks
const embeddings = await this.embedder.embedBatch(chunks.map((chunk) => chunk.text))
// Create backup (if existing data exists)
try {
const existingFiles = await this.vectorStore.listFiles()
const existingFile = existingFiles.find((file) => file.filePath === args.filePath)
if (existingFile && existingFile.chunkCount > 0) {
// Backup existing data (retrieve via search)
const queryVector = embeddings[0] || []
if (queryVector.length > 0) {
const allChunks = await this.vectorStore.search(queryVector, undefined, 20) // Retrieve max 20 items
backup = allChunks
.filter((chunk) => chunk.filePath === args.filePath)
.map((chunk) => ({
id: randomUUID(),
filePath: chunk.filePath,
chunkIndex: chunk.chunkIndex,
text: chunk.text,
vector: queryVector, // Use dummy vector since actual vector cannot be retrieved
metadata: chunk.metadata,
fileTitle: chunk.fileTitle ?? null,
timestamp: new Date().toISOString(),
}))
}
console.error(`Backup created: ${backup?.length || 0} chunks for ${args.filePath}`)
}
} catch (error) {
// Backup creation failure is warning only (for new files)
console.warn('Failed to create backup (new file?):', error)
}
// Delete existing data
await this.vectorStore.deleteChunks(args.filePath)
console.error(`Deleted existing chunks for: ${args.filePath}`)
// Create vector chunks
const timestamp = new Date().toISOString()
const vectorChunks: VectorChunk[] = chunks.map((chunk, index) => {
const embedding = embeddings[index]
if (!embedding) {
throw new Error(`Missing embedding for chunk ${index}`)
}
return {
id: randomUUID(),
filePath: args.filePath,
chunkIndex: chunk.index,
text: chunk.text,
vector: embedding,
metadata: {
fileName: args.filePath.split('/').pop() || args.filePath,
fileSize: text.length,
fileType: args.filePath.split('.').pop() || '',
},
fileTitle: title || null,
timestamp,
}
})
// Insert vectors (transaction processing)
try {
await this.vectorStore.insertChunks(vectorChunks)
console.error(`Inserted ${vectorChunks.length} chunks for: ${args.filePath}`)
// Delete backup on success
backup = null
} catch (insertError) {
// Rollback on error
if (backup && backup.length > 0) {
console.error('Ingestion failed, rolling back...', insertError)
try {
await this.vectorStore.insertChunks(backup)
console.error(`Rollback completed: ${backup.length} chunks restored`)
} catch (rollbackError) {
console.error('Rollback failed:', rollbackError)
throw new Error(
`Failed to ingest file and rollback failed: ${(insertError as Error).message}`
)
}
}
throw insertError
}
// Result
const result: IngestResult = {
filePath: args.filePath,
chunkCount: chunks.length,
timestamp,
fileTitle: title || null,
}
return {
content: [
{
type: 'text',
text: JSON.stringify(result, null, 2),
},
],
}
} catch (error) {
// Re-throw McpError as-is to preserve error code
if (error instanceof McpError) {
console.error('Failed to ingest file:', error.message)
throw error
}
const errorMessage = formatErrorMessage(error)
console.error('Failed to ingest file:', errorMessage)
throw new Error(`Failed to ingest file: ${errorMessage}`)
}
}
/**
* ingest_data tool handler
* Saves raw content to raw-data directory and calls handleIngestFile internally
*
* For HTML content:
* - Parses HTML and extracts main content using Readability
* - Converts to Markdown for better chunking
* - Saves as .md file
*/
async handleIngestData(
args: IngestDataInput
): Promise<{ content: [{ type: 'text'; text: string }] }> {
try {
let contentToSave = args.content
let formatToSave: ContentFormat = args.metadata.format
let title: string | null = null
// Per-format title extraction and content preparation
if (args.metadata.format === 'html') {
console.error(`Parsing HTML from: ${args.metadata.source}`)
const { content: markdown, title: htmlTitle } = await parseHtml(
args.content,
args.metadata.source
)
if (!markdown.trim()) {
throw new Error(
'Failed to extract content from HTML. The page may have no readable content.'
)
}
title = htmlTitle || null
contentToSave = markdown
formatToSave = 'markdown' // Save as .md file
console.error(`Converted HTML to Markdown: ${markdown.length} characters`)
} else if (args.metadata.format === 'markdown') {
const result = extractMarkdownTitle(args.content, args.metadata.source)
title = result.source !== 'filename' ? result.title : null
} else {
// text format
const result = extractTxtTitle(args.content, args.metadata.source)
title = result.source !== 'filename' ? result.title : null
}
// Save content to raw-data directory
const rawDataPath = await saveRawData(
this.dbPath,
args.metadata.source,
contentToSave,
formatToSave
)
// Save metadata sidecar (.meta.json) alongside the raw-data file
await saveMetaJson(rawDataPath, {
title,
source: args.metadata.source,
format: args.metadata.format,
})
console.error(`Saved raw data: ${args.metadata.source} -> ${rawDataPath}`)
// Call existing ingest_file internally with rollback on failure
try {
return await this.handleIngestFile({ filePath: rawDataPath })
} catch (ingestError) {
// Rollback: delete the raw-data file and .meta.json if ingest fails
try {
await unlink(rawDataPath)
await unlink(generateMetaJsonPath(rawDataPath))
console.error(`Rolled back raw-data file: ${rawDataPath}`)
} catch {
console.warn(`Failed to rollback raw-data file: ${rawDataPath}`)
}
throw ingestError
}
} catch (error) {
const errorMessage = formatErrorMessage(error)
console.error('Failed to ingest data:', errorMessage)
throw new Error(`Failed to ingest data: ${errorMessage}`)
}
}
/**
* list_files tool handler
* Scans BASE_DIR for supported files and cross-references with ingested documents
*/
async handleListFiles(): Promise<{ content: [{ type: 'text'; text: string }] }> {
try {
// Get all ingested entries from the vector store
const ingested = await this.vectorStore.listFiles()
const ingestedMap = new Map(ingested.map((f) => [f.filePath, f]))
// Scan BASE_DIR recursively for supported files.
// Errors propagate to the outer catch: if readdir fails, ingest_file and
// delete_file won't work either, so surfacing the error is appropriate.
const entries = await readdir(this.baseDir, { recursive: true, withFileTypes: true })
const baseDirFiles = entries
.filter((e) => e.isFile() && SUPPORTED_EXTENSIONS.has(extname(e.name).toLowerCase()))
.map((e) => {
// parentPath is the Node 21+ name; path is the deprecated Node 20 alias
// biome-ignore lint/suspicious/noExplicitAny: parentPath not yet in @types/node@20
const dir = (e as any).parentPath ?? e.path
return join(dir, e.name)
})
.filter((filePath) => !this.excludePaths.some((ep) => filePath.startsWith(ep)))
.sort()
const baseDirSet = new Set(baseDirFiles)
// Files in BASE_DIR with ingestion status
const files: FileEntry[] = baseDirFiles.map((filePath) => {
const entry = ingestedMap.get(filePath)
return entry
? { filePath, ingested: true, chunkCount: entry.chunkCount, timestamp: entry.timestamp }
: { filePath, ingested: false }
})
// Content ingested via ingest_data (web pages, clipboard, etc.) plus any
// orphaned DB entries whose files no longer exist on disk
const sources: SourceEntry[] = ingested
.filter((f) => !baseDirSet.has(f.filePath))
.map((f) => {
if (isRawDataPath(f.filePath)) {
const source = extractSourceFromPath(f.filePath)
if (source) return { source, chunkCount: f.chunkCount, timestamp: f.timestamp }
}
return { filePath: f.filePath, chunkCount: f.chunkCount, timestamp: f.timestamp }
})
const result: ListFilesResult = { baseDir: this.baseDir, files, sources }
return {
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
}
} catch (error) {
console.error('Failed to list files:', error)
throw error
}
}
/**
* status tool handler (Phase 1: basic implementation)
*/
async handleStatus(): Promise<{ content: [{ type: 'text'; text: string }] }> {
try {
const status = await this.vectorStore.getStatus()
return {
content: [
{
type: 'text',
text: JSON.stringify(status, null, 2),
},
],
}
} catch (error) {
console.error('Failed to get status:', error)
throw error
}
}
/**
* delete_file tool handler
* Deletes chunks from VectorDB and physical raw-data files
* Supports both filePath (for ingest_file) and source (for ingest_data)
*/
async handleDeleteFile(
args: DeleteFileInput
): Promise<{ content: [{ type: 'text'; text: string }] }> {
try {
let targetPath: string
let skipValidation = false
if (args.source) {
// Generate raw-data path from source (extension is always .md)
// Internal path generation is secure, skip baseDir validation
targetPath = generateRawDataPath(this.dbPath, args.source, 'markdown')
skipValidation = true
} else if (args.filePath) {
targetPath = args.filePath
} else {
throw new Error('Either filePath or source must be provided')
}
// Only validate user-provided filePath (not internally generated paths)
if (!skipValidation) {
await this.parser.validateFilePath(targetPath)
}
// Delete chunks from vector database
await this.vectorStore.deleteChunks(targetPath)
// Also delete physical raw-data file if applicable
if (isRawDataPath(targetPath)) {
try {
await unlink(targetPath)
console.error(`Deleted raw-data file: ${targetPath}`)
} catch {
console.warn(`Could not delete raw-data file (may not exist): ${targetPath}`)
}
try {
await unlink(generateMetaJsonPath(targetPath))
console.error(`Deleted meta.json: ${generateMetaJsonPath(targetPath)}`)
} catch {
// .meta.json may not exist for old data, silently ignore
}
}
// Return success message
const result = {
filePath: targetPath,
deleted: true,
timestamp: new Date().toISOString(),
}
return {
content: [
{
type: 'text',
text: JSON.stringify(result, null, 2),
},
],
}
} catch (error) {
const errorMessage = formatErrorMessage(error)
console.error('Failed to delete file:', errorMessage)
throw new Error(`Failed to delete file: ${errorMessage}`)
}
}
/**
* Start the server
*/
async run(): Promise<void> {
const transport = new StdioServerTransport()
await this.server.connect(transport)
console.error('RAGServer running on stdio transport')
}
}