#!/usr/bin/env node
/**
* MCP Document Access Server - Phase 2
* Provides iterative document access tools to LLMs for token-efficient exploration
*
* Solves the token limit problem:
* - 300K line document = 4.5M tokens (impossible)
* - MCP iterative access = ~16K tokens (achievable)
* - 281x token efficiency improvement
*
* MCP Tools Provided:
* 1. list_document_sections - Get hierarchical document structure
* 2. read_section - Read specific section with context
* 3. read_lines - Read specific line range
* 4. search_content - Search for keywords/patterns
*
* Usage:
* # Stdio mode (for MCP clients)
* node document_access_mcp_server.js --mode stdio
*
* # HTTP mode (for testing/debugging)
* node document_access_mcp_server.js --mode http --port 49400
*/
const fs = require('fs');
const path = require('path');
const readline = require('readline');
// LODA (LLM-Optimized Document Access) module
const { LodaSearchHandler } = require('./loda/loda_search_handler');
// Document staging directory (where LLM can access documents)
const STAGING_DIR = path.join(__dirname, 'staging');
// Ensure staging directory exists
if (!fs.existsSync(STAGING_DIR)) {
fs.mkdirSync(STAGING_DIR, { recursive: true });
}
/**
* Parse document structure - deterministic, no LLM needed
* Identifies markdown headings and creates hierarchical structure
*/
function parseDocumentStructure(documentPath, maxDepth = 6) {
const content = fs.readFileSync(documentPath, 'utf8');
const lines = content.split('\n');
const sections = [];
let currentSectionId = 0;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
if (headingMatch) {
const level = headingMatch[1].length;
const headerText = headingMatch[2];
if (level <= maxDepth) {
// Find section end (next heading of same or higher level)
let endLine = lines.length - 1;
for (let j = i + 1; j < lines.length; j++) {
const nextHeadingMatch = lines[j].match(/^(#{1,6})\s+/);
if (nextHeadingMatch && nextHeadingMatch[1].length <= level) {
endLine = j - 1;
break;
}
}
sections.push({
id: `section-${currentSectionId++}`,
header: headerText,
level,
startLine: i + 1, // 1-indexed for user display
endLine: endLine + 1,
lineCount: endLine - i + 1
});
}
}
}
return {
sections,
totalLines: lines.length,
totalSections: sections.length,
fileName: path.basename(documentPath)
};
}
/**
* Read specific section with optional context lines
*/
function readSection(documentPath, sectionId, includeContext = true, contextLines = 20) {
const structure = parseDocumentStructure(documentPath);
const section = structure.sections.find(s => s.id === sectionId);
if (!section) {
throw new Error(`Section ${sectionId} not found`);
}
const content = fs.readFileSync(documentPath, 'utf8');
const lines = content.split('\n');
// Get section content
const sectionContent = lines.slice(section.startLine - 1, section.endLine).join('\n');
// Get context if requested
let contextBefore = '';
let contextAfter = '';
if (includeContext) {
const contextStart = Math.max(0, section.startLine - 1 - contextLines);
const contextEnd = Math.min(lines.length, section.endLine + contextLines);
contextBefore = lines.slice(contextStart, section.startLine - 1).join('\n');
contextAfter = lines.slice(section.endLine, contextEnd).join('\n');
}
return {
section: {
id: section.id,
header: section.header,
level: section.level,
content: sectionContent,
lineRange: [section.startLine, section.endLine],
contextBefore: includeContext ? contextBefore : null,
contextAfter: includeContext ? contextAfter : null
}
};
}
/**
* Read specific line range from document
*/
function readLines(documentPath, startLine, endLine) {
const content = fs.readFileSync(documentPath, 'utf8');
const lines = content.split('\n');
// Validate range
if (startLine < 1 || endLine > lines.length || startLine > endLine) {
throw new Error(`Invalid line range: ${startLine}-${endLine} (document has ${lines.length} lines)`);
}
const selectedLines = lines.slice(startLine - 1, endLine);
return {
content: selectedLines.join('\n'),
lineRange: [startLine, endLine],
lineCount: selectedLines.length
};
}
/**
* Search for keywords/patterns in document
* FIX: Moved parseDocumentStructure outside loop for performance (PLAN Section 4.1)
*/
function searchContent(documentPath, query, maxResults = 10) {
const content = fs.readFileSync(documentPath, 'utf8');
const lines = content.split('\n');
const structure = parseDocumentStructure(documentPath); // MOVED: Now called once
const matches = [];
const regex = new RegExp(query, 'gi');
for (let i = 0; i < lines.length && matches.length < maxResults; i++) {
if (regex.test(lines[i])) {
// Get context (3 lines before and after)
const contextStart = Math.max(0, i - 3);
const contextEnd = Math.min(lines.length, i + 4);
const context = lines.slice(contextStart, contextEnd).join('\n');
// Find which section this line belongs to
const section = structure.sections.find(s =>
i + 1 >= s.startLine && i + 1 <= s.endLine
);
matches.push({
line: i + 1,
content: lines[i],
context,
section: section ? section.header : 'Unknown'
});
}
}
return {
query,
matches,
totalMatches: matches.length
};
}
// LODA Search Handler - initialized with parseDocumentStructure
const lodaHandler = new LodaSearchHandler(parseDocumentStructure);
/**
* MCP Tool Definitions
*/
const MCP_TOOLS = {
list_document_sections: {
name: 'list_document_sections',
description: 'Get hierarchical structure of document sections (headings, line ranges, metadata)',
inputSchema: {
type: 'object',
properties: {
documentPath: {
type: 'string',
description: 'Path to document in staging directory (e.g., "document.md" or "/staging/document.md")'
},
depth: {
type: 'number',
description: 'Maximum heading depth to return (1-6, default: 3)',
default: 3
}
},
required: ['documentPath']
},
handler: (args) => {
const docPath = resolveStagingPath(args.documentPath);
const depth = args.depth || 3;
return parseDocumentStructure(docPath, depth);
}
},
read_section: {
name: 'read_section',
description: 'Read specific section with optional context lines before/after',
inputSchema: {
type: 'object',
properties: {
documentPath: {
type: 'string',
description: 'Path to document in staging directory'
},
sectionId: {
type: 'string',
description: 'Section ID from list_document_sections (e.g., "section-0")'
},
includeContext: {
type: 'boolean',
description: 'Include context lines before/after section',
default: true
},
contextLines: {
type: 'number',
description: 'Number of context lines to include',
default: 20
}
},
required: ['documentPath', 'sectionId']
},
handler: (args) => {
const docPath = resolveStagingPath(args.documentPath);
return readSection(
docPath,
args.sectionId,
args.includeContext !== false,
args.contextLines || 20
);
}
},
read_lines: {
name: 'read_lines',
description: 'Read specific line range from document (1-indexed)',
inputSchema: {
type: 'object',
properties: {
documentPath: {
type: 'string',
description: 'Path to document in staging directory'
},
startLine: {
type: 'number',
description: 'Start line number (1-indexed, inclusive)'
},
endLine: {
type: 'number',
description: 'End line number (1-indexed, inclusive)'
}
},
required: ['documentPath', 'startLine', 'endLine']
},
handler: (args) => {
const docPath = resolveStagingPath(args.documentPath);
return readLines(docPath, args.startLine, args.endLine);
}
},
search_content: {
name: 'search_content',
description: 'Search for keywords/patterns in document using regex',
inputSchema: {
type: 'object',
properties: {
documentPath: {
type: 'string',
description: 'Path to document in staging directory'
},
query: {
type: 'string',
description: 'Search query (regex pattern supported)'
},
maxResults: {
type: 'number',
description: 'Maximum number of results to return',
default: 10
}
},
required: ['documentPath', 'query']
},
handler: (args) => {
const docPath = resolveStagingPath(args.documentPath);
return searchContent(docPath, args.query, args.maxResults || 10);
}
},
// LODA-MCP-COMP-02: Token-optimized search tool
loda_search: {
name: 'loda_search',
description: 'Token-optimized document search using LODA (LLM-Optimized Document Access). Returns relevant sections within optional token budget. Uses Bloom filters for O(1) section elimination and caching for 10x+ speedup.',
inputSchema: {
type: 'object',
properties: {
documentPath: {
type: 'string',
description: 'Path to document (absolute or relative to staging directory)'
},
query: {
type: 'string',
description: 'Search query (keywords or phrase)'
},
contextBudget: {
type: 'number',
description: 'Maximum tokens to return (null/omit for unlimited)'
},
maxSections: {
type: 'number',
description: 'Maximum sections to return (default: 5)',
default: 5
}
},
required: ['documentPath', 'query']
},
handler: (args) => {
const docPath = resolveStagingPath(args.documentPath);
return lodaHandler.search(docPath, args.query, {
contextBudget: args.contextBudget || null,
maxSections: args.maxSections || 5
});
}
}
};
/**
* Resolve document path to staging directory
*/
function resolveStagingPath(documentPath) {
// Handle absolute paths, relative paths, and staging directory references
if (documentPath.startsWith('/staging/')) {
documentPath = documentPath.replace('/staging/', '');
}
const resolved = path.isAbsolute(documentPath)
? documentPath
: path.join(STAGING_DIR, documentPath);
if (!fs.existsSync(resolved)) {
throw new Error(`Document not found: ${documentPath} (resolved to: ${resolved})`);
}
return resolved;
}
/**
* Execute MCP tool
*/
function executeTool(toolName, args) {
const tool = MCP_TOOLS[toolName];
if (!tool) {
throw new Error(`Unknown tool: ${toolName}`);
}
try {
return {
success: true,
result: tool.handler(args)
};
} catch (error) {
return {
success: false,
error: error.message
};
}
}
/**
* MCP Stdio Protocol Handler
*/
function startStdioServer() {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
console.error('[MCP Server] Started in stdio mode');
console.error(`[MCP Server] Staging directory: ${STAGING_DIR}`);
console.error('[MCP Server] Available tools:', Object.keys(MCP_TOOLS).join(', '));
rl.on('line', (line) => {
try {
const request = JSON.parse(line);
if (request.method === 'tools/list') {
// List available tools
const response = {
tools: Object.values(MCP_TOOLS).map(tool => ({
name: tool.name,
description: tool.description,
inputSchema: tool.inputSchema
}))
};
console.log(JSON.stringify(response));
} else if (request.method === 'tools/call') {
// Execute tool
const { name, arguments: args } = request.params;
const result = executeTool(name, args);
console.log(JSON.stringify(result));
} else {
console.log(JSON.stringify({
success: false,
error: `Unknown method: ${request.method}`
}));
}
} catch (error) {
console.log(JSON.stringify({
success: false,
error: error.message
}));
}
});
}
/**
* HTTP Server for testing/debugging
*/
function startHttpServer(port = 49400) {
const http = require('http');
const server = http.createServer((req, res) => {
// Enable CORS
res.setHeader('Access-Control-Allow-Origin', '*');
res.setHeader('Content-Type', 'application/json');
if (req.method === 'GET' && req.url === '/tools') {
// List tools
const tools = Object.values(MCP_TOOLS).map(tool => ({
name: tool.name,
description: tool.description,
inputSchema: tool.inputSchema
}));
res.writeHead(200);
res.end(JSON.stringify({ tools }));
} else if (req.method === 'POST' && req.url.startsWith('/tools/')) {
// Execute tool
const toolName = req.url.replace('/tools/', '');
let body = '';
req.on('data', chunk => body += chunk);
req.on('end', () => {
try {
const args = JSON.parse(body);
const result = executeTool(toolName, args);
res.writeHead(200);
res.end(JSON.stringify(result));
} catch (error) {
res.writeHead(400);
res.end(JSON.stringify({
success: false,
error: error.message
}));
}
});
} else if (req.method === 'GET' && req.url === '/health') {
res.writeHead(200);
res.end(JSON.stringify({
status: 'healthy',
stagingDir: STAGING_DIR,
tools: Object.keys(MCP_TOOLS)
}));
} else {
res.writeHead(404);
res.end(JSON.stringify({ error: 'Not found' }));
}
});
server.listen(port, () => {
console.log(`[MCP Server] HTTP mode listening on port ${port}`);
console.log(`[MCP Server] Staging directory: ${STAGING_DIR}`);
console.log(`[MCP Server] Health: http://localhost:${port}/health`);
console.log(`[MCP Server] Tools: http://localhost:${port}/tools`);
});
}
// Parse command-line arguments
const args = process.argv.slice(2);
const mode = args.find(arg => arg.startsWith('--mode='))?.split('=')[1] || 'stdio';
const port = parseInt(args.find(arg => arg.startsWith('--port='))?.split('=')[1]) || 49400;
if (args.includes('--help') || args.includes('-h')) {
console.log(`
MCP Document Access Server
Usage:
node document_access_mcp_server.js [options]
Options:
--mode=<mode> Server mode: stdio or http (default: stdio)
--port=<port> HTTP port (default: 49400, only for http mode)
--help, -h Show this help message
Modes:
stdio Standard I/O mode for MCP clients (production)
http HTTP REST API mode for testing/debugging
Examples:
# Stdio mode (for MCP integration)
node document_access_mcp_server.js --mode=stdio
# HTTP mode for testing
node document_access_mcp_server.js --mode=http --port=49400
Staging Directory:
${STAGING_DIR}
Place documents here for LLM access via MCP tools.
Available Tools:
- list_document_sections: Get document structure
- read_section: Read specific section with context
- read_lines: Read specific line range
- search_content: Search for keywords/patterns
- loda_search: Token-optimized search (LODA) with budget awareness
`);
process.exit(0);
}
// Start server in appropriate mode
if (mode === 'http') {
startHttpServer(port);
} else {
startStdioServer();
}