MCP Power - Knowledge Search Server

Overview Schema Related Servers Score Discussions

mcpower
src
config

datasets.ts

datasets.ts•9.88 KiB

import { readdir, readFile, access, stat } from 'fs/promises'; import { join, resolve, isAbsolute, dirname } from 'path'; import { Dataset, DatasetManifestSchema, DatasetError, DatasetWithStatus } from '../types/dataset.js'; import { logger } from '../logger.js'; import { pythonBridge } from '../bridge/pythonBridge.js'; /** * Security constants for dataset loading */ const MAX_MANIFEST_SIZE = 10 * 1024; // 10KB const MAX_RECURSION_DEPTH = 3; // Limit directory traversal depth /** * DatasetRegistry - Manages dataset discovery, loading, and validation * * Scans the datasets directory on startup, validates manifests, and maintains * a registry of available datasets. Provides graceful error handling and * detailed diagnostics for dataset loading failures. * * @example * ```typescript * const registry = new DatasetRegistry('./datasets'); * await registry.load(); * const dataset = registry.get('my-dataset'); * const stats = registry.getStats(); * ``` */ export class DatasetRegistry { private datasets: Map<string, DatasetWithStatus> = new Map(); private loadErrors: DatasetError[] = []; private datasetsPath: string; /** * Creates a new DatasetRegistry instance * * @param datasetsPath - Absolute or relative path to datasets directory (default: './datasets') */ constructor(datasetsPath: string = './datasets') { this.datasetsPath = resolve(datasetsPath); } /** * Loads all datasets from the datasets directory * * Scans for subdirectories containing manifest.json files, validates each manifest, * verifies required files exist, and registers valid datasets. Continues loading * even if individual datasets fail (graceful degradation). * * Security: Limited to direct subdirectories only (max depth 1) to prevent * directory traversal attacks and excessive filesystem operations. * * @throws Never throws - errors are logged and tracked in loadErrors array */ async load(): Promise<void> { logger.info({ datasetsPath: this.datasetsPath }, 'Starting dataset discovery'); try { // Check if datasets directory exists await access(this.datasetsPath); } catch { logger.warn({ datasetsPath: this.datasetsPath }, 'Datasets directory does not exist, creating empty registry'); return; } try { // Phase 6: Only scan direct subdirectories (T046 - Security hardening) // Depth limited to 1 level to prevent directory traversal attacks const entries = await readdir(this.datasetsPath, { withFileTypes: true }); const directories = entries.filter(entry => entry.isDirectory()); logger.info({ count: directories.length }, 'Found dataset directories'); for (const dir of directories) { const manifestPath = join(this.datasetsPath, dir.name, 'manifest.json'); await this.loadManifest(manifestPath); } const successCount = Array.from(this.datasets.values()).filter(d => d.status === 'ready').length; const errorCount = this.loadErrors.length; logger.info( { total: directories.length, success: successCount, errors: errorCount }, 'Dataset discovery completed' ); } catch (error) { logger.error({ error, datasetsPath: this.datasetsPath }, 'Failed to scan datasets directory'); } } /** * Load and validate a single dataset manifest */ private async loadManifest(manifestPath: string): Promise<void> { let datasetId: string | undefined; try { // Phase 6: Validate manifest file size (T046 - Security hardening) const stats = await stat(manifestPath); if (stats.size > MAX_MANIFEST_SIZE) { throw new Error( `Manifest file exceeds maximum size (${stats.size} > ${MAX_MANIFEST_SIZE} bytes). ` + `This limit prevents DoS attacks from large manifest files.` ); } // Read manifest file const manifestContent = await readFile(manifestPath, 'utf-8'); const manifestData = JSON.parse(manifestContent); // Validate against schema const dataset = DatasetManifestSchema.parse(manifestData); datasetId = dataset.id; const manifestDir = dirname(resolve(manifestPath)); const resolvePath = (pathValue: string) => { if (isAbsolute(pathValue)) { return resolve(pathValue); } return resolve(join(manifestDir, pathValue)); }; // Resolve paths relative to project root const resolvedDataset: Dataset = { ...dataset, index: resolvePath(dataset.index), metadata: resolvePath(dataset.metadata) }; // Verify index directory exists try { await access(resolvedDataset.index); } catch { throw new Error(`Index directory not found: ${resolvedDataset.index}`); } // Verify metadata file exists try { await access(resolvedDataset.metadata); } catch { throw new Error(`Metadata file not found: ${resolvedDataset.metadata}`); } // Phase 5: Validate FAISS index using Python bridge (T038) try { const validationResult = await pythonBridge.validateIndex(resolvedDataset.index); if (validationResult.status === 'error') { throw new Error(`FAISS index validation failed: ${validationResult.error || 'Unknown error'}`); } logger.debug( { datasetId: dataset.id, indexPath: resolvedDataset.index, validation: validationResult }, 'FAISS index validated successfully' ); } catch (validationError) { // Log validation failure but don't stop loading - allow dataset to be registered with warning const errorMsg = validationError instanceof Error ? validationError.message : String(validationError); logger.warn( { datasetId: dataset.id, indexPath: resolvedDataset.index, error: errorMsg }, 'FAISS index validation failed, dataset may not be searchable' ); // Mark dataset as having validation issues (but still register it) // This allows the dataset to be listed but searches may fail } // Register dataset as ready this.datasets.set(dataset.id, { ...resolvedDataset, status: 'ready' }); logger.info( { datasetId: dataset.id, name: dataset.name, indexPath: resolvedDataset.index, metadataPath: resolvedDataset.metadata }, 'Dataset loaded successfully' ); } catch (error) { // Phase 5: Enhanced error logging with full context (SC-003) const errorMessage = error instanceof Error ? error.message : String(error); // Determine error type for better diagnostics let errorType = 'unknown'; if (error instanceof Error) { if (error.message.includes('JSON')) { errorType = 'json_parse_error'; } else if (error.message.includes('not found')) { errorType = 'file_not_found'; } else if (error.message.includes('Required')) { errorType = 'validation_error'; } } this.loadErrors.push({ manifestPath, error: errorMessage, timestamp: new Date() }); // Enhanced structured logging with full paths and context logger.error( { datasetId: datasetId || 'unknown', manifestPath: resolve(manifestPath), // Full absolute path errorType, error: errorMessage, datasetsDirectory: this.datasetsPath }, 'Failed to load dataset manifest' ); } } /** * Retrieves a dataset by its unique identifier * * @param id - Dataset identifier (as specified in manifest.json) * @returns Dataset with status, or undefined if not found * * @example * ```typescript * const dataset = registry.get('my-docs'); * if (dataset && dataset.status === 'ready') { * // Use dataset * } * ``` */ get(id: string): DatasetWithStatus | undefined { return this.datasets.get(id); } /** * Checks if a dataset exists and is ready for use * * @param id - Dataset identifier * @returns true if dataset exists and status is 'ready' * * @example * ```typescript * if (registry.has('my-docs')) { * // Dataset is available for searching * } * ``` */ has(id: string): boolean { const dataset = this.datasets.get(id); return dataset !== undefined && dataset.status === 'ready'; } /** * Lists all registered datasets (including those with errors) * * @returns Array of all datasets with their status */ list(): DatasetWithStatus[] { return Array.from(this.datasets.values()); } /** * Lists only datasets that are ready for searching * * Filters out datasets that failed validation or loading. * Use this for presenting available datasets to users. * * @returns Array of ready datasets */ listReady(): DatasetWithStatus[] { return Array.from(this.datasets.values()).filter(d => d.status === 'ready'); } /** * Retrieves all dataset loading errors for diagnostics * * Returns errors that occurred during dataset discovery, including * manifest parsing errors, validation failures, and missing files. * * @returns Array of dataset errors with paths and timestamps */ getErrors(): DatasetError[] { return this.loadErrors; } /** * Gets registry statistics for monitoring and diagnostics * * @returns Statistics object with total, ready, and error counts * * @example * ```typescript * const stats = registry.getStats(); * console.log(`${stats.ready}/${stats.total} datasets ready, ${stats.errors} errors`); * ``` */ getStats() { const ready = this.listReady().length; const errors = this.loadErrors.length; return { total: this.datasets.size + errors, ready, errors }; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wspotter/mcpower'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

datasets.ts•9.88 KiB