de en es ja ko ru zh

docs-mcp-server

by arabold

TypeScript

MIT License

542

676

Overview InspectNew Endpoints Schema Related Servers Reviews Score

Need Help?View Source Code Report Issue

LocalFileStrategy.ts•3.89 kB

import fs from "node:fs/promises"; import path from "node:path"; import type { Document, ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import { FileFetcher } from "../fetcher"; import type { RawContent } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; import type { ContentPipeline } from "../pipelines/types"; import type { ScraperOptions, ScraperProgress } from "../types"; import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; /** * LocalFileStrategy handles crawling and scraping of local files and folders using file:// URLs. * * All files with a MIME type of `text/*` are processed. This includes HTML, Markdown, plain text, and source code files such as `.js`, `.ts`, `.tsx`, `.css`, etc. Binary files, PDFs, images, and other non-text formats are ignored. * * Supports include/exclude filters and percent-encoded paths. */ export class LocalFileStrategy extends BaseScraperStrategy { private readonly fileFetcher = new FileFetcher(); private readonly pipelines: ContentPipeline[]; constructor() { super(); this.pipelines = PipelineFactory.createStandardPipelines(); } canHandle(url: string): boolean { return url.startsWith("file://"); } protected async processItem( item: QueueItem, options: ScraperOptions, _progressCallback?: ProgressCallback<ScraperProgress>, _signal?: AbortSignal, ): Promise<{ document?: Document; links?: string[] }> { // Parse the file URL properly to handle both file:// and file:/// formats let filePath = item.url.replace(/^file:\/\/\/?/, ""); filePath = decodeURIComponent(filePath); // Ensure absolute path on Unix-like systems (if not already absolute) if (!filePath.startsWith("/") && process.platform !== "win32") { filePath = `/${filePath}`; } const stats = await fs.stat(filePath); if (stats.isDirectory()) { const contents = await fs.readdir(filePath); // Only return links that pass shouldProcessUrl const links = contents .map((name) => `file://${path.join(filePath, name)}`) .filter((url) => this.shouldProcessUrl(url, options)); return { links }; } logger.info(`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`); const rawContent: RawContent = await this.fileFetcher.fetch(item.url); let processed: Awaited<ReturnType<ContentPipeline["process"]>> | undefined; for (const pipeline of this.pipelines) { if (pipeline.canProcess(rawContent)) { logger.debug( `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`, ); processed = await pipeline.process(rawContent, options, this.fileFetcher); break; } } if (!processed) { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`, ); return { document: undefined, links: [] }; } for (const err of processed.errors) { logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`); } return { document: { content: typeof processed.textContent === "string" ? processed.textContent : "", contentType: rawContent.mimeType, metadata: { url: rawContent.source, title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled", library: options.library, version: options.version, }, } satisfies Document, }; } /** * Cleanup resources used by this strategy, specifically the pipeline browser instances. */ async cleanup(): Promise<void> { await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close())); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server