Local Documents MCP Server

server.py•5.85 kB

# standard libraries import os from pathlib import Path from typing import Annotated from pydantic import Field from sys import argv # third-party libraries from mcp.server.fastmcp import FastMCP from utils.prompts import get_instructions from utils.path_files import list_documents_by_folder from utils.markitdown import get_markdown_content from utils.ocr import get_scanned_document_text from utils.max_tokens import tokenizer from dotenv import load_dotenv load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), ".env")) POPPLER_PATH = Path(os.getenv("POPPLER_PATH")) ROOT_PATH = str(argv[1]) MAX_TOKENS = int(argv[2]) mcp = FastMCP( name="Local Doc MCP", instructions=get_instructions(), ) @mcp.tool( name="list_documents", description="List all documents in the specified root path.", ) def list_documents()-> dict: """ List documents in the given root path. Returns: dict: A dictionary containing the list of documents found in the root path """ try: return { 'root_path': ROOT_PATH, 'subpaths': list_documents_by_folder(ROOT_PATH) } except Exception as error: return {"error",f"Error searching for documents: {error}"} @mcp.tool( name="load_documents", description="Load and extract text from a document, converting it to markdown.", ) def load_documents( relative_path: Annotated[str, Field(description="The relative path to the document from root_path")], document_name: Annotated[str, Field(description="The name and extension of the document (file.extension). If a pdf file cannot be converted to markdown, it have to be processed as a scanned document.")], ) -> dict: """ Load and extract text from a document, converting it to markdown. Args: relative_path (str): The relative path to the document from root_path document_name (str): The name of the document file Returns: dict: A dictionary containing: - document: The processed text content Raises: FileNotFoundError: If the specified document does not exist Exception: For other processing errors """ try: # Construct and validate file path file_path = Path(ROOT_PATH) / relative_path / document_name if not file_path.exists(): raise FileNotFoundError(f"Document not found: {file_path}") except Exception as error: return {"error":f"Error processing markdown file: {error}"} try: # Get markdown content text_content = get_markdown_content(file_path) # Process content with token limits output_text, original_token_count, was_truncated = tokenizer( text_content, MAX_TOKENS ) return { "paths": { "root_path": ROOT_PATH, "relative_path": relative_path, "document_name": document_name, "file_path": str(file_path) }, "usage": { "tokens": original_token_count, "truncated": was_truncated, "max_tokens": MAX_TOKENS, }, "response":{ "document": output_text } } except Exception as error: return { "paths": { "root_path": ROOT_PATH, "relative_path": relative_path, "document_name": document_name, "file_path": str(file_path) }, "error": { "message": f"Error processing markdown file: {error}" } } @mcp.tool( name="load_scanned_document", description="Load and extract text from a scanned document using Tesseract OCR.", ) def load_scanned_document( relative_path: Annotated[str, Field(description="The relative path to the scanned document from root_path")], document_name: Annotated[str, Field(description="The name of the scanned document (file.pdf)")], ) -> dict: """ Load and extract text from a scanned document using tesseract OCR. Args: relative_path (str): The relative path to the document from root_path document_name (str): The name of the document file Returns: dict: A dictionary containing: - document: The processed text content """ try: # Construct and validate file path file_path = Path(ROOT_PATH) / relative_path / document_name if not file_path.exists(): raise FileNotFoundError(f"Document not found: {file_path}") except Exception as error: return {"error":f"Error processing markdown file: {error}"} try: # Get markdown content text_content = get_scanned_document_text(str(file_path), poppler_path=str(POPPLER_PATH)) # Process content with token limits output_text, original_token_count, was_truncated = tokenizer( text_content['content'], MAX_TOKENS ) return { "paths": { "root_path": ROOT_PATH, "relative_path": relative_path, "document_name": document_name, "file_path": str(file_path) }, "usage": { "tokens": original_token_count, "truncated": was_truncated, "max_tokens": MAX_TOKENS, }, "processing_time": { "seconds": text_content['processing_time'] }, "response":{ "document": output_text, } } except Exception as error: return {"error": f"Error processing markdown file: {error}"} # # Initialize and run the server if __name__ == "__main__": mcp.run(transport='stdio')

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Baronco/Local-Docs-MCP-Tool'

If you have feedback or need assistance with the MCP directory API, please join our Discord server