server.py•5.85 kB
# standard libraries
import os
from pathlib import Path
from typing import Annotated
from pydantic import Field
from sys import argv
# third-party libraries
from mcp.server.fastmcp import FastMCP
from utils.prompts import get_instructions
from utils.path_files import list_documents_by_folder
from utils.markitdown import get_markdown_content
from utils.ocr import get_scanned_document_text
from utils.max_tokens import tokenizer
from dotenv import load_dotenv
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), ".env"))
POPPLER_PATH = Path(os.getenv("POPPLER_PATH"))
ROOT_PATH = str(argv[1])
MAX_TOKENS = int(argv[2])
mcp = FastMCP(
name="Local Doc MCP",
instructions=get_instructions(),
)
@mcp.tool(
name="list_documents",
description="List all documents in the specified root path.",
)
def list_documents()-> dict:
"""
List documents in the given root path.
Returns:
dict: A dictionary containing the list of documents found in the root path
"""
try:
return {
'root_path': ROOT_PATH,
'subpaths': list_documents_by_folder(ROOT_PATH)
}
except Exception as error:
return {"error",f"Error searching for documents: {error}"}
@mcp.tool(
name="load_documents",
description="Load and extract text from a document, converting it to markdown.",
)
def load_documents(
relative_path: Annotated[str, Field(description="The relative path to the document from root_path")],
document_name: Annotated[str, Field(description="The name and extension of the document (file.extension). If a pdf file cannot be converted to markdown, it have to be processed as a scanned document.")],
) -> dict:
"""
Load and extract text from a document, converting it to markdown.
Args:
relative_path (str): The relative path to the document from root_path
document_name (str): The name of the document file
Returns:
dict: A dictionary containing:
- document: The processed text content
Raises:
FileNotFoundError: If the specified document does not exist
Exception: For other processing errors
"""
try:
# Construct and validate file path
file_path = Path(ROOT_PATH) / relative_path / document_name
if not file_path.exists():
raise FileNotFoundError(f"Document not found: {file_path}")
except Exception as error:
return {"error":f"Error processing markdown file: {error}"}
try:
# Get markdown content
text_content = get_markdown_content(file_path)
# Process content with token limits
output_text, original_token_count, was_truncated = tokenizer(
text_content,
MAX_TOKENS
)
return {
"paths": {
"root_path": ROOT_PATH,
"relative_path": relative_path,
"document_name": document_name,
"file_path": str(file_path)
},
"usage": {
"tokens": original_token_count,
"truncated": was_truncated,
"max_tokens": MAX_TOKENS,
},
"response":{
"document": output_text
}
}
except Exception as error:
return {
"paths": {
"root_path": ROOT_PATH,
"relative_path": relative_path,
"document_name": document_name,
"file_path": str(file_path)
},
"error": {
"message": f"Error processing markdown file: {error}"
}
}
@mcp.tool(
name="load_scanned_document",
description="Load and extract text from a scanned document using Tesseract OCR.",
)
def load_scanned_document(
relative_path: Annotated[str, Field(description="The relative path to the scanned document from root_path")],
document_name: Annotated[str, Field(description="The name of the scanned document (file.pdf)")],
) -> dict:
"""
Load and extract text from a scanned document using tesseract OCR.
Args:
relative_path (str): The relative path to the document from root_path
document_name (str): The name of the document file
Returns:
dict: A dictionary containing:
- document: The processed text content
"""
try:
# Construct and validate file path
file_path = Path(ROOT_PATH) / relative_path / document_name
if not file_path.exists():
raise FileNotFoundError(f"Document not found: {file_path}")
except Exception as error:
return {"error":f"Error processing markdown file: {error}"}
try:
# Get markdown content
text_content = get_scanned_document_text(str(file_path), poppler_path=str(POPPLER_PATH))
# Process content with token limits
output_text, original_token_count, was_truncated = tokenizer(
text_content['content'],
MAX_TOKENS
)
return {
"paths": {
"root_path": ROOT_PATH,
"relative_path": relative_path,
"document_name": document_name,
"file_path": str(file_path)
},
"usage": {
"tokens": original_token_count,
"truncated": was_truncated,
"max_tokens": MAX_TOKENS,
},
"processing_time": {
"seconds": text_content['processing_time']
},
"response":{
"document": output_text,
}
}
except Exception as error:
return {"error": f"Error processing markdown file: {error}"}
# # Initialize and run the server
if __name__ == "__main__":
mcp.run(transport='stdio')