process_document

Instructions

Sends document to process with Unstructured, return the content of the document Args: filepath: path to the document

Input Schema

TableJSON Schema

Name	Required	Description	Default
`filepath`	Yes

Implementation Reference

doc_processor.py:83-132 (handler)
The core handler function for the 'process_document' tool, decorated with @mcp.tool() for automatic registration in FastMCP. It validates the file, checks supported extensions, partitions the document using UnstructuredClient API, saves elements as JSON, converts to formatted text using json_to_text helper, and returns the result.
@mcp.tool() async def process_document(ctx: Context, filepath: str) -> str: """ Sends document to process with Unstructured, return the content of the document Args: filepath: path to the document """ if not os.path.isfile(filepath): return "File does not exist" # Check is file extension is supported _, ext = os.path.splitext(filepath) supported_extensions = {".abw", ".bmp", ".csv", ".cwk", ".dbf", ".dif", ".doc", ".docm", ".docx", ".dot", ".dotm", ".eml", ".epub", ".et", ".eth", ".fods", ".gif", ".heic", ".htm", ".html", ".hwp", ".jpeg", ".jpg", ".md", ".mcw", ".mw", ".odt", ".org", ".p7s", ".pages", ".pbd", ".pdf", ".png", ".pot", ".potm", ".ppt", ".pptm", ".pptx", ".prn", ".rst", ".rtf", ".sdp", ".sgl", ".svg", ".sxg", ".tiff", ".txt", ".tsv", ".uof", ".uos1", ".uos2", ".web", ".webp", ".wk2", ".xls", ".xlsb", ".xlsm", ".xlsx", ".xlw", ".xml", ".zabw"} if ext.lower() not in supported_extensions: return "File extension not supported by Unstructured" client = ctx.request_context.lifespan_context.client file_basename = os.path.basename(filepath) req = operations.PartitionRequest( partition_parameters=shared.PartitionParameters( files=shared.Files( content=open(filepath, "rb"), file_name=filepath, ), strategy=shared.Strategy.AUTO, ), ) os.makedirs(PROCESSED_FILES_FOLDER, exist_ok=True) try: res = client.general.partition(request=req) element_dicts = [element for element in res.elements] json_elements = json.dumps(element_dicts, indent=2) output_json_file_path = os.path.join(PROCESSED_FILES_FOLDER, f"{file_basename}.json") with open(output_json_file_path, "w") as file: file.write(json_elements) return json_to_text(output_json_file_path) except Exception as e: return f"The following exception happened during file processing: {e}"
doc_processor.py:53-80 (helper)
Helper function to convert Unstructured JSON output to formatted HTML/text string, used by process_document.
def json_to_text(file_path) -> str: with open(file_path, 'r') as file: elements = json.load(file) doc_texts = [] for element in elements: text = element.get("text", "").strip() element_type = element.get("type", "") metadata = element.get("metadata", {}) if element_type == "Title": doc_texts.append(f"<h1> {text}</h1><br>") elif element_type == "Header": doc_texts.append(f"<h2> {text}</h2><br/>") elif element_type == "NarrativeText" or element_type == "UncategorizedText": doc_texts.append(f"<p>{text}</p>") elif element_type == "ListItem": doc_texts.append(f"<li>{text}</li>") elif element_type == "PageNumber": doc_texts.append(f"Page number: {text}") elif element_type == "Table": table_html = metadata.get("text_as_html", "") doc_texts.append(table_html) # Keep the table as HTML else: doc_texts.append(text) return " ".join(doc_texts)

Unstructured Document Processor MCP

Instructions

Input Schema

Implementation Reference

Other Tools

Latest Blog Posts

MCP directory API