process_url_file
Extract text and tables from documents at URLs into structured markdown and HTML formats using optimized OCR processing.
Instructions
Process a file from a URL.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| arguments | Yes |
Implementation Reference
- src/mcp_mistral_ocr_opt/main.py:400-436 (handler)MCP tool registration and handler function for "process_url_file". Validates input arguments and calls the underlying ocr_processor logic.
@app.tool("process_url_file") async def process_url_file(arguments: Dict[str, Any]) -> List[TextContent]: """Process a file from a URL.""" url = arguments.get("url") file_type = arguments.get("file_type") if not url: raise McpError(ErrorData(code=INVALID_PARAMS, message="url is required")) if not file_type: raise McpError(ErrorData(code=INVALID_PARAMS, message="file_type is required")) if file_type not in ["image", "pdf"]: raise McpError( ErrorData( code=INVALID_PARAMS, message="file_type must be either 'image' or 'pdf'" ) ) try: result = await ocr_processor.process_url_file( url=url, file_type=file_type, table_format=arguments.get("table_format"), extract_header=arguments.get("extract_header", False), extract_footer=arguments.get("extract_footer", False), include_images=arguments.get("include_images", False), ) return [ TextContent( type="text", text=json.dumps(result, indent=2, ensure_ascii=False) ) ] except Exception as e: raise McpError( ErrorData(code=INTERNAL_ERROR, message=f"Error processing URL: {str(e)}") ) - The core implementation of the OCR processing for files provided via URL using the Mistral SDK.
async def process_url_file( self, url: str, file_type: str, table_format: Optional[str] = None, extract_header: bool = False, extract_footer: bool = False, include_images: bool = False, ) -> Dict[str, Any]: """Process a file from a URL using Mistral's OCR capabilities. Args: url: URL of the file to process file_type: Type of file: 'image' or 'pdf' table_format: Table formatting option (null, markdown, html) extract_header: Extract document headers extract_footer: Extract document footers include_images: Include base64 images in output Returns: Dictionary with result and metadata """ if file_type not in ["image", "pdf"]: raise ValueError("file_type must be either 'image' or 'pdf'") # Get client from pool client_pool = await self._ensure_client_pool() client = await client_pool.get_client() try: # Build OCR parameters ocr_params = { "model": self.config.model, "document": { "type": "image_url" if file_type == "image" else "document_url", f"{'image' if file_type == 'image' else 'document'}_url": url, }, } # Add optional parameters (use defaults from config if not specified) final_table_format = table_format or self.config.default_table_format final_extract_header = extract_header or self.config.default_extract_header final_extract_footer = extract_footer or self.config.default_extract_footer final_include_images = include_images or self.config.default_include_images if final_table_format and final_table_format != "null": ocr_params["table_format"] = final_table_format if final_extract_header: ocr_params["extract_header"] = True if final_extract_footer: ocr_params["extract_footer"] = True if final_include_images: ocr_params["include_image_base64"] = True # Process the document response = await asyncio.to_thread(client.ocr.process, **ocr_params) # Convert response to JSON result = json.loads(self._process_response(response)) # Extract filename from URL parsed_url = urlparse(url) source_name = Path(parsed_url.path).stem or "url_document" # Save result to output directory result_path = self._save_result(result, source_name) # Add metadata to result result["_metadata"] = { "source_url": url, "output_file": str(result_path),