create_batch_job
Process large document sets efficiently by creating batch OCR jobs to extract text and tables into structured formats.
Instructions
Create a batch processing job for large file sets.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| arguments | Yes |
Implementation Reference
- src/mcp_mistral_ocr_opt/main.py:439-510 (handler)The implementation of the create_batch_job tool, which processes file batches based on provided patterns.
@app.tool("create_batch_job") async def create_batch_job(arguments: Dict[str, Any]) -> List[TextContent]: """Create a batch processing job for large file sets.""" patterns = arguments.get("patterns") if not patterns or not isinstance(patterns, list): raise McpError( ErrorData(code=INVALID_PARAMS, message="patterns array is required") ) try: files = await discover_files( directory=config.ocr_dir, patterns=patterns, ) if not files: return [ TextContent( type="text", text=json.dumps( {"message": "No files found matching patterns"}, indent=2 ), ) ] batch_proc = await get_batch_processor() use_inline = arguments.get( "use_inline", len(files) < config.inline_batch_threshold ) if use_inline: # Inline batch requests = await batch_proc.prepare_inline_batch( files=files, table_format=arguments.get("table_format"), extract_header=arguments.get("extract_header", False), extract_footer=arguments.get("extract_footer", False), include_images=arguments.get("include_images", False), ) job_id = await batch_proc.process_inline_batch(requests) result = { "batch_type": "inline", "job_id": job_id, "files_queued": len(files), "message": f"Inline batch job created with {len(files)} files. Use check_batch_status to monitor progress.", } else: # File batch batch_file_id = await batch_proc.prepare_file_batch( files=files, table_format=arguments.get("table_format"), extract_header=arguments.get("extract_header", False), extract_footer=arguments.get("extract_footer", False), include_images=arguments.get("include_images", False), ) job_id = await batch_proc.process_file_batch(batch_file_id) result = { "batch_type": "file", "job_id": job_id, "batch_file_id": batch_file_id, "files_queued": len(files), "message": f"File batch job created with {len(files)} files. Use check_batch_status to monitor progress.", } return [ TextContent( type="text", text=json.dumps(result, indent=2, ensure_ascii=False) ) ]