Skip to main content
Glama
snussik
by snussik

create_batch_job

Process large document sets efficiently by creating batch OCR jobs to extract text and tables into structured formats.

Instructions

Create a batch processing job for large file sets.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
argumentsYes

Implementation Reference

  • The implementation of the create_batch_job tool, which processes file batches based on provided patterns.
    @app.tool("create_batch_job")
    async def create_batch_job(arguments: Dict[str, Any]) -> List[TextContent]:
        """Create a batch processing job for large file sets."""
        patterns = arguments.get("patterns")
        if not patterns or not isinstance(patterns, list):
            raise McpError(
                ErrorData(code=INVALID_PARAMS, message="patterns array is required")
            )
    
        try:
            files = await discover_files(
                directory=config.ocr_dir,
                patterns=patterns,
            )
    
            if not files:
                return [
                    TextContent(
                        type="text",
                        text=json.dumps(
                            {"message": "No files found matching patterns"}, indent=2
                        ),
                    )
                ]
    
            batch_proc = await get_batch_processor()
    
            use_inline = arguments.get(
                "use_inline", len(files) < config.inline_batch_threshold
            )
    
            if use_inline:
                # Inline batch
                requests = await batch_proc.prepare_inline_batch(
                    files=files,
                    table_format=arguments.get("table_format"),
                    extract_header=arguments.get("extract_header", False),
                    extract_footer=arguments.get("extract_footer", False),
                    include_images=arguments.get("include_images", False),
                )
                job_id = await batch_proc.process_inline_batch(requests)
    
                result = {
                    "batch_type": "inline",
                    "job_id": job_id,
                    "files_queued": len(files),
                    "message": f"Inline batch job created with {len(files)} files. Use check_batch_status to monitor progress.",
                }
            else:
                # File batch
                batch_file_id = await batch_proc.prepare_file_batch(
                    files=files,
                    table_format=arguments.get("table_format"),
                    extract_header=arguments.get("extract_header", False),
                    extract_footer=arguments.get("extract_footer", False),
                    include_images=arguments.get("include_images", False),
                )
                job_id = await batch_proc.process_file_batch(batch_file_id)
    
                result = {
                    "batch_type": "file",
                    "job_id": job_id,
                    "batch_file_id": batch_file_id,
                    "files_queued": len(files),
                    "message": f"File batch job created with {len(files)} files. Use check_batch_status to monitor progress.",
                }
    
            return [
                TextContent(
                    type="text", text=json.dumps(result, indent=2, ensure_ascii=False)
                )
            ]

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/snussik/mcp_mistral_ocr_opt'

If you have feedback or need assistance with the MCP directory API, please join our Discord server