validate_document_schema

Name	Required	Description	Default
`document`	Yes	Document object to validate against knowledge base schema format

src/elasticsearch/sub_servers/elasticsearch_document.py:347-373 (handler)

The FastMCP tool handler for 'validate_document_schema'. This async function validates the input document using validate_document_structure from document_schema.py, formats success/error responses with guidance.

@app.tool( description="Validate document structure against knowledge base schema and provide formatting guidance", tags={"elasticsearch", "validation", "document", "schema"} ) async def validate_document_schema( document: Annotated[ Dict[str, Any], Field(description="Document object to validate against knowledge base schema format")] ) -> str: """Validate document structure against knowledge base schema standards.""" try: validated_doc = validate_document_structure(document) return (f"✅ Document validation successful!\n\n" + f"Validated document:\n{json.dumps(validated_doc, indent=2, ensure_ascii=False)}\n\n" + f"Document is ready to be indexed.\n\n" + f"🚨 **RECOMMENDED: Check for Duplicates First**:\n" + f" 🔍 **Use index_document**: Built-in AI-powered duplicate detection\n" + f" 🔄 **Update instead of duplicate**: Modify existing documents when possible\n" + f" 📏 **Content length check**: If < 1000 chars, store in 'content' field directly\n" + f" 📁 **File creation**: Only for truly long content that needs separate storage\n" + f" 🎯 **Quality over quantity**: Prevent knowledge base bloat through smart reuse") except DocumentValidationError as e: return format_validation_error(e) except Exception as e: return f"❌ Validation error: {str(e)}"

src/elasticsearch/document_schema.py:168-277 (schema)

Core schema validation logic. Loads document_schema and document_validation from config.json, performs strict field checks, type validation, content checks, priority/source_type validation, etc. Called by the tool handler.

def validate_document_structure(document: Dict[str, Any], base_directory: str = None, is_knowledge_doc: bool = True) -> Dict[str, Any]: """ Validate document structure against schema with strict mode support. Args: document: Document to validate base_directory: Base directory for relative path conversion is_knowledge_doc: Whether this is a knowledge base document (default: True) Returns: Validated and normalized document Raises: DocumentValidationError: If validation fails """ errors = [] validation_config = load_validation_config() document_schema = load_document_schema() # For knowledge base documents, check the full schema if is_knowledge_doc: # Check for extra fields if strict validation is enabled if validation_config.get("strict_schema_validation", False) and not validation_config.get("allow_extra_fields", True): allowed_fields = set(document_schema["required_fields"]) document_fields = set(document.keys()) extra_fields = document_fields - allowed_fields if extra_fields: errors.append(f"Extra fields not allowed in strict mode: {', '.join(sorted(extra_fields))}. Allowed fields: {', '.join(sorted(allowed_fields))}") else: # For non-knowledge documents, only check for extra fields if strict validation is enabled if validation_config.get("strict_schema_validation", False) and not validation_config.get("allow_extra_fields", True): # For non-knowledge docs, we don't have a predefined schema, so just enforce no extra fields beyond basic ones # This is a more lenient check - you might want to customize this based on your needs errors.append("Strict schema validation is enabled. Extra fields are not allowed for custom documents.") # Check required fields only for knowledge base documents if is_knowledge_doc: required_fields = document_schema["required_fields"] if validation_config.get("required_fields_only", False): # Only check fields that are actually required for field in required_fields: if field not in document: errors.append(f"Missing required field: {field}") else: # Check all fields in schema for field in required_fields: if field not in document: errors.append(f"Missing required field: {field}") if errors: raise DocumentValidationError("Validation failed: " + "; ".join(errors)) # For knowledge base documents, perform detailed validation if is_knowledge_doc: # Validate field types for field, expected_type in document_schema["field_types"].items(): if field in document: if not isinstance(document[field], expected_type): errors.append(f"Field '{field}' must be of type {expected_type.__name__}, got {type(document[field]).__name__}") # NEW: Validate content length if document.get("content"): content = document["content"] # Check for empty content if not content.strip(): errors.append("Content cannot be empty or contain only whitespace") # Validate priority values if document.get("priority") not in document_schema["priority_values"]: errors.append(f"Priority must be one of {document_schema['priority_values']}, got '{document.get('priority')}'") # Validate source_type if document.get("source_type") not in document_schema["source_types"]: errors.append(f"Source type must be one of {document_schema['source_types']}, got '{document.get('source_type')}'") # Validate ID format (should be alphanumeric with hyphens) if document.get("id") and not re.match(r'^[a-zA-Z0-9-_]+$', document["id"]): errors.append("ID must contain only alphanumeric characters, hyphens, and underscores") # Validate timestamp format if document.get("last_modified"): try: datetime.fromisoformat(document["last_modified"].replace('Z', '+00:00')) except ValueError: errors.append("last_modified must be in ISO 8601 format (e.g., '2025-01-04T10:30:00Z')") # Validate tags (must be non-empty strings) if document.get("tags"): for i, tag in enumerate(document["tags"]): if not isinstance(tag, str) or not tag.strip(): errors.append(f"Tag at index {i} must be a non-empty string") # Validate related documents (must be strings) if document.get("related"): for i, related_id in enumerate(document["related"]): if not isinstance(related_id, str) or not related_id.strip(): errors.append(f"Related document ID at index {i} must be a non-empty string") # Validate key_points (must be non-empty strings) if document.get("key_points"): for i, point in enumerate(document["key_points"]): if not isinstance(point, str) or not point.strip(): errors.append(f"Key point at index {i} must be a non-empty string") if errors: raise DocumentValidationError("Validation failed: " + "; ".join(errors)) return document

src/elasticsearch/document_schema.py:378-407 (helper)

Helper function to format validation errors with detailed guidance, example document structure, and schema requirements. Used by the tool handler.

def format_validation_error(error: DocumentValidationError, context: str = "general") -> str: """ Format validation error with example and requirements. Args: error: The validation error context: Context for example selection Returns: Formatted error message with example """ example_doc = get_example_document(context) document_schema = load_document_schema() error_message = f"❌ Document validation failed!\n\n{str(error)}\n\n" error_message += "📋 Required fields and format:\n" # Show requirements error_message += f"• Required fields: {', '.join(document_schema['required_fields'])}\n" error_message += f"• Priority values: {', '.join(document_schema['priority_values'])}\n" error_message += f"• Source types: {', '.join(document_schema['source_types'])}\n" error_message += f"• ID format: alphanumeric, hyphens, underscores only\n" error_message += f"• Timestamp format: ISO 8601 (YYYY-MM-DDTHH:MM:SSZ)\n\n" # Show example error_message += "📄 Example document format:\n" error_message += json.dumps(example_doc, indent=2, ensure_ascii=False) return error_message

src/elasticsearch/document_schema.py:15-17 (helper)

Custom exception class for document validation failures, raised by validate_document_structure and caught by the tool handler.

class DocumentValidationError(Exception): """Exception raised when document validation fails.""" pass

src/elasticsearch/sub_servers/elasticsearch_document.py:347-373 (registration)

The @app.tool decorator registers 'validate_document_schema' as an MCP tool in the FastMCP app instance.

@app.tool( description="Validate document structure against knowledge base schema and provide formatting guidance", tags={"elasticsearch", "validation", "document", "schema"} ) async def validate_document_schema( document: Annotated[ Dict[str, Any], Field(description="Document object to validate against knowledge base schema format")] ) -> str: """Validate document structure against knowledge base schema standards.""" try: validated_doc = validate_document_structure(document) return (f"✅ Document validation successful!\n\n" + f"Validated document:\n{json.dumps(validated_doc, indent=2, ensure_ascii=False)}\n\n" + f"Document is ready to be indexed.\n\n" + f"🚨 **RECOMMENDED: Check for Duplicates First**:\n" + f" 🔍 **Use index_document**: Built-in AI-powered duplicate detection\n" + f" 🔄 **Update instead of duplicate**: Modify existing documents when possible\n" + f" 📏 **Content length check**: If < 1000 chars, store in 'content' field directly\n" + f" 📁 **File creation**: Only for truly long content that needs separate storage\n" + f" 🎯 **Quality over quantity**: Prevent knowledge base bloat through smart reuse") except DocumentValidationError as e: return format_validation_error(e) except Exception as e: return f"❌ Validation error: {str(e)}"

Agent Knowledge MCP

Instructions

Input Schema

Implementation Reference

Other Tools

Latest Blog Posts

MCP directory API