AWS MCP Server

document_management.py•11.9 KiB

""" Generic document management for MCP Server vector store Handles any type of document with flexible categorization and metadata """ import os import re from datetime import datetime from pathlib import Path from typing import Any, Union from ...mcp import mcp from .vector_store import vector_store_add, vector_store_info, vector_store_search # Document categories and types DOCUMENT_CATEGORIES = { "technical": ["documentation", "manual", "guide", "reference", "api"], "business": ["policy", "procedure", "compliance", "governance", "strategy"], "educational": ["tutorial", "course", "lesson", "training", "workshop"], "legal": ["contract", "agreement", "terms", "privacy", "regulation"], "research": ["paper", "study", "analysis", "report", "whitepaper"], "general": ["misc", "other", "uncategorized"], } SOURCE_TYPES = ["pdf", "web", "file", "api", "manual"] @mcp.tool( name="document_ingest", description="Ingest any document into the vector store with flexible metadata", ) async def document_ingest( content: str | list[str], document_title: str, source_type: str = "manual", category: str = "general", doc_type: str = "documentation", tags: list[str] | None = None, metadata: dict[str, Any] | None = None, collection_name: str = "documents", chunk_size: int = 1200, overlap_size: int = 200, ) -> dict[str, Any]: """ Ingest any document content into the vector store Args: content: Document content (string or list of strings) document_title: Human-readable title for the document source_type: Type of source (pdf, web, file, api, manual) category: Document category (technical, business, educational, legal, research, general) doc_type: Specific document type (documentation, manual, guide, etc.) tags: Optional list of tags for categorization metadata: Additional custom metadata collection_name: Vector store collection name chunk_size: Size of text chunks for embedding overlap_size: Overlap between chunks to maintain context Returns: Dictionary with ingestion results """ try: # Validate inputs if not content or not document_title: return { "success": False, "error": "Content and document title are required", } # Convert content to string if it's a list if isinstance(content, list): text_content = "\n\n".join(content) else: text_content = content # Validate category if category not in DOCUMENT_CATEGORIES: category = "general" # Create chunks chunks = create_smart_chunks(text_content, chunk_size, overlap_size) # Prepare base metadata (ChromaDB requires scalar values) base_metadata = { "document_title": document_title, "source_type": source_type, "category": category, "doc_type": doc_type, "tags": ",".join(tags or []), # Convert list to comma-separated string "ingested_at": datetime.now().isoformat(), "content_length": len(text_content), **(metadata or {}), } # Create chunk metadata and IDs safe_title = create_safe_document_id(document_title) # Use list comprehensions for Pythonic code chunk_metadatas = [ { **base_metadata, "chunk_index": i, "total_chunks": len(chunks), "chunk_size": len(chunk), } for i, chunk in enumerate(chunks) ] chunk_ids = [f"{safe_title}_chunk_{i:04d}" for i in range(len(chunks))] # Add to vector store result = await vector_store_add( documents=chunks, ids=chunk_ids, metadatas=chunk_metadatas, collection_name=collection_name, ) if result["success"]: result.update( { "document_title": document_title, "category": category, "doc_type": doc_type, "total_text_length": len(text_content), "chunks_created": len(chunks), "safe_document_id": safe_title, } ) return result except Exception as e: return {"success": False, "error": f"Document ingestion failed: {str(e)}"} @mcp.tool( name="document_search", description="Search documents with category and type filtering", ) async def document_search( query: str, category: str | None = None, doc_type: str | None = None, tags: list[str] | None = None, collection_name: str = "documents", n_results: int = 5, ) -> dict[str, Any]: """ Search documents with filtering by category, type, and tags Args: query: Search query text category: Filter by document category doc_type: Filter by document type tags: Filter by tags (documents must have at least one matching tag) collection_name: Vector store collection name n_results: Maximum number of results to return Returns: Dictionary with search results """ try: # Perform initial search search_result = await vector_store_search( query=query, n_results=n_results * 3, # Get more results for filtering collection_name=collection_name, include_distances=True, ) if not search_result["success"]: return search_result # Filter results based on criteria filtered_results = [] for result in search_result["results"]: metadata = result["metadata"] # Category filter if category and metadata.get("category") != category: continue # Doc type filter if doc_type and metadata.get("doc_type") != doc_type: continue # Tags filter if tags: result_tags_str = metadata.get("tags", "") result_tags = result_tags_str.split(",") if result_tags_str else [] if not any(tag in result_tags for tag in tags): continue # Add additional fields for display result["category"] = metadata.get("category", "unknown") result["doc_type"] = metadata.get("doc_type", "unknown") tags_str = metadata.get("tags", "") result["tags"] = tags_str.split(",") if tags_str else [] result["document_title"] = metadata.get("document_title", "Untitled") filtered_results.append(result) if len(filtered_results) >= n_results: break return { "success": True, "query": query, "filters": {"category": category, "doc_type": doc_type, "tags": tags}, "results": filtered_results, "total_found": len(filtered_results), "searched_collection": collection_name, } except Exception as e: return {"success": False, "error": f"Document search failed: {str(e)}"} @mcp.tool( name="document_list", description="List all documents in the vector store with metadata", ) async def document_list( collection_name: str = "documents", category: str | None = None, doc_type: str | None = None, ) -> dict[str, Any]: """ List all documents with their metadata Args: collection_name: Vector store collection name category: Filter by category doc_type: Filter by document type Returns: Dictionary with document list """ try: # Get collection info info_result = await vector_store_info(collection_name) if not info_result["success"]: return info_result # For now, we'll use a search to get all documents # In a real implementation, you'd query the collection directly all_docs_result = await vector_store_search( query="", # Empty query to get diverse results n_results=min(info_result["current_collection"]["count"], 100), collection_name=collection_name, ) if not all_docs_result["success"]: return all_docs_result # Group documents by title and extract unique documents documents = {} for result in all_docs_result["results"]: metadata = result["metadata"] doc_title = metadata.get("document_title", "Untitled") # Apply filters if category and metadata.get("category") != category: continue if doc_type and metadata.get("doc_type") != doc_type: continue if doc_title not in documents: tags_str = metadata.get("tags", "") tags_list = tags_str.split(",") if tags_str else [] documents[doc_title] = { "title": doc_title, "category": metadata.get("category", "unknown"), "doc_type": metadata.get("doc_type", "unknown"), "source_type": metadata.get("source_type", "unknown"), "tags": tags_list, "ingested_at": metadata.get("ingested_at", "unknown"), "total_chunks": metadata.get("total_chunks", 1), "content_length": metadata.get("content_length", 0), } return { "success": True, "collection": collection_name, "filters": {"category": category, "doc_type": doc_type}, "documents": list(documents.values()), "total_documents": len(documents), "total_chunks": info_result["current_collection"]["count"], } except Exception as e: return {"success": False, "error": f"Document listing failed: {str(e)}"} @mcp.tool( name="document_categories", description="Get available document categories and types", ) async def document_categories() -> dict[str, Any]: """ Get available document categories and types for classification Returns: Dictionary with available categories and types """ return { "success": True, "categories": DOCUMENT_CATEGORIES, "source_types": SOURCE_TYPES, "usage": { "categories": "High-level classification of document purpose", "doc_types": "Specific type within each category", "source_types": "How the document was ingested", "tags": "Custom labels for flexible categorization", }, } def create_smart_chunks(text: str, chunk_size: int, overlap_size: int) -> list[str]: """Create simple text chunks with sentence boundary detection""" if len(text) <= chunk_size: return [text] chunks = [] start = 0 while start < len(text): end = start + chunk_size if end >= len(text): chunks.append(text[start:].strip()) break # Find last sentence ending before chunk limit using walrus operator chunk_text = text[start:end] if (sentence_end := chunk_text.rfind(". ")) > chunk_size * 0.5: chunks.append(text[start : start + sentence_end + 1].strip()) start = start + sentence_end + 1 - overlap_size else: chunks.append(chunk_text) start = end - overlap_size return [chunk for chunk in chunks if chunk.strip()] def create_safe_document_id(title: str) -> str: """Create a safe ID from document title""" # Convert to lowercase and replace spaces/special chars with underscores safe_id = re.sub(r"[^a-zA-Z0-9_-]", "_", title.lower().strip()) # Remove multiple consecutive underscores safe_id = re.sub(r"_+", "_", safe_id) # Remove leading/trailing underscores safe_id = safe_id.strip("_") # Limit length if len(safe_id) > 50: safe_id = safe_id[:50].rstrip("_") return safe_id or "untitled_document"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Havoc24k/aws-sa-tools-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

document_management.py•11.9 KiB