Skip to main content
Glama

RAG Document Server

by jaimeferj
document_processor.py3.44 kB
"""Document processor for handling text and markdown files.""" import hashlib from pathlib import Path from typing import Optional import aiofiles class DocumentProcessor: """Process and extract content from documents.""" SUPPORTED_EXTENSIONS = {".txt", ".md", ".mdx"} @staticmethod def is_supported(file_path: str | Path) -> bool: """Check if the file extension is supported.""" path = Path(file_path) return path.suffix.lower() in DocumentProcessor.SUPPORTED_EXTENSIONS @staticmethod async def read_file(file_path: str | Path) -> str: """Read file content asynchronously.""" async with aiofiles.open(file_path, "r", encoding="utf-8") as f: return await f.read() @staticmethod def generate_doc_id(content: str, filename: str) -> str: """Generate a unique document ID based on content and filename.""" hash_input = f"{filename}:{content}" return hashlib.sha256(hash_input.encode()).hexdigest()[:16] @staticmethod def extract_metadata(file_path: str | Path, base_path: Optional[str | Path] = None) -> dict[str, str]: """ Extract metadata from the file. Args: file_path: Path to the document file base_path: Optional base path to extract relative path from Returns: Dictionary with metadata including filesystem path """ path = Path(file_path).resolve() # Extract filesystem path structure path_structure = None if base_path: base = Path(base_path).resolve() try: # Get relative path from base rel_path = path.relative_to(base) # Convert to breadcrumb format (without filename) parts = list(rel_path.parts[:-1]) # Exclude filename if parts: path_structure = " > ".join(parts) except ValueError: # File is not relative to base_path, skip path structure pass return { "filename": path.name, "extension": path.suffix.lower(), "file_type": "text" if path.suffix == ".txt" else "markdown", "path_structure": path_structure, } async def process_document( self, file_path: str | Path, content: Optional[str] = None, base_path: Optional[str | Path] = None ) -> dict[str, str]: """ Process a document and return its content with metadata. Args: file_path: Path to the document file content: Optional pre-loaded content (if None, will read from file) base_path: Optional base path to extract relative path structure from Returns: Dictionary with 'content', 'doc_id', and metadata fields """ if not self.is_supported(file_path): raise ValueError( f"Unsupported file type. Supported: {self.SUPPORTED_EXTENSIONS}" ) # Read content if not provided if content is None: content = await self.read_file(file_path) # Extract metadata with path structure metadata = self.extract_metadata(file_path, base_path) doc_id = self.generate_doc_id(content, metadata["filename"]) return { "doc_id": doc_id, "content": content, **metadata, }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jaimeferj/mcp-rag-docs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server