Skip to main content
Glama

RAG Document Server

by jaimeferj
text_chunker.py3.24 kB
"""Text chunking utility for splitting documents into smaller pieces.""" import re from typing import List class TextChunker: """Split text into overlapping chunks for better retrieval.""" def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): """ Initialize the text chunker. Args: chunk_size: Maximum size of each chunk in characters chunk_overlap: Number of overlapping characters between chunks """ self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap def split_text(self, text: str) -> List[str]: """ Split text into chunks with overlap. Args: text: The text to split Returns: List of text chunks """ if not text or len(text) == 0: return [] # If text is shorter than chunk size, return as single chunk if len(text) <= self.chunk_size: return [text] chunks = [] start = 0 prev_start = -1 while start < len(text): # Calculate end position end = start + self.chunk_size # If this is not the last chunk, try to break at sentence boundary if end < len(text): # Look for sentence endings near the chunk boundary chunk_text = text[start:end] # Try to find the last sentence ending sentence_endings = [ m.end() for m in re.finditer(r'[.!?]\s+', chunk_text) ] if sentence_endings: # Use the last sentence ending as the break point end = start + sentence_endings[-1] else: # If no sentence ending, try to break at word boundary remaining_text = text[start:end] last_space = remaining_text.rfind(' ') if last_space > 0: end = start + last_space # Extract chunk chunk = text[start:end].strip() if chunk: chunks.append(chunk) # Move start position with overlap new_start = end - self.chunk_overlap # Ensure we make progress even with small chunks if new_start <= prev_start: new_start = end prev_start = start start = new_start return chunks def chunk_with_metadata( self, text: str, doc_id: str, metadata: dict = None ) -> List[dict]: """ Split text into chunks with metadata. Args: text: The text to split doc_id: Document identifier metadata: Additional metadata to include Returns: List of dictionaries with chunk text and metadata """ chunks = self.split_text(text) metadata = metadata or {} return [ { "text": chunk, "doc_id": doc_id, "chunk_index": idx, "total_chunks": len(chunks), **metadata, } for idx, chunk in enumerate(chunks) ]

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jaimeferj/mcp-rag-docs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server