Skip to main content
Glama
cbcoutinho

Nextcloud MCP Server

by cbcoutinho
document_chunker.py3.57 kB
"""Document chunking for large texts using LangChain text splitters.""" import logging from dataclasses import dataclass from langchain_text_splitters import RecursiveCharacterTextSplitter logger = logging.getLogger(__name__) @dataclass class ChunkWithPosition: """A text chunk with its character position in the original document.""" text: str start_offset: int # Character position where chunk starts end_offset: int # Character position where chunk ends (exclusive) page_number: int | None = None # Page number for PDF chunks (optional) metadata: dict | None = None # Additional processor-specific metadata (optional) class DocumentChunker: """Chunk large documents for optimal embedding using LangChain text splitters. Uses RecursiveCharacterTextSplitter which preserves semantic boundaries by splitting on sentence and paragraph boundaries before resorting to character-level splitting. """ def __init__(self, chunk_size: int = 2048, overlap: int = 200): """ Initialize document chunker. Args: chunk_size: Number of characters per chunk (default: 2048) overlap: Number of overlapping characters between chunks (default: 200) """ self.chunk_size = chunk_size self.overlap = overlap # Initialize LangChain RecursiveCharacterTextSplitter # Uses hierarchical splitting to preserve semantic boundaries: # - Paragraphs (\n\n) # - Sentences (. ! ?) # - Words (spaces) # - Characters (last resort) # This prevents mid-sentence splitting while maintaining semantic coherence self.splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=overlap, add_start_index=True, # Enable position tracking strip_whitespace=True, ) async def chunk_text(self, content: str) -> list[ChunkWithPosition]: """ Split text into overlapping chunks with position tracking. Uses LangChain's RecursiveCharacterTextSplitter to create chunks that preserve semantic boundaries by splitting at paragraphs and sentences before resorting to word or character-level splitting. This ensures sentences are kept intact. Preserves character positions for each chunk to enable precise document retrieval. Args: content: Text content to chunk Returns: List of chunks with their character positions in the original content """ import anyio # Handle empty content - return single empty chunk for backward compatibility if not content: return [ChunkWithPosition(text="", start_offset=0, end_offset=0)] # Run CPU-bound text splitting in thread pool to avoid blocking event loop docs = await anyio.to_thread.run_sync( # type: ignore[attr-defined] self.splitter.create_documents, [content], ) # Convert LangChain Documents to ChunkWithPosition objects chunks = [ ChunkWithPosition( text=doc.page_content, start_offset=doc.metadata.get("start_index", 0), end_offset=doc.metadata.get("start_index", 0) + len(doc.page_content), ) for doc in docs ] logger.debug( f"Chunked document into {len(chunks)} chunks " f"(chunk_size={self.chunk_size}, overlap={self.overlap})" ) return chunks

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cbcoutinho/nextcloud-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server