MCP Chat Support System

chunking.py•6.7 KiB

""" Document chunking with overlap and metadata preservation. """ import tiktoken from typing import List, Dict, Any, Optional from dataclasses import dataclass import re import uuid from datetime import datetime from app.config import settings @dataclass class TextChunk: """Represents a chunk of text with metadata.""" content: str chunk_index: int start_char: int end_char: int page_number: Optional[int] = None token_count: int = 0 class DocumentChunker: """ Chunks documents into smaller pieces with overlap. Uses tiktoken for accurate token counting. """ def __init__( self, chunk_size: int = settings.CHUNK_SIZE, chunk_overlap: int = settings.CHUNK_OVERLAP, min_chunk_size: int = settings.MIN_CHUNK_SIZE ): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.min_chunk_size = min_chunk_size # Use cl100k_base encoding (same as GPT-4, good general purpose) self.encoding = tiktoken.get_encoding("cl100k_base") def count_tokens(self, text: str) -> int: """Count tokens in text.""" return len(self.encoding.encode(text)) def _split_into_sentences(self, text: str) -> List[str]: """Split text into sentences while preserving structure.""" # Split on sentence boundaries but keep delimiters sentence_endings = r'(?<=[.!?])\s+' sentences = re.split(sentence_endings, text) return [s.strip() for s in sentences if s.strip()] def _split_into_paragraphs(self, text: str) -> List[str]: """Split text into paragraphs.""" paragraphs = re.split(r'\n\s*\n', text) return [p.strip() for p in paragraphs if p.strip()] def chunk_text( self, text: str, page_numbers: Optional[Dict[int, int]] = None # char_position -> page_number ) -> List[TextChunk]: """ Chunk text into smaller pieces with overlap. Args: text: The text to chunk page_numbers: Optional mapping of character positions to page numbers Returns: List of TextChunk objects """ if not text.strip(): return [] chunks = [] current_chunk = "" current_start = 0 chunk_index = 0 # First, split into paragraphs for natural boundaries paragraphs = self._split_into_paragraphs(text) char_position = 0 for para in paragraphs: para_tokens = self.count_tokens(para) current_tokens = self.count_tokens(current_chunk) # If adding this paragraph exceeds chunk size if current_tokens + para_tokens > self.chunk_size and current_chunk: # Save current chunk if it meets minimum size if current_tokens >= self.min_chunk_size: page_num = None if page_numbers: # Find the page number for this chunk's start position for pos, page in sorted(page_numbers.items()): if pos <= current_start: page_num = page chunks.append(TextChunk( content=current_chunk.strip(), chunk_index=chunk_index, start_char=current_start, end_char=char_position, page_number=page_num, token_count=current_tokens )) chunk_index += 1 # Start new chunk with overlap overlap_text = self._get_overlap_text(current_chunk) current_chunk = overlap_text + "\n\n" + para if overlap_text else para current_start = char_position - len(overlap_text) if overlap_text else char_position else: # Add paragraph to current chunk if current_chunk: current_chunk += "\n\n" + para else: current_chunk = para current_start = char_position char_position += len(para) + 2 # +2 for paragraph separator # Don't forget the last chunk if current_chunk and self.count_tokens(current_chunk) >= self.min_chunk_size: page_num = None if page_numbers: for pos, page in sorted(page_numbers.items()): if pos <= current_start: page_num = page chunks.append(TextChunk( content=current_chunk.strip(), chunk_index=chunk_index, start_char=current_start, end_char=len(text), page_number=page_num, token_count=self.count_tokens(current_chunk) )) return chunks def _get_overlap_text(self, text: str) -> str: """Get the overlap text from the end of a chunk.""" sentences = self._split_into_sentences(text) if not sentences: return "" overlap = "" tokens = 0 # Work backwards through sentences for sentence in reversed(sentences): sentence_tokens = self.count_tokens(sentence) if tokens + sentence_tokens <= self.chunk_overlap: overlap = sentence + " " + overlap if overlap else sentence tokens += sentence_tokens else: break return overlap.strip() def create_chunk_metadata( self, chunk: TextChunk, tenant_id: str, # CRITICAL: Multi-tenant isolation kb_id: str, user_id: str, file_name: str, file_type: str, total_chunks: int, document_id: Optional[str] = None ) -> Dict[str, Any]: """Create metadata dictionary for a chunk.""" chunk_id = f"{tenant_id}_{kb_id}_{file_name}_{chunk.chunk_index}_{uuid.uuid4().hex[:8]}" return { "tenant_id": tenant_id, # CRITICAL: Multi-tenant isolation "kb_id": kb_id, "user_id": user_id, "file_name": file_name, "file_type": file_type, "chunk_id": chunk_id, "chunk_index": chunk.chunk_index, "page_number": chunk.page_number, "total_chunks": total_chunks, "token_count": chunk.token_count, "document_id": document_id, # Track original document "created_at": datetime.utcnow().isoformat() } # Global chunker instance chunker = DocumentChunker()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ChiragPatankar/MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

chunking.py•6.7 KiB