Log Analyzer MCP Server

chunking_utils.py•6.18 KiB

""" Text chunking utilities for log and code files Supports character, word, and line-based chunking with memory and streaming modes """ import os from typing import List, Generator, Tuple, TextIO from collections import deque from config import Config LOG_EXTENSIONS = Config.LOG_EXTENSIONS def validate_chunker(size: int, overlap: int) -> None: """ Validate chunking parameters Args: size: Chunk size overlap: Overlap between chunks Raises: ValueError: If parameters are invalid """ if size <= 0: raise ValueError("chunk_size must be > 0") if overlap < 0: raise ValueError("overlap must be >= 0") if overlap >= size: raise ValueError("overlap must be < chunk_size") def iter_local_logs(folder_path: str) -> Generator[Tuple[str, str], None, None]: """Yield (relative_path, absolute_path) for each eligible log under folder_path.""" base = os.path.abspath(os.path.expanduser(folder_path)) for root, _, files in os.walk(base): for f in files: if f.lower().endswith(LOG_EXTENSIONS): abs_path = os.path.join(root, f) rel_path = os.path.relpath(abs_path, base) yield rel_path, abs_path def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]: """ Simple text chunking by characters Args: text: Text to chunk chunk_size: Size of each chunk overlap: Overlap between chunks Returns: List of text chunks """ chunks = [] start = 0 while start < len(text): end = min(start + chunk_size, len(text)) chunks.append(text[start:end]) start += chunk_size - overlap return chunks # --------------------------------------------------------------------------- # Memory-based chunking (loads entire text into memory) # --------------------------------------------------------------------------- def chunks_chars_mem(text: str, size: int, overlap: int) -> List[str]: """ Chunk text by characters in memory Args: text: Text to chunk size: Chunk size in characters overlap: Overlap between chunks Returns: List of text chunks """ validate_chunker(size, overlap) step = size - overlap out = [] i = 0 n = len(text) while i < n: out.append(text[i:i+size]) if i + size >= n: break i += step return out def chunks_words_mem(text: str, size: int, overlap: int) -> List[str]: """ Chunk text by words in memory Args: text: Text to chunk size: Chunk size in words overlap: Overlap between chunks Returns: List of text chunks """ validate_chunker(size, overlap) words = text.split() step = size - overlap out = [] i, n = 0, len(words) while i < n: out.append(" ".join(words[i:i+size])) if i + size >= n: break i += step return out def chunks_lines_mem(text: str, size: int, overlap: int) -> List[str]: """ Chunk text by lines in memory Args: text: Text to chunk size: Chunk size in lines overlap: Overlap between chunks Returns: List of text chunks """ validate_chunker(size, overlap) lines = text.splitlines(keepends=True) step = size - overlap out = [] i, n = 0, len(lines) while i < n: out.append("".join(lines[i:i+size])) if i + size >= n: break i += step return out # --------------------------------------------------------------------------- # Streaming chunking (for large files) # --------------------------------------------------------------------------- def stream_chunks_chars(f: TextIO, size: int, overlap: int, read_size: int = 65536) -> Generator[str, None, None]: """ Stream chunks by characters from file Args: f: File object to read from size: Chunk size in characters overlap: Overlap between chunks read_size: Buffer size for reading Yields: Text chunks """ validate_chunker(size, overlap) step = size - overlap buf = "" while True: data = f.read(read_size) if not data: break buf += data while len(buf) >= size: yield buf[:size] buf = buf[step:] if buf: yield buf def stream_chunks_words(f: TextIO, size: int, overlap: int, read_size: int = 65536) -> Generator[str, None, None]: """ Stream chunks by words from file Args: f: File object to read from size: Chunk size in words overlap: Overlap between chunks read_size: Buffer size for reading Yields: Text chunks """ validate_chunker(size, overlap) step = size - overlap token_buf: List[str] = [] carry = "" while True: data = f.read(read_size) if not data: break data = carry + data parts = data.split() ends_with_space = bool(data) and data[-1].isspace() if not ends_with_space: if parts: carry = parts.pop() else: carry = data else: carry = "" token_buf.extend(parts) while len(token_buf) >= size: chunk_tokens = token_buf[:size] yield " ".join(chunk_tokens) token_buf = token_buf[step:] if carry: token_buf.append(carry) if token_buf: yield " ".join(token_buf) def stream_chunks_lines(f: TextIO, size: int, overlap: int) -> Generator[str, None, None]: """ Stream chunks by lines from file Args: f: File object to read from size: Chunk size in lines overlap: Overlap between chunks Yields: Text chunks """ validate_chunker(size, overlap) step = size - overlap window: deque[str] = deque() for line in f: window.append(line) if len(window) == size: yield "".join(window) for _ in range(step): if window: window.popleft() if window: yield "".join(window)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/suriya-ML/log-checker-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

chunking_utils.py•6.18 KiB