We provide all the information about MCP servers via our MCP API.
curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'
If you have feedback or need assistance with the MCP directory API, please join our Discord server
"""Chunker using LangChain's MarkdownHeaderTextSplitter."""
import logging
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple
from ..models import Chunk
from .chunker import Chunker
logger = logging.getLogger(__name__)
class LangChainChunker(Chunker):
"""Chunker using LangChain's MarkdownHeaderTextSplitter."""
def __init__(
self,
headers_to_split_on: Optional[List[Tuple[str, str]]] = None,
chunk_size: int = 1000,
chunk_overlap: int = 200,
cache_dir: str = None,
min_chunk_size: int = 0,
):
"""Initialize the LangChain chunker.
Args:
headers_to_split_on: List of (header_tag, header_name) tuples.
chunk_size: Maximum size of each chunk.
chunk_overlap: Overlap between chunks.
cache_dir: Optional cache directory.
min_chunk_size: Minimum size for chunks (0 = disabled).
"""
super().__init__(cache_dir=cache_dir, min_chunk_size=min_chunk_size)
self.headers_to_split_on = headers_to_split_on or [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
try:
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
self.header_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=self.headers_to_split_on)
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
separators=["\n\n", "\n", ". ", " ", ""],
)
except ImportError:
raise ImportError("LangChain text splitters not available. Install with: pip install pdfkb-mcp[langchain]")
def chunk(self, markdown_content: str, metadata: Dict[str, Any]) -> List[Chunk]:
"""Chunk markdown content using LangChain's splitter.
Args:
markdown_content: Markdown text to chunk.
metadata: Document metadata.
Returns:
List of Chunk objects.
"""
try:
if not markdown_content or not markdown_content.strip():
logger.warning("Empty markdown content provided to chunker")
return []
# Split the markdown content by headers
header_splits = self.header_splitter.split_text(markdown_content)
# Process each header split and further split if needed
final_chunks = []
for split in header_splits:
# Extract the text content from the split
if hasattr(split, "page_content"):
# LangChain Document object
text_content = split.page_content
split_metadata = getattr(split, "metadata", {})
else:
# Plain string
text_content = str(split)
split_metadata = {}
# If the split is small enough, use it as is
if len(text_content) <= self.chunk_size:
final_chunks.append({"text": text_content, "metadata": split_metadata})
else:
# Split large sections into smaller chunks
sub_chunks = self.text_splitter.split_text(text_content)
for sub_chunk in sub_chunks:
final_chunks.append({"text": sub_chunk, "metadata": split_metadata})
# Convert to Chunk objects with metadata
chunks = []
for i, chunk_data in enumerate(final_chunks):
chunk_text = chunk_data["text"].strip()
if not chunk_text: # Skip empty chunks
continue
chunk_metadata = {
"chunk_strategy": "langchain_markdown_header",
"headers_to_split_on": self.headers_to_split_on,
"chunk_size": self.chunk_size,
"chunk_overlap": self.chunk_overlap,
"created_at": datetime.now(timezone.utc).isoformat(),
}
# Add header metadata if available
if chunk_data["metadata"]:
chunk_metadata.update(chunk_data["metadata"])
# Add any provided metadata
chunk_metadata.update(metadata)
chunk = Chunk(text=chunk_text, chunk_index=i, metadata=chunk_metadata)
chunks.append(chunk)
# Apply minimum chunk size filtering
chunks = self._filter_small_chunks(chunks)
logger.info(f"Created {len(chunks)} chunks using LangChain MarkdownHeaderTextSplitter")
return chunks
except Exception as e:
logger.error(f"Failed to chunk markdown content with LangChain: {e}")
# Fallback to simple text splitting
return self._fallback_chunk(markdown_content, metadata)
def _fallback_chunk(self, text: str, metadata: Dict[str, Any]) -> List[Chunk]:
"""Fallback chunking method using only RecursiveCharacterTextSplitter.
Args:
text: Text content to chunk.
metadata: Document metadata.
Returns:
List of Chunk objects.
"""
try:
logger.warning("Using fallback chunking method")
chunks = []
text_chunks = self.text_splitter.split_text(text)
for i, chunk_text in enumerate(text_chunks):
if not chunk_text.strip():
continue
chunk_metadata = {
"chunk_strategy": "langchain_fallback",
"chunk_size": self.chunk_size,
"chunk_overlap": self.chunk_overlap,
"created_at": datetime.now(timezone.utc).isoformat(),
}
chunk_metadata.update(metadata)
chunk = Chunk(text=chunk_text.strip(), chunk_index=i, metadata=chunk_metadata)
chunks.append(chunk)
# Apply minimum chunk size filtering
chunks = self._filter_small_chunks(chunks)
logger.info(f"Created {len(chunks)} chunks using fallback method")
return chunks
except Exception as e:
logger.error(f"Fallback chunking also failed: {e}")
raise RuntimeError(f"Failed to chunk text: {e}") from e