Stock MCP Server

base.py•7.24 KiB

"""Base classes for chunking strategies. This module defines the core abstractions: - Chunk: A dataclass representing a text segment with metadata - ChunkingStrategy: Abstract base class for all chunking strategies """ from abc import ABC, abstractmethod from dataclasses import dataclass, field from typing import Any, Dict, Generator, List, Optional @dataclass class Chunk: """Represents a document chunk with rich metadata. Attributes: text: The actual text content of this chunk metadata: Dictionary containing contextual information: - ticker: Stock symbol (e.g., "AAPL") - doc_id: SEC Accession Number - form: Filing type (e.g., "10-K", "6-K") - item: Item identifier (e.g., "Item 1A", "6-K-attachment") - item_name: Human-readable item name (e.g., "Risk Factors") - filing_date: Date of the filing - chunk_index: Sequential index within the document - is_table: Whether this chunk contains tabular data - char_count: Character count of the text - source: Source of the chunk (e.g., "structured", "attachment", "fallback") """ text: str metadata: Dict[str, Any] = field(default_factory=dict) def __post_init__(self): """Ensure metadata has default values.""" defaults = { "ticker": "", "doc_id": "", "form": "", "item": "unknown", "item_name": "Unknown", "filing_date": "", "chunk_index": 0, "is_table": False, "char_count": len(self.text) if self.text else 0, "source": "unknown", } for key, value in defaults.items(): if key not in self.metadata: self.metadata[key] = value # Update char_count based on actual text self.metadata["char_count"] = len(self.text) if self.text else 0 def to_dict(self) -> Dict[str, Any]: """Convert to dictionary format for JSON serialization.""" return { "type": "chunk", "text": self.text, "metadata": self.metadata, } class ChunkingStrategy(ABC): """Abstract base class for document chunking strategies. Each strategy is responsible for: 1. extract(): Extracting processable content from a Filing 2. chunk(): Breaking the content into semantic chunks Different filing types (10-K, 6-K, 8-K, etc.) have different structures and require different extraction/chunking logic. """ # Target size for fallback paragraph-based chunking TARGET_CHUNK_SIZE = 4000 @property @abstractmethod def supported_forms(self) -> List[str]: """Return list of form types this strategy handles (e.g., ["10-K", "10-Q"]).""" pass @abstractmethod def extract(self, filing) -> Optional[str]: """Extract content from a Filing for chunking. For structured filings (10-K, 10-Q): Returns None, using edgartools' ChunkedDocument for structured parsing. For attachment-based filings (6-K): Returns the extracted attachment content as text. Args: filing: An edgartools Filing object Returns: Optional[str]: Extracted text content, or None if using structured parsing """ pass @abstractmethod def chunk( self, content: Optional[str], filing, ticker: str, items: Optional[List[str]] = None, ) -> Generator[Chunk, None, None]: """Chunk the content into semantic segments. Args: content: Content from extract() (may be None for structured filings) filing: Original Filing object for metadata ticker: Stock ticker symbol items: Optional list of items to filter (e.g., ["Item 1A", "Item 7"]) Yields: Chunk objects with text and metadata """ pass def process( self, filing, ticker: str, items: Optional[List[str]] = None, ) -> Generator[Chunk, None, None]: """Main entry point: extract then chunk. This is a template method that orchestrates the extract/chunk flow. Args: filing: An edgartools Filing object ticker: Stock ticker symbol items: Optional list of items to filter Yields: Chunk objects """ content = self.extract(filing) yield from self.chunk(content, filing, ticker, items) def _fallback_chunk( self, text: str, filing, ticker: str, item: str = "fallback", item_name: str = "Fallback Chunking", source: str = "fallback", ) -> Generator[Chunk, None, None]: """Fallback paragraph-based chunking. Used when structured parsing fails or is not available. Splits text by double newlines and groups into target-sized chunks. Args: text: Full text to chunk filing: Filing object for metadata ticker: Stock ticker item: Item label to use item_name: Human-readable item name source: Source identifier Yields: Chunk objects """ if not text or not text.strip(): return paragraphs = text.split("\n\n") current_chunk = "" chunk_index = 0 for para in paragraphs: para = para.strip() if not para: continue # Check if adding this paragraph would exceed target size if len(current_chunk) + len(para) > self.TARGET_CHUNK_SIZE and current_chunk: yield Chunk( text=current_chunk.strip(), metadata={ "ticker": ticker, "doc_id": getattr(filing, 'accession_no', ''), "form": getattr(filing, 'form', ''), "item": item, "item_name": item_name, "filing_date": str(getattr(filing, 'filing_date', '')), "chunk_index": chunk_index, "is_table": False, "source": source, } ) chunk_index += 1 current_chunk = para else: current_chunk = current_chunk + "\n\n" + para if current_chunk else para # Emit final chunk if current_chunk.strip(): yield Chunk( text=current_chunk.strip(), metadata={ "ticker": ticker, "doc_id": getattr(filing, 'accession_no', ''), "form": getattr(filing, 'form', ''), "item": item, "item_name": item_name, "filing_date": str(getattr(filing, 'filing_date', '')), "chunk_index": chunk_index, "is_table": False, "source": source, } )

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/huweihua123/stock-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

base.py•7.24 KiB