Skip to main content
Glama
twentyf_strategy.py3.83 kB
"""Chunking strategy for Form 20-F (Foreign Private Issuer Annual Report). This strategy handles the unique characteristics of 20-F filings: 1. Different Item structure than 10-K (Item 3.D vs Item 1A). 2. "Shell" filings (like BABA) that primarily incorporate by reference. 3. Hybrid content (structured items + potential attachments). """ from typing import Generator, List, Optional from src.server.domain.chunking.base import Chunk from src.server.domain.chunking.strategies.tenk_strategy import TenKStrategy from src.server.domain.sec_filing_schema import get_filing_schema from src.server.utils.logger import logger class TwentyFStrategy(TenKStrategy): """Strategy for 20-F filings with smart fallback.""" @property def supported_forms(self) -> List[str]: return ["20-F"] def chunk( self, content: Optional[str], filing, ticker: str, items: Optional[List[str]] = None, ) -> Generator[Chunk, None, None]: """ Smart chunking for 20-F: 1. Try structured parsing (ChunkedDocument). 2. Check if the result is "thin" (e.g., just reference statements). 3. If thin, fallback to full markdown or attachment search. """ schema = get_filing_schema(filing.form) items_to_extract = items if items else schema.default_items # 1. Try structured chunking first (Standard 10-K logic) chunked_doc = self._get_chunked_document(filing) chunks_yielded = 0 total_chars = 0 if chunked_doc: # Buffer chunks to analyze quality before yielding buffered_chunks = [] try: for chunk in self._chunk_from_dataframe( chunked_doc, filing, ticker, items_to_extract, schema ): buffered_chunks.append(chunk) total_chars += len(chunk.text) chunks_yielded = len(buffered_chunks) logger.info(f"📊 20-F Structured parse: {chunks_yielded} chunks, {total_chars} chars") except Exception as e: logger.warning(f"20-F structured parsing failed: {e}") chunks_yielded = 0 # 2. Quality Check: Is this a "Shell" filing? # Threshold: < 20000 chars suggests it might just be "Incorporated by reference" statements # A full 20-F is usually > 100k chars. is_thin_content = total_chars < 20000 if chunks_yielded > 0 and not is_thin_content: # Good structured content, yield it yield from buffered_chunks return # 3. Fallback Logic if is_thin_content: logger.info(f"⚠️ 20-F content thin ({total_chars} chars). Likely 'Incorporation by Reference'. Switching to fallback.") # If we have some structured chunks, yield them first (they might contain useful metadata/signatures) if buffered_chunks: yield from buffered_chunks # Fallback 1: Try to find "Annual Report" attachment (like 6-K logic) # TODO: Implement attachment search if needed. For now, Markdown fallback is safer for 20-F. # Fallback 2: Full Markdown (Valuecell style) # This captures "unlabeled" text that might have been missed by Item filtering logger.info("🔄 Using Markdown fallback to capture full content...") yield from self._chunk_from_markdown(filing, ticker) else: # No structured content at all logger.warning("ChunkedDocument not available for 20-F, using markdown fallback") yield from self._chunk_from_markdown(filing, ticker)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/huweihua123/stock-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server