RAGStack-Lambda

Overview Schema Related Servers Score Discussions

RAGStack-Lambda
lib
ragstack_common

metadata_normalizer.py•6.71 KiB

"""Metadata normalizer for S3 Vectors STRING_LIST expansion This module provides functions to normalize metadata values for S3 Vectors storage. All string values are converted to STRING_LIST arrays with smart expansion for flexible element-level matching. The normalizer: - Expands strings into searchable arrays with component parts - Supports comma-separated values, word splitting, year extraction - Limits arrays to 10 items (AWS STRING_LIST limit) - Ensures consistent lowercase values for filtering """ import re from typing import Any # AWS STRING_LIST maximum items MAX_ARRAY_ITEMS = 10 # Maximum length for individual string values (S3 Vectors limit protection) MAX_VALUE_LENGTH = 100 def expand_to_searchable_array(value: str, min_word_length: int = 3) -> list[str]: """ Expand a string value into a searchable array with component parts. Includes the original value plus word components for flexible matching. For example: "chicago, illinois" -> ["chicago, illinois", "chicago", "illinois"] Args: value: The string value to expand. min_word_length: Minimum length for word components (default 3 to skip initials). Returns: List of searchable terms, limited to 10 items, each truncated to MAX_VALUE_LENGTH. """ if not value or not value.strip(): return [] # Truncate input value to prevent excessively long strings value = value.strip().lower()[:MAX_VALUE_LENGTH] items = {value} # Always include original # Split on commas first (highest priority after original) if "," in value: for part in value.split(","): part = part.strip() if part and len(part) >= min_word_length: items.add(part) # Split on spaces for word components for word in value.replace(",", " ").split(): word = word.strip() if len(word) >= min_word_length: items.add(word) # Extract 4-digit years from date-like strings (e.g., "2016-01-15" -> "2016") year_match = re.search(r"\b(1[89]\d{2}|20\d{2})\b", value) if year_match: items.add(year_match.group(1)) # Prioritize: original value first, then sorted remaining items result = [value] remaining = sorted(items - {value}) result.extend(remaining) return result[:MAX_ARRAY_ITEMS] def normalize_metadata_for_s3(metadata: dict[str, Any]) -> dict[str, Any]: """ Normalize metadata values for S3 Vectors storage. All string values are converted to STRING_LIST arrays with smart expansion for flexible element-level matching. This allows searching for individual components (e.g., "chicago" matches "chicago, illinois"). Expansion rules: - Original value always included - Comma-separated parts added as separate elements - Word components added (if >= 3 chars to skip initials) - Years extracted from date strings - Limited to 10 items per array (AWS limit) Args: metadata: Raw metadata dictionary. Returns: Normalized metadata with STRING_LIST arrays for all string fields. """ normalized = {} for key, value in metadata.items(): if value is None: continue if isinstance(value, list): # Already a list - expand each item and flatten # Use consistent normalization: str().lower().strip() expanded = set() for item in value: if item is None: continue if isinstance(item, str): expanded.update(expand_to_searchable_array(item)) else: # Preserve falsy values like 0 or False expanded.add(str(item).lower().strip()) if expanded: # Keep original items first, then additional expansions # Preserve falsy values (0, False) - only skip None and empty strings original_items = [] for v in value: if v is None: continue s = str(v).lower().strip() if s: # Skip empty strings after normalization original_items.append(s) additional = sorted(expanded - set(original_items)) result = original_items + additional normalized[key] = result[:MAX_ARRAY_ITEMS] elif isinstance(value, str): expanded = expand_to_searchable_array(value) if expanded: normalized[key] = expanded elif isinstance(value, bool): # Booleans stay as-is (not arrays) normalized[key] = value elif isinstance(value, (int, float)): # Numbers become single-item arrays for consistency normalized[key] = [str(value)] else: expanded = expand_to_searchable_array(str(value)) if expanded: normalized[key] = expanded return normalized # Default core metadata keys to preserve when reducing metadata # This is a superset covering documents, media, and images DEFAULT_CORE_METADATA_KEYS = frozenset( { "content_type", "document_id", "document_type", "duration_seconds", "filename", "main_topic", "segment_index", "timestamp_end", "timestamp_start", } ) def reduce_metadata( metadata: dict[str, Any], reduction_level: int = 1, core_keys: frozenset[str] | None = None, ) -> dict[str, Any]: """ Reduce metadata size by removing non-core keys or truncating values. Used when metadata is too large for ingestion APIs. Progressively strips non-essential fields while preserving core identifiers and search keys. Args: metadata: Original metadata dict. reduction_level: Aggressiveness of reduction: 1 = keep all keys (no reduction) 2 = truncate arrays to 3 items 3 = core keys only core_keys: Set of keys to always preserve. Defaults to DEFAULT_CORE_METADATA_KEYS. Returns: Reduced metadata dict. """ if core_keys is None: core_keys = DEFAULT_CORE_METADATA_KEYS reduced = {} for key, value in metadata.items(): # Level 3: Only keep core keys if reduction_level >= 3 and key not in core_keys: continue # Core keys always kept as-is if key in core_keys: reduced[key] = value continue # Level 2+: Truncate arrays to 3 items, keep scalars if reduction_level >= 2: if isinstance(value, list): reduced[key] = value[:3] else: reduced[key] = value # Preserve scalar values else: reduced[key] = value return reduced

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HatmanStack/RAGStack-Lambda'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

metadata_normalizer.py•6.71 KiB