Skip to main content
Glama
config.py2.87 kB
"""Configuration for govinfo.gov bulk data ingestion.""" import logging from pathlib import Path import os # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Base URL for govinfo bulk data BASE_URL = "https://www.govinfo.gov/bulkdata" # Congress sessions to process (113th to 119th) CONGRESS_SESSIONS = list(range(113, 120)) # Document types to download DOCUMENT_TYPES = [ "BILLS", "BILLSTATUS", "PLAW", # Public and Private Laws "STATUTE", "FR", # Federal Register "CREC", # Congressional Record ] # Output directory (can be overridden by environment variable) OUTPUT_DIR = Path(os.getenv("GOVINFO_DATA_DIR", "govinfo_data")) # Performance settings WORKERS = int(os.getenv("GOVINFO_WORKERS", "10")) # Number of parallel downloads RATE_LIMIT = int(os.getenv("GOVINFO_RATE_LIMIT", "10")) # Requests per second CHUNK_SIZE = 1024 * 1024 # 1MB chunks for streaming downloads # Timeout for HTTP requests (in seconds) REQUEST_TIMEOUT = 30 # Retry settings MAX_RETRIES = 3 RETRY_DELAY = 5 # seconds # Validation settings VALIDATE_XML = os.getenv("GOVINFO_VALIDATE_XML", "true").lower() == "true" # Logging configuration LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") LOG_FILE = "ingestion.log" # File patterns for different document types FILE_PATTERNS = { "BILLS": "BILLS-{congress}{bill_type}{bill_number}{version}.xml", "BILLSTATUS": "BILLSTATUS-{congress}{bill_type}{bill_number}.xml", "PLAW": "PLAW-{congress}publ{pub_number}.xml", "STATUTE": "STATUTE-{volume}pg{page}.xml", "FR": "FR-{year}-{month}-{day}.xml", "CREC": "CREC-{year}-{month}-{day}.xml", } # XSD schemas for validation (relative to project root) SCHEMA_PATHS = { "BILLS": "schemas/bill.xsd", "BILLSTATUS": "schemas/billstatus.xsd", "PLAW": "schemas/plaw.xsd", "STATUTE": "schemas/statute.xsd", "FR": "schemas/fr.xsd", "CREC": "schemas/crec.xsd", } def get_document_path(congress: int, doc_type: str, **kwargs) -> Path: """ Get the output path for a document. Args: congress: The congress number (e.g., 115) doc_type: Document type (e.g., 'BILLS', 'PLAW') **kwargs: Additional format arguments for the filename pattern Returns: Path where the document should be saved """ doc_dir = OUTPUT_DIR / str(congress) / doc_type doc_dir.mkdir(parents=True, exist_ok=True) if doc_type in FILE_PATTERNS: try: filename = FILE_PATTERNS[doc_type].format(congress=congress, **kwargs) return doc_dir / filename except (KeyError, IndexError) as e: logger.warning(f"Error formatting filename for {doc_type}: {e}") # Fallback naming if pattern formatting fails or no pattern exists return doc_dir / f"{doc_type}_{kwargs.get('id', 'unknown')}.xml"

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cbwinslow/opendiscourse_mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server