OpenDiscourse MCP

ingest_all_govinfo.py•3.46 kB

#!/usr/bin/env python3 """ Ingest all GovInfo bulkdata XML across configured congress sessions and document types. This script uses the async ingestion API to traverse govinfo.gov/bulkdata using XML/JSON listing endpoints, applies rate limiting and retries, and writes per-doc-type manifests and failure logs under the output directory. Defaults come from scripts/ingestion/config.py, but can be overridden via CLI. """ import asyncio import logging from pathlib import Path from typing import Optional from scripts.ingestion import ingest_all_congresses from scripts.ingestion.config import ( CONGRESS_SESSIONS, DOCUMENT_TYPES, OUTPUT_DIR, WORKERS, LOG_LEVEL, ) def parse_args(): import argparse parser = argparse.ArgumentParser( description="Ingest all GovInfo bulkdata XML across configured congresses and document types" ) parser.add_argument( "--congress", type=int, nargs="+", help="Specific congress numbers to process (default: all from config)", ) parser.add_argument( "--doc-types", nargs="+", choices=DOCUMENT_TYPES, help=f"Document types to process (default: all: {', '.join(DOCUMENT_TYPES)})", ) parser.add_argument( "--output", type=Path, default=OUTPUT_DIR, help=f"Output directory (default: {OUTPUT_DIR})", ) parser.add_argument( "--workers", type=int, default=WORKERS, help=f"Number of parallel downloads (default: {WORKERS})", ) parser.add_argument( "--log-level", default=LOG_LEVEL, choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help=f"Logging level (default: {LOG_LEVEL})", ) return parser.parse_args() def main() -> int: args = parse_args() # Configure logging logging.basicConfig( level=args.log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) # Determine targets congresses = args.congress if args.congress else CONGRESS_SESSIONS doc_types = args.doc_types if args.doc_types else DOCUMENT_TYPES logging.info( "Starting full GovInfo ingestion | Congresses: %s | DocTypes: %s | Workers: %s | Output: %s", congresses, doc_types, args.workers, args.output, ) # Run ingestion results = asyncio.run( ingest_all_congresses( congresses=congresses, doc_types=doc_types, output_dir=args.output, workers=args.workers, ) ) # Print concise summary print("\n=== GovInfo Ingestion Summary ===\n") # Column widths congress_width = max(10, len("Congress") + 2) type_width = max(15, max(len(t) for t in DOCUMENT_TYPES) + 2) header = f"{'Congress':<{congress_width}} " + " ".join( f"{t:<{type_width-1}}" for t in (args.doc_types or DOCUMENT_TYPES) ) print(header) print("-" * len(header)) for congress in sorted(results.keys()): row = [f"{congress}th".ljust(congress_width)] for t in (args.doc_types or DOCUMENT_TYPES): row.append(str(results[congress].get(t, 0)).ljust(type_width - 1)) print(" ".join(row)) print("\nNote: Per-doc-type manifests and failures are written under: {}/<congress>/<doctype>/".format(args.output)) return 0 if __name__ == "__main__": import sys sys.exit(main())

Latest Blog Posts

Model Context Protocol Proxies: Enabling Enterprise Control with Virtual MCPs
By Om-Shree-0709 on December 9, 2025.
AI Security
Virtual MCP
Kubernetes Operator
The State of MCP in 2025: Who's Building What and Why It Matters
By punkpeye on December 7, 2025.
mcp
startups
MCP hosting with persistent storage
By punkpeye on December 6, 2025.
changelog

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cbwinslow/opendiscourse_mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server