Skip to main content
Glama
ingest_all_govinfo.py3.46 kB
#!/usr/bin/env python3 """ Ingest all GovInfo bulkdata XML across configured congress sessions and document types. This script uses the async ingestion API to traverse govinfo.gov/bulkdata using XML/JSON listing endpoints, applies rate limiting and retries, and writes per-doc-type manifests and failure logs under the output directory. Defaults come from scripts/ingestion/config.py, but can be overridden via CLI. """ import asyncio import logging from pathlib import Path from typing import Optional from scripts.ingestion import ingest_all_congresses from scripts.ingestion.config import ( CONGRESS_SESSIONS, DOCUMENT_TYPES, OUTPUT_DIR, WORKERS, LOG_LEVEL, ) def parse_args(): import argparse parser = argparse.ArgumentParser( description="Ingest all GovInfo bulkdata XML across configured congresses and document types" ) parser.add_argument( "--congress", type=int, nargs="+", help="Specific congress numbers to process (default: all from config)", ) parser.add_argument( "--doc-types", nargs="+", choices=DOCUMENT_TYPES, help=f"Document types to process (default: all: {', '.join(DOCUMENT_TYPES)})", ) parser.add_argument( "--output", type=Path, default=OUTPUT_DIR, help=f"Output directory (default: {OUTPUT_DIR})", ) parser.add_argument( "--workers", type=int, default=WORKERS, help=f"Number of parallel downloads (default: {WORKERS})", ) parser.add_argument( "--log-level", default=LOG_LEVEL, choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help=f"Logging level (default: {LOG_LEVEL})", ) return parser.parse_args() def main() -> int: args = parse_args() # Configure logging logging.basicConfig( level=args.log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) # Determine targets congresses = args.congress if args.congress else CONGRESS_SESSIONS doc_types = args.doc_types if args.doc_types else DOCUMENT_TYPES logging.info( "Starting full GovInfo ingestion | Congresses: %s | DocTypes: %s | Workers: %s | Output: %s", congresses, doc_types, args.workers, args.output, ) # Run ingestion results = asyncio.run( ingest_all_congresses( congresses=congresses, doc_types=doc_types, output_dir=args.output, workers=args.workers, ) ) # Print concise summary print("\n=== GovInfo Ingestion Summary ===\n") # Column widths congress_width = max(10, len("Congress") + 2) type_width = max(15, max(len(t) for t in DOCUMENT_TYPES) + 2) header = f"{'Congress':<{congress_width}} " + " ".join( f"{t:<{type_width-1}}" for t in (args.doc_types or DOCUMENT_TYPES) ) print(header) print("-" * len(header)) for congress in sorted(results.keys()): row = [f"{congress}th".ljust(congress_width)] for t in (args.doc_types or DOCUMENT_TYPES): row.append(str(results[congress].get(t, 0)).ljust(type_width - 1)) print(" ".join(row)) print("\nNote: Per-doc-type manifests and failures are written under: {}/<congress>/<doctype>/".format(args.output)) return 0 if __name__ == "__main__": import sys sys.exit(main())

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cbwinslow/opendiscourse_mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server