#!/usr/bin/env python3
"""
GC InfoBase Data Ingestion Cloud Run Job
This job imports departmental performance and spending data from GC InfoBase
via the Open Canada portal. It's designed to be run as a Cloud Run job on a
monthly schedule or on-demand.
Features:
- Downloads ~30MB CSV data from Open Canada portal
- Imports departmental results (performance indicators, targets, actuals)
- Imports program expenditures and FTE data
- Creates Department nodes and relationships
- Full refresh of all GC InfoBase data
Environment variables:
- NEO4J_URI: Neo4j connection URI (default: bolt://10.128.0.3:7687)
- NEO4J_USERNAME: Neo4j username (default: neo4j)
- NEO4J_PASSWORD: Neo4j password (required)
"""
import sys
import os
from fedmcp_pipeline.utils.neo4j_client import Neo4jClient
from fedmcp_pipeline.utils.progress import logger
from fedmcp_pipeline.ingest.gc_infobase import ingest_gc_infobase_data
def main():
"""Run GC InfoBase data ingestion job."""
logger.info("=" * 80)
logger.info("GC INFOBASE DATA INGESTION CLOUD RUN JOB - STARTING")
logger.info("=" * 80)
print()
# Get environment variables
neo4j_uri = os.getenv('NEO4J_URI', 'bolt://10.128.0.3:7687')
neo4j_user = os.getenv('NEO4J_USERNAME', 'neo4j')
neo4j_password = os.getenv('NEO4J_PASSWORD')
if not neo4j_password:
logger.error("NEO4J_PASSWORD environment variable not set!")
sys.exit(1)
logger.info(f"Connecting to Neo4j at {neo4j_uri}...")
neo4j = Neo4jClient(uri=neo4j_uri, user=neo4j_user, password=neo4j_password)
try:
logger.info("Running GC InfoBase data ingestion...")
logger.info("This will:")
logger.info(" - Download ~30MB CSV data from Open Canada portal")
logger.info(" - Import departmental performance results")
logger.info(" - Import program expenditures and FTE data")
logger.info(" - Create Department nodes and relationships")
logger.info(" - Full refresh of all GC InfoBase data")
print()
# Run ingestion
stats = ingest_gc_infobase_data(neo4j, batch_size=5000)
print()
logger.success(f"✅ Successfully imported {stats['departmental_results']:,} departmental results")
logger.success(f"✅ Successfully imported {stats['program_expenditures']:,} program expenditures")
logger.success(f"✅ Created {stats['departments']:,} department nodes")
logger.info("=" * 80)
logger.info("GC INFOBASE DATA INGESTION CLOUD RUN JOB - COMPLETED")
logger.info("=" * 80)
print()
except Exception as e:
logger.error(f"GC InfoBase data ingestion job failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
finally:
neo4j.close()
if __name__ == "__main__":
main()