#!/usr/bin/env python3
"""
Entry point script for news ingestion Cloud Run job.
Fetches Canadian political news from RSS feeds, extracts entity mentions
(MPs, bills, committees), and creates activity feed items for bookmarked entities.
"""
import os
import sys
import argparse
import logging
# Add packages to path
sys.path.insert(0, '/app/packages/data-pipeline')
sys.path.insert(0, '/app/packages/fedmcp/src')
from fedmcp_pipeline.ingest.news import NewsIngestionPipeline
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser(description='Ingest Canadian political news from RSS feeds')
parser.add_argument(
'--source',
type=str,
help='Specific news source ID to fetch (e.g., cbc, globe). If not specified, fetches all active sources.'
)
parser.add_argument(
'--limit',
type=int,
default=50,
help='Maximum articles to process per source (default: 50)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Parse and extract entities without storing to database'
)
args = parser.parse_args()
# Get environment variables
neo4j_uri = os.environ.get('NEO4J_URI', 'bolt://localhost:7687')
neo4j_username = os.environ.get('NEO4J_USERNAME', 'neo4j')
neo4j_password = os.environ.get('NEO4J_PASSWORD')
supabase_url = os.environ.get('SUPABASE_URL')
supabase_key = os.environ.get('SUPABASE_SERVICE_ROLE_KEY')
if not neo4j_password:
logger.error("NEO4J_PASSWORD environment variable is required")
sys.exit(1)
if not supabase_url or not supabase_key:
logger.warning("Supabase credentials not set - activity items will not be created")
logger.info("=" * 60)
logger.info("NEWS INGESTION PIPELINE")
logger.info("=" * 60)
logger.info(f"Neo4j URI: {neo4j_uri}")
logger.info(f"Source filter: {args.source or 'all active sources'}")
logger.info(f"Article limit per source: {args.limit}")
logger.info(f"Dry run: {args.dry_run}")
logger.info("=" * 60)
try:
# Initialize pipeline
pipeline = NewsIngestionPipeline(
neo4j_uri=neo4j_uri,
neo4j_username=neo4j_username,
neo4j_password=neo4j_password,
supabase_url=supabase_url,
supabase_key=supabase_key
)
# Run ingestion
stats = pipeline.run(
source_id=args.source,
limit_per_source=args.limit,
dry_run=args.dry_run
)
# Log results
logger.info("=" * 60)
logger.info("INGESTION COMPLETE")
logger.info("=" * 60)
logger.info(f"Sources processed: {stats.get('sources_processed', 0)}")
logger.info(f"Articles fetched: {stats.get('articles_fetched', 0)}")
logger.info(f"New articles: {stats.get('new_articles', 0)}")
logger.info(f"Entities extracted: {stats.get('entities_extracted', 0)}")
logger.info(f"Activity items created: {stats.get('activity_items_created', 0)}")
logger.info(f"Errors: {stats.get('errors', 0)}")
logger.info("=" * 60)
# Exit with error code if there were failures
if stats.get('errors', 0) > 0:
logger.warning(f"Completed with {stats['errors']} errors")
sys.exit(1)
logger.info("News ingestion completed successfully")
except Exception as e:
logger.exception(f"Fatal error during news ingestion: {e}")
sys.exit(1)
if __name__ == '__main__':
main()