"""
Opposition Leader Ingestion Module.
Standalone module for ingesting the Opposition Leader role from House Officers data.
This is separate from parliament.py to avoid fedmcp dependencies in jobs that don't need them.
"""
import re
import csv
import io
import requests
from datetime import datetime
from typing import Optional
from ..utils.neo4j_client import Neo4jClient
from ..utils.progress import logger
def ingest_opposition_leader(neo4j_client: Neo4jClient) -> int:
"""
Ingest Opposition Leader from House Officers expense data.
The Opposition Leader is not included in the Cabinet XML, so we fetch it
from the House Officers quarterly expense data which includes role information.
Uses name matching to link to existing MP nodes.
Args:
neo4j_client: Neo4j client instance
Returns:
Number of roles created (0 or 1)
"""
logger.info("Fetching Opposition Leader from House Officers data...")
# Get the current fiscal year and quarter
# Fiscal year runs April 1 to March 31
# Q1 = Apr-Jun, Q2 = Jul-Sep, Q3 = Oct-Dec, Q4 = Jan-Mar
now = datetime.utcnow()
if now.month >= 4:
fiscal_year = now.year + 1 # FY2025-26 is "2026"
else:
fiscal_year = now.year # FY2024-25 is "2025"
if now.month >= 1 and now.month <= 3:
quarter = 4
elif now.month >= 4 and now.month <= 6:
quarter = 1
elif now.month >= 7 and now.month <= 9:
quarter = 2
else:
quarter = 3
# Try current quarter first, then previous quarters going back up to 2 years
# This ensures we find data even if the current quarter isn't published yet
quarters_to_try = []
fy, q = fiscal_year, quarter
for _ in range(8): # Try up to 8 quarters (2 years)
quarters_to_try.append((fy, q))
q -= 1
if q < 1:
q = 4
fy -= 1
csv_data = None
for fy, q in quarters_to_try:
# Fetch the House Officers page to find the CSV link
page_url = f"https://www.ourcommons.ca/proactivedisclosure/en/house-officers/{fy}/{q}"
logger.info(f"Trying House Officers page: {page_url}")
try:
page_response = requests.get(page_url, timeout=30)
if page_response.status_code != 200:
logger.info(f"Page returned {page_response.status_code}, trying next...")
continue
# Extract CSV UUID from the page
match = re.search(r'/house-officers/([a-f0-9\-]{36})/summary-expenditures/csv', page_response.text)
if not match:
logger.info("Could not find CSV link on page, trying next...")
continue
csv_uuid = match.group(1)
csv_url = f"https://www.ourcommons.ca/proactivedisclosure/en/house-officers/{csv_uuid}/summary-expenditures/csv"
logger.info(f"Fetching CSV: {csv_url}")
csv_response = requests.get(csv_url, timeout=30)
if csv_response.status_code == 200:
csv_data = csv_response.content.decode('utf-8-sig')
logger.info(f"Found House Officers data for FY{fy} Q{q}")
break
else:
logger.info(f"CSV returned {csv_response.status_code}, trying next...")
except Exception as e:
logger.warning(f"Error fetching FY{fy} Q{q}: {e}")
continue
if not csv_data:
logger.warning("Could not find House Officers data in any recent quarter")
return 0
# Parse CSV and find Opposition Leader candidates
# Specifically look for "Leader, Official Opposition" (the formal title)
# not "Leader, Other Opposition Party" (for NDP, Bloc, etc.)
# The file may have multiple entries (current + former), so collect all and match to current MPs
reader = csv.DictReader(io.StringIO(csv_data))
opposition_leader_candidates = []
for row in reader:
role = row.get('Role', '').strip()
# Look specifically for "Leader, Official Opposition" (not "Other Opposition Party")
# Also skip "House Leader" - we want the party leader
if ('leader, official opposition' in role.lower() and
'house leader' not in role.lower()):
opposition_leader_candidates.append({
'name': row.get('Name', '').strip(),
'role': role.strip(),
'caucus': row.get('Caucus', '').strip()
})
if not opposition_leader_candidates:
logger.warning("Opposition Leader not found in House Officers data")
return 0
logger.info(f"Found {len(opposition_leader_candidates)} Opposition Leader candidate(s)")
for c in opposition_leader_candidates:
logger.info(f" Candidate: {c['name']}")
# Sort candidates to prioritize the CURRENT leader
# Pierre Poilievre became Conservative leader in Sep 2022 and is still current.
# Andrew Scheer was leader 2017-2020 but may still appear in historical expense data.
# We prioritize "Poilievre" to ensure we get the current leader.
def prioritize_current_leader(candidate):
name_lower = candidate['name'].lower()
# Current Conservative leader (as of Jan 2026)
if 'poilievre' in name_lower:
return 0
# Former leaders appear lower
if 'scheer' in name_lower:
return 2
if "o'toole" in name_lower:
return 3
return 1 # Unknown candidates get middle priority
opposition_leader_candidates.sort(key=prioritize_current_leader)
logger.info("Candidates after sorting by current leadership:")
for c in opposition_leader_candidates:
logger.info(f" {c['name']}")
# Try to match each candidate to a current MP, prefer the first match
opposition_leader = None
matched_mp = None
for candidate in opposition_leader_candidates:
# Parse the name for matching
# House Officers format is usually "Last, First" or "First Last"
name = candidate['name']
# Remove honorifics
for honorific in ['Right Hon. ', 'Hon. ', 'Dr. ', 'Mr. ', 'Mrs. ', 'Ms. ']:
name = name.replace(honorific, '')
if ',' in name:
parts = name.split(',')
last_name = parts[0].strip()
first_name = parts[1].strip() if len(parts) > 1 else ''
else:
parts = name.split()
first_name = parts[0] if parts else ''
last_name = parts[-1] if len(parts) > 1 else ''
logger.info(f"Attempting to match: '{candidate['name']}' -> first_name='{first_name}', last_name='{last_name}'")
# Query Neo4j for matching current MP
match_query = """
MATCH (mp:MP)
WHERE mp.current = true
AND (
// Exact name match
toLower(mp.name) = toLower($full_name)
// Match on family name containing the last name
OR (mp.family_name IS NOT NULL AND toLower(mp.family_name) = toLower($last_name))
// Match on name containing last name (for compound names)
OR (toLower(mp.name) CONTAINS toLower($last_name) AND size($last_name) > 4)
)
RETURN mp.id AS mp_id, mp.name AS mp_name, mp.parl_mp_id AS person_id
ORDER BY
CASE
WHEN toLower(mp.name) = toLower($full_name) THEN 1
WHEN mp.family_name IS NOT NULL AND toLower(mp.family_name) = toLower($last_name) THEN 2
ELSE 3
END
LIMIT 5
"""
matches = neo4j_client.run_query(match_query, {
'full_name': name,
'last_name': last_name,
'first_name': first_name
})
if matches:
# Log all matches for debugging
for m in matches:
logger.info(f" Match: {m['mp_name']} (id={m['mp_id']}, person_id={m['person_id']})")
# Use first match (best score)
matched_mp = matches[0]
opposition_leader = candidate
logger.info(f"Matched to current MP: {matched_mp['mp_name']} (person_id: {matched_mp['person_id']})")
break
else:
logger.info(f" No current MP match for {candidate['name']}, trying next candidate...")
if not matched_mp:
logger.warning("Could not match any Opposition Leader candidate to a current MP in Neo4j")
return 0
# First, clean up any existing Opposition Leader roles for OTHER MPs
# (e.g., if role was previously created for wrong person)
cleanup_query = """
MATCH (role:Role)
WHERE role.role_type = 'Opposition Leader'
AND role.person_id <> $person_id
DETACH DELETE role
"""
neo4j_client.run_query(cleanup_query, {'person_id': matched_mp['person_id']})
logger.info("Cleaned up any old Opposition Leader roles")
# Create Role node with MERGE (idempotent)
role_id = f"opposition-leader-{matched_mp['person_id']}"
create_query = """
MERGE (role:Role {id: $role_id})
SET role.title = 'Leader of the Official Opposition',
role.role_type = 'Opposition Leader',
role.is_current = true,
role.person_id = $person_id,
role.updated_at = datetime()
WITH role
MATCH (mp:MP {parl_mp_id: $person_id})
MERGE (mp)-[:HOLDS_ROLE]->(role)
RETURN role.id AS role_id, mp.name AS mp_name, mp.id AS mp_slug
"""
result = neo4j_client.run_query(create_query, {
'role_id': role_id,
'person_id': matched_mp['person_id']
})
if result and result[0].get('mp_name'):
logger.success(f"✅ Created Opposition Leader role for {result[0]['mp_name']} (slug: {result[0]['mp_slug']})")
# Verify the role was created correctly
verify_query = """
MATCH (mp:MP)-[:HOLDS_ROLE]->(role:Role {id: $role_id})
MATCH (mp)-[:MEMBER_OF]->(party:Party)
RETURN mp.name AS mp_name, role.role_type AS role_type, role.is_current AS is_current,
party.code AS party_code
"""
verify_result = neo4j_client.run_query(verify_query, {'role_id': role_id})
if verify_result:
v = verify_result[0]
logger.info(f"Verified: {v['mp_name']} holds {v['role_type']} (is_current={v['is_current']}, party={v['party_code']})")
else:
logger.warning("Could not verify role creation - MP may be missing MEMBER_OF relationship")
return 1
else:
logger.warning(f"Role created but MP relationship may be missing for person_id={matched_mp['person_id']}")
return 0