Skip to main content
Glama
northernvariables

FedMCP - Federal Parliamentary Information

import_november_hansard_from_xml.py6.55 kB
#!/usr/bin/env python3 """ Import November 2025 Hansard using OurCommonsHansardClient (XML from DocumentViewer). This script: 1. Finds November 2025 debate dates from OpenParliament API 2. Fetches Hansard XML from ourcommons.ca DocumentViewer 3. Parses XML into Document and Statement nodes 4. Imports to Neo4j production database """ import os import sys from pathlib import Path from datetime import datetime, timedelta from typing import List, Dict, Any # Add packages to path sys.path.insert(0, str(Path(__file__).parent / "packages" / "fedmcp" / "src")) sys.path.insert(0, str(Path(__file__).parent / "packages" / "data-pipeline")) from fedmcp.clients.openparliament import OpenParliamentClient from fedmcp.clients.ourcommons import OurCommonsHansardClient from fedmcp_pipeline.utils.neo4j_client import Neo4jClient from fedmcp_pipeline.utils.progress import logger def find_november_debate_urls(op_client: OpenParliamentClient) -> List[Dict[str, Any]]: """Find all debate URLs for November 2025.""" logger.info("Finding November 2025 debates from OpenParliament API...") debates = [] for debate in op_client.list_debates(): debate_date = debate.get("date", "") # Filter for November 2025 if debate_date >= "2025-11-01" and debate_date < "2025-12-01": debates.append({ "date": debate_date, "url": debate.get("url"), "hansard_url": debate.get("hansard_url"), }) debates.sort(key=lambda x: x["date"]) logger.info(f"Found {len(debates)} debates in November 2025") return debates def parse_hansard_xml(hansard_client: OurCommonsHansardClient, slug: str) -> Any: """Fetch and parse Hansard XML for a sitting.""" try: sitting = hansard_client.get_sitting(slug, parse=True) return sitting except Exception as e: logger.warning(f"Failed to parse Hansard for {slug}: {e}") return None def import_sitting_to_neo4j(neo4j: Neo4jClient, sitting: Any, date: str) -> Dict[str, int]: """Import a parsed Hansard sitting to Neo4j.""" stats = {"documents": 0, "statements": 0} if not sitting or not sitting.sections: logger.warning(f"No content for sitting on {date}") return stats # Create Document node document_id = f"hansard-{date}" document_data = [{ "id": document_id, "date": date, "session_id": "45-1", # Current parliament session "document_type": "D", # Debates "public": True, "source": "ourcommons_xml", "updated_at": datetime.utcnow().isoformat(), }] neo4j.batch_merge_nodes("Document", document_data, merge_keys=["id"]) stats["documents"] = 1 logger.info(f" Created Document: {document_id}") # Create Statement nodes from speeches statements_data = [] stmt_counter = 0 for section in sitting.sections: h1 = section.heading_en or "" for speech in section.speeches: stmt_counter += 1 statement_id = f"{document_id}-stmt-{stmt_counter}" statements_data.append({ "id": statement_id, "document_id": document_id, "time": f"{date}T{speech.time}" if speech.time else f"{date}T12:00:00", "who_en": speech.speaker_en or "", "who_fr": speech.speaker_fr or "", "content_en": "\n\n".join(speech.paragraphs_en) if speech.paragraphs_en else "", "content_fr": "\n\n".join(speech.paragraphs_fr) if speech.paragraphs_fr else "", "h1_en": h1, "h2_en": speech.topic_en or "", "statement_type": "speech", "wordcount": len(" ".join(speech.paragraphs_en).split()) if speech.paragraphs_en else 0, "procedural": False, "updated_at": datetime.utcnow().isoformat(), }) if statements_data: neo4j.batch_merge_nodes("Statement", statements_data, merge_keys=["id"], batch_size=1000) stats["statements"] = len(statements_data) logger.info(f" Created {len(statements_data)} statements") # Create PART_OF relationships if statements_data: rel_query = """ MATCH (d:Document {id: $doc_id}) MATCH (s:Statement) WHERE s.document_id = $doc_id AND NOT exists((s)-[:PART_OF]->()) MERGE (s)-[:PART_OF]->(d) """ neo4j.run_query(rel_query, {"doc_id": document_id}) logger.info(f" Linked statements to document") return stats def main(): logger.info("=" * 80) logger.info("NOVEMBER 2025 HANSARD IMPORT (from XML)") logger.info("=" * 80) # Initialize clients logger.info("Initializing clients...") op_client = OpenParliamentClient() hansard_client = OurCommonsHansardClient() neo4j_uri = os.getenv("NEO4J_URI", "bolt://10.128.0.3:7687") neo4j_user = os.getenv("NEO4J_USERNAME", "neo4j") neo4j_password = os.getenv("NEO4J_PASSWORD", "canadagpt2024") neo4j = Neo4jClient(uri=neo4j_uri, user=neo4j_user, password=neo4j_password) neo4j.test_connection() try: # Find November debates debates = find_november_debate_urls(op_client) if not debates: logger.warning("No November 2025 debates found!") return # Import each debate total_docs = 0 total_stmts = 0 for debate in debates: logger.info(f"\nProcessing {debate['date']}...") # Extract Hansard slug from URL hansard_url = debate.get("hansard_url", "") if not hansard_url: logger.warning(f" No Hansard URL for {debate['date']}") continue # Parse "latest/hansard" or date-based slug slug = hansard_url.split("/en/")[-1] if "/en/" in hansard_url else "latest/hansard" # Fetch and parse XML sitting = parse_hansard_xml(hansard_client, slug) if sitting: stats = import_sitting_to_neo4j(neo4j, sitting, debate["date"]) total_docs += stats["documents"] total_stmts += stats["statements"] # Summary logger.info("=" * 80) logger.success("✅ NOVEMBER 2025 IMPORT COMPLETE") logger.info(f"Documents created: {total_docs}") logger.info(f"Statements created: {total_stmts}") logger.info("=" * 80) finally: neo4j.close() if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/northernvariables/FedMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server