"""Unit tests for MP name matching utilities.
These functions are critical for linking parliamentary data (Hansard statements,
votes, committee testimony) to the correct MP nodes in Neo4j.
"""
import pytest
import sys
from pathlib import Path
# Add packages to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from fedmcp_pipeline.ingest.written_questions import (
normalize_name,
match_mp_name,
NICKNAME_MAPPING,
)
class TestNormalizeName:
"""Tests for the normalize_name function."""
def test_removes_accents(self):
"""Should remove diacritics/accents from characters."""
assert normalize_name("Café") == "cafe"
assert normalize_name("Élisabeth") == "elisabeth"
assert normalize_name("François") == "francois"
assert normalize_name("José") == "jose"
assert normalize_name("Müller") == "muller"
assert normalize_name("Björk") == "bjork"
def test_handles_french_accents(self):
"""Should handle common French Canadian accented names."""
assert normalize_name("André Bélanger") == "andre belanger"
assert normalize_name("Hélène Côté") == "helene cote"
assert normalize_name("René Lévesque") == "rene levesque"
assert normalize_name("Gérald Tremblay") == "gerald tremblay"
assert normalize_name("Montréal") == "montreal"
def test_lowercase(self):
"""Should convert to lowercase."""
assert normalize_name("JOHN SMITH") == "john smith"
assert normalize_name("John Smith") == "john smith"
assert normalize_name("jOhN sMiTh") == "john smith"
def test_removes_punctuation(self):
"""Should remove periods and commas."""
assert normalize_name("Hon. John Smith") == "hon john smith"
assert normalize_name("Rt. Hon. Justin Trudeau") == "rt hon justin trudeau"
assert normalize_name("Smith, John") == "smith john"
assert normalize_name("J.D. Power") == "jd power"
def test_normalizes_whitespace(self):
"""Should normalize extra whitespace."""
assert normalize_name("John Smith") == "john smith"
assert normalize_name(" John Smith ") == "john smith"
assert normalize_name("John\t\nSmith") == "john smith"
def test_empty_and_none(self):
"""Should handle empty and None input."""
assert normalize_name("") == ""
assert normalize_name(None) == ""
def test_preserves_hyphens(self):
"""Should preserve hyphens in compound names."""
assert normalize_name("Jean-Pierre") == "jean-pierre"
assert normalize_name("Mary-Jane Watson") == "mary-jane watson"
class TestNicknameMapping:
"""Tests for the NICKNAME_MAPPING dictionary."""
def test_common_nicknames_exist(self):
"""Should have mappings for common nicknames."""
assert NICKNAME_MAPPING.get('bob') == 'robert'
assert NICKNAME_MAPPING.get('bobby') == 'robert'
assert NICKNAME_MAPPING.get('bill') == 'william'
assert NICKNAME_MAPPING.get('jim') == 'james'
assert NICKNAME_MAPPING.get('mike') == 'michael'
assert NICKNAME_MAPPING.get('tony') == 'anthony'
def test_all_nicknames_lowercase(self):
"""All nicknames should be lowercase."""
for nickname, formal in NICKNAME_MAPPING.items():
assert nickname == nickname.lower(), f"Nickname '{nickname}' should be lowercase"
assert formal == formal.lower(), f"Formal name '{formal}' should be lowercase"
class TestMatchMpName:
"""Tests for the match_mp_name function."""
@pytest.fixture
def mp_mapping(self):
"""Create a sample MP mapping for testing."""
return {
# Standard names
'pierre poilievre': 'mp-pierre-poilievre',
'justin trudeau': 'mp-justin-trudeau',
'mark carney': 'mp-mark-carney',
# French Canadian names
'francois legault': 'mp-francois-legault',
'helene leblanc': 'mp-helene-leblanc',
# Compound last names
'jagmeet singh': 'mp-jagmeet-singh',
'yves-francois blanchet': 'mp-yves-francois-blanchet',
# With formal names (for nickname testing)
'robert smith': 'mp-robert-smith',
'william jones': 'mp-william-jones',
'james wilson': 'mp-james-wilson',
# Hyphenated surnames
'mary simon-johnson': 'mp-mary-simon-johnson',
'mary simon': 'mp-mary-simon-johnson', # First part of hyphenated
}
def test_direct_match(self, mp_mapping):
"""Should find direct matches."""
assert match_mp_name('Pierre Poilievre', mp_mapping) == 'mp-pierre-poilievre'
assert match_mp_name('Justin Trudeau', mp_mapping) == 'mp-justin-trudeau'
def test_case_insensitive(self, mp_mapping):
"""Should match regardless of case."""
assert match_mp_name('PIERRE POILIEVRE', mp_mapping) == 'mp-pierre-poilievre'
assert match_mp_name('pierre poilievre', mp_mapping) == 'mp-pierre-poilievre'
assert match_mp_name('JUSTIN trudeau', mp_mapping) == 'mp-justin-trudeau'
def test_accent_insensitive(self, mp_mapping):
"""Should match names with or without accents."""
assert match_mp_name('François Legault', mp_mapping) == 'mp-francois-legault'
assert match_mp_name('Francois Legault', mp_mapping) == 'mp-francois-legault'
assert match_mp_name('Hélène Leblanc', mp_mapping) == 'mp-helene-leblanc'
def test_first_last_extraction(self, mp_mapping):
"""Should extract first and last name from longer names."""
# When middle names are present, should try first+last
mp_mapping['john doe'] = 'mp-john-doe'
assert match_mp_name('John Middle Doe', mp_mapping) == 'mp-john-doe'
def test_nickname_to_formal(self, mp_mapping):
"""Should match nicknames to formal names."""
# Bob → Robert
assert match_mp_name('Bob Smith', mp_mapping) == 'mp-robert-smith'
assert match_mp_name('Bobby Smith', mp_mapping) == 'mp-robert-smith'
# Bill → William
assert match_mp_name('Bill Jones', mp_mapping) == 'mp-william-jones'
# Jim → James
assert match_mp_name('Jim Wilson', mp_mapping) == 'mp-james-wilson'
def test_no_match_returns_none(self, mp_mapping):
"""Should return None when no match is found."""
assert match_mp_name('Unknown Person', mp_mapping) is None
assert match_mp_name('John Unknown', mp_mapping) is None
def test_empty_input(self, mp_mapping):
"""Should handle empty input gracefully."""
assert match_mp_name('', mp_mapping) is None
assert match_mp_name(None, mp_mapping) is None
def test_honorific_removal(self, mp_mapping):
"""Should match names with honorifics removed."""
# normalize_name removes periods, so "Hon." becomes "hon"
# The matching should still work if base name exists
assert match_mp_name('Pierre Poilievre', mp_mapping) == 'mp-pierre-poilievre'
# Note: "Hon. Pierre Poilievre" normalizes to "hon pierre poilievre"
# which won't match directly, but the first+last logic should help
def test_hyphenated_surname_partial(self, mp_mapping):
"""Should match first part of hyphenated surnames."""
assert match_mp_name('Mary Simon', mp_mapping) == 'mp-mary-simon-johnson'
class TestNameMatchingEdgeCases:
"""Edge case tests for name matching."""
@pytest.fixture
def edge_case_mapping(self):
"""MP mapping with edge cases."""
return {
# Single names
'cher': 'mp-cher',
# Very long names
'maria guadalupe gonzalez de la torre': 'mp-maria-torre',
# Names with apostrophes (apostrophe preserved by normalization)
"sean o'connor": 'mp-sean-oconnor',
# Quebec-style names
'jean-claude van damme': 'mp-jc-vd',
}
def test_single_word_name(self, edge_case_mapping):
"""Should handle single-word names."""
assert match_mp_name('Cher', edge_case_mapping) == 'mp-cher'
def test_apostrophe_in_name(self, edge_case_mapping):
"""Should handle names with apostrophes (apostrophe preserved)."""
# normalize_name preserves apostrophes: O'Connor → o'connor
assert match_mp_name("Sean O'Connor", edge_case_mapping) == 'mp-sean-oconnor'
def test_whitespace_only(self, edge_case_mapping):
"""Should handle whitespace-only input."""
assert match_mp_name(' ', edge_case_mapping) is None
class TestIntegration:
"""Integration tests simulating real-world scenarios."""
@pytest.fixture
def realistic_mp_mapping(self):
"""Mapping resembling real Canadian MPs."""
return {
# Current party leaders
'pierre poilievre': 'mp-pierre-poilievre',
'jagmeet singh': 'mp-jagmeet-singh',
'yves-francois blanchet': 'mp-yves-francois-blanchet',
'yves francois blanchet': 'mp-yves-francois-blanchet',
'elizabeth may': 'mp-elizabeth-may',
# French Canadian names with accents removed
'steven guilbeault': 'mp-steven-guilbeault',
'melanie joly': 'mp-melanie-joly',
'francois-philippe champagne': 'mp-fp-champagne',
# Common name variations
'robert morrissey': 'mp-robert-morrissey',
'michael chong': 'mp-michael-chong',
'mike chong': 'mp-michael-chong',
}
def test_hansard_speaker_names(self, realistic_mp_mapping):
"""Test names as they appear in Hansard."""
# Standard format
assert match_mp_name('Pierre Poilievre', realistic_mp_mapping) == 'mp-pierre-poilievre'
# With accent
assert match_mp_name('Mélanie Joly', realistic_mp_mapping) == 'mp-melanie-joly'
# Nickname
assert match_mp_name('Mike Chong', realistic_mp_mapping) == 'mp-michael-chong'
# Formal name
assert match_mp_name('Michael Chong', realistic_mp_mapping) == 'mp-michael-chong'
def test_vote_ballot_names(self, realistic_mp_mapping):
"""Test names as they appear in vote ballots."""
# Ballots often use formal names
assert match_mp_name('Robert Morrissey', realistic_mp_mapping) == 'mp-robert-morrissey'
# Test nickname variation
assert match_mp_name('Bob Morrissey', realistic_mp_mapping) == 'mp-robert-morrissey'
def test_committee_witness_names(self, realistic_mp_mapping):
"""Test names as they appear in committee records."""
# May include honorifics (normalize_name removes periods)
assert match_mp_name('Elizabeth May', realistic_mp_mapping) == 'mp-elizabeth-may'