diff --git a/explorer.py b/explorer.py index 597ce2b..d42fafc 100644 --- a/explorer.py +++ b/explorer.py @@ -84,7 +84,6 @@ CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset( "CDA", "SP", "ChristenUnie", - "CU", # alias for ChristenUnie "SGP", "Volt", "DENK", @@ -94,6 +93,17 @@ CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset( } ) +# Normalize variant party names to canonical display names in CURRENT_PARLIAMENT_PARTIES +_PARTY_NORMALIZE: dict[str, str] = { + "Nieuw Sociaal Contract": "NSC", + "CU": "ChristenUnie", + "GL": "GroenLinks-PvdA", + "GroenLinks": "GroenLinks-PvdA", + "PvdA": "GroenLinks-PvdA", + "Gündoğan": "GroenLinks-PvdA", # briefly sat with GL-PvdA faction + "Lid Keijzer": "BBB", # Keijzer left CDA, joined BBB cabinet +} + # --------------------------------------------------------------------------- # Cached loaders @@ -239,17 +249,20 @@ def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]: ).fetchall() party_counts: Dict[str, Dict[str, int]] = {} for mp_name, party, n in party_rows: - party_counts.setdefault(mp_name, {})[party] = n + # Normalize variant names to canonical party names + canonical = _PARTY_NORMALIZE.get(party, party) + party_counts.setdefault(mp_name, {})[canonical] = ( + party_counts.setdefault(mp_name, {}).get(canonical, 0) + n + ) mp_party: Dict[str, str] = { mp: max(counts, key=counts.__getitem__) for mp, counts in party_counts.items() } - # Individual MP vectors from current_parliament + # Individual MP vectors from current_parliament (all mp rows are now individual MPs) rows = con.execute( "SELECT entity_id, vector FROM svd_vectors " - "WHERE entity_type='mp' AND window_id='current_parliament' " - "AND entity_id LIKE '%,%'" + "WHERE entity_type='mp' AND window_id='current_parliament'" ).fetchall() party_vecs: Dict[str, list] = {} @@ -994,20 +1007,17 @@ def build_svd_components_tab(db_path: str) -> None: # Produced by per-axis analysis of all 10 unique top motions (zero cross-axis overlap). SVD_THEMES: dict[int, dict[str, str]] = { 1: { - "label": "Constructief coalitiebeleid versus radicale PVV-migratiepolitiek", + "label": "Links-rechts hoofdas: progressief versus conservatief-nationalistisch", "explanation": ( - "Deze as scheidt brede, constructieve coalitiemoties (CDA, VVD, NSC, CU, D66, GL-PvdA) " - "over uiteenlopende beleidsterreinen — van zorg en defensie tot multilateralisme en " - "digitale inclusie — van de harde PVV-motie die onmiddellijke uitzetting van Syriërs " - "en Oekraïense mannen eist. De positieve pool vertegenwoordigt het brede parlementaire " - "midden dat procedureel en inhoudelijk samenwerkt, terwijl de negatieve pool de " - "isolationistische en radicaal-nationalistische positie van de PVV symboliseert. " - "Deze dimensie is politiek betekenisvol omdat zij de kloof blootlegt tussen mainstream " - "coalitiepolitiek en populistisch-nationalistisch obstructionisme op het vlak van migratie." + "De dominante dimensie van het parlement: partijen aan de linkerkant (PvdD, GL-PvdA, " + "DENK, SP) stemmen progressief — voor sociale voorzieningen, klimaat, internationale " + "solidariteit — terwijl partijen aan de rechterkant (PVV, NSC, BBB, SGP) inzetten op " + "nationaal belang, migratiebeheer en conservatieve waarden. Dit is de klassieke " + "links-rechts tegenstelling die het meeste verschil in stemgedrag verklaart." ), - "positive_pole": "Breed coalitiebeleid: zorg, defensie, multilateralisme, inclusie", - "negative_pole": "Radicale PVV-eis tot onmiddellijke uitzetting migranten", - "flip": True, + "positive_pole": "Nationalistisch-conservatief: PVV, NSC, BBB, SGP", + "negative_pole": "Progressief-links: PvdD, GL-PvdA, DENK, SP", + "flip": False, }, 2: { "label": "Nationalistisch migratiebeleid versus progressief internationaal solidariteitsdenken", diff --git a/pipeline/svd_pipeline.py b/pipeline/svd_pipeline.py index 13fcbb6..d9f680f 100644 --- a/pipeline/svd_pipeline.py +++ b/pipeline/svd_pipeline.py @@ -46,6 +46,119 @@ VOTE_MAP = { "Blanco": 0.0, } +# Mapping from short party names (as they appear in party-level vote rows) +# to canonical party names in mp_metadata. Parties not listed here are either +# already matching or are skipped (no valid mp_metadata coverage). +_PARTY_NAME_MAP = { + "NSC": "Nieuw Sociaal Contract", + "Gündogan": "Gündoğan", + "Keijzer": "Lid Keijzer", + # Pre-merger: both GroenLinks and PvdA votes map to the merged faction + "GroenLinks": "GroenLinks-PvdA", + "PvdA": "GroenLinks-PvdA", + # Omtzigt initially sat alone before founding NSC + "Omtzigt": "Nieuw Sociaal Contract", +} + +# Party names for which we have no usable mp_metadata (tiny noise, skip expansion) +_SKIP_PARTIES = {"Brinkman", "Bontes", "Krol", "Van Kooten-Arissen"} + + +def _build_expanded_rows( + db_path: str, start_date: str, end_date: str +) -> List[Tuple[int, str, str, str]]: + """Build vote rows expanding party-level votes to individual MPs. + + For motions that have only party-level vote records (mp_name is a party code, + not a 'Lastname, F.' individual), each party vote is expanded to all individual + MPs of that party who were active on the motion date (via mp_metadata). + + For motions that already have individual MP records, those rows are kept as-is. + + Returns list of (motion_id, mp_name, vote, date_str) tuples. + """ + conn = duckdb.connect(db_path, read_only=True) + try: + # Load all vote rows for the window + vote_rows = conn.execute( + "SELECT motion_id, mp_name, vote, date FROM mp_votes " + "WHERE date BETWEEN ? AND ?", + (start_date, end_date), + ).fetchall() + + # Load mp_metadata (name, party, van, tot_en_met) + meta_rows = conn.execute( + "SELECT mp_name, party, van, tot_en_met FROM mp_metadata" + ).fetchall() + finally: + conn.close() + + if not vote_rows: + return [] + + # Build mp_metadata lookup: canonical_party -> list of (mp_name, van, tot_en_met) + from collections import defaultdict + import datetime + + party_to_mps: Dict[str, List[Tuple]] = defaultdict(list) + for mp_name, party, van, tot_en_met in meta_rows: + if party and mp_name: + party_to_mps[party].append((mp_name, van, tot_en_met)) + + def get_active_mps(canonical_party: str, motion_date) -> List[str]: + """Return MP names active in canonical_party on motion_date.""" + result = [] + for mp_name, van, tot_en_met in party_to_mps.get(canonical_party, []): + if van is None or van > motion_date: + continue + if tot_en_met is not None and tot_en_met < motion_date: + continue + result.append(mp_name) + return result + + # Group rows by motion_id, separate individual vs party rows + from collections import defaultdict as _dd + + motion_individual: Dict[int, List] = _dd(list) + motion_party: Dict[int, List] = _dd(list) + + for motion_id, mp_name, vote, date in vote_rows: + mid = int(motion_id) + # Individual MPs have comma in name (e.g. "Bergkamp, V.A.") + if "," in str(mp_name): + motion_individual[mid].append((mp_name, vote, date)) + else: + motion_party[mid].append((mp_name, vote, date)) + + # Build the final expanded rows + expanded: List[Tuple[int, str, str, str]] = [] + + all_motion_ids = set(motion_individual.keys()) | set(motion_party.keys()) + for mid in all_motion_ids: + if mid in motion_individual and motion_individual[mid]: + # Motion already has individual MP rows — use them directly, skip party rows + for mp_name, vote, date in motion_individual[mid]: + expanded.append((mid, mp_name, vote, str(date))) + else: + # Party-only motion — expand each party row to individual MPs + for party_name, vote, date in motion_party[mid]: + if party_name in _SKIP_PARTIES: + continue + canonical = _PARTY_NAME_MAP.get(party_name, party_name) + active_mps = get_active_mps(canonical, date) + if not active_mps: + _logger.debug( + "No active MPs found for party %s (canonical: %s) on %s", + party_name, + canonical, + date, + ) + continue + for mp_name in active_mps: + expanded.append((mid, mp_name, vote, str(date))) + + return expanded + def _safe_k(mat: np.ndarray, k: int) -> int: """Return a safe k for svds: must be < min(mat.shape).""" @@ -162,21 +275,18 @@ def compute_svd_for_window( Opens the DB in read-only mode (allows concurrent parallel workers). Does NOT write to the DB — caller is responsible for persisting results. + Party-level vote rows are expanded to individual MP rows using mp_metadata + so that the vote matrix contains only individual MPs (no party aggregates). + This prevents the block-diagonal structure that causes SVD axes to be disjoint. + Returns dict with keys: window_id, k_used, mp_rows, motion_rows where *_rows are List[Tuple[entity_type, entity_id, vector, model]] """ empty = {"window_id": window_id, "k_used": 0, "mp_rows": [], "motion_rows": []} - # Read vote matrix using a read-only connection — safe to run in parallel. - conn = duckdb.connect(db_path, read_only=True) - try: - rows = conn.execute( - "SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ?", - (start_date, end_date), - ).fetchall() - finally: - conn.close() + # Build expanded rows: party votes → individual MP votes + rows = _build_expanded_rows(db_path, start_date, end_date) if not rows: return empty @@ -191,7 +301,7 @@ def compute_svd_for_window( mp_index = {name: i for i, name in enumerate(mp_names)} motion_index = {mid: j for j, mid in enumerate(motion_ids)} - for motion_id, mp_name, vote in rows: + for motion_id, mp_name, vote, _date in rows: i = mp_index[mp_name] j = motion_index[int(motion_id)] val = VOTE_MAP.get(