fix(svd): expand party votes to individual MPs before SVD computation

The mp_votes table contains both party-aggregate rows (e.g. 'PVV', 'NSC') and individual MP rows (e.g. 'Aardema, M.'). Running SVD on both together creates a block-diagonal vote matrix where party codes and individual MPs occupy disjoint SVD dimensions — causing dim 0 to be zero for all 421 MPs. Fix: _build_expanded_rows() converts every party-level vote to individual MP votes using mp_metadata date ranges (active MPs on motion date). Motions that already have individual MP records are kept as-is. A party name mapping handles NSC/Nieuw Sociaal Contract and other canonical name variants. Results for current_parliament: 517 individual MPs, all 8732 motions covered, dim 0 std=23.1 (was 0.0 for all MPs). PVV/NSC/BBB on positive end, SP/GL/PvdD on negative end — matches expected left-right political axis. All 11 annual windows (2016-2026) re-run with the new pipeline.
4 months ago · 6cb89126a7
parent a7517bb6ae
commit 6cb89126a7
2 changed files with 147 additions and 27 deletions
--- a/explorer.py
+++ b/explorer.py
@ -84,7 +84,6 @@ CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset(
        "CDA",
        "SP",
        "ChristenUnie",
-        "CU",  # alias for ChristenUnie
        "SGP",
        "Volt",
        "DENK",
@ -94,6 +93,17 @@ CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset(
    }
 )

+# Normalize variant party names to canonical display names in CURRENT_PARLIAMENT_PARTIES
+_PARTY_NORMALIZE: dict[str, str] = {
+    "Nieuw Sociaal Contract": "NSC",
+    "CU": "ChristenUnie",
+    "GL": "GroenLinks-PvdA",
+    "GroenLinks": "GroenLinks-PvdA",
+    "PvdA": "GroenLinks-PvdA",
+    "Gündoğan": "GroenLinks-PvdA",  # briefly sat with GL-PvdA faction
+    "Lid Keijzer": "BBB",  # Keijzer left CDA, joined BBB cabinet
+}
+

 # ---------------------------------------------------------------------------
 # Cached loaders
@ -239,17 +249,20 @@ def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]:
        ).fetchall()
        party_counts: Dict[str, Dict[str, int]] = {}
        for mp_name, party, n in party_rows:
-            party_counts.setdefault(mp_name, {})[party] = n
+            # Normalize variant names to canonical party names
+            canonical = _PARTY_NORMALIZE.get(party, party)
+            party_counts.setdefault(mp_name, {})[canonical] = (
+                party_counts.setdefault(mp_name, {}).get(canonical, 0) + n
+            )
        mp_party: Dict[str, str] = {
            mp: max(counts, key=counts.__getitem__)
            for mp, counts in party_counts.items()
        }

-        # Individual MP vectors from current_parliament
+        # Individual MP vectors from current_parliament (all mp rows are now individual MPs)
        rows = con.execute(
            "SELECT entity_id, vector FROM svd_vectors "
-            "WHERE entity_type='mp' AND window_id='current_parliament' "
-            "AND entity_id LIKE '%,%'"
+            "WHERE entity_type='mp' AND window_id='current_parliament'"
        ).fetchall()

        party_vecs: Dict[str, list] = {}
@ -994,20 +1007,17 @@ def build_svd_components_tab(db_path: str) -> None:
    # Produced by per-axis analysis of all 10 unique top motions (zero cross-axis overlap).
    SVD_THEMES: dict[int, dict[str, str]] = {
        1: {
-            "label": "Constructief coalitiebeleid versus radicale PVV-migratiepolitiek",
+            "label": "Links-rechts hoofdas: progressief versus conservatief-nationalistisch",
            "explanation": (
-                "Deze as scheidt brede, constructieve coalitiemoties (CDA, VVD, NSC, CU, D66, GL-PvdA) "
-                "over uiteenlopende beleidsterreinen — van zorg en defensie tot multilateralisme en "
-                "digitale inclusie — van de harde PVV-motie die onmiddellijke uitzetting van Syriërs "
-                "en Oekraïense mannen eist. De positieve pool vertegenwoordigt het brede parlementaire "
-                "midden dat procedureel en inhoudelijk samenwerkt, terwijl de negatieve pool de "
-                "isolationistische en radicaal-nationalistische positie van de PVV symboliseert. "
-                "Deze dimensie is politiek betekenisvol omdat zij de kloof blootlegt tussen mainstream "
-                "coalitiepolitiek en populistisch-nationalistisch obstructionisme op het vlak van migratie."
+                "De dominante dimensie van het parlement: partijen aan de linkerkant (PvdD, GL-PvdA, "
+                "DENK, SP) stemmen progressief — voor sociale voorzieningen, klimaat, internationale "
+                "solidariteit — terwijl partijen aan de rechterkant (PVV, NSC, BBB, SGP) inzetten op "
+                "nationaal belang, migratiebeheer en conservatieve waarden. Dit is de klassieke "
+                "links-rechts tegenstelling die het meeste verschil in stemgedrag verklaart."
            ),
-            "positive_pole": "Breed coalitiebeleid: zorg, defensie, multilateralisme, inclusie",
-            "negative_pole": "Radicale PVV-eis tot onmiddellijke uitzetting migranten",
-            "flip": True,
+            "positive_pole": "Nationalistisch-conservatief: PVV, NSC, BBB, SGP",
+            "negative_pole": "Progressief-links: PvdD, GL-PvdA, DENK, SP",
+            "flip": False,
        },
        2: {
            "label": "Nationalistisch migratiebeleid versus progressief internationaal solidariteitsdenken",
--- a/pipeline/svd_pipeline.py
+++ b/pipeline/svd_pipeline.py
@ -46,6 +46,119 @@ VOTE_MAP = {
    "Blanco": 0.0,
 }

+# Mapping from short party names (as they appear in party-level vote rows)
+# to canonical party names in mp_metadata. Parties not listed here are either
+# already matching or are skipped (no valid mp_metadata coverage).
+_PARTY_NAME_MAP = {
+    "NSC": "Nieuw Sociaal Contract",
+    "Gündogan": "Gündoğan",
+    "Keijzer": "Lid Keijzer",
+    # Pre-merger: both GroenLinks and PvdA votes map to the merged faction
+    "GroenLinks": "GroenLinks-PvdA",
+    "PvdA": "GroenLinks-PvdA",
+    # Omtzigt initially sat alone before founding NSC
+    "Omtzigt": "Nieuw Sociaal Contract",
+}
+
+# Party names for which we have no usable mp_metadata (tiny noise, skip expansion)
+_SKIP_PARTIES = {"Brinkman", "Bontes", "Krol", "Van Kooten-Arissen"}
+
+
+def _build_expanded_rows(
+    db_path: str, start_date: str, end_date: str
+) -> List[Tuple[int, str, str, str]]:
+    """Build vote rows expanding party-level votes to individual MPs.
+
+    For motions that have only party-level vote records (mp_name is a party code,
+    not a 'Lastname, F.' individual), each party vote is expanded to all individual
+    MPs of that party who were active on the motion date (via mp_metadata).
+
+    For motions that already have individual MP records, those rows are kept as-is.
+
+    Returns list of (motion_id, mp_name, vote, date_str) tuples.
+    """
+    conn = duckdb.connect(db_path, read_only=True)
+    try:
+        # Load all vote rows for the window
+        vote_rows = conn.execute(
+            "SELECT motion_id, mp_name, vote, date FROM mp_votes "
+            "WHERE date BETWEEN ? AND ?",
+            (start_date, end_date),
+        ).fetchall()
+
+        # Load mp_metadata (name, party, van, tot_en_met)
+        meta_rows = conn.execute(
+            "SELECT mp_name, party, van, tot_en_met FROM mp_metadata"
+        ).fetchall()
+    finally:
+        conn.close()
+
+    if not vote_rows:
+        return []
+
+    # Build mp_metadata lookup: canonical_party -> list of (mp_name, van, tot_en_met)
+    from collections import defaultdict
+    import datetime
+
+    party_to_mps: Dict[str, List[Tuple]] = defaultdict(list)
+    for mp_name, party, van, tot_en_met in meta_rows:
+        if party and mp_name:
+            party_to_mps[party].append((mp_name, van, tot_en_met))
+
+    def get_active_mps(canonical_party: str, motion_date) -> List[str]:
+        """Return MP names active in canonical_party on motion_date."""
+        result = []
+        for mp_name, van, tot_en_met in party_to_mps.get(canonical_party, []):
+            if van is None or van > motion_date:
+                continue
+            if tot_en_met is not None and tot_en_met < motion_date:
+                continue
+            result.append(mp_name)
+        return result
+
+    # Group rows by motion_id, separate individual vs party rows
+    from collections import defaultdict as _dd
+
+    motion_individual: Dict[int, List] = _dd(list)
+    motion_party: Dict[int, List] = _dd(list)
+
+    for motion_id, mp_name, vote, date in vote_rows:
+        mid = int(motion_id)
+        # Individual MPs have comma in name (e.g. "Bergkamp, V.A.")
+        if "," in str(mp_name):
+            motion_individual[mid].append((mp_name, vote, date))
+        else:
+            motion_party[mid].append((mp_name, vote, date))
+
+    # Build the final expanded rows
+    expanded: List[Tuple[int, str, str, str]] = []
+
+    all_motion_ids = set(motion_individual.keys()) | set(motion_party.keys())
+    for mid in all_motion_ids:
+        if mid in motion_individual and motion_individual[mid]:
+            # Motion already has individual MP rows — use them directly, skip party rows
+            for mp_name, vote, date in motion_individual[mid]:
+                expanded.append((mid, mp_name, vote, str(date)))
+        else:
+            # Party-only motion — expand each party row to individual MPs
+            for party_name, vote, date in motion_party[mid]:
+                if party_name in _SKIP_PARTIES:
+                    continue
+                canonical = _PARTY_NAME_MAP.get(party_name, party_name)
+                active_mps = get_active_mps(canonical, date)
+                if not active_mps:
+                    _logger.debug(
+                        "No active MPs found for party %s (canonical: %s) on %s",
+                        party_name,
+                        canonical,
+                        date,
+                    )
+                    continue
+                for mp_name in active_mps:
+                    expanded.append((mid, mp_name, vote, str(date)))
+
+    return expanded
+

 def _safe_k(mat: np.ndarray, k: int) -> int:
    """Return a safe k for svds: must be < min(mat.shape)."""
@ -162,21 +275,18 @@ def compute_svd_for_window(
    Opens the DB in read-only mode (allows concurrent parallel workers).
    Does NOT write to the DB — caller is responsible for persisting results.

+    Party-level vote rows are expanded to individual MP rows using mp_metadata
+    so that the vote matrix contains only individual MPs (no party aggregates).
+    This prevents the block-diagonal structure that causes SVD axes to be disjoint.
+
    Returns dict with keys:
        window_id, k_used, mp_rows, motion_rows
        where *_rows are List[Tuple[entity_type, entity_id, vector, model]]
    """
    empty = {"window_id": window_id, "k_used": 0, "mp_rows": [], "motion_rows": []}

-    # Read vote matrix using a read-only connection — safe to run in parallel.
-    conn = duckdb.connect(db_path, read_only=True)
-    try:
-        rows = conn.execute(
-            "SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ?",
-            (start_date, end_date),
-        ).fetchall()
-    finally:
-        conn.close()
+    # Build expanded rows: party votes → individual MP votes
+    rows = _build_expanded_rows(db_path, start_date, end_date)

    if not rows:
        return empty
@ -191,7 +301,7 @@ def compute_svd_for_window(
    mp_index = {name: i for i, name in enumerate(mp_names)}
    motion_index = {mid: j for j, mid in enumerate(motion_ids)}

-    for motion_id, mp_name, vote in rows:
+    for motion_id, mp_name, vote, _date in rows:
        i = mp_index[mp_name]
        j = motion_index[int(motion_id)]
        val = VOTE_MAP.get(