fix(svd): expand party votes to individual MPs before SVD computation

The mp_votes table contains both party-aggregate rows (e.g. 'PVV', 'NSC') and individual MP rows (e.g. 'Aardema, M.'). Running SVD on both together creates a block-diagonal vote matrix where party codes and individual MPs occupy disjoint SVD dimensions — causing dim 0 to be zero for all 421 MPs. Fix: _build_expanded_rows() converts every party-level vote to individual MP votes using mp_metadata date ranges (active MPs on motion date). Motions that already have individual MP records are kept as-is. A party name mapping handles NSC/Nieuw Sociaal Contract and other canonical name variants. Results for current_parliament: 517 individual MPs, all 8732 motions covered, dim 0 std=23.1 (was 0.0 for all MPs). PVV/NSC/BBB on positive end, SP/GL/PvdD on negative end — matches expected left-right political axis. All 11 annual windows (2016-2026) re-run with the new pipeline.
3 months ago · 6cb89126a7
parent a7517bb6ae
commit 6cb89126a7
2 changed files with 147 additions and 27 deletions
--- a/explorer.py
+++ b/explorer.py
@ -84,7 +84,6 @@ CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset(
        "CDA",
        "SP",
        "ChristenUnie",
        "CU",  # alias for ChristenUnie
        "SGP",
        "Volt",
        "DENK",
@ -94,6 +93,17 @@ CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset(
    }
 )
 # Normalize variant party names to canonical display names in CURRENT_PARLIAMENT_PARTIES
 _PARTY_NORMALIZE: dict[str, str] = {
    "Nieuw Sociaal Contract": "NSC",
    "CU": "ChristenUnie",
    "GL": "GroenLinks-PvdA",
    "GroenLinks": "GroenLinks-PvdA",
    "PvdA": "GroenLinks-PvdA",
    "Gündoğan": "GroenLinks-PvdA",  # briefly sat with GL-PvdA faction
    "Lid Keijzer": "BBB",  # Keijzer left CDA, joined BBB cabinet
 }
 # ---------------------------------------------------------------------------
 # Cached loaders
@ -239,17 +249,20 @@ def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]:
        ).fetchall()
        party_counts: Dict[str, Dict[str, int]] = {}
        for mp_name, party, n in party_rows:
-            party_counts.setdefault(mp_name, {})[party] = n
+            # Normalize variant names to canonical party names
            canonical = _PARTY_NORMALIZE.get(party, party)
            party_counts.setdefault(mp_name, {})[canonical] = (
                party_counts.setdefault(mp_name, {}).get(canonical, 0) + n
            )
        mp_party: Dict[str, str] = {
            mp: max(counts, key=counts.__getitem__)
            for mp, counts in party_counts.items()
        }
-        # Individual MP vectors from current_parliament
+        # Individual MP vectors from current_parliament (all mp rows are now individual MPs)
        rows = con.execute(
            "SELECT entity_id, vector FROM svd_vectors "
            "WHERE entity_type='mp' AND window_id='current_parliament'"
            "AND entity_id LIKE '%,%'"
        ).fetchall()
        party_vecs: Dict[str, list] = {}
@ -994,20 +1007,17 @@ def build_svd_components_tab(db_path: str) -> None:
    # Produced by per-axis analysis of all 10 unique top motions (zero cross-axis overlap).
    SVD_THEMES: dict[int, dict[str, str]] = {
        1: {
-            "label": "Constructief coalitiebeleid versus radicale PVV-migratiepolitiek",
+            "label": "Links-rechts hoofdas: progressief versus conservatief-nationalistisch",
            "explanation": (
-                "Deze as scheidt brede, constructieve coalitiemoties (CDA, VVD, NSC, CU, D66, GL-PvdA) "
+                "De dominante dimensie van het parlement: partijen aan de linkerkant (PvdD, GL-PvdA, "
-                "over uiteenlopende beleidsterreinen — van zorg en defensie tot multilateralisme en "
+                "DENK, SP) stemmen progressief — voor sociale voorzieningen, klimaat, internationale "
-                "digitale inclusie — van de harde PVV-motie die onmiddellijke uitzetting van Syriërs "
+                "solidariteit — terwijl partijen aan de rechterkant (PVV, NSC, BBB, SGP) inzetten op "
-                "en Oekraïense mannen eist. De positieve pool vertegenwoordigt het brede parlementaire "
+                "nationaal belang, migratiebeheer en conservatieve waarden. Dit is de klassieke "
-                "midden dat procedureel en inhoudelijk samenwerkt, terwijl de negatieve pool de "
+                "links-rechts tegenstelling die het meeste verschil in stemgedrag verklaart."
                "isolationistische en radicaal-nationalistische positie van de PVV symboliseert. "
                "Deze dimensie is politiek betekenisvol omdat zij de kloof blootlegt tussen mainstream "
                "coalitiepolitiek en populistisch-nationalistisch obstructionisme op het vlak van migratie."
            ),
-            "positive_pole": "Breed coalitiebeleid: zorg, defensie, multilateralisme, inclusie",
+            "positive_pole": "Nationalistisch-conservatief: PVV, NSC, BBB, SGP",
-            "negative_pole": "Radicale PVV-eis tot onmiddellijke uitzetting migranten",
+            "negative_pole": "Progressief-links: PvdD, GL-PvdA, DENK, SP",
-            "flip": True,
+            "flip": False,
        },
        2: {
            "label": "Nationalistisch migratiebeleid versus progressief internationaal solidariteitsdenken",
--- a/pipeline/svd_pipeline.py
+++ b/pipeline/svd_pipeline.py
@ -46,6 +46,119 @@ VOTE_MAP = {
    "Blanco": 0.0,
 }
 # Mapping from short party names (as they appear in party-level vote rows)
 # to canonical party names in mp_metadata. Parties not listed here are either
 # already matching or are skipped (no valid mp_metadata coverage).
 _PARTY_NAME_MAP = {
    "NSC": "Nieuw Sociaal Contract",
    "Gündogan": "Gündoğan",
    "Keijzer": "Lid Keijzer",
    # Pre-merger: both GroenLinks and PvdA votes map to the merged faction
    "GroenLinks": "GroenLinks-PvdA",
    "PvdA": "GroenLinks-PvdA",
    # Omtzigt initially sat alone before founding NSC
    "Omtzigt": "Nieuw Sociaal Contract",
 }
 # Party names for which we have no usable mp_metadata (tiny noise, skip expansion)
 _SKIP_PARTIES = {"Brinkman", "Bontes", "Krol", "Van Kooten-Arissen"}
 def _build_expanded_rows(
    db_path: str, start_date: str, end_date: str
 ) -> List[Tuple[int, str, str, str]]:
    """Build vote rows expanding party-level votes to individual MPs.
    For motions that have only party-level vote records (mp_name is a party code,
    not a 'Lastname, F.' individual), each party vote is expanded to all individual
    MPs of that party who were active on the motion date (via mp_metadata).
    For motions that already have individual MP records, those rows are kept as-is.
    Returns list of (motion_id, mp_name, vote, date_str) tuples.
    """
    conn = duckdb.connect(db_path, read_only=True)
    try:
        # Load all vote rows for the window
        vote_rows = conn.execute(
            "SELECT motion_id, mp_name, vote, date FROM mp_votes "
            "WHERE date BETWEEN ? AND ?",
            (start_date, end_date),
        ).fetchall()
        # Load mp_metadata (name, party, van, tot_en_met)
        meta_rows = conn.execute(
            "SELECT mp_name, party, van, tot_en_met FROM mp_metadata"
        ).fetchall()
    finally:
        conn.close()
    if not vote_rows:
        return []
    # Build mp_metadata lookup: canonical_party -> list of (mp_name, van, tot_en_met)
    from collections import defaultdict
    import datetime
    party_to_mps: Dict[str, List[Tuple]] = defaultdict(list)
    for mp_name, party, van, tot_en_met in meta_rows:
        if party and mp_name:
            party_to_mps[party].append((mp_name, van, tot_en_met))
    def get_active_mps(canonical_party: str, motion_date) -> List[str]:
        """Return MP names active in canonical_party on motion_date."""
        result = []
        for mp_name, van, tot_en_met in party_to_mps.get(canonical_party, []):
            if van is None or van > motion_date:
                continue
            if tot_en_met is not None and tot_en_met < motion_date:
                continue
            result.append(mp_name)
        return result
    # Group rows by motion_id, separate individual vs party rows
    from collections import defaultdict as _dd
    motion_individual: Dict[int, List] = _dd(list)
    motion_party: Dict[int, List] = _dd(list)
    for motion_id, mp_name, vote, date in vote_rows:
        mid = int(motion_id)
        # Individual MPs have comma in name (e.g. "Bergkamp, V.A.")
        if "," in str(mp_name):
            motion_individual[mid].append((mp_name, vote, date))
        else:
            motion_party[mid].append((mp_name, vote, date))
    # Build the final expanded rows
    expanded: List[Tuple[int, str, str, str]] = []
    all_motion_ids = set(motion_individual.keys()) | set(motion_party.keys())
    for mid in all_motion_ids:
        if mid in motion_individual and motion_individual[mid]:
            # Motion already has individual MP rows — use them directly, skip party rows
            for mp_name, vote, date in motion_individual[mid]:
                expanded.append((mid, mp_name, vote, str(date)))
        else:
            # Party-only motion — expand each party row to individual MPs
            for party_name, vote, date in motion_party[mid]:
                if party_name in _SKIP_PARTIES:
                    continue
                canonical = _PARTY_NAME_MAP.get(party_name, party_name)
                active_mps = get_active_mps(canonical, date)
                if not active_mps:
                    _logger.debug(
                        "No active MPs found for party %s (canonical: %s) on %s",
                        party_name,
                        canonical,
                        date,
                    )
                    continue
                for mp_name in active_mps:
                    expanded.append((mid, mp_name, vote, str(date)))
    return expanded
 def _safe_k(mat: np.ndarray, k: int) -> int:
    """Return a safe k for svds: must be < min(mat.shape)."""
@ -162,21 +275,18 @@ def compute_svd_for_window(
    Opens the DB in read-only mode (allows concurrent parallel workers).
    Does NOT write to the DB — caller is responsible for persisting results.
    Party-level vote rows are expanded to individual MP rows using mp_metadata
    so that the vote matrix contains only individual MPs (no party aggregates).
    This prevents the block-diagonal structure that causes SVD axes to be disjoint.
    Returns dict with keys:
        window_id, k_used, mp_rows, motion_rows
        where *_rows are List[Tuple[entity_type, entity_id, vector, model]]
    """
    empty = {"window_id": window_id, "k_used": 0, "mp_rows": [], "motion_rows": []}
-    # Read vote matrix using a read-only connection — safe to run in parallel.
+    # Build expanded rows: party votes → individual MP votes
-    conn = duckdb.connect(db_path, read_only=True)
+    rows = _build_expanded_rows(db_path, start_date, end_date)
    try:
        rows = conn.execute(
            "SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ?",
            (start_date, end_date),
        ).fetchall()
    finally:
        conn.close()
    if not rows:
        return empty
@ -191,7 +301,7 @@ def compute_svd_for_window(
    mp_index = {name: i for i, name in enumerate(mp_names)}
    motion_index = {mid: j for j, mid in enumerate(motion_ids)}
-    for motion_id, mp_name, vote in rows:
+    for motion_id, mp_name, vote, _date in rows:
        i = mp_index[mp_name]
        j = motion_index[int(motion_id)]
        val = VOTE_MAP.get(