fix(svd): expand party votes to individual MPs before SVD computation

The mp_votes table contains both party-aggregate rows (e.g. 'PVV', 'NSC')
and individual MP rows (e.g. 'Aardema, M.'). Running SVD on both together
creates a block-diagonal vote matrix where party codes and individual MPs
occupy disjoint SVD dimensions — causing dim 0 to be zero for all 421 MPs.

Fix: _build_expanded_rows() converts every party-level vote to individual
MP votes using mp_metadata date ranges (active MPs on motion date). Motions
that already have individual MP records are kept as-is. A party name mapping
handles NSC/Nieuw Sociaal Contract and other canonical name variants.

Results for current_parliament: 517 individual MPs, all 8732 motions covered,
dim 0 std=23.1 (was 0.0 for all MPs). PVV/NSC/BBB on positive end, SP/GL/PvdD
on negative end — matches expected left-right political axis.

All 11 annual windows (2016-2026) re-run with the new pipeline.
main
Sven Geboers 1 month ago
parent a7517bb6ae
commit 6cb89126a7
  1. 42
      explorer.py
  2. 130
      pipeline/svd_pipeline.py

@ -84,7 +84,6 @@ CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset(
"CDA", "CDA",
"SP", "SP",
"ChristenUnie", "ChristenUnie",
"CU", # alias for ChristenUnie
"SGP", "SGP",
"Volt", "Volt",
"DENK", "DENK",
@ -94,6 +93,17 @@ CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset(
} }
) )
# Normalize variant party names to canonical display names in CURRENT_PARLIAMENT_PARTIES
_PARTY_NORMALIZE: dict[str, str] = {
"Nieuw Sociaal Contract": "NSC",
"CU": "ChristenUnie",
"GL": "GroenLinks-PvdA",
"GroenLinks": "GroenLinks-PvdA",
"PvdA": "GroenLinks-PvdA",
"Gündoğan": "GroenLinks-PvdA", # briefly sat with GL-PvdA faction
"Lid Keijzer": "BBB", # Keijzer left CDA, joined BBB cabinet
}
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Cached loaders # Cached loaders
@ -239,17 +249,20 @@ def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]:
).fetchall() ).fetchall()
party_counts: Dict[str, Dict[str, int]] = {} party_counts: Dict[str, Dict[str, int]] = {}
for mp_name, party, n in party_rows: for mp_name, party, n in party_rows:
party_counts.setdefault(mp_name, {})[party] = n # Normalize variant names to canonical party names
canonical = _PARTY_NORMALIZE.get(party, party)
party_counts.setdefault(mp_name, {})[canonical] = (
party_counts.setdefault(mp_name, {}).get(canonical, 0) + n
)
mp_party: Dict[str, str] = { mp_party: Dict[str, str] = {
mp: max(counts, key=counts.__getitem__) mp: max(counts, key=counts.__getitem__)
for mp, counts in party_counts.items() for mp, counts in party_counts.items()
} }
# Individual MP vectors from current_parliament # Individual MP vectors from current_parliament (all mp rows are now individual MPs)
rows = con.execute( rows = con.execute(
"SELECT entity_id, vector FROM svd_vectors " "SELECT entity_id, vector FROM svd_vectors "
"WHERE entity_type='mp' AND window_id='current_parliament'" "WHERE entity_type='mp' AND window_id='current_parliament'"
"AND entity_id LIKE '%,%'"
).fetchall() ).fetchall()
party_vecs: Dict[str, list] = {} party_vecs: Dict[str, list] = {}
@ -994,20 +1007,17 @@ def build_svd_components_tab(db_path: str) -> None:
# Produced by per-axis analysis of all 10 unique top motions (zero cross-axis overlap). # Produced by per-axis analysis of all 10 unique top motions (zero cross-axis overlap).
SVD_THEMES: dict[int, dict[str, str]] = { SVD_THEMES: dict[int, dict[str, str]] = {
1: { 1: {
"label": "Constructief coalitiebeleid versus radicale PVV-migratiepolitiek", "label": "Links-rechts hoofdas: progressief versus conservatief-nationalistisch",
"explanation": ( "explanation": (
"Deze as scheidt brede, constructieve coalitiemoties (CDA, VVD, NSC, CU, D66, GL-PvdA) " "De dominante dimensie van het parlement: partijen aan de linkerkant (PvdD, GL-PvdA, "
"over uiteenlopende beleidsterreinen — van zorg en defensie tot multilateralisme en " "DENK, SP) stemmen progressief — voor sociale voorzieningen, klimaat, internationale "
"digitale inclusie — van de harde PVV-motie die onmiddellijke uitzetting van Syriërs " "solidariteit — terwijl partijen aan de rechterkant (PVV, NSC, BBB, SGP) inzetten op "
"en Oekraïense mannen eist. De positieve pool vertegenwoordigt het brede parlementaire " "nationaal belang, migratiebeheer en conservatieve waarden. Dit is de klassieke "
"midden dat procedureel en inhoudelijk samenwerkt, terwijl de negatieve pool de " "links-rechts tegenstelling die het meeste verschil in stemgedrag verklaart."
"isolationistische en radicaal-nationalistische positie van de PVV symboliseert. "
"Deze dimensie is politiek betekenisvol omdat zij de kloof blootlegt tussen mainstream "
"coalitiepolitiek en populistisch-nationalistisch obstructionisme op het vlak van migratie."
), ),
"positive_pole": "Breed coalitiebeleid: zorg, defensie, multilateralisme, inclusie", "positive_pole": "Nationalistisch-conservatief: PVV, NSC, BBB, SGP",
"negative_pole": "Radicale PVV-eis tot onmiddellijke uitzetting migranten", "negative_pole": "Progressief-links: PvdD, GL-PvdA, DENK, SP",
"flip": True, "flip": False,
}, },
2: { 2: {
"label": "Nationalistisch migratiebeleid versus progressief internationaal solidariteitsdenken", "label": "Nationalistisch migratiebeleid versus progressief internationaal solidariteitsdenken",

@ -46,6 +46,119 @@ VOTE_MAP = {
"Blanco": 0.0, "Blanco": 0.0,
} }
# Mapping from short party names (as they appear in party-level vote rows)
# to canonical party names in mp_metadata. Parties not listed here are either
# already matching or are skipped (no valid mp_metadata coverage).
_PARTY_NAME_MAP = {
"NSC": "Nieuw Sociaal Contract",
"Gündogan": "Gündoğan",
"Keijzer": "Lid Keijzer",
# Pre-merger: both GroenLinks and PvdA votes map to the merged faction
"GroenLinks": "GroenLinks-PvdA",
"PvdA": "GroenLinks-PvdA",
# Omtzigt initially sat alone before founding NSC
"Omtzigt": "Nieuw Sociaal Contract",
}
# Party names for which we have no usable mp_metadata (tiny noise, skip expansion)
_SKIP_PARTIES = {"Brinkman", "Bontes", "Krol", "Van Kooten-Arissen"}
def _build_expanded_rows(
db_path: str, start_date: str, end_date: str
) -> List[Tuple[int, str, str, str]]:
"""Build vote rows expanding party-level votes to individual MPs.
For motions that have only party-level vote records (mp_name is a party code,
not a 'Lastname, F.' individual), each party vote is expanded to all individual
MPs of that party who were active on the motion date (via mp_metadata).
For motions that already have individual MP records, those rows are kept as-is.
Returns list of (motion_id, mp_name, vote, date_str) tuples.
"""
conn = duckdb.connect(db_path, read_only=True)
try:
# Load all vote rows for the window
vote_rows = conn.execute(
"SELECT motion_id, mp_name, vote, date FROM mp_votes "
"WHERE date BETWEEN ? AND ?",
(start_date, end_date),
).fetchall()
# Load mp_metadata (name, party, van, tot_en_met)
meta_rows = conn.execute(
"SELECT mp_name, party, van, tot_en_met FROM mp_metadata"
).fetchall()
finally:
conn.close()
if not vote_rows:
return []
# Build mp_metadata lookup: canonical_party -> list of (mp_name, van, tot_en_met)
from collections import defaultdict
import datetime
party_to_mps: Dict[str, List[Tuple]] = defaultdict(list)
for mp_name, party, van, tot_en_met in meta_rows:
if party and mp_name:
party_to_mps[party].append((mp_name, van, tot_en_met))
def get_active_mps(canonical_party: str, motion_date) -> List[str]:
"""Return MP names active in canonical_party on motion_date."""
result = []
for mp_name, van, tot_en_met in party_to_mps.get(canonical_party, []):
if van is None or van > motion_date:
continue
if tot_en_met is not None and tot_en_met < motion_date:
continue
result.append(mp_name)
return result
# Group rows by motion_id, separate individual vs party rows
from collections import defaultdict as _dd
motion_individual: Dict[int, List] = _dd(list)
motion_party: Dict[int, List] = _dd(list)
for motion_id, mp_name, vote, date in vote_rows:
mid = int(motion_id)
# Individual MPs have comma in name (e.g. "Bergkamp, V.A.")
if "," in str(mp_name):
motion_individual[mid].append((mp_name, vote, date))
else:
motion_party[mid].append((mp_name, vote, date))
# Build the final expanded rows
expanded: List[Tuple[int, str, str, str]] = []
all_motion_ids = set(motion_individual.keys()) | set(motion_party.keys())
for mid in all_motion_ids:
if mid in motion_individual and motion_individual[mid]:
# Motion already has individual MP rows — use them directly, skip party rows
for mp_name, vote, date in motion_individual[mid]:
expanded.append((mid, mp_name, vote, str(date)))
else:
# Party-only motion — expand each party row to individual MPs
for party_name, vote, date in motion_party[mid]:
if party_name in _SKIP_PARTIES:
continue
canonical = _PARTY_NAME_MAP.get(party_name, party_name)
active_mps = get_active_mps(canonical, date)
if not active_mps:
_logger.debug(
"No active MPs found for party %s (canonical: %s) on %s",
party_name,
canonical,
date,
)
continue
for mp_name in active_mps:
expanded.append((mid, mp_name, vote, str(date)))
return expanded
def _safe_k(mat: np.ndarray, k: int) -> int: def _safe_k(mat: np.ndarray, k: int) -> int:
"""Return a safe k for svds: must be < min(mat.shape).""" """Return a safe k for svds: must be < min(mat.shape)."""
@ -162,21 +275,18 @@ def compute_svd_for_window(
Opens the DB in read-only mode (allows concurrent parallel workers). Opens the DB in read-only mode (allows concurrent parallel workers).
Does NOT write to the DB caller is responsible for persisting results. Does NOT write to the DB caller is responsible for persisting results.
Party-level vote rows are expanded to individual MP rows using mp_metadata
so that the vote matrix contains only individual MPs (no party aggregates).
This prevents the block-diagonal structure that causes SVD axes to be disjoint.
Returns dict with keys: Returns dict with keys:
window_id, k_used, mp_rows, motion_rows window_id, k_used, mp_rows, motion_rows
where *_rows are List[Tuple[entity_type, entity_id, vector, model]] where *_rows are List[Tuple[entity_type, entity_id, vector, model]]
""" """
empty = {"window_id": window_id, "k_used": 0, "mp_rows": [], "motion_rows": []} empty = {"window_id": window_id, "k_used": 0, "mp_rows": [], "motion_rows": []}
# Read vote matrix using a read-only connection — safe to run in parallel. # Build expanded rows: party votes → individual MP votes
conn = duckdb.connect(db_path, read_only=True) rows = _build_expanded_rows(db_path, start_date, end_date)
try:
rows = conn.execute(
"SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ?",
(start_date, end_date),
).fetchall()
finally:
conn.close()
if not rows: if not rows:
return empty return empty
@ -191,7 +301,7 @@ def compute_svd_for_window(
mp_index = {name: i for i, name in enumerate(mp_names)} mp_index = {name: i for i, name in enumerate(mp_names)}
motion_index = {mid: j for j, mid in enumerate(motion_ids)} motion_index = {mid: j for j, mid in enumerate(motion_ids)}
for motion_id, mp_name, vote in rows: for motion_id, mp_name, vote, _date in rows:
i = mp_index[mp_name] i = mp_index[mp_name]
j = motion_index[int(motion_id)] j = motion_index[int(motion_id)]
val = VOTE_MAP.get( val = VOTE_MAP.get(

Loading…
Cancel
Save