fix(svd): expand party votes to individual MPs before SVD computation

The mp_votes table contains both party-aggregate rows (e.g. 'PVV', 'NSC')
and individual MP rows (e.g. 'Aardema, M.'). Running SVD on both together
creates a block-diagonal vote matrix where party codes and individual MPs
occupy disjoint SVD dimensions — causing dim 0 to be zero for all 421 MPs.

Fix: _build_expanded_rows() converts every party-level vote to individual
MP votes using mp_metadata date ranges (active MPs on motion date). Motions
that already have individual MP records are kept as-is. A party name mapping
handles NSC/Nieuw Sociaal Contract and other canonical name variants.

Results for current_parliament: 517 individual MPs, all 8732 motions covered,
dim 0 std=23.1 (was 0.0 for all MPs). PVV/NSC/BBB on positive end, SP/GL/PvdD
on negative end — matches expected left-right political axis.

All 11 annual windows (2016-2026) re-run with the new pipeline.
main
Sven Geboers 1 month ago
parent a7517bb6ae
commit 6cb89126a7
  1. 44
      explorer.py
  2. 130
      pipeline/svd_pipeline.py

@ -84,7 +84,6 @@ CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset(
"CDA",
"SP",
"ChristenUnie",
"CU", # alias for ChristenUnie
"SGP",
"Volt",
"DENK",
@ -94,6 +93,17 @@ CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset(
}
)
# Normalize variant party names to canonical display names in CURRENT_PARLIAMENT_PARTIES
_PARTY_NORMALIZE: dict[str, str] = {
"Nieuw Sociaal Contract": "NSC",
"CU": "ChristenUnie",
"GL": "GroenLinks-PvdA",
"GroenLinks": "GroenLinks-PvdA",
"PvdA": "GroenLinks-PvdA",
"Gündoğan": "GroenLinks-PvdA", # briefly sat with GL-PvdA faction
"Lid Keijzer": "BBB", # Keijzer left CDA, joined BBB cabinet
}
# ---------------------------------------------------------------------------
# Cached loaders
@ -239,17 +249,20 @@ def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]:
).fetchall()
party_counts: Dict[str, Dict[str, int]] = {}
for mp_name, party, n in party_rows:
party_counts.setdefault(mp_name, {})[party] = n
# Normalize variant names to canonical party names
canonical = _PARTY_NORMALIZE.get(party, party)
party_counts.setdefault(mp_name, {})[canonical] = (
party_counts.setdefault(mp_name, {}).get(canonical, 0) + n
)
mp_party: Dict[str, str] = {
mp: max(counts, key=counts.__getitem__)
for mp, counts in party_counts.items()
}
# Individual MP vectors from current_parliament
# Individual MP vectors from current_parliament (all mp rows are now individual MPs)
rows = con.execute(
"SELECT entity_id, vector FROM svd_vectors "
"WHERE entity_type='mp' AND window_id='current_parliament' "
"AND entity_id LIKE '%,%'"
"WHERE entity_type='mp' AND window_id='current_parliament'"
).fetchall()
party_vecs: Dict[str, list] = {}
@ -994,20 +1007,17 @@ def build_svd_components_tab(db_path: str) -> None:
# Produced by per-axis analysis of all 10 unique top motions (zero cross-axis overlap).
SVD_THEMES: dict[int, dict[str, str]] = {
1: {
"label": "Constructief coalitiebeleid versus radicale PVV-migratiepolitiek",
"label": "Links-rechts hoofdas: progressief versus conservatief-nationalistisch",
"explanation": (
"Deze as scheidt brede, constructieve coalitiemoties (CDA, VVD, NSC, CU, D66, GL-PvdA) "
"over uiteenlopende beleidsterreinen — van zorg en defensie tot multilateralisme en "
"digitale inclusie — van de harde PVV-motie die onmiddellijke uitzetting van Syriërs "
"en Oekraïense mannen eist. De positieve pool vertegenwoordigt het brede parlementaire "
"midden dat procedureel en inhoudelijk samenwerkt, terwijl de negatieve pool de "
"isolationistische en radicaal-nationalistische positie van de PVV symboliseert. "
"Deze dimensie is politiek betekenisvol omdat zij de kloof blootlegt tussen mainstream "
"coalitiepolitiek en populistisch-nationalistisch obstructionisme op het vlak van migratie."
"De dominante dimensie van het parlement: partijen aan de linkerkant (PvdD, GL-PvdA, "
"DENK, SP) stemmen progressief — voor sociale voorzieningen, klimaat, internationale "
"solidariteit — terwijl partijen aan de rechterkant (PVV, NSC, BBB, SGP) inzetten op "
"nationaal belang, migratiebeheer en conservatieve waarden. Dit is de klassieke "
"links-rechts tegenstelling die het meeste verschil in stemgedrag verklaart."
),
"positive_pole": "Breed coalitiebeleid: zorg, defensie, multilateralisme, inclusie",
"negative_pole": "Radicale PVV-eis tot onmiddellijke uitzetting migranten",
"flip": True,
"positive_pole": "Nationalistisch-conservatief: PVV, NSC, BBB, SGP",
"negative_pole": "Progressief-links: PvdD, GL-PvdA, DENK, SP",
"flip": False,
},
2: {
"label": "Nationalistisch migratiebeleid versus progressief internationaal solidariteitsdenken",

@ -46,6 +46,119 @@ VOTE_MAP = {
"Blanco": 0.0,
}
# Mapping from short party names (as they appear in party-level vote rows)
# to canonical party names in mp_metadata. Parties not listed here are either
# already matching or are skipped (no valid mp_metadata coverage).
_PARTY_NAME_MAP = {
"NSC": "Nieuw Sociaal Contract",
"Gündogan": "Gündoğan",
"Keijzer": "Lid Keijzer",
# Pre-merger: both GroenLinks and PvdA votes map to the merged faction
"GroenLinks": "GroenLinks-PvdA",
"PvdA": "GroenLinks-PvdA",
# Omtzigt initially sat alone before founding NSC
"Omtzigt": "Nieuw Sociaal Contract",
}
# Party names for which we have no usable mp_metadata (tiny noise, skip expansion)
_SKIP_PARTIES = {"Brinkman", "Bontes", "Krol", "Van Kooten-Arissen"}
def _build_expanded_rows(
db_path: str, start_date: str, end_date: str
) -> List[Tuple[int, str, str, str]]:
"""Build vote rows expanding party-level votes to individual MPs.
For motions that have only party-level vote records (mp_name is a party code,
not a 'Lastname, F.' individual), each party vote is expanded to all individual
MPs of that party who were active on the motion date (via mp_metadata).
For motions that already have individual MP records, those rows are kept as-is.
Returns list of (motion_id, mp_name, vote, date_str) tuples.
"""
conn = duckdb.connect(db_path, read_only=True)
try:
# Load all vote rows for the window
vote_rows = conn.execute(
"SELECT motion_id, mp_name, vote, date FROM mp_votes "
"WHERE date BETWEEN ? AND ?",
(start_date, end_date),
).fetchall()
# Load mp_metadata (name, party, van, tot_en_met)
meta_rows = conn.execute(
"SELECT mp_name, party, van, tot_en_met FROM mp_metadata"
).fetchall()
finally:
conn.close()
if not vote_rows:
return []
# Build mp_metadata lookup: canonical_party -> list of (mp_name, van, tot_en_met)
from collections import defaultdict
import datetime
party_to_mps: Dict[str, List[Tuple]] = defaultdict(list)
for mp_name, party, van, tot_en_met in meta_rows:
if party and mp_name:
party_to_mps[party].append((mp_name, van, tot_en_met))
def get_active_mps(canonical_party: str, motion_date) -> List[str]:
"""Return MP names active in canonical_party on motion_date."""
result = []
for mp_name, van, tot_en_met in party_to_mps.get(canonical_party, []):
if van is None or van > motion_date:
continue
if tot_en_met is not None and tot_en_met < motion_date:
continue
result.append(mp_name)
return result
# Group rows by motion_id, separate individual vs party rows
from collections import defaultdict as _dd
motion_individual: Dict[int, List] = _dd(list)
motion_party: Dict[int, List] = _dd(list)
for motion_id, mp_name, vote, date in vote_rows:
mid = int(motion_id)
# Individual MPs have comma in name (e.g. "Bergkamp, V.A.")
if "," in str(mp_name):
motion_individual[mid].append((mp_name, vote, date))
else:
motion_party[mid].append((mp_name, vote, date))
# Build the final expanded rows
expanded: List[Tuple[int, str, str, str]] = []
all_motion_ids = set(motion_individual.keys()) | set(motion_party.keys())
for mid in all_motion_ids:
if mid in motion_individual and motion_individual[mid]:
# Motion already has individual MP rows — use them directly, skip party rows
for mp_name, vote, date in motion_individual[mid]:
expanded.append((mid, mp_name, vote, str(date)))
else:
# Party-only motion — expand each party row to individual MPs
for party_name, vote, date in motion_party[mid]:
if party_name in _SKIP_PARTIES:
continue
canonical = _PARTY_NAME_MAP.get(party_name, party_name)
active_mps = get_active_mps(canonical, date)
if not active_mps:
_logger.debug(
"No active MPs found for party %s (canonical: %s) on %s",
party_name,
canonical,
date,
)
continue
for mp_name in active_mps:
expanded.append((mid, mp_name, vote, str(date)))
return expanded
def _safe_k(mat: np.ndarray, k: int) -> int:
"""Return a safe k for svds: must be < min(mat.shape)."""
@ -162,21 +275,18 @@ def compute_svd_for_window(
Opens the DB in read-only mode (allows concurrent parallel workers).
Does NOT write to the DB caller is responsible for persisting results.
Party-level vote rows are expanded to individual MP rows using mp_metadata
so that the vote matrix contains only individual MPs (no party aggregates).
This prevents the block-diagonal structure that causes SVD axes to be disjoint.
Returns dict with keys:
window_id, k_used, mp_rows, motion_rows
where *_rows are List[Tuple[entity_type, entity_id, vector, model]]
"""
empty = {"window_id": window_id, "k_used": 0, "mp_rows": [], "motion_rows": []}
# Read vote matrix using a read-only connection — safe to run in parallel.
conn = duckdb.connect(db_path, read_only=True)
try:
rows = conn.execute(
"SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ?",
(start_date, end_date),
).fetchall()
finally:
conn.close()
# Build expanded rows: party votes → individual MP votes
rows = _build_expanded_rows(db_path, start_date, end_date)
if not rows:
return empty
@ -191,7 +301,7 @@ def compute_svd_for_window(
mp_index = {name: i for i, name in enumerate(mp_names)}
motion_index = {mid: j for j, mid in enumerate(motion_ids)}
for motion_id, mp_name, vote in rows:
for motion_id, mp_name, vote, _date in rows:
i = mp_index[mp_name]
j = motion_index[int(motion_id)]
val = VOTE_MAP.get(

Loading…
Cancel
Save