@ -46,6 +46,119 @@ VOTE_MAP = {
" Blanco " : 0.0 ,
}
# Mapping from short party names (as they appear in party-level vote rows)
# to canonical party names in mp_metadata. Parties not listed here are either
# already matching or are skipped (no valid mp_metadata coverage).
_PARTY_NAME_MAP = {
" NSC " : " Nieuw Sociaal Contract " ,
" Gündogan " : " Gündoğan " ,
" Keijzer " : " Lid Keijzer " ,
# Pre-merger: both GroenLinks and PvdA votes map to the merged faction
" GroenLinks " : " GroenLinks-PvdA " ,
" PvdA " : " GroenLinks-PvdA " ,
# Omtzigt initially sat alone before founding NSC
" Omtzigt " : " Nieuw Sociaal Contract " ,
}
# Party names for which we have no usable mp_metadata (tiny noise, skip expansion)
_SKIP_PARTIES = { " Brinkman " , " Bontes " , " Krol " , " Van Kooten-Arissen " }
def _build_expanded_rows (
db_path : str , start_date : str , end_date : str
) - > List [ Tuple [ int , str , str , str ] ] :
""" Build vote rows expanding party-level votes to individual MPs.
For motions that have only party - level vote records ( mp_name is a party code ,
not a ' Lastname, F. ' individual ) , each party vote is expanded to all individual
MPs of that party who were active on the motion date ( via mp_metadata ) .
For motions that already have individual MP records , those rows are kept as - is .
Returns list of ( motion_id , mp_name , vote , date_str ) tuples .
"""
conn = duckdb . connect ( db_path , read_only = True )
try :
# Load all vote rows for the window
vote_rows = conn . execute (
" SELECT motion_id, mp_name, vote, date FROM mp_votes "
" WHERE date BETWEEN ? AND ? " ,
( start_date , end_date ) ,
) . fetchall ( )
# Load mp_metadata (name, party, van, tot_en_met)
meta_rows = conn . execute (
" SELECT mp_name, party, van, tot_en_met FROM mp_metadata "
) . fetchall ( )
finally :
conn . close ( )
if not vote_rows :
return [ ]
# Build mp_metadata lookup: canonical_party -> list of (mp_name, van, tot_en_met)
from collections import defaultdict
import datetime
party_to_mps : Dict [ str , List [ Tuple ] ] = defaultdict ( list )
for mp_name , party , van , tot_en_met in meta_rows :
if party and mp_name :
party_to_mps [ party ] . append ( ( mp_name , van , tot_en_met ) )
def get_active_mps ( canonical_party : str , motion_date ) - > List [ str ] :
""" Return MP names active in canonical_party on motion_date. """
result = [ ]
for mp_name , van , tot_en_met in party_to_mps . get ( canonical_party , [ ] ) :
if van is None or van > motion_date :
continue
if tot_en_met is not None and tot_en_met < motion_date :
continue
result . append ( mp_name )
return result
# Group rows by motion_id, separate individual vs party rows
from collections import defaultdict as _dd
motion_individual : Dict [ int , List ] = _dd ( list )
motion_party : Dict [ int , List ] = _dd ( list )
for motion_id , mp_name , vote , date in vote_rows :
mid = int ( motion_id )
# Individual MPs have comma in name (e.g. "Bergkamp, V.A.")
if " , " in str ( mp_name ) :
motion_individual [ mid ] . append ( ( mp_name , vote , date ) )
else :
motion_party [ mid ] . append ( ( mp_name , vote , date ) )
# Build the final expanded rows
expanded : List [ Tuple [ int , str , str , str ] ] = [ ]
all_motion_ids = set ( motion_individual . keys ( ) ) | set ( motion_party . keys ( ) )
for mid in all_motion_ids :
if mid in motion_individual and motion_individual [ mid ] :
# Motion already has individual MP rows — use them directly, skip party rows
for mp_name , vote , date in motion_individual [ mid ] :
expanded . append ( ( mid , mp_name , vote , str ( date ) ) )
else :
# Party-only motion — expand each party row to individual MPs
for party_name , vote , date in motion_party [ mid ] :
if party_name in _SKIP_PARTIES :
continue
canonical = _PARTY_NAME_MAP . get ( party_name , party_name )
active_mps = get_active_mps ( canonical , date )
if not active_mps :
_logger . debug (
" No active MPs found for party %s (canonical: %s ) on %s " ,
party_name ,
canonical ,
date ,
)
continue
for mp_name in active_mps :
expanded . append ( ( mid , mp_name , vote , str ( date ) ) )
return expanded
def _safe_k ( mat : np . ndarray , k : int ) - > int :
""" Return a safe k for svds: must be < min(mat.shape). """
@ -162,21 +275,18 @@ def compute_svd_for_window(
Opens the DB in read - only mode ( allows concurrent parallel workers ) .
Does NOT write to the DB — caller is responsible for persisting results .
Party - level vote rows are expanded to individual MP rows using mp_metadata
so that the vote matrix contains only individual MPs ( no party aggregates ) .
This prevents the block - diagonal structure that causes SVD axes to be disjoint .
Returns dict with keys :
window_id , k_used , mp_rows , motion_rows
where * _rows are List [ Tuple [ entity_type , entity_id , vector , model ] ]
"""
empty = { " window_id " : window_id , " k_used " : 0 , " mp_rows " : [ ] , " motion_rows " : [ ] }
# Read vote matrix using a read-only connection — safe to run in parallel.
conn = duckdb . connect ( db_path , read_only = True )
try :
rows = conn . execute (
" SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ? " ,
( start_date , end_date ) ,
) . fetchall ( )
finally :
conn . close ( )
# Build expanded rows: party votes → individual MP votes
rows = _build_expanded_rows ( db_path , start_date , end_date )
if not rows :
return empty
@ -191,7 +301,7 @@ def compute_svd_for_window(
mp_index = { name : i for i , name in enumerate ( mp_names ) }
motion_index = { mid : j for j , mid in enumerate ( motion_ids ) }
for motion_id , mp_name , vote in rows :
for motion_id , mp_name , vote , _date in rows :
i = mp_index [ mp_name ]
j = motion_index [ int ( motion_id ) ]
val = VOTE_MAP . get (