"""Validate SVD themes against actual party positions. This hook detects when SVD axis themes no longer match the actual party positions from the SVD vectors. Themes are derived from top motion analysis, but party positions reflect voting on all motions — they can diverge when the SVD is recomputed or voting patterns shift. Primary check: canonical right-wing parties (PVV, FVD, JA21, SGP) must appear on the RIGHT side of all axes (per repo convention). Secondary check: theme pole labels should match actual party positions. Usage: uv run python scripts/validate_svd_themes.py --db data/motions.db Returns exit code 1 if any axis has divergent party positions. """ from __future__ import annotations import argparse import json import logging import sys from collections import defaultdict from typing import Dict, List, Tuple logger = logging.getLogger("validate_svd_themes") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") def _load_canonical_parties(): """Import canonical party sets from analysis.config (single source of truth).""" sys.path.insert(0, ".") from analysis.config import CANONICAL_LEFT, CANONICAL_RIGHT return CANONICAL_RIGHT, CANONICAL_LEFT def _load_party_normalize(): """Import party name normalization map from analysis.config.""" sys.path.insert(0, ".") from analysis.config import _PARTY_NORMALIZE return _PARTY_NORMALIZE def load_party_positions( db_path: str, window_id: str = "current_parliament" ) -> Dict[str, Dict[int, float]]: """Load per-party average SVD scores per component from svd_vectors. Returns {party: {component: avg_score}} where component is 1-indexed. Also returns full average vectors for flip computation. """ import duckdb con = duckdb.connect(database=db_path, read_only=True) try: # Get MP → party mapping (with normalization) meta_rows = con.execute("SELECT mp_name, party FROM mp_metadata").fetchall() party_normalize = _load_party_normalize() mp_party = { name: party_normalize.get(party, party) for name, party in meta_rows } # Load MP vectors rows = con.execute( "SELECT entity_id, vector FROM svd_vectors " "WHERE window_id = ? AND entity_type = 'mp'", [window_id], ).fetchall() # Aggregate by party party_vectors: Dict[str, List[List[float]]] = defaultdict(list) for entity_id, vector_json in rows: party = mp_party.get(entity_id) if party: vec = json.loads(vector_json) party_vectors[party].append(vec) # Compute averages per component and full average vectors party_positions: Dict[str, Dict[int, float]] = {} party_avg_vectors: Dict[str, List[float]] = {} for party, vectors in party_vectors.items(): n = len(vectors) if n == 0: continue dim = len(vectors[0]) avg_vec = [sum(v[i] for v in vectors) / n for i in range(dim)] party_avg_vectors[party] = avg_vec avg = {} for comp_idx in range(dim): avg[comp_idx + 1] = avg_vec[comp_idx] party_positions[party] = avg return party_positions, party_avg_vectors finally: con.close() def load_themes() -> Dict[int, Dict[str, str]]: """Load SVD_THEMES from analysis.config.""" sys.path.insert(0, ".") from analysis.config import SVD_THEMES return SVD_THEMES def check_canonical_right_on_right( party_positions: Dict[str, Dict[int, float]], party_avg_vectors: Dict[str, List[float]], themes: Dict[int, Dict[str, str]], canonical_right: frozenset, canonical_left: frozenset, num_components: int = 10, ) -> List[Dict]: """Check that canonical right-wing parties appear on the right side after flip. For each component, verifies that canonical right parties have higher average scores than canonical left parties AFTER applying the flip mechanism (which negates scores when flip=True). Returns list of divergence reports. """ from analysis.svd_labels import compute_flip_direction divergences = [] for comp in range(1, num_components + 1): right_scores = [] left_scores = [] for party in canonical_right: if party in party_positions and comp in party_positions[party]: right_scores.append(party_positions[party][comp]) for party in canonical_left: if party in party_positions and comp in party_positions[party]: left_scores.append(party_positions[party][comp]) if not right_scores or not left_scores: divergences.append( { "component": comp, "issue": "missing_canonical_party_data", "right_found": [p for p in canonical_right if p in party_positions], "left_found": [p for p in canonical_left if p in party_positions], } ) continue right_mean = sum(right_scores) / len(right_scores) left_mean = sum(left_scores) / len(left_scores) # Compute flip using full average vectors (compute_flip_direction expects # party_scores[party] to be a list of scores for all components) scores_dict = { p: party_avg_vectors[p] for p in canonical_right | canonical_left if p in party_avg_vectors } flip = compute_flip_direction(comp, scores_dict) # Post-flip: if flip=True, scores are negated post_flip_right = -right_mean if flip else right_mean post_flip_left = -left_mean if flip else left_mean if post_flip_right < post_flip_left: divergences.append( { "component": comp, "issue": "canonical_right_not_on_right", "flip": flip, "raw_right_mean": round(right_mean, 4), "raw_left_mean": round(left_mean, 4), "post_flip_right": round(post_flip_right, 4), "post_flip_left": round(post_flip_left, 4), "diff": round(post_flip_right - post_flip_left, 4), "right_scores": { p: round(party_positions[p][comp], 4) for p in canonical_right if p in party_positions }, "left_scores": { p: round(party_positions[p][comp], 4) for p in canonical_left if p in party_positions }, } ) return divergences def check_config_flip_consistency( party_avg_vectors: Dict[str, List[float]], themes: Dict[int, Dict[str, str]], canonical_right: frozenset, canonical_left: frozenset, num_components: int = 10, ) -> List[Dict]: """Check that config flip values match computed flip directions. The runtime uses compute_flip_direction() to determine flips, but config also stores a flip value. If they disagree, the config is stale or wrong. """ from analysis.svd_labels import compute_flip_direction scores_dict = { p: v for p, v in party_avg_vectors.items() if p in canonical_right or p in canonical_left } mismatches = [] for comp in range(1, num_components + 1): config_flip = themes.get(comp, {}).get("flip", False) computed_flip = compute_flip_direction(comp, scores_dict) if config_flip != computed_flip: mismatches.append( { "component": comp, "issue": "config_flip_mismatch", "config_flip": config_flip, "computed_flip": computed_flip, } ) return mismatches def check_theme_consistency( party_positions: Dict[str, Dict[int, float]], themes: Dict[int, Dict[str, str]], canonical_right: frozenset, canonical_left: frozenset, ) -> List[Dict]: """Check that theme pole labels are consistent with actual party positions. Note: positive_pole/negative_pole describe the SVD axis orientation. Labels are derived at runtime from flip direction. Returns list of divergence reports. """ # This check is inherently noisy because pole text mentions parties that # may not be in canonical sets. Skip for now — the canonical right-on-right # check is the primary validation. return [] def main() -> int: parser = argparse.ArgumentParser( description="Validate SVD themes against actual party positions" ) parser.add_argument( "--db", default="data/motions.db", help="Path to motions database" ) parser.add_argument( "--window", default="current_parliament", help="Window ID to validate" ) parser.add_argument( "--components", type=int, default=10, help="Number of components to check" ) args = parser.parse_args() logger.info("Loading party positions from %s (window=%s)", args.db, args.window) party_positions, party_avg_vectors = load_party_positions(args.db, args.window) logger.info("Loading SVD themes from analysis.config") themes = load_themes() logger.info("Loading canonical party sets from analysis.config") canonical_right, canonical_left = _load_canonical_parties() # Check 1: Canonical right-wing parties on right side (after flip) logger.info("Checking canonical right-wing party positions (post-flip)") canonical_divergences = check_canonical_right_on_right( party_positions, party_avg_vectors, themes, canonical_right, canonical_left, args.components, ) # Check 2: Config flip vs computed flip consistency logger.info("Checking config flip vs computed flip consistency") flip_mismatches = check_config_flip_consistency( party_avg_vectors, themes, canonical_right, canonical_left, args.components ) # Check 3: Theme pole label consistency logger.info("Checking theme pole label consistency") theme_divergences = check_theme_consistency( party_positions, themes, canonical_right, canonical_left ) all_divergences = canonical_divergences + flip_mismatches + theme_divergences if all_divergences: print(f"\n{'=' * 60}") print(f"FOUND {len(all_divergences)} DIVERGENCE(S)") print(f"{'=' * 60}") for d in all_divergences: comp = d["component"] theme_label = themes.get(comp, {}).get("label", f"Component {comp}") print(f"\n--- Component {comp}: {theme_label} ---") print(f" Issue: {d['issue']}") if d["issue"] == "canonical_right_not_on_right": print(f" Canonical RIGHT mean (raw): {d['raw_right_mean']:.4f}") print(f" Canonical LEFT mean (raw): {d['raw_left_mean']:.4f}") print(f" Flip applied: {d['flip']}") print(f" Post-flip RIGHT: {d['post_flip_right']:.4f}") print(f" Post-flip LEFT: {d['post_flip_left']:.4f}") print(f" Diff (post-flip R - L): {d['diff']:.4f}") print(f" Right scores: {d['right_scores']}") print(f" Left scores: {d['left_scores']}") elif d["issue"] == "missing_canonical_party_data": print(f" Expected right: {canonical_right}") print(f" Expected left: {canonical_left}") print(f" Found right: {d['right_found']}") print(f" Found left: {d['left_found']}") elif d["issue"] == "config_flip_mismatch": print(f" Config flip: {d['config_flip']}") print(f" Computed flip: {d['computed_flip']}") print(f" → Update SVD_THEMES[{comp}]['flip'] to {d['computed_flip']}") return 1 else: print("\n✓ All SVD themes match actual party positions") print(" - Canonical right-wing parties on right side of all axes") print(" - Config flip values match computed flip directions") print(" - Theme pole labels consistent with party positions") return 0 if __name__ == "__main__": raise SystemExit(main())