You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
301 lines
11 KiB
301 lines
11 KiB
"""Validate SVD themes against actual party positions.
|
|
|
|
This hook detects when SVD axis themes no longer match the actual party
|
|
positions from the SVD vectors. Themes are derived from top motion analysis,
|
|
but party positions reflect voting on all motions — they can diverge when
|
|
the SVD is recomputed or voting patterns shift.
|
|
|
|
Primary check: canonical right-wing parties (PVV, FVD, JA21, SGP) must
|
|
appear on the RIGHT side of all axes (per repo convention).
|
|
|
|
Secondary check: theme pole labels should match actual party positions.
|
|
|
|
Usage:
|
|
uv run python scripts/validate_svd_themes.py --db data/motions.db
|
|
|
|
Returns exit code 1 if any axis has divergent party positions.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import sys
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Tuple
|
|
|
|
logger = logging.getLogger("validate_svd_themes")
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
|
|
def _load_canonical_parties():
|
|
"""Import canonical party sets from analysis.config (single source of truth)."""
|
|
sys.path.insert(0, ".")
|
|
from analysis.config import CANONICAL_LEFT, CANONICAL_RIGHT
|
|
|
|
return CANONICAL_RIGHT, CANONICAL_LEFT
|
|
|
|
|
|
def _load_party_normalize():
|
|
"""Import party name normalization map from analysis.config."""
|
|
sys.path.insert(0, ".")
|
|
from analysis.config import _PARTY_NORMALIZE
|
|
|
|
return _PARTY_NORMALIZE
|
|
|
|
|
|
def load_party_positions(
|
|
db_path: str, window_id: str = "current_parliament"
|
|
) -> Dict[str, Dict[int, float]]:
|
|
"""Load per-party average SVD scores per component from svd_vectors.
|
|
|
|
Returns {party: {component: avg_score}} where component is 1-indexed.
|
|
Also returns full average vectors for flip computation.
|
|
"""
|
|
import duckdb
|
|
|
|
con = duckdb.connect(database=db_path, read_only=True)
|
|
try:
|
|
# Get MP → party mapping (with normalization)
|
|
meta_rows = con.execute("SELECT mp_name, party FROM mp_metadata").fetchall()
|
|
party_normalize = _load_party_normalize()
|
|
mp_party = {
|
|
name: party_normalize.get(party, party) for name, party in meta_rows
|
|
}
|
|
|
|
# Load MP vectors
|
|
rows = con.execute(
|
|
"SELECT entity_id, vector FROM svd_vectors "
|
|
"WHERE window_id = ? AND entity_type = 'mp'",
|
|
[window_id],
|
|
).fetchall()
|
|
|
|
# Aggregate by party
|
|
party_vectors: Dict[str, List[List[float]]] = defaultdict(list)
|
|
for entity_id, vector_json in rows:
|
|
party = mp_party.get(entity_id)
|
|
if party:
|
|
vec = json.loads(vector_json)
|
|
party_vectors[party].append(vec)
|
|
|
|
# Compute averages per component and full average vectors
|
|
party_positions: Dict[str, Dict[int, float]] = {}
|
|
party_avg_vectors: Dict[str, List[float]] = {}
|
|
for party, vectors in party_vectors.items():
|
|
n = len(vectors)
|
|
if n == 0:
|
|
continue
|
|
dim = len(vectors[0])
|
|
avg_vec = [sum(v[i] for v in vectors) / n for i in range(dim)]
|
|
party_avg_vectors[party] = avg_vec
|
|
avg = {}
|
|
for comp_idx in range(dim):
|
|
avg[comp_idx + 1] = avg_vec[comp_idx]
|
|
party_positions[party] = avg
|
|
|
|
return party_positions, party_avg_vectors
|
|
finally:
|
|
con.close()
|
|
|
|
|
|
def load_themes() -> Dict[int, Dict[str, str]]:
|
|
"""Load SVD_THEMES from analysis.config."""
|
|
sys.path.insert(0, ".")
|
|
from analysis.config import SVD_THEMES
|
|
|
|
return SVD_THEMES
|
|
|
|
|
|
def check_canonical_right_on_right(
|
|
party_positions: Dict[str, Dict[int, float]],
|
|
party_avg_vectors: Dict[str, List[float]],
|
|
themes: Dict[int, Dict[str, str]],
|
|
canonical_right: frozenset,
|
|
canonical_left: frozenset,
|
|
num_components: int = 10,
|
|
) -> List[Dict]:
|
|
"""Check that canonical right-wing parties appear on the right side after flip.
|
|
|
|
For each component, verifies that canonical right parties have higher
|
|
average scores than canonical left parties AFTER applying the flip
|
|
mechanism (which negates scores when flip=True).
|
|
|
|
Returns list of divergence reports.
|
|
"""
|
|
from analysis.svd_labels import compute_flip_direction
|
|
|
|
divergences = []
|
|
|
|
for comp in range(1, num_components + 1):
|
|
right_scores = []
|
|
left_scores = []
|
|
|
|
for party in canonical_right:
|
|
if party in party_positions and comp in party_positions[party]:
|
|
right_scores.append(party_positions[party][comp])
|
|
|
|
for party in canonical_left:
|
|
if party in party_positions and comp in party_positions[party]:
|
|
left_scores.append(party_positions[party][comp])
|
|
|
|
if not right_scores or not left_scores:
|
|
divergences.append(
|
|
{
|
|
"component": comp,
|
|
"issue": "missing_canonical_party_data",
|
|
"right_found": [p for p in canonical_right if p in party_positions],
|
|
"left_found": [p for p in canonical_left if p in party_positions],
|
|
}
|
|
)
|
|
continue
|
|
|
|
right_mean = sum(right_scores) / len(right_scores)
|
|
left_mean = sum(left_scores) / len(left_scores)
|
|
|
|
# Compute flip using full average vectors (compute_flip_direction expects
|
|
# party_scores[party] to be a list of scores for all components)
|
|
scores_dict = {
|
|
p: party_avg_vectors[p]
|
|
for p in canonical_right | canonical_left
|
|
if p in party_avg_vectors
|
|
}
|
|
flip = compute_flip_direction(comp, scores_dict)
|
|
|
|
# Post-flip: if flip=True, scores are negated
|
|
post_flip_right = -right_mean if flip else right_mean
|
|
post_flip_left = -left_mean if flip else left_mean
|
|
|
|
if post_flip_right < post_flip_left:
|
|
divergences.append(
|
|
{
|
|
"component": comp,
|
|
"issue": "canonical_right_not_on_right",
|
|
"flip": flip,
|
|
"raw_right_mean": round(right_mean, 4),
|
|
"raw_left_mean": round(left_mean, 4),
|
|
"post_flip_right": round(post_flip_right, 4),
|
|
"post_flip_left": round(post_flip_left, 4),
|
|
"diff": round(post_flip_right - post_flip_left, 4),
|
|
"right_scores": {
|
|
p: round(party_positions[p][comp], 4)
|
|
for p in canonical_right
|
|
if p in party_positions
|
|
},
|
|
"left_scores": {
|
|
p: round(party_positions[p][comp], 4)
|
|
for p in canonical_left
|
|
if p in party_positions
|
|
},
|
|
}
|
|
)
|
|
|
|
return divergences
|
|
|
|
|
|
def check_theme_consistency(
|
|
party_positions: Dict[str, Dict[int, float]],
|
|
themes: Dict[int, Dict[str, str]],
|
|
canonical_right: frozenset,
|
|
canonical_left: frozenset,
|
|
) -> List[Dict]:
|
|
"""Check that theme pole labels are consistent with actual party positions.
|
|
|
|
Note: left_pole/right_pole describe the SEMANTIC left/right after flip,
|
|
not the political left/right spectrum. This check verifies that the
|
|
parties mentioned in each pole are actually on the expected side.
|
|
|
|
Returns list of divergence reports.
|
|
"""
|
|
# This check is inherently noisy because pole text mentions parties that
|
|
# may not be in canonical sets. Skip for now — the canonical right-on-right
|
|
# check is the primary validation.
|
|
return []
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(
|
|
description="Validate SVD themes against actual party positions"
|
|
)
|
|
parser.add_argument(
|
|
"--db", default="data/motions.db", help="Path to motions database"
|
|
)
|
|
parser.add_argument(
|
|
"--window", default="current_parliament", help="Window ID to validate"
|
|
)
|
|
parser.add_argument(
|
|
"--components", type=int, default=10, help="Number of components to check"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
logger.info("Loading party positions from %s (window=%s)", args.db, args.window)
|
|
party_positions, party_avg_vectors = load_party_positions(args.db, args.window)
|
|
|
|
logger.info("Loading SVD themes from analysis.config")
|
|
themes = load_themes()
|
|
|
|
logger.info("Loading canonical party sets from analysis.config")
|
|
canonical_right, canonical_left = _load_canonical_parties()
|
|
|
|
# Check 1: Canonical right-wing parties on right side (after flip)
|
|
logger.info("Checking canonical right-wing party positions (post-flip)")
|
|
canonical_divergences = check_canonical_right_on_right(
|
|
party_positions,
|
|
party_avg_vectors,
|
|
themes,
|
|
canonical_right,
|
|
canonical_left,
|
|
args.components,
|
|
)
|
|
|
|
# Check 2: Theme pole label consistency
|
|
logger.info("Checking theme pole label consistency")
|
|
theme_divergences = check_theme_consistency(
|
|
party_positions, themes, canonical_right, canonical_left
|
|
)
|
|
|
|
all_divergences = canonical_divergences + theme_divergences
|
|
|
|
if all_divergences:
|
|
print(f"\n{'=' * 60}")
|
|
print(f"FOUND {len(all_divergences)} DIVERGENCE(S)")
|
|
print(f"{'=' * 60}")
|
|
|
|
for d in all_divergences:
|
|
comp = d["component"]
|
|
theme_label = themes.get(comp, {}).get("label", f"Component {comp}")
|
|
print(f"\n--- Component {comp}: {theme_label} ---")
|
|
print(f" Issue: {d['issue']}")
|
|
|
|
if d["issue"] == "canonical_right_not_on_right":
|
|
print(f" Canonical RIGHT mean (raw): {d['raw_right_mean']:.4f}")
|
|
print(f" Canonical LEFT mean (raw): {d['raw_left_mean']:.4f}")
|
|
print(f" Flip applied: {d['flip']}")
|
|
print(f" Post-flip RIGHT: {d['post_flip_right']:.4f}")
|
|
print(f" Post-flip LEFT: {d['post_flip_left']:.4f}")
|
|
print(f" Diff (post-flip R - L): {d['diff']:.4f}")
|
|
print(f" Right scores: {d['right_scores']}")
|
|
print(f" Left scores: {d['left_scores']}")
|
|
elif d["issue"] == "theme_pole_mismatch":
|
|
print(f" Label: {d.get('label', '')}")
|
|
print(f" Left pole: {d['left_pole']}")
|
|
print(f" Right pole: {d['right_pole']}")
|
|
print(f" Left mean: {d['left_mean']:.4f} ({d['left_parties']})")
|
|
print(f" Right mean: {d['right_mean']:.4f} ({d['right_parties']})")
|
|
print(f" Diff (left - right): {d['diff']:.4f}")
|
|
elif d["issue"] == "missing_canonical_party_data":
|
|
print(f" Expected right: {canonical_right}")
|
|
print(f" Expected left: {canonical_left}")
|
|
print(f" Found right: {d['right_found']}")
|
|
print(f" Found left: {d['left_found']}")
|
|
|
|
return 1
|
|
else:
|
|
print("\n✓ All SVD themes match actual party positions")
|
|
print(" - Canonical right-wing parties on right side of all axes")
|
|
print(" - Theme pole labels consistent with party positions")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|
|
|