motief/scripts/validate_svd_themes.py

"""Validate SVD themes against actual party positions.

This hook detects when SVD axis themes no longer match the actual party
positions from the SVD vectors. Themes are derived from top motion analysis,
but party positions reflect voting on all motions — they can diverge when
the SVD is recomputed or voting patterns shift.

Primary check: canonical right-wing parties (PVV, FVD, JA21, SGP) must
appear on the RIGHT side of all axes (per repo convention).

Secondary check: theme pole labels should match actual party positions.

Usage:
    uv run python scripts/validate_svd_themes.py --db data/motions.db

Returns exit code 1 if any axis has divergent party positions.
"""

from __future__ import annotations

import argparse
import json
import logging
import sys
from collections import defaultdict
from typing import Dict, List, Tuple

logger = logging.getLogger("validate_svd_themes")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

# Canonical party sets (must match analysis.config)
CANONICAL_RIGHT = frozenset({"PVV", "FVD", "JA21", "SGP"})
CANONICAL_LEFT = frozenset(
    {"SP", "PvdA", "GL", "GroenLinks", "GroenLinks-PvdA", "DENK", "PvdD", "Volt"}
)


def load_party_positions(
    db_path: str, window_id: str = "current_parliament"
) -> Dict[str, Dict[int, float]]:
    """Load per-party average SVD scores per component from svd_vectors.

    Returns {party: {component: avg_score}} where component is 1-indexed.
    Also returns full average vectors for flip computation.
    """
    import duckdb

    con = duckdb.connect(database=db_path, read_only=True)
    try:
        # Get MP → party mapping
        meta_rows = con.execute("SELECT mp_name, party FROM mp_metadata").fetchall()
        mp_party = {name: party for name, party in meta_rows}

        # Load MP vectors
        rows = con.execute(
            "SELECT entity_id, vector FROM svd_vectors "
            "WHERE window_id = ? AND entity_type = 'mp'",
            [window_id],
        ).fetchall()

        # Aggregate by party
        party_vectors: Dict[str, List[List[float]]] = defaultdict(list)
        for entity_id, vector_json in rows:
            party = mp_party.get(entity_id)
            if party:
                vec = json.loads(vector_json)
                party_vectors[party].append(vec)

        # Compute averages per component and full average vectors
        party_positions: Dict[str, Dict[int, float]] = {}
        party_avg_vectors: Dict[str, List[float]] = {}
        for party, vectors in party_vectors.items():
            n = len(vectors)
            if n == 0:
                continue
            dim = len(vectors[0])
            avg_vec = [sum(v[i] for v in vectors) / n for i in range(dim)]
            party_avg_vectors[party] = avg_vec
            avg = {}
            for comp_idx in range(dim):
                avg[comp_idx + 1] = avg_vec[comp_idx]
            party_positions[party] = avg

        return party_positions, party_avg_vectors
    finally:
        con.close()


def load_themes() -> Dict[int, Dict[str, str]]:
    """Load SVD_THEMES from analysis.config."""
    sys.path.insert(0, ".")
    from analysis.config import SVD_THEMES

    return SVD_THEMES


def check_canonical_right_on_right(
    party_positions: Dict[str, Dict[int, float]],
    party_avg_vectors: Dict[str, List[float]],
    themes: Dict[int, Dict[str, str]],
    num_components: int = 10,
) -> List[Dict]:
    """Check that canonical right-wing parties appear on the right side after flip.

    For each component, verifies that canonical right parties have higher
    average scores than canonical left parties AFTER applying the flip
    mechanism (which negates scores when flip=True).

    Returns list of divergence reports.
    """
    from analysis.svd_labels import compute_flip_direction

    divergences = []

    for comp in range(1, num_components + 1):
        right_scores = []
        left_scores = []

        for party in CANONICAL_RIGHT:
            if party in party_positions and comp in party_positions[party]:
                right_scores.append(party_positions[party][comp])

        for party in CANONICAL_LEFT:
            if party in party_positions and comp in party_positions[party]:
                left_scores.append(party_positions[party][comp])

        if not right_scores or not left_scores:
            divergences.append(
                {
                    "component": comp,
                    "issue": "missing_canonical_party_data",
                    "right_found": [p for p in CANONICAL_RIGHT if p in party_positions],
                    "left_found": [p for p in CANONICAL_LEFT if p in party_positions],
                }
            )
            continue

        right_mean = sum(right_scores) / len(right_scores)
        left_mean = sum(left_scores) / len(left_scores)

        # Compute flip using full average vectors (compute_flip_direction expects
        # party_scores[party] to be a list of scores for all components)
        scores_dict = {
            p: party_avg_vectors[p]
            for p in CANONICAL_RIGHT | CANONICAL_LEFT
            if p in party_avg_vectors
        }
        flip = compute_flip_direction(comp, scores_dict)

        # Post-flip: if flip=True, scores are negated
        post_flip_right = -right_mean if flip else right_mean
        post_flip_left = -left_mean if flip else left_mean

        if post_flip_right < post_flip_left:
            divergences.append(
                {
                    "component": comp,
                    "issue": "canonical_right_not_on_right",
                    "flip": flip,
                    "raw_right_mean": round(right_mean, 4),
                    "raw_left_mean": round(left_mean, 4),
                    "post_flip_right": round(post_flip_right, 4),
                    "post_flip_left": round(post_flip_left, 4),
                    "diff": round(post_flip_right - post_flip_left, 4),
                    "right_scores": {
                        p: round(party_positions[p][comp], 4)
                        for p in CANONICAL_RIGHT
                        if p in party_positions
                    },
                    "left_scores": {
                        p: round(party_positions[p][comp], 4)
                        for p in CANONICAL_LEFT
                        if p in party_positions
                    },
                }
            )

    return divergences


def check_theme_consistency(
    party_positions: Dict[str, Dict[int, float]],
    themes: Dict[int, Dict[str, str]],
) -> List[Dict]:
    """Check that theme pole labels are consistent with actual party positions.

    For each component, verifies that parties mentioned in left_pole have
    lower scores than parties mentioned in right_pole.

    Returns list of divergence reports.
    """
    divergences = []

    for comp, theme in themes.items():
        left_pole = theme.get("left_pole", "")
        right_pole = theme.get("right_pole", "")

        if not left_pole or not right_pole:
            continue

        # Extract party mentions from pole text
        left_parties = [p for p in CANONICAL_RIGHT | CANONICAL_LEFT if p in left_pole]
        right_parties = [p for p in CANONICAL_RIGHT | CANONICAL_LEFT if p in right_pole]

        if not left_parties or not right_parties:
            continue

        left_scores = []
        right_scores = []

        for party in left_parties:
            if party in party_positions and comp in party_positions[party]:
                left_scores.append(party_positions[party][comp])

        for party in right_parties:
            if party in party_positions and comp in party_positions[party]:
                right_scores.append(party_positions[party][comp])

        if not left_scores or not right_scores:
            continue

        left_mean = sum(left_scores) / len(left_scores)
        right_mean = sum(right_scores) / len(right_scores)

        # Left pole parties should have lower scores than right pole parties
        if left_mean > right_mean:
            divergences.append(
                {
                    "component": comp,
                    "issue": "theme_pole_mismatch",
                    "label": theme.get("label", f"Component {comp}"),
                    "left_pole": left_pole[:80],
                    "right_pole": right_pole[:80],
                    "left_mean": round(left_mean, 4),
                    "right_mean": round(right_mean, 4),
                    "diff": round(left_mean - right_mean, 4),
                    "left_parties": left_parties,
                    "right_parties": right_parties,
                }
            )

    return divergences


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Validate SVD themes against actual party positions"
    )
    parser.add_argument(
        "--db", default="data/motions.db", help="Path to motions database"
    )
    parser.add_argument(
        "--window", default="current_parliament", help="Window ID to validate"
    )
    parser.add_argument(
        "--components", type=int, default=10, help="Number of components to check"
    )
    args = parser.parse_args()

    logger.info("Loading party positions from %s (window=%s)", args.db, args.window)
    party_positions, party_avg_vectors = load_party_positions(args.db, args.window)

    logger.info("Loading SVD themes from analysis.config")
    themes = load_themes()

    # Check 1: Canonical right-wing parties on right side (after flip)
    logger.info("Checking canonical right-wing party positions (post-flip)")
    canonical_divergences = check_canonical_right_on_right(
        party_positions, party_avg_vectors, themes, args.components
    )

    # Check 2: Theme pole label consistency
    logger.info("Checking theme pole label consistency")
    theme_divergences = check_theme_consistency(party_positions, themes)

    all_divergences = canonical_divergences + theme_divergences

    if all_divergences:
        print(f"\n{'=' * 60}")
        print(f"FOUND {len(all_divergences)} DIVERGENCE(S)")
        print(f"{'=' * 60}")

        for d in all_divergences:
            comp = d["component"]
            theme_label = themes.get(comp, {}).get("label", f"Component {comp}")
            print(f"\n--- Component {comp}: {theme_label} ---")
            print(f"  Issue: {d['issue']}")

            if d["issue"] == "canonical_right_not_on_right":
                print(f"  Canonical RIGHT mean (raw): {d['raw_right_mean']:.4f}")
                print(f"  Canonical LEFT mean (raw):  {d['raw_left_mean']:.4f}")
                print(f"  Flip applied: {d['flip']}")
                print(f"  Post-flip RIGHT: {d['post_flip_right']:.4f}")
                print(f"  Post-flip LEFT:  {d['post_flip_left']:.4f}")
                print(f"  Diff (post-flip R - L): {d['diff']:.4f}")
                print(f"  Right scores: {d['right_scores']}")
                print(f"  Left scores:  {d['left_scores']}")
            elif d["issue"] == "theme_pole_mismatch":
                print(f"  Label: {d.get('label', '')}")
                print(f"  Left pole:  {d['left_pole']}")
                print(f"  Right pole: {d['right_pole']}")
                print(f"  Left mean:  {d['left_mean']:.4f} ({d['left_parties']})")
                print(f"  Right mean: {d['right_mean']:.4f} ({d['right_parties']})")
                print(f"  Diff (left - right): {d['diff']:.4f}")
            elif d["issue"] == "missing_canonical_party_data":
                print(f"  Expected right: {CANONICAL_RIGHT}")
                print(f"  Expected left:  {CANONICAL_LEFT}")
                print(f"  Found right:  {d['right_found']}")
                print(f"  Found left:   {d['left_found']}")

        return 1
    else:
        print("\n✓ All SVD themes match actual party positions")
        print("  - Canonical right-wing parties on right side of all axes")
        print("  - Theme pole labels consistent with party positions")
        return 0


if __name__ == "__main__":
    raise SystemExit(main())