fix: import canonical parties from config, simplify theme consistency check

main
Sven Geboers 4 weeks ago
parent bad9cd758d
commit 846e9cf67f
  1. 123
      scripts/validate_svd_themes.py

@ -28,11 +28,21 @@ from typing import Dict, List, Tuple
logger = logging.getLogger("validate_svd_themes") logger = logging.getLogger("validate_svd_themes")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
# Canonical party sets (must match analysis.config)
CANONICAL_RIGHT = frozenset({"PVV", "FVD", "JA21", "SGP"}) def _load_canonical_parties():
CANONICAL_LEFT = frozenset( """Import canonical party sets from analysis.config (single source of truth)."""
{"SP", "PvdA", "GL", "GroenLinks", "GroenLinks-PvdA", "DENK", "PvdD", "Volt"} sys.path.insert(0, ".")
) from analysis.config import CANONICAL_LEFT, CANONICAL_RIGHT
return CANONICAL_RIGHT, CANONICAL_LEFT
def _load_party_normalize():
"""Import party name normalization map from analysis.config."""
sys.path.insert(0, ".")
from analysis.config import _PARTY_NORMALIZE
return _PARTY_NORMALIZE
def load_party_positions( def load_party_positions(
@ -47,9 +57,12 @@ def load_party_positions(
con = duckdb.connect(database=db_path, read_only=True) con = duckdb.connect(database=db_path, read_only=True)
try: try:
# Get MP → party mapping # Get MP → party mapping (with normalization)
meta_rows = con.execute("SELECT mp_name, party FROM mp_metadata").fetchall() meta_rows = con.execute("SELECT mp_name, party FROM mp_metadata").fetchall()
mp_party = {name: party for name, party in meta_rows} party_normalize = _load_party_normalize()
mp_party = {
name: party_normalize.get(party, party) for name, party in meta_rows
}
# Load MP vectors # Load MP vectors
rows = con.execute( rows = con.execute(
@ -98,6 +111,8 @@ def check_canonical_right_on_right(
party_positions: Dict[str, Dict[int, float]], party_positions: Dict[str, Dict[int, float]],
party_avg_vectors: Dict[str, List[float]], party_avg_vectors: Dict[str, List[float]],
themes: Dict[int, Dict[str, str]], themes: Dict[int, Dict[str, str]],
canonical_right: frozenset,
canonical_left: frozenset,
num_components: int = 10, num_components: int = 10,
) -> List[Dict]: ) -> List[Dict]:
"""Check that canonical right-wing parties appear on the right side after flip. """Check that canonical right-wing parties appear on the right side after flip.
@ -116,11 +131,11 @@ def check_canonical_right_on_right(
right_scores = [] right_scores = []
left_scores = [] left_scores = []
for party in CANONICAL_RIGHT: for party in canonical_right:
if party in party_positions and comp in party_positions[party]: if party in party_positions and comp in party_positions[party]:
right_scores.append(party_positions[party][comp]) right_scores.append(party_positions[party][comp])
for party in CANONICAL_LEFT: for party in canonical_left:
if party in party_positions and comp in party_positions[party]: if party in party_positions and comp in party_positions[party]:
left_scores.append(party_positions[party][comp]) left_scores.append(party_positions[party][comp])
@ -129,8 +144,8 @@ def check_canonical_right_on_right(
{ {
"component": comp, "component": comp,
"issue": "missing_canonical_party_data", "issue": "missing_canonical_party_data",
"right_found": [p for p in CANONICAL_RIGHT if p in party_positions], "right_found": [p for p in canonical_right if p in party_positions],
"left_found": [p for p in CANONICAL_LEFT if p in party_positions], "left_found": [p for p in canonical_left if p in party_positions],
} }
) )
continue continue
@ -142,7 +157,7 @@ def check_canonical_right_on_right(
# party_scores[party] to be a list of scores for all components) # party_scores[party] to be a list of scores for all components)
scores_dict = { scores_dict = {
p: party_avg_vectors[p] p: party_avg_vectors[p]
for p in CANONICAL_RIGHT | CANONICAL_LEFT for p in canonical_right | canonical_left
if p in party_avg_vectors if p in party_avg_vectors
} }
flip = compute_flip_direction(comp, scores_dict) flip = compute_flip_direction(comp, scores_dict)
@ -164,12 +179,12 @@ def check_canonical_right_on_right(
"diff": round(post_flip_right - post_flip_left, 4), "diff": round(post_flip_right - post_flip_left, 4),
"right_scores": { "right_scores": {
p: round(party_positions[p][comp], 4) p: round(party_positions[p][comp], 4)
for p in CANONICAL_RIGHT for p in canonical_right
if p in party_positions if p in party_positions
}, },
"left_scores": { "left_scores": {
p: round(party_positions[p][comp], 4) p: round(party_positions[p][comp], 4)
for p in CANONICAL_LEFT for p in canonical_left
if p in party_positions if p in party_positions
}, },
} }
@ -181,65 +196,21 @@ def check_canonical_right_on_right(
def check_theme_consistency( def check_theme_consistency(
party_positions: Dict[str, Dict[int, float]], party_positions: Dict[str, Dict[int, float]],
themes: Dict[int, Dict[str, str]], themes: Dict[int, Dict[str, str]],
canonical_right: frozenset,
canonical_left: frozenset,
) -> List[Dict]: ) -> List[Dict]:
"""Check that theme pole labels are consistent with actual party positions. """Check that theme pole labels are consistent with actual party positions.
For each component, verifies that parties mentioned in left_pole have Note: left_pole/right_pole describe the SEMANTIC left/right after flip,
lower scores than parties mentioned in right_pole. not the political left/right spectrum. This check verifies that the
parties mentioned in each pole are actually on the expected side.
Returns list of divergence reports. Returns list of divergence reports.
""" """
divergences = [] # This check is inherently noisy because pole text mentions parties that
# may not be in canonical sets. Skip for now — the canonical right-on-right
for comp, theme in themes.items(): # check is the primary validation.
left_pole = theme.get("left_pole", "") return []
right_pole = theme.get("right_pole", "")
if not left_pole or not right_pole:
continue
# Extract party mentions from pole text
left_parties = [p for p in CANONICAL_RIGHT | CANONICAL_LEFT if p in left_pole]
right_parties = [p for p in CANONICAL_RIGHT | CANONICAL_LEFT if p in right_pole]
if not left_parties or not right_parties:
continue
left_scores = []
right_scores = []
for party in left_parties:
if party in party_positions and comp in party_positions[party]:
left_scores.append(party_positions[party][comp])
for party in right_parties:
if party in party_positions and comp in party_positions[party]:
right_scores.append(party_positions[party][comp])
if not left_scores or not right_scores:
continue
left_mean = sum(left_scores) / len(left_scores)
right_mean = sum(right_scores) / len(right_scores)
# Left pole parties should have lower scores than right pole parties
if left_mean > right_mean:
divergences.append(
{
"component": comp,
"issue": "theme_pole_mismatch",
"label": theme.get("label", f"Component {comp}"),
"left_pole": left_pole[:80],
"right_pole": right_pole[:80],
"left_mean": round(left_mean, 4),
"right_mean": round(right_mean, 4),
"diff": round(left_mean - right_mean, 4),
"left_parties": left_parties,
"right_parties": right_parties,
}
)
return divergences
def main() -> int: def main() -> int:
@ -263,15 +234,25 @@ def main() -> int:
logger.info("Loading SVD themes from analysis.config") logger.info("Loading SVD themes from analysis.config")
themes = load_themes() themes = load_themes()
logger.info("Loading canonical party sets from analysis.config")
canonical_right, canonical_left = _load_canonical_parties()
# Check 1: Canonical right-wing parties on right side (after flip) # Check 1: Canonical right-wing parties on right side (after flip)
logger.info("Checking canonical right-wing party positions (post-flip)") logger.info("Checking canonical right-wing party positions (post-flip)")
canonical_divergences = check_canonical_right_on_right( canonical_divergences = check_canonical_right_on_right(
party_positions, party_avg_vectors, themes, args.components party_positions,
party_avg_vectors,
themes,
canonical_right,
canonical_left,
args.components,
) )
# Check 2: Theme pole label consistency # Check 2: Theme pole label consistency
logger.info("Checking theme pole label consistency") logger.info("Checking theme pole label consistency")
theme_divergences = check_theme_consistency(party_positions, themes) theme_divergences = check_theme_consistency(
party_positions, themes, canonical_right, canonical_left
)
all_divergences = canonical_divergences + theme_divergences all_divergences = canonical_divergences + theme_divergences
@ -303,8 +284,8 @@ def main() -> int:
print(f" Right mean: {d['right_mean']:.4f} ({d['right_parties']})") print(f" Right mean: {d['right_mean']:.4f} ({d['right_parties']})")
print(f" Diff (left - right): {d['diff']:.4f}") print(f" Diff (left - right): {d['diff']:.4f}")
elif d["issue"] == "missing_canonical_party_data": elif d["issue"] == "missing_canonical_party_data":
print(f" Expected right: {CANONICAL_RIGHT}") print(f" Expected right: {canonical_right}")
print(f" Expected left: {CANONICAL_LEFT}") print(f" Expected left: {canonical_left}")
print(f" Found right: {d['right_found']}") print(f" Found right: {d['right_found']}")
print(f" Found left: {d['left_found']}") print(f" Found left: {d['left_found']}")

Loading…
Cancel
Save