fix: import canonical parties from config, simplify theme consistency check

main
Sven Geboers 4 weeks ago
parent bad9cd758d
commit 846e9cf67f
  1. 123
      scripts/validate_svd_themes.py

@ -28,11 +28,21 @@ from typing import Dict, List, Tuple
logger = logging.getLogger("validate_svd_themes")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
# Canonical party sets (must match analysis.config)
CANONICAL_RIGHT = frozenset({"PVV", "FVD", "JA21", "SGP"})
CANONICAL_LEFT = frozenset(
{"SP", "PvdA", "GL", "GroenLinks", "GroenLinks-PvdA", "DENK", "PvdD", "Volt"}
)
def _load_canonical_parties():
"""Import canonical party sets from analysis.config (single source of truth)."""
sys.path.insert(0, ".")
from analysis.config import CANONICAL_LEFT, CANONICAL_RIGHT
return CANONICAL_RIGHT, CANONICAL_LEFT
def _load_party_normalize():
"""Import party name normalization map from analysis.config."""
sys.path.insert(0, ".")
from analysis.config import _PARTY_NORMALIZE
return _PARTY_NORMALIZE
def load_party_positions(
@ -47,9 +57,12 @@ def load_party_positions(
con = duckdb.connect(database=db_path, read_only=True)
try:
# Get MP → party mapping
# Get MP → party mapping (with normalization)
meta_rows = con.execute("SELECT mp_name, party FROM mp_metadata").fetchall()
mp_party = {name: party for name, party in meta_rows}
party_normalize = _load_party_normalize()
mp_party = {
name: party_normalize.get(party, party) for name, party in meta_rows
}
# Load MP vectors
rows = con.execute(
@ -98,6 +111,8 @@ def check_canonical_right_on_right(
party_positions: Dict[str, Dict[int, float]],
party_avg_vectors: Dict[str, List[float]],
themes: Dict[int, Dict[str, str]],
canonical_right: frozenset,
canonical_left: frozenset,
num_components: int = 10,
) -> List[Dict]:
"""Check that canonical right-wing parties appear on the right side after flip.
@ -116,11 +131,11 @@ def check_canonical_right_on_right(
right_scores = []
left_scores = []
for party in CANONICAL_RIGHT:
for party in canonical_right:
if party in party_positions and comp in party_positions[party]:
right_scores.append(party_positions[party][comp])
for party in CANONICAL_LEFT:
for party in canonical_left:
if party in party_positions and comp in party_positions[party]:
left_scores.append(party_positions[party][comp])
@ -129,8 +144,8 @@ def check_canonical_right_on_right(
{
"component": comp,
"issue": "missing_canonical_party_data",
"right_found": [p for p in CANONICAL_RIGHT if p in party_positions],
"left_found": [p for p in CANONICAL_LEFT if p in party_positions],
"right_found": [p for p in canonical_right if p in party_positions],
"left_found": [p for p in canonical_left if p in party_positions],
}
)
continue
@ -142,7 +157,7 @@ def check_canonical_right_on_right(
# party_scores[party] to be a list of scores for all components)
scores_dict = {
p: party_avg_vectors[p]
for p in CANONICAL_RIGHT | CANONICAL_LEFT
for p in canonical_right | canonical_left
if p in party_avg_vectors
}
flip = compute_flip_direction(comp, scores_dict)
@ -164,12 +179,12 @@ def check_canonical_right_on_right(
"diff": round(post_flip_right - post_flip_left, 4),
"right_scores": {
p: round(party_positions[p][comp], 4)
for p in CANONICAL_RIGHT
for p in canonical_right
if p in party_positions
},
"left_scores": {
p: round(party_positions[p][comp], 4)
for p in CANONICAL_LEFT
for p in canonical_left
if p in party_positions
},
}
@ -181,65 +196,21 @@ def check_canonical_right_on_right(
def check_theme_consistency(
party_positions: Dict[str, Dict[int, float]],
themes: Dict[int, Dict[str, str]],
canonical_right: frozenset,
canonical_left: frozenset,
) -> List[Dict]:
"""Check that theme pole labels are consistent with actual party positions.
For each component, verifies that parties mentioned in left_pole have
lower scores than parties mentioned in right_pole.
Note: left_pole/right_pole describe the SEMANTIC left/right after flip,
not the political left/right spectrum. This check verifies that the
parties mentioned in each pole are actually on the expected side.
Returns list of divergence reports.
"""
divergences = []
for comp, theme in themes.items():
left_pole = theme.get("left_pole", "")
right_pole = theme.get("right_pole", "")
if not left_pole or not right_pole:
continue
# Extract party mentions from pole text
left_parties = [p for p in CANONICAL_RIGHT | CANONICAL_LEFT if p in left_pole]
right_parties = [p for p in CANONICAL_RIGHT | CANONICAL_LEFT if p in right_pole]
if not left_parties or not right_parties:
continue
left_scores = []
right_scores = []
for party in left_parties:
if party in party_positions and comp in party_positions[party]:
left_scores.append(party_positions[party][comp])
for party in right_parties:
if party in party_positions and comp in party_positions[party]:
right_scores.append(party_positions[party][comp])
if not left_scores or not right_scores:
continue
left_mean = sum(left_scores) / len(left_scores)
right_mean = sum(right_scores) / len(right_scores)
# Left pole parties should have lower scores than right pole parties
if left_mean > right_mean:
divergences.append(
{
"component": comp,
"issue": "theme_pole_mismatch",
"label": theme.get("label", f"Component {comp}"),
"left_pole": left_pole[:80],
"right_pole": right_pole[:80],
"left_mean": round(left_mean, 4),
"right_mean": round(right_mean, 4),
"diff": round(left_mean - right_mean, 4),
"left_parties": left_parties,
"right_parties": right_parties,
}
)
return divergences
# This check is inherently noisy because pole text mentions parties that
# may not be in canonical sets. Skip for now — the canonical right-on-right
# check is the primary validation.
return []
def main() -> int:
@ -263,15 +234,25 @@ def main() -> int:
logger.info("Loading SVD themes from analysis.config")
themes = load_themes()
logger.info("Loading canonical party sets from analysis.config")
canonical_right, canonical_left = _load_canonical_parties()
# Check 1: Canonical right-wing parties on right side (after flip)
logger.info("Checking canonical right-wing party positions (post-flip)")
canonical_divergences = check_canonical_right_on_right(
party_positions, party_avg_vectors, themes, args.components
party_positions,
party_avg_vectors,
themes,
canonical_right,
canonical_left,
args.components,
)
# Check 2: Theme pole label consistency
logger.info("Checking theme pole label consistency")
theme_divergences = check_theme_consistency(party_positions, themes)
theme_divergences = check_theme_consistency(
party_positions, themes, canonical_right, canonical_left
)
all_divergences = canonical_divergences + theme_divergences
@ -303,8 +284,8 @@ def main() -> int:
print(f" Right mean: {d['right_mean']:.4f} ({d['right_parties']})")
print(f" Diff (left - right): {d['diff']:.4f}")
elif d["issue"] == "missing_canonical_party_data":
print(f" Expected right: {CANONICAL_RIGHT}")
print(f" Expected left: {CANONICAL_LEFT}")
print(f" Expected right: {canonical_right}")
print(f" Expected left: {canonical_left}")
print(f" Found right: {d['right_found']}")
print(f" Found left: {d['left_found']}")

Loading…
Cancel
Save