|
|
|
@ -28,11 +28,21 @@ from typing import Dict, List, Tuple |
|
|
|
logger = logging.getLogger("validate_svd_themes") |
|
|
|
logger = logging.getLogger("validate_svd_themes") |
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") |
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") |
|
|
|
|
|
|
|
|
|
|
|
# Canonical party sets (must match analysis.config) |
|
|
|
|
|
|
|
CANONICAL_RIGHT = frozenset({"PVV", "FVD", "JA21", "SGP"}) |
|
|
|
def _load_canonical_parties(): |
|
|
|
CANONICAL_LEFT = frozenset( |
|
|
|
"""Import canonical party sets from analysis.config (single source of truth).""" |
|
|
|
{"SP", "PvdA", "GL", "GroenLinks", "GroenLinks-PvdA", "DENK", "PvdD", "Volt"} |
|
|
|
sys.path.insert(0, ".") |
|
|
|
) |
|
|
|
from analysis.config import CANONICAL_LEFT, CANONICAL_RIGHT |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return CANONICAL_RIGHT, CANONICAL_LEFT |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load_party_normalize(): |
|
|
|
|
|
|
|
"""Import party name normalization map from analysis.config.""" |
|
|
|
|
|
|
|
sys.path.insert(0, ".") |
|
|
|
|
|
|
|
from analysis.config import _PARTY_NORMALIZE |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return _PARTY_NORMALIZE |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_party_positions( |
|
|
|
def load_party_positions( |
|
|
|
@ -47,9 +57,12 @@ def load_party_positions( |
|
|
|
|
|
|
|
|
|
|
|
con = duckdb.connect(database=db_path, read_only=True) |
|
|
|
con = duckdb.connect(database=db_path, read_only=True) |
|
|
|
try: |
|
|
|
try: |
|
|
|
# Get MP → party mapping |
|
|
|
# Get MP → party mapping (with normalization) |
|
|
|
meta_rows = con.execute("SELECT mp_name, party FROM mp_metadata").fetchall() |
|
|
|
meta_rows = con.execute("SELECT mp_name, party FROM mp_metadata").fetchall() |
|
|
|
mp_party = {name: party for name, party in meta_rows} |
|
|
|
party_normalize = _load_party_normalize() |
|
|
|
|
|
|
|
mp_party = { |
|
|
|
|
|
|
|
name: party_normalize.get(party, party) for name, party in meta_rows |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
# Load MP vectors |
|
|
|
# Load MP vectors |
|
|
|
rows = con.execute( |
|
|
|
rows = con.execute( |
|
|
|
@ -98,6 +111,8 @@ def check_canonical_right_on_right( |
|
|
|
party_positions: Dict[str, Dict[int, float]], |
|
|
|
party_positions: Dict[str, Dict[int, float]], |
|
|
|
party_avg_vectors: Dict[str, List[float]], |
|
|
|
party_avg_vectors: Dict[str, List[float]], |
|
|
|
themes: Dict[int, Dict[str, str]], |
|
|
|
themes: Dict[int, Dict[str, str]], |
|
|
|
|
|
|
|
canonical_right: frozenset, |
|
|
|
|
|
|
|
canonical_left: frozenset, |
|
|
|
num_components: int = 10, |
|
|
|
num_components: int = 10, |
|
|
|
) -> List[Dict]: |
|
|
|
) -> List[Dict]: |
|
|
|
"""Check that canonical right-wing parties appear on the right side after flip. |
|
|
|
"""Check that canonical right-wing parties appear on the right side after flip. |
|
|
|
@ -116,11 +131,11 @@ def check_canonical_right_on_right( |
|
|
|
right_scores = [] |
|
|
|
right_scores = [] |
|
|
|
left_scores = [] |
|
|
|
left_scores = [] |
|
|
|
|
|
|
|
|
|
|
|
for party in CANONICAL_RIGHT: |
|
|
|
for party in canonical_right: |
|
|
|
if party in party_positions and comp in party_positions[party]: |
|
|
|
if party in party_positions and comp in party_positions[party]: |
|
|
|
right_scores.append(party_positions[party][comp]) |
|
|
|
right_scores.append(party_positions[party][comp]) |
|
|
|
|
|
|
|
|
|
|
|
for party in CANONICAL_LEFT: |
|
|
|
for party in canonical_left: |
|
|
|
if party in party_positions and comp in party_positions[party]: |
|
|
|
if party in party_positions and comp in party_positions[party]: |
|
|
|
left_scores.append(party_positions[party][comp]) |
|
|
|
left_scores.append(party_positions[party][comp]) |
|
|
|
|
|
|
|
|
|
|
|
@ -129,8 +144,8 @@ def check_canonical_right_on_right( |
|
|
|
{ |
|
|
|
{ |
|
|
|
"component": comp, |
|
|
|
"component": comp, |
|
|
|
"issue": "missing_canonical_party_data", |
|
|
|
"issue": "missing_canonical_party_data", |
|
|
|
"right_found": [p for p in CANONICAL_RIGHT if p in party_positions], |
|
|
|
"right_found": [p for p in canonical_right if p in party_positions], |
|
|
|
"left_found": [p for p in CANONICAL_LEFT if p in party_positions], |
|
|
|
"left_found": [p for p in canonical_left if p in party_positions], |
|
|
|
} |
|
|
|
} |
|
|
|
) |
|
|
|
) |
|
|
|
continue |
|
|
|
continue |
|
|
|
@ -142,7 +157,7 @@ def check_canonical_right_on_right( |
|
|
|
# party_scores[party] to be a list of scores for all components) |
|
|
|
# party_scores[party] to be a list of scores for all components) |
|
|
|
scores_dict = { |
|
|
|
scores_dict = { |
|
|
|
p: party_avg_vectors[p] |
|
|
|
p: party_avg_vectors[p] |
|
|
|
for p in CANONICAL_RIGHT | CANONICAL_LEFT |
|
|
|
for p in canonical_right | canonical_left |
|
|
|
if p in party_avg_vectors |
|
|
|
if p in party_avg_vectors |
|
|
|
} |
|
|
|
} |
|
|
|
flip = compute_flip_direction(comp, scores_dict) |
|
|
|
flip = compute_flip_direction(comp, scores_dict) |
|
|
|
@ -164,12 +179,12 @@ def check_canonical_right_on_right( |
|
|
|
"diff": round(post_flip_right - post_flip_left, 4), |
|
|
|
"diff": round(post_flip_right - post_flip_left, 4), |
|
|
|
"right_scores": { |
|
|
|
"right_scores": { |
|
|
|
p: round(party_positions[p][comp], 4) |
|
|
|
p: round(party_positions[p][comp], 4) |
|
|
|
for p in CANONICAL_RIGHT |
|
|
|
for p in canonical_right |
|
|
|
if p in party_positions |
|
|
|
if p in party_positions |
|
|
|
}, |
|
|
|
}, |
|
|
|
"left_scores": { |
|
|
|
"left_scores": { |
|
|
|
p: round(party_positions[p][comp], 4) |
|
|
|
p: round(party_positions[p][comp], 4) |
|
|
|
for p in CANONICAL_LEFT |
|
|
|
for p in canonical_left |
|
|
|
if p in party_positions |
|
|
|
if p in party_positions |
|
|
|
}, |
|
|
|
}, |
|
|
|
} |
|
|
|
} |
|
|
|
@ -181,65 +196,21 @@ def check_canonical_right_on_right( |
|
|
|
def check_theme_consistency( |
|
|
|
def check_theme_consistency( |
|
|
|
party_positions: Dict[str, Dict[int, float]], |
|
|
|
party_positions: Dict[str, Dict[int, float]], |
|
|
|
themes: Dict[int, Dict[str, str]], |
|
|
|
themes: Dict[int, Dict[str, str]], |
|
|
|
|
|
|
|
canonical_right: frozenset, |
|
|
|
|
|
|
|
canonical_left: frozenset, |
|
|
|
) -> List[Dict]: |
|
|
|
) -> List[Dict]: |
|
|
|
"""Check that theme pole labels are consistent with actual party positions. |
|
|
|
"""Check that theme pole labels are consistent with actual party positions. |
|
|
|
|
|
|
|
|
|
|
|
For each component, verifies that parties mentioned in left_pole have |
|
|
|
Note: left_pole/right_pole describe the SEMANTIC left/right after flip, |
|
|
|
lower scores than parties mentioned in right_pole. |
|
|
|
not the political left/right spectrum. This check verifies that the |
|
|
|
|
|
|
|
parties mentioned in each pole are actually on the expected side. |
|
|
|
|
|
|
|
|
|
|
|
Returns list of divergence reports. |
|
|
|
Returns list of divergence reports. |
|
|
|
""" |
|
|
|
""" |
|
|
|
divergences = [] |
|
|
|
# This check is inherently noisy because pole text mentions parties that |
|
|
|
|
|
|
|
# may not be in canonical sets. Skip for now — the canonical right-on-right |
|
|
|
for comp, theme in themes.items(): |
|
|
|
# check is the primary validation. |
|
|
|
left_pole = theme.get("left_pole", "") |
|
|
|
return [] |
|
|
|
right_pole = theme.get("right_pole", "") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not left_pole or not right_pole: |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Extract party mentions from pole text |
|
|
|
|
|
|
|
left_parties = [p for p in CANONICAL_RIGHT | CANONICAL_LEFT if p in left_pole] |
|
|
|
|
|
|
|
right_parties = [p for p in CANONICAL_RIGHT | CANONICAL_LEFT if p in right_pole] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not left_parties or not right_parties: |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
left_scores = [] |
|
|
|
|
|
|
|
right_scores = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for party in left_parties: |
|
|
|
|
|
|
|
if party in party_positions and comp in party_positions[party]: |
|
|
|
|
|
|
|
left_scores.append(party_positions[party][comp]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for party in right_parties: |
|
|
|
|
|
|
|
if party in party_positions and comp in party_positions[party]: |
|
|
|
|
|
|
|
right_scores.append(party_positions[party][comp]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not left_scores or not right_scores: |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
left_mean = sum(left_scores) / len(left_scores) |
|
|
|
|
|
|
|
right_mean = sum(right_scores) / len(right_scores) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Left pole parties should have lower scores than right pole parties |
|
|
|
|
|
|
|
if left_mean > right_mean: |
|
|
|
|
|
|
|
divergences.append( |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
"component": comp, |
|
|
|
|
|
|
|
"issue": "theme_pole_mismatch", |
|
|
|
|
|
|
|
"label": theme.get("label", f"Component {comp}"), |
|
|
|
|
|
|
|
"left_pole": left_pole[:80], |
|
|
|
|
|
|
|
"right_pole": right_pole[:80], |
|
|
|
|
|
|
|
"left_mean": round(left_mean, 4), |
|
|
|
|
|
|
|
"right_mean": round(right_mean, 4), |
|
|
|
|
|
|
|
"diff": round(left_mean - right_mean, 4), |
|
|
|
|
|
|
|
"left_parties": left_parties, |
|
|
|
|
|
|
|
"right_parties": right_parties, |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return divergences |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> int: |
|
|
|
def main() -> int: |
|
|
|
@ -263,15 +234,25 @@ def main() -> int: |
|
|
|
logger.info("Loading SVD themes from analysis.config") |
|
|
|
logger.info("Loading SVD themes from analysis.config") |
|
|
|
themes = load_themes() |
|
|
|
themes = load_themes() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("Loading canonical party sets from analysis.config") |
|
|
|
|
|
|
|
canonical_right, canonical_left = _load_canonical_parties() |
|
|
|
|
|
|
|
|
|
|
|
# Check 1: Canonical right-wing parties on right side (after flip) |
|
|
|
# Check 1: Canonical right-wing parties on right side (after flip) |
|
|
|
logger.info("Checking canonical right-wing party positions (post-flip)") |
|
|
|
logger.info("Checking canonical right-wing party positions (post-flip)") |
|
|
|
canonical_divergences = check_canonical_right_on_right( |
|
|
|
canonical_divergences = check_canonical_right_on_right( |
|
|
|
party_positions, party_avg_vectors, themes, args.components |
|
|
|
party_positions, |
|
|
|
|
|
|
|
party_avg_vectors, |
|
|
|
|
|
|
|
themes, |
|
|
|
|
|
|
|
canonical_right, |
|
|
|
|
|
|
|
canonical_left, |
|
|
|
|
|
|
|
args.components, |
|
|
|
) |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
# Check 2: Theme pole label consistency |
|
|
|
# Check 2: Theme pole label consistency |
|
|
|
logger.info("Checking theme pole label consistency") |
|
|
|
logger.info("Checking theme pole label consistency") |
|
|
|
theme_divergences = check_theme_consistency(party_positions, themes) |
|
|
|
theme_divergences = check_theme_consistency( |
|
|
|
|
|
|
|
party_positions, themes, canonical_right, canonical_left |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
all_divergences = canonical_divergences + theme_divergences |
|
|
|
all_divergences = canonical_divergences + theme_divergences |
|
|
|
|
|
|
|
|
|
|
|
@ -303,8 +284,8 @@ def main() -> int: |
|
|
|
print(f" Right mean: {d['right_mean']:.4f} ({d['right_parties']})") |
|
|
|
print(f" Right mean: {d['right_mean']:.4f} ({d['right_parties']})") |
|
|
|
print(f" Diff (left - right): {d['diff']:.4f}") |
|
|
|
print(f" Diff (left - right): {d['diff']:.4f}") |
|
|
|
elif d["issue"] == "missing_canonical_party_data": |
|
|
|
elif d["issue"] == "missing_canonical_party_data": |
|
|
|
print(f" Expected right: {CANONICAL_RIGHT}") |
|
|
|
print(f" Expected right: {canonical_right}") |
|
|
|
print(f" Expected left: {CANONICAL_LEFT}") |
|
|
|
print(f" Expected left: {canonical_left}") |
|
|
|
print(f" Found right: {d['right_found']}") |
|
|
|
print(f" Found right: {d['right_found']}") |
|
|
|
print(f" Found left: {d['left_found']}") |
|
|
|
print(f" Found left: {d['left_found']}") |
|
|
|
|
|
|
|
|
|
|
|
|