feat: restructure classify_axes — motion projection as primary label source

main
Sven Geboers 1 month ago
parent 62daad321e
commit 1ff280e0e3
  1. 181
      analysis/axis_classifier.py

@ -392,74 +392,157 @@ def classify_axes(
axes: dict,
db_path: str,
) -> dict:
"""Classify compass axes by correlating per-party positions against ideology reference data.
Enriches ``axes`` with:
x_label, y_label global label (modal across annual windows)
x_quality, y_quality {window_id: float} max |r| for each window
x_interpretation {window_id: str} Dutch explanation per window
y_interpretation {window_id: str} Dutch explanation per window
Returns the original ``axes`` dict unchanged if reference data is unavailable.
"""Classify compass axes using motion projection (primary) and ideology CSV (fallback).
Motion projection path:
- Requires axes["global_mean"], axes["x_axis"], axes["y_axis"].
- Loads motion SVD vectors per window, projects onto PCA axes,
ranks top 5+5 motions, applies keyword classifier -> label.
Fallback path (unchanged):
- Pearson-r against party_ideologies.csv (left_right, progressive).
- Pearson-r against coalition_membership.csv dummy.
Enriches axes with:
x_label, y_label global modal label across annual windows
x_quality, y_quality {window_id: float} max |r|
x_interpretation {window_id: str}
y_interpretation {window_id: str}
x_top_motions, y_top_motions {window_id: {'+': [(title, date), ...], '-': [...]}}
x_label_confidence {window_id: float}
y_label_confidence {window_id: float}
"""
data_dir = Path(db_path).parent
ideology = _load_ideology(data_dir / "party_ideologies.csv")
if not ideology:
return axes # no reference data — preserve existing behaviour
coalition = _load_coalition(data_dir / "coalition_membership.csv")
# Determine whether motion projection is possible.
global_mean = axes.get("global_mean")
x_axis_arr = np.array(axes.get("x_axis", []))
y_axis_arr = np.array(axes.get("y_axis", []))
motion_path_available = (
global_mean is not None
and x_axis_arr.ndim == 1
and x_axis_arr.size > 0
and y_axis_arr.size > 0
)
if not ideology and not motion_path_available:
return axes # nothing to classify with
x_quality: Dict[str, float] = {}
y_quality: Dict[str, float] = {}
x_interpretation: Dict[str, str] = {}
y_interpretation: Dict[str, str] = {}
x_top_motions: Dict[str, Dict] = {}
y_top_motions: Dict[str, Dict] = {}
x_label_confidence: Dict[str, float] = {}
y_label_confidence: Dict[str, float] = {}
annual_x_labels: List[str] = []
annual_y_labels: List[str] = []
for wid, pos_dict in positions_by_window.items():
year = _window_year(wid)
is_current = wid == "current_parliament"
is_annual = not is_current and "-" not in wid # e.g. "2016" not "2016-Q3"
# Only use parties present in both the positions and the ideology reference.
parties = [p for p in pos_dict if p in ideology]
if len(parties) < 5:
_logger.debug(
"Skipping axis classification for %s: only %d reference parties (need 5)",
wid,
len(parties),
is_annual = wid != "current_parliament" and "-" not in wid
# ── Ideology / coalition Pearson-r (unchanged logic) ──────────────────
x_lbl_fallback: Optional[str] = None
y_lbl_fallback: Optional[str] = None
x_q = 0.0
y_q = 0.0
x_int = ""
y_int = ""
if ideology:
parties = [p for p in pos_dict if p in ideology]
if len(parties) >= 5:
party_x = [pos_dict[p][0] for p in parties]
party_y = [pos_dict[p][1] for p in parties]
ref_lr = [ideology[p]["left_right"] for p in parties]
ref_pc = [ideology[p]["progressive"] for p in parties]
if year and coalition and year in coalition:
gov_set = coalition[year]
ref_co = [1.0 if p in gov_set else -1.0 for p in parties]
else:
ref_co = [0.0] * len(parties)
r_lr_x = _pearsonr(party_x, ref_lr)
r_co_x = _pearsonr(party_x, ref_co)
r_pc_x = _pearsonr(party_x, ref_pc)
x_lbl_fallback, x_int, x_q = _assign_label(r_lr_x, r_co_x, r_pc_x, "x")
r_lr_y = _pearsonr(party_y, ref_lr)
r_co_y = _pearsonr(party_y, ref_co)
r_pc_y = _pearsonr(party_y, ref_pc)
y_lbl_fallback, y_int, y_q = _assign_label(r_lr_y, r_co_y, r_pc_y, "y")
# ── Motion projection (primary) ────────────────────────────────────────
x_lbl = x_lbl_fallback
y_lbl = y_lbl_fallback
x_conf = 0.0
y_conf = 0.0
x_tops: Dict[str, List] = {"+": [], "-": []}
y_tops: Dict[str, List] = {"+": [], "-": []}
if motion_path_available:
motion_vecs = _load_motion_vectors(db_path, wid)
if motion_vecs:
projections = _project_motions(
motion_vecs, x_axis_arr, y_axis_arr, global_mean
)
x_ids = _top_motion_ids(projections, "x", n=5)
y_ids = _top_motion_ids(projections, "y", n=5)
all_x_ids = x_ids["+"] + x_ids["-"]
all_y_ids = y_ids["+"] + y_ids["-"]
titles_map = _fetch_motion_titles(
db_path, list(set(all_x_ids + all_y_ids))
)
x_title_list = [
titles_map[mid][0] for mid in all_x_ids if mid in titles_map
]
y_title_list = [
titles_map[mid][0] for mid in all_y_ids if mid in titles_map
]
x_kw_lbl, x_conf = _classify_from_titles(x_title_list)
y_kw_lbl, y_conf = _classify_from_titles(y_title_list)
if x_kw_lbl is not None:
x_lbl = x_kw_lbl
if y_kw_lbl is not None:
y_lbl = y_kw_lbl
# Build display lists: [(title, date), ...]
for pole, ids in x_ids.items():
x_tops[pole] = [titles_map[mid] for mid in ids if mid in titles_map]
for pole, ids in y_ids.items():
y_tops[pole] = [titles_map[mid] for mid in ids if mid in titles_map]
# ── Final label resolution ────────────────────────────────────────────
# If both motion and ideology paths produced nothing, use generic fallback.
if x_lbl is None:
x_lbl = _LABELS["fallback_x"]
x_int = _INTERPRETATION_TEMPLATES["fallback"].format(
orientation="horizontale"
)
if y_lbl is None:
y_lbl = _LABELS["fallback_y"]
y_int = _INTERPRETATION_TEMPLATES["fallback"].format(
orientation="verticale"
)
continue
party_x = [pos_dict[p][0] for p in parties]
party_y = [pos_dict[p][1] for p in parties]
ref_lr = [ideology[p]["left_right"] for p in parties]
ref_pc = [ideology[p]["progressive"] for p in parties]
# Coalition dummy: +1 if in government that year, -1 otherwise.
# current_parliament and windows with no coalition data use a neutral vector.
if year and coalition and year in coalition:
gov_set = coalition[year]
ref_co = [1.0 if p in gov_set else -1.0 for p in parties]
else:
ref_co = [0.0] * len(parties) # neutral — will never exceed threshold
r_lr_x = _pearsonr(party_x, ref_lr)
r_co_x = _pearsonr(party_x, ref_co)
r_pc_x = _pearsonr(party_x, ref_pc)
x_lbl, x_int, x_q = _assign_label(r_lr_x, r_co_x, r_pc_x, "x")
r_lr_y = _pearsonr(party_y, ref_lr)
r_co_y = _pearsonr(party_y, ref_co)
r_pc_y = _pearsonr(party_y, ref_pc)
y_lbl, y_int, y_q = _assign_label(r_lr_y, r_co_y, r_pc_y, "y")
x_quality[wid] = x_q
y_quality[wid] = y_q
x_interpretation[wid] = x_int
y_interpretation[wid] = y_int
x_top_motions[wid] = x_tops
y_top_motions[wid] = y_tops
x_label_confidence[wid] = x_conf
y_label_confidence[wid] = y_conf
# Only annual windows vote on the global label (not quarterly, not current_parliament).
if is_annual:
annual_x_labels.append(x_lbl)
annual_y_labels.append(y_lbl)
@ -476,4 +559,8 @@ def classify_axes(
enriched["y_quality"] = y_quality
enriched["x_interpretation"] = x_interpretation
enriched["y_interpretation"] = y_interpretation
enriched["x_top_motions"] = x_top_motions
enriched["y_top_motions"] = y_top_motions
enriched["x_label_confidence"] = x_label_confidence
enriched["y_label_confidence"] = y_label_confidence
return enriched

Loading…
Cancel
Save