feat: add axis classifier with party ideology reference data

classify_axes() correlates per-party PCA positions against party_ideologies.csv to assign honest dynamic labels (Links-Rechts, Coalitie-Oppositie, etc.) instead of always assuming the first PCA axis is left-right.
4 months ago · 5ec1f7af75
parent 23849c9cb6
commit 5ec1f7af75
4 changed files with 447 additions and 0 deletions
--- a/analysis/axis_classifier.py
+++ b/analysis/axis_classifier.py
@ -0,0 +1,269 @@
 """Axis classifier: correlate per-party PCA positions against ideology reference data
 to assign honest, dynamic labels to political compass axes.
 Public API: classify_axes(positions_by_window, axes, db_path) -> dict
 """
 import logging
 from collections import Counter
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import numpy as np
 _logger = logging.getLogger(__name__)
 # Module-level caches — loaded once per process lifetime.
 _ideology_cache: Optional[Dict[str, Dict[str, float]]] = None
 _coalition_cache: Optional[Dict[str, set]] = None
 # Correlation threshold above which we consider an axis "explained" by a dimension.
 _THRESHOLD = 0.65
 _LABELS = {
    "lr": "Links\u2013Rechts",
    "co": "Coalitie\u2013Oppositie",
    "pc": "Progressief\u2013Conservatief",
    "fallback_x": "Stempatroon As 1",
    "fallback_y": "Stempatroon As 2",
 }
 _INTERPRETATION_TEMPLATES = {
    "lr": "De {orientation} as weerspiegelt de klassieke links-rechts tegenstelling.",
    "co": (
        "De {orientation} as weerspiegelt stemgedrag van coalitie- versus "
        "oppositiepartijen (r={r:.2f}). Links-rechts is minder dominant dit jaar."
    ),
    "pc": "De {orientation} as weerspiegelt de progressief-conservatieve tegenstelling.",
    "fallback": (
        "De {orientation} as weerspiegelt een empirisch stempatroon "
        "zonder duidelijke ideologische richting."
    ),
 }
 def _load_ideology(csv_path: Path) -> Dict[str, Dict[str, float]]:
    """Load party ideology scores from CSV.
    Returns {party_name: {"left_right": float, "progressive": float}}.
    Returns {} on any error (caller should treat empty as 'skip classification').
    """
    global _ideology_cache
    if _ideology_cache is not None:
        return _ideology_cache
    result: Dict[str, Dict[str, float]] = {}
    try:
        with open(csv_path, encoding="utf-8") as fh:
            lines = fh.read().splitlines()
        header = [h.strip() for h in lines[0].split(",")]
        lr_idx = header.index("left_right")
        pc_idx = header.index("progressive")
        for line in lines[1:]:
            if not line.strip():
                continue
            parts = [p.strip() for p in line.split(",")]
            if len(parts) <= max(lr_idx, pc_idx):
                continue
            result[parts[0]] = {
                "left_right": float(parts[lr_idx]),
                "progressive": float(parts[pc_idx]),
            }
    except FileNotFoundError:
        _logger.warning(
            "party_ideologies.csv not found at %s — axis labels will be generic",
            csv_path,
        )
        return {}
    except Exception as exc:
        _logger.warning("Failed to load party_ideologies.csv: %s", exc)
        return {}
    _ideology_cache = result
    return result
 def _load_coalition(csv_path: Path) -> Dict[str, set]:
    """Load coalition membership from CSV.
    Returns {window_id: set_of_party_names}.
    Returns {} on any error (coalition dimension will be skipped).
    """
    global _coalition_cache
    if _coalition_cache is not None:
        return _coalition_cache
    result: Dict[str, set] = {}
    try:
        with open(csv_path, encoding="utf-8") as fh:
            lines = fh.read().splitlines()
        for line in lines[1:]:
            if not line.strip():
                continue
            parts = [p.strip() for p in line.split(",")]
            if len(parts) < 2:
                continue
            wid, party = parts[0], parts[1]
            result.setdefault(wid, set()).add(party)
    except FileNotFoundError:
        _logger.warning(
            "coalition_membership.csv not found at %s — coalition axis detection disabled",
            csv_path,
        )
        return {}
    except Exception as exc:
        _logger.warning("Failed to load coalition_membership.csv: %s", exc)
        return {}
    _coalition_cache = result
    return result
 def _window_year(window_id: str) -> Optional[str]:
    """Extract year string from window_id.
    Returns None for 'current_parliament'.
    '2016' → '2016', '2016-Q3' → '2016'.
    """
    if window_id == "current_parliament":
        return None
    return window_id.split("-")[0]
 def _pearsonr(x: List[float], y: List[float]) -> float:
    """Pearson r; returns 0.0 for degenerate input (< 3 points or zero variance)."""
    if len(x) < 3:
        return 0.0
    xa = np.array(x, dtype=float)
    ya = np.array(y, dtype=float)
    if xa.std() < 1e-12 or ya.std() < 1e-12:
        return 0.0
    return float(np.corrcoef(xa, ya)[0, 1])
 def _assign_label(
    r_lr: float,
    r_co: float,
    r_pc: float,
    axis: str,
 ) -> Tuple[str, str, float]:
    """Assign label, interpretation and quality score for one axis.
    Priority: left-right > coalition > progressive > fallback.
    Returns (label, interpretation_string, quality_score).
    """
    orientation = "horizontale" if axis == "x" else "verticale"
    fallback_label = _LABELS["fallback_x"] if axis == "x" else _LABELS["fallback_y"]
    quality = max(abs(r_lr), abs(r_co), abs(r_pc))
    if abs(r_lr) >= _THRESHOLD:
        return (
            _LABELS["lr"],
            _INTERPRETATION_TEMPLATES["lr"].format(orientation=orientation),
            quality,
        )
    if abs(r_co) >= _THRESHOLD:
        return (
            _LABELS["co"],
            _INTERPRETATION_TEMPLATES["co"].format(orientation=orientation, r=r_co),
            quality,
        )
    if abs(r_pc) >= _THRESHOLD:
        return (
            _LABELS["pc"],
            _INTERPRETATION_TEMPLATES["pc"].format(orientation=orientation),
            quality,
        )
    return (
        fallback_label,
        _INTERPRETATION_TEMPLATES["fallback"].format(orientation=orientation),
        quality,
    )
 def classify_axes(
    positions_by_window: Dict[str, Dict[str, Tuple[float, float]]],
    axes: dict,
    db_path: str,
 ) -> dict:
    """Classify compass axes by correlating per-party positions against ideology reference data.
    Enriches ``axes`` with:
        x_label, y_label       — global label (modal across annual windows)
        x_quality, y_quality   — {window_id: float} max |r| for each window
        x_interpretation       — {window_id: str} Dutch explanation per window
        y_interpretation       — {window_id: str} Dutch explanation per window
    Returns the original ``axes`` dict unchanged if reference data is unavailable.
    """
    data_dir = Path(db_path).parent
    ideology = _load_ideology(data_dir / "party_ideologies.csv")
    if not ideology:
        return axes  # no reference data — preserve existing behaviour
    coalition = _load_coalition(data_dir / "coalition_membership.csv")
    x_quality: Dict[str, float] = {}
    y_quality: Dict[str, float] = {}
    x_interpretation: Dict[str, str] = {}
    y_interpretation: Dict[str, str] = {}
    annual_x_labels: List[str] = []
    annual_y_labels: List[str] = []
    for wid, pos_dict in positions_by_window.items():
        year = _window_year(wid)
        is_current = wid == "current_parliament"
        is_annual = not is_current and "-" not in wid  # e.g. "2016" not "2016-Q3"
        # Only use parties present in both the positions and the ideology reference.
        parties = [p for p in pos_dict if p in ideology]
        if len(parties) < 5:
            _logger.debug(
                "Skipping axis classification for %s: only %d reference parties (need 5)",
                wid,
                len(parties),
            )
            continue
        party_x = [pos_dict[p][0] for p in parties]
        party_y = [pos_dict[p][1] for p in parties]
        ref_lr = [ideology[p]["left_right"] for p in parties]
        ref_pc = [ideology[p]["progressive"] for p in parties]
        # Coalition dummy: +1 if in government that year, -1 otherwise.
        # current_parliament and windows with no coalition data use a neutral vector.
        if year and coalition and year in coalition:
            gov_set = coalition[year]
            ref_co = [1.0 if p in gov_set else -1.0 for p in parties]
        else:
            ref_co = [0.0] * len(parties)  # neutral — will never exceed threshold
        r_lr_x = _pearsonr(party_x, ref_lr)
        r_co_x = _pearsonr(party_x, ref_co)
        r_pc_x = _pearsonr(party_x, ref_pc)
        x_lbl, x_int, x_q = _assign_label(r_lr_x, r_co_x, r_pc_x, "x")
        r_lr_y = _pearsonr(party_y, ref_lr)
        r_co_y = _pearsonr(party_y, ref_co)
        r_pc_y = _pearsonr(party_y, ref_pc)
        y_lbl, y_int, y_q = _assign_label(r_lr_y, r_co_y, r_pc_y, "y")
        x_quality[wid] = x_q
        y_quality[wid] = y_q
        x_interpretation[wid] = x_int
        y_interpretation[wid] = y_int
        # Only annual windows vote on the global label (not quarterly, not current_parliament).
        if is_annual:
            annual_x_labels.append(x_lbl)
            annual_y_labels.append(y_lbl)
    def _modal(labels: List[str], fallback: str) -> str:
        if not labels:
            return fallback
        return Counter(labels).most_common(1)[0][0]
    enriched = dict(axes)
    enriched["x_label"] = _modal(annual_x_labels, "Links\u2013Rechts")
    enriched["y_label"] = _modal(annual_y_labels, "Progressief\u2013Conservatief")
    enriched["x_quality"] = x_quality
    enriched["y_quality"] = y_quality
    enriched["x_interpretation"] = x_interpretation
    enriched["y_interpretation"] = y_interpretation
    return enriched
--- a/data/coalition_membership.csv
+++ b/data/coalition_membership.csv
@ -0,0 +1,51 @@
 window_id,party
 2012,VVD
 2012,PvdA
 2013,VVD
 2013,PvdA
 2014,VVD
 2014,PvdA
 2015,VVD
 2015,PvdA
 2016,VVD
 2016,PvdA
 2017,VVD
 2017,CDA
 2017,D66
 2017,ChristenUnie
 2018,VVD
 2018,CDA
 2018,D66
 2018,ChristenUnie
 2019,VVD
 2019,CDA
 2019,D66
 2019,ChristenUnie
 2020,VVD
 2020,CDA
 2020,D66
 2020,ChristenUnie
 2021,VVD
 2021,CDA
 2021,D66
 2021,ChristenUnie
 2022,VVD
 2022,D66
 2022,CDA
 2022,ChristenUnie
 2023,VVD
 2023,D66
 2023,CDA
 2023,ChristenUnie
 2024,PVV
 2024,VVD
 2024,NSC
 2024,BBB
 2025,PVV
 2025,VVD
 2025,NSC
 2025,BBB
 2026,PVV
 2026,VVD
 2026,NSC
 2026,BBB
--- a/data/party_ideologies.csv
+++ b/data/party_ideologies.csv
@ -0,0 +1,23 @@
 party,left_right,progressive
 VVD,0.65,0.10
 PvdA,-0.70,0.75
 SP,-0.90,0.50
 CDA,0.25,-0.45
 D66,-0.10,0.85
 GroenLinks,-0.70,0.90
 GL,-0.70,0.90
 GroenLinks-PvdA,-0.70,0.82
 ChristenUnie,0.10,-0.55
 SGP,0.35,-0.95
 PVV,0.90,-0.50
 DENK,-0.40,0.55
 50Plus,-0.05,-0.10
 FVD,0.90,-0.75
 PvdD,-0.60,0.85
 Volt,-0.20,0.80
 JA21,0.70,-0.30
 BBB,0.50,-0.35
 NSC,0.20,-0.20
 Nieuw Sociaal Contract,0.20,-0.20
 BVNL,0.85,-0.55
 Bij1,-0.90,0.90
--- a/tests/test_political_compass.py
+++ b/tests/test_political_compass.py
@ -365,3 +365,107 @@ def test_compute_party_discipline_empty_range(monkeypatch):
    )
    assert df.empty
 # ---------------------------------------------------------------------------
 # Tests for analysis.axis_classifier
 # ---------------------------------------------------------------------------
 import importlib
 def _fresh_classifier(monkeypatch):
    """Import axis_classifier with cleared module-level caches."""
    import analysis.axis_classifier as _cls
    monkeypatch.setattr(_cls, "_ideology_cache", None)
    monkeypatch.setattr(_cls, "_coalition_cache", None)
    return _cls
 def test_axis_label_left_right(tmp_path, monkeypatch):
    """Positions that closely correlate with left_right scores → label 'Links–Rechts'."""
    _cls = _fresh_classifier(monkeypatch)
    (tmp_path / "party_ideologies.csv").write_text(
        "party,left_right,progressive\n"
        "VVD,0.65,0.10\n"
        "PvdA,-0.70,0.75\n"
        "SP,-0.90,0.50\n"
        "PVV,0.90,-0.50\n"
        "D66,-0.10,0.85\n"
        "CDA,0.25,-0.45\n"
    )
    (tmp_path / "coalition_membership.csv").write_text("window_id,party\n")
    # X values are the party's left_right scores — perfect correlation
    positions_by_window = {
        "2022": {
            "VVD": (0.65, 0.10),
            "PvdA": (-0.70, 0.20),
            "SP": (-0.90, 0.30),
            "PVV": (0.90, -0.10),
            "D66": (-0.10, 0.40),
            "CDA": (0.25, -0.20),
        }
    }
    axes = {"x_axis": None, "y_axis": None, "method": "pca"}
    result = _cls.classify_axes(positions_by_window, axes, str(tmp_path / "motions.db"))
    assert result["x_label"] == "Links\u2013Rechts"
    assert result["x_quality"]["2022"] >= 0.65
 def test_axis_label_coalition_dominant(tmp_path, monkeypatch):
    """Positions that match coalition pattern but NOT left-right → 'Coalitie–Oppositie'."""
    _cls = _fresh_classifier(monkeypatch)
    (tmp_path / "party_ideologies.csv").write_text(
        "party,left_right,progressive\n"
        "VVD,0.65,0.10\n"
        "PvdA,-0.70,0.75\n"
        "SP,-0.90,0.50\n"
        "PVV,0.90,-0.50\n"
        "D66,-0.10,0.85\n"
        "CDA,0.25,-0.45\n"
    )
    # 2016: Rutte II coalition = VVD + PvdA
    (tmp_path / "coalition_membership.csv").write_text(
        "window_id,party\n2016,VVD\n2016,PvdA\n"
    )
    # Coalition parties (VVD + PvdA) at x ≈ +1, opposition at x ≈ -1.
    # VVD (right) and PvdA (left) are both near +1 → low left_right correlation
    # but high coalition correlation.
    positions_by_window = {
        "2016": {
            "VVD": (0.95, 0.10),
            "PvdA": (0.90, 0.20),
            "SP": (-0.85, 0.30),
            "PVV": (-0.95, -0.10),
            "D66": (-0.80, 0.40),
            "CDA": (-0.75, -0.20),
        }
    }
    axes = {"x_axis": None, "y_axis": None, "method": "pca"}
    result = _cls.classify_axes(positions_by_window, axes, str(tmp_path / "motions.db"))
    assert result["x_label"] == "Coalitie\u2013Oppositie"
    assert "coalitie" in result["x_interpretation"]["2016"].lower()
 def test_axis_classifier_missing_csv(tmp_path, monkeypatch):
    """Missing party_ideologies.csv → returns axes dict unchanged, no exception."""
    _cls = _fresh_classifier(monkeypatch)
    # No CSVs written — directory exists but files do not
    positions_by_window = {"2022": {"VVD": (1.0, 0.5), "PvdA": (-1.0, 0.3)}}
    axes = {"x_axis": None, "y_axis": None, "method": "pca"}
    result = _cls.classify_axes(positions_by_window, axes, str(tmp_path / "motions.db"))
    # Must not crash and must return the original axes dict unchanged
    assert result is axes
    assert "x_label" not in result