feat: add _classify_from_titles keyword classifier to axis_classifier

4 months ago · f8d9af7d9d
parent 6c4dd81723
commit f8d9af7d9d
2 changed files with 135 additions and 0 deletions
--- a/analysis/axis_classifier.py
+++ b/analysis/axis_classifier.py
@ -42,6 +42,95 @@ _INTERPRETATION_TEMPLATES = {
 }


+# Simple keyword-based classifier for motion titles (fallback signal)
+_KEYWORD_THRESHOLD = 0.4
+
+_KEYWORDS: Dict[str, List[str]] = {
+    "Links\u2013Rechts": [
+        # economic
+        "belasting",
+        "uitkering",
+        "bijstand",
+        "minimumloon",
+        "cao",
+        "vakbond",
+        "bezuiniging",
+        "privatisering",
+        "subsidie",
+        "pensioen",
+        "aow",
+        "zorg",
+        # immigration
+        "asiel",
+        "asielaanvraag",
+        "migratie",
+        "vreemdeling",
+        "vluchtelingen",
+        "terugkeer",
+        "grenzen",
+        "opvang",
+        "statushouder",
+    ],
+    "Progressief\u2013Conservatief": [
+        # environment
+        "klimaat",
+        "stikstof",
+        "duurzaam",
+        "duurzaamheid",
+        "co2",
+        "energietransitie",
+        "biodiversiteit",
+        # social
+        "euthanasie",
+        "abortus",
+        "lgbtq",
+        "transgender",
+        "diversiteit",
+        "traditi",
+        "gezin",
+        "religie",
+        "geloof",
+    ],
+    "Nationaal\u2013Internationaal": [
+        "navo",
+        "nato",
+        "europees",
+        "europese",
+        " eu ",
+        "verdrag",
+        " vn ",
+        "internationaal",
+    ],
+}
+
+
+def _classify_from_titles(titles: List[str]) -> Tuple[Optional[str], float]:
+    """Classify a list of motion titles into an axis category using keyword matching.
+
+    Returns (category_label, confidence) where confidence = fraction of titles
+    containing at least one keyword from the winning category.
+    Returns (None, 0.0) if confidence is below _KEYWORD_THRESHOLD.
+    """
+    if not titles:
+        return None, 0.0
+
+    counts: Dict[str, int] = {cat: 0 for cat in _KEYWORDS}
+    for title in titles:
+        lower = title.lower()
+        for cat, keywords in _KEYWORDS.items():
+            if any(kw in lower for kw in keywords):
+                counts[cat] += 1
+
+    best_cat = max(counts, key=lambda c: counts[c])
+    best_count = counts[best_cat]
+    confidence = best_count / len(titles)
+
+    if confidence < _KEYWORD_THRESHOLD:
+        return None, confidence
+
+    return best_cat, confidence
+
+
 def _load_ideology(csv_path: Path) -> Dict[str, Dict[str, float]]:
    """Load party ideology scores from CSV.

--- a/tests/test_political_compass.py
+++ b/tests/test_political_compass.py
@ -608,3 +608,49 @@ def test_compute_2d_axes_exposes_global_mean(monkeypatch):
    _, axis_def = compute_2d_axes(db_path="dummy", window_ids=["w1"], method="pca")
    assert "global_mean" in axis_def
    assert isinstance(axis_def["global_mean"], np.ndarray)
+
+
+def test_classify_from_titles_left_right():
+    """Titles dominated by left-right keywords  'LinksRechts'."""
+    from analysis.axis_classifier import _classify_from_titles
+
+    titles = [
+        "Motie over asielbeleid",
+        "Motie over minimumloon verhoging",
+        "Motie over vluchtelingen opvang",
+        "Motie over belastingverlaging",
+        "Motie over bijstandsuitkering",
+    ]
+    label, confidence = _classify_from_titles(titles)
+    assert label == "Links\u2013Rechts"
+    assert confidence >= 0.4
+
+
+def test_classify_from_titles_progressive():
+    """Titles dominated by progressive/conservative keywords -> 'ProgressiefConservatief'."""
+    from analysis.axis_classifier import _classify_from_titles
+
+    titles = [
+        "Motie over klimaatdoelstellingen",
+        "Motie over stikstofbeleid",
+        "Motie over duurzame energie",
+        "Motie over co2 uitstoot",
+        "Motie over energietransitie",
+    ]
+    label, confidence = _classify_from_titles(titles)
+    assert label == "Progressief\u2013Conservatief"
+    assert confidence >= 0.4
+
+
+def test_classify_from_titles_low_confidence():
+    """Mixed/irrelevant titles -> None (fallback triggered)."""
+    from analysis.axis_classifier import _classify_from_titles
+
+    titles = [
+        "Motie over sportsubsidie",
+        "Motie over bibliotheekregeling",
+        "Motie over verkeersveiligheid",
+    ]
+    label, confidence = _classify_from_titles(titles)
+    assert label is None
+    assert confidence < 0.4