From f8d9af7d9d7d3948a9f46294913611196b8c971c Mon Sep 17 00:00:00 2001 From: Sven Geboers Date: Sun, 29 Mar 2026 14:57:19 +0200 Subject: [PATCH] feat: add _classify_from_titles keyword classifier to axis_classifier --- analysis/axis_classifier.py | 89 +++++++++++++++++++++++++++++++++ tests/test_political_compass.py | 46 +++++++++++++++++ 2 files changed, 135 insertions(+) diff --git a/analysis/axis_classifier.py b/analysis/axis_classifier.py index db26314..64b5554 100644 --- a/analysis/axis_classifier.py +++ b/analysis/axis_classifier.py @@ -42,6 +42,95 @@ _INTERPRETATION_TEMPLATES = { } +# Simple keyword-based classifier for motion titles (fallback signal) +_KEYWORD_THRESHOLD = 0.4 + +_KEYWORDS: Dict[str, List[str]] = { + "Links\u2013Rechts": [ + # economic + "belasting", + "uitkering", + "bijstand", + "minimumloon", + "cao", + "vakbond", + "bezuiniging", + "privatisering", + "subsidie", + "pensioen", + "aow", + "zorg", + # immigration + "asiel", + "asielaanvraag", + "migratie", + "vreemdeling", + "vluchtelingen", + "terugkeer", + "grenzen", + "opvang", + "statushouder", + ], + "Progressief\u2013Conservatief": [ + # environment + "klimaat", + "stikstof", + "duurzaam", + "duurzaamheid", + "co2", + "energietransitie", + "biodiversiteit", + # social + "euthanasie", + "abortus", + "lgbtq", + "transgender", + "diversiteit", + "traditi", + "gezin", + "religie", + "geloof", + ], + "Nationaal\u2013Internationaal": [ + "navo", + "nato", + "europees", + "europese", + " eu ", + "verdrag", + " vn ", + "internationaal", + ], +} + + +def _classify_from_titles(titles: List[str]) -> Tuple[Optional[str], float]: + """Classify a list of motion titles into an axis category using keyword matching. + + Returns (category_label, confidence) where confidence = fraction of titles + containing at least one keyword from the winning category. + Returns (None, 0.0) if confidence is below _KEYWORD_THRESHOLD. + """ + if not titles: + return None, 0.0 + + counts: Dict[str, int] = {cat: 0 for cat in _KEYWORDS} + for title in titles: + lower = title.lower() + for cat, keywords in _KEYWORDS.items(): + if any(kw in lower for kw in keywords): + counts[cat] += 1 + + best_cat = max(counts, key=lambda c: counts[c]) + best_count = counts[best_cat] + confidence = best_count / len(titles) + + if confidence < _KEYWORD_THRESHOLD: + return None, confidence + + return best_cat, confidence + + def _load_ideology(csv_path: Path) -> Dict[str, Dict[str, float]]: """Load party ideology scores from CSV. diff --git a/tests/test_political_compass.py b/tests/test_political_compass.py index 21b18d2..798d045 100644 --- a/tests/test_political_compass.py +++ b/tests/test_political_compass.py @@ -608,3 +608,49 @@ def test_compute_2d_axes_exposes_global_mean(monkeypatch): _, axis_def = compute_2d_axes(db_path="dummy", window_ids=["w1"], method="pca") assert "global_mean" in axis_def assert isinstance(axis_def["global_mean"], np.ndarray) + + +def test_classify_from_titles_left_right(): + """Titles dominated by left-right keywords  'Links Rechts'.""" + from analysis.axis_classifier import _classify_from_titles + + titles = [ + "Motie over asielbeleid", + "Motie over minimumloon verhoging", + "Motie over vluchtelingen opvang", + "Motie over belastingverlaging", + "Motie over bijstandsuitkering", + ] + label, confidence = _classify_from_titles(titles) + assert label == "Links\u2013Rechts" + assert confidence >= 0.4 + + +def test_classify_from_titles_progressive(): + """Titles dominated by progressive/conservative keywords -> 'Progressief Conservatief'.""" + from analysis.axis_classifier import _classify_from_titles + + titles = [ + "Motie over klimaatdoelstellingen", + "Motie over stikstofbeleid", + "Motie over duurzame energie", + "Motie over co2 uitstoot", + "Motie over energietransitie", + ] + label, confidence = _classify_from_titles(titles) + assert label == "Progressief\u2013Conservatief" + assert confidence >= 0.4 + + +def test_classify_from_titles_low_confidence(): + """Mixed/irrelevant titles -> None (fallback triggered).""" + from analysis.axis_classifier import _classify_from_titles + + titles = [ + "Motie over sportsubsidie", + "Motie over bibliotheekregeling", + "Motie over verkeersveiligheid", + ] + label, confidence = _classify_from_titles(titles) + assert label is None + assert confidence < 0.4