feat: add _classify_from_titles keyword classifier to axis_classifier

main
Sven Geboers 1 month ago
parent 6c4dd81723
commit f8d9af7d9d
  1. 89
      analysis/axis_classifier.py
  2. 46
      tests/test_political_compass.py

@ -42,6 +42,95 @@ _INTERPRETATION_TEMPLATES = {
}
# Simple keyword-based classifier for motion titles (fallback signal)
_KEYWORD_THRESHOLD = 0.4
_KEYWORDS: Dict[str, List[str]] = {
"Links\u2013Rechts": [
# economic
"belasting",
"uitkering",
"bijstand",
"minimumloon",
"cao",
"vakbond",
"bezuiniging",
"privatisering",
"subsidie",
"pensioen",
"aow",
"zorg",
# immigration
"asiel",
"asielaanvraag",
"migratie",
"vreemdeling",
"vluchtelingen",
"terugkeer",
"grenzen",
"opvang",
"statushouder",
],
"Progressief\u2013Conservatief": [
# environment
"klimaat",
"stikstof",
"duurzaam",
"duurzaamheid",
"co2",
"energietransitie",
"biodiversiteit",
# social
"euthanasie",
"abortus",
"lgbtq",
"transgender",
"diversiteit",
"traditi",
"gezin",
"religie",
"geloof",
],
"Nationaal\u2013Internationaal": [
"navo",
"nato",
"europees",
"europese",
" eu ",
"verdrag",
" vn ",
"internationaal",
],
}
def _classify_from_titles(titles: List[str]) -> Tuple[Optional[str], float]:
"""Classify a list of motion titles into an axis category using keyword matching.
Returns (category_label, confidence) where confidence = fraction of titles
containing at least one keyword from the winning category.
Returns (None, 0.0) if confidence is below _KEYWORD_THRESHOLD.
"""
if not titles:
return None, 0.0
counts: Dict[str, int] = {cat: 0 for cat in _KEYWORDS}
for title in titles:
lower = title.lower()
for cat, keywords in _KEYWORDS.items():
if any(kw in lower for kw in keywords):
counts[cat] += 1
best_cat = max(counts, key=lambda c: counts[c])
best_count = counts[best_cat]
confidence = best_count / len(titles)
if confidence < _KEYWORD_THRESHOLD:
return None, confidence
return best_cat, confidence
def _load_ideology(csv_path: Path) -> Dict[str, Dict[str, float]]:
"""Load party ideology scores from CSV.

@ -608,3 +608,49 @@ def test_compute_2d_axes_exposes_global_mean(monkeypatch):
_, axis_def = compute_2d_axes(db_path="dummy", window_ids=["w1"], method="pca")
assert "global_mean" in axis_def
assert isinstance(axis_def["global_mean"], np.ndarray)
def test_classify_from_titles_left_right():
"""Titles dominated by left-right keywords  'Links Rechts'."""
from analysis.axis_classifier import _classify_from_titles
titles = [
"Motie over asielbeleid",
"Motie over minimumloon verhoging",
"Motie over vluchtelingen opvang",
"Motie over belastingverlaging",
"Motie over bijstandsuitkering",
]
label, confidence = _classify_from_titles(titles)
assert label == "Links\u2013Rechts"
assert confidence >= 0.4
def test_classify_from_titles_progressive():
"""Titles dominated by progressive/conservative keywords -> 'Progressief Conservatief'."""
from analysis.axis_classifier import _classify_from_titles
titles = [
"Motie over klimaatdoelstellingen",
"Motie over stikstofbeleid",
"Motie over duurzame energie",
"Motie over co2 uitstoot",
"Motie over energietransitie",
]
label, confidence = _classify_from_titles(titles)
assert label == "Progressief\u2013Conservatief"
assert confidence >= 0.4
def test_classify_from_titles_low_confidence():
"""Mixed/irrelevant titles -> None (fallback triggered)."""
from analysis.axis_classifier import _classify_from_titles
titles = [
"Motie over sportsubsidie",
"Motie over bibliotheekregeling",
"Motie over verkeersveiligheid",
]
label, confidence = _classify_from_titles(titles)
assert label is None
assert confidence < 0.4

Loading…
Cancel
Save