fix: deterministic tie handling and regex matching in _classify_from_titles

main
Sven Geboers 1 month ago
parent 71e4b68926
commit 1e52a8a8cc
  1. 28
      analysis/axis_classifier.py

@ -10,6 +10,7 @@ from pathlib import Path
from typing import Dict, List, Optional, Tuple
import numpy as np
import re
_logger = logging.getLogger(__name__)
@ -103,6 +104,18 @@ _KEYWORDS: Dict[str, List[str]] = {
],
}
# Pre-compiled regexes for keyword matching. We escape keywords but do NOT add
# word-boundaries because some keywords intentionally match substrings
# (e.g. 'traditi' matching 'tradities'). re.IGNORECASE makes lowercasing
# unnecessary during matching.
_KEYWORD_REGEXES: Dict[str, "re.Pattern[str]"] = {
cat: re.compile(
"|".join(re.escape(kw.strip()) for kw in kws),
re.IGNORECASE,
)
for cat, kws in _KEYWORDS.items()
}
def _classify_from_titles(titles: List[str]) -> Tuple[Optional[str], float]:
"""Classify a list of motion titles into an axis category using keyword matching.
@ -116,19 +129,20 @@ def _classify_from_titles(titles: List[str]) -> Tuple[Optional[str], float]:
counts: Dict[str, int] = {cat: 0 for cat in _KEYWORDS}
for title in titles:
lower = title.lower()
for cat, keywords in _KEYWORDS.items():
if any(kw in lower for kw in keywords):
for cat, rx in _KEYWORD_REGEXES.items():
if rx.search(title):
counts[cat] += 1
best_cat = max(counts, key=lambda c: counts[c])
best_count = counts[best_cat]
# Determine the best category, but be deterministic on ties: if more than
# one category has the top count, return None to indicate ambiguity.
best_count = max(counts.values())
best_cats = [cat for cat, cnt in counts.items() if cnt == best_count]
confidence = best_count / len(titles)
if confidence < _KEYWORD_THRESHOLD:
if len(best_cats) != 1 or confidence < _KEYWORD_THRESHOLD:
return None, confidence
return best_cat, confidence
return best_cats[0], confidence
def _load_ideology(csv_path: Path) -> Dict[str, Dict[str, float]]:

Loading…
Cancel
Save