fix: deterministic tie handling and regex matching in _classify_from_titles

4 months ago · 1e52a8a8cc
parent 71e4b68926
commit 1e52a8a8cc
1 changed files with 21 additions and 7 deletions
--- a/analysis/axis_classifier.py
+++ b/analysis/axis_classifier.py
@ -10,6 +10,7 @@ from pathlib import Path
 from typing import Dict, List, Optional, Tuple

 import numpy as np
+import re

 _logger = logging.getLogger(__name__)

@ -103,6 +104,18 @@ _KEYWORDS: Dict[str, List[str]] = {
    ],
 }

+# Pre-compiled regexes for keyword matching. We escape keywords but do NOT add
+# word-boundaries because some keywords intentionally match substrings
+# (e.g. 'traditi' matching 'tradities'). re.IGNORECASE makes lowercasing
+# unnecessary during matching.
+_KEYWORD_REGEXES: Dict[str, "re.Pattern[str]"] = {
+    cat: re.compile(
+        "|".join(re.escape(kw.strip()) for kw in kws),
+        re.IGNORECASE,
+    )
+    for cat, kws in _KEYWORDS.items()
+}
+

 def _classify_from_titles(titles: List[str]) -> Tuple[Optional[str], float]:
    """Classify a list of motion titles into an axis category using keyword matching.
@ -116,19 +129,20 @@ def _classify_from_titles(titles: List[str]) -> Tuple[Optional[str], float]:

    counts: Dict[str, int] = {cat: 0 for cat in _KEYWORDS}
    for title in titles:
-        lower = title.lower()
-        for cat, keywords in _KEYWORDS.items():
-            if any(kw in lower for kw in keywords):
+        for cat, rx in _KEYWORD_REGEXES.items():
+            if rx.search(title):
                counts[cat] += 1

-    best_cat = max(counts, key=lambda c: counts[c])
-    best_count = counts[best_cat]
+    # Determine the best category, but be deterministic on ties: if more than
+    # one category has the top count, return None to indicate ambiguity.
+    best_count = max(counts.values())
+    best_cats = [cat for cat, cnt in counts.items() if cnt == best_count]
    confidence = best_count / len(titles)

-    if confidence < _KEYWORD_THRESHOLD:
+    if len(best_cats) != 1 or confidence < _KEYWORD_THRESHOLD:
        return None, confidence

-    return best_cat, confidence
+    return best_cats[0], confidence


 def _load_ideology(csv_path: Path) -> Dict[str, Dict[str, float]]: