diff --git a/analysis/axis_classifier.py b/analysis/axis_classifier.py index b0a1877..d71b932 100644 --- a/analysis/axis_classifier.py +++ b/analysis/axis_classifier.py @@ -10,6 +10,7 @@ from pathlib import Path from typing import Dict, List, Optional, Tuple import numpy as np +import re _logger = logging.getLogger(__name__) @@ -103,6 +104,18 @@ _KEYWORDS: Dict[str, List[str]] = { ], } +# Pre-compiled regexes for keyword matching. We escape keywords but do NOT add +# word-boundaries because some keywords intentionally match substrings +# (e.g. 'traditi' matching 'tradities'). re.IGNORECASE makes lowercasing +# unnecessary during matching. +_KEYWORD_REGEXES: Dict[str, "re.Pattern[str]"] = { + cat: re.compile( + "|".join(re.escape(kw.strip()) for kw in kws), + re.IGNORECASE, + ) + for cat, kws in _KEYWORDS.items() +} + def _classify_from_titles(titles: List[str]) -> Tuple[Optional[str], float]: """Classify a list of motion titles into an axis category using keyword matching. @@ -116,19 +129,20 @@ def _classify_from_titles(titles: List[str]) -> Tuple[Optional[str], float]: counts: Dict[str, int] = {cat: 0 for cat in _KEYWORDS} for title in titles: - lower = title.lower() - for cat, keywords in _KEYWORDS.items(): - if any(kw in lower for kw in keywords): + for cat, rx in _KEYWORD_REGEXES.items(): + if rx.search(title): counts[cat] += 1 - best_cat = max(counts, key=lambda c: counts[c]) - best_count = counts[best_cat] + # Determine the best category, but be deterministic on ties: if more than + # one category has the top count, return None to indicate ambiguity. + best_count = max(counts.values()) + best_cats = [cat for cat, cnt in counts.items() if cnt == best_count] confidence = best_count / len(titles) - if confidence < _KEYWORD_THRESHOLD: + if len(best_cats) != 1 or confidence < _KEYWORD_THRESHOLD: return None, confidence - return best_cat, confidence + return best_cats[0], confidence def _load_ideology(csv_path: Path) -> Dict[str, Dict[str, float]]: