|
|
|
|
@ -10,6 +10,7 @@ from pathlib import Path |
|
|
|
|
from typing import Dict, List, Optional, Tuple |
|
|
|
|
|
|
|
|
|
import numpy as np |
|
|
|
|
import re |
|
|
|
|
|
|
|
|
|
_logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
@ -103,6 +104,18 @@ _KEYWORDS: Dict[str, List[str]] = { |
|
|
|
|
], |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
# Pre-compiled regexes for keyword matching. We escape keywords but do NOT add |
|
|
|
|
# word-boundaries because some keywords intentionally match substrings |
|
|
|
|
# (e.g. 'traditi' matching 'tradities'). re.IGNORECASE makes lowercasing |
|
|
|
|
# unnecessary during matching. |
|
|
|
|
_KEYWORD_REGEXES: Dict[str, "re.Pattern[str]"] = { |
|
|
|
|
cat: re.compile( |
|
|
|
|
"|".join(re.escape(kw.strip()) for kw in kws), |
|
|
|
|
re.IGNORECASE, |
|
|
|
|
) |
|
|
|
|
for cat, kws in _KEYWORDS.items() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _classify_from_titles(titles: List[str]) -> Tuple[Optional[str], float]: |
|
|
|
|
"""Classify a list of motion titles into an axis category using keyword matching. |
|
|
|
|
@ -116,19 +129,20 @@ def _classify_from_titles(titles: List[str]) -> Tuple[Optional[str], float]: |
|
|
|
|
|
|
|
|
|
counts: Dict[str, int] = {cat: 0 for cat in _KEYWORDS} |
|
|
|
|
for title in titles: |
|
|
|
|
lower = title.lower() |
|
|
|
|
for cat, keywords in _KEYWORDS.items(): |
|
|
|
|
if any(kw in lower for kw in keywords): |
|
|
|
|
for cat, rx in _KEYWORD_REGEXES.items(): |
|
|
|
|
if rx.search(title): |
|
|
|
|
counts[cat] += 1 |
|
|
|
|
|
|
|
|
|
best_cat = max(counts, key=lambda c: counts[c]) |
|
|
|
|
best_count = counts[best_cat] |
|
|
|
|
# Determine the best category, but be deterministic on ties: if more than |
|
|
|
|
# one category has the top count, return None to indicate ambiguity. |
|
|
|
|
best_count = max(counts.values()) |
|
|
|
|
best_cats = [cat for cat, cnt in counts.items() if cnt == best_count] |
|
|
|
|
confidence = best_count / len(titles) |
|
|
|
|
|
|
|
|
|
if confidence < _KEYWORD_THRESHOLD: |
|
|
|
|
if len(best_cats) != 1 or confidence < _KEYWORD_THRESHOLD: |
|
|
|
|
return None, confidence |
|
|
|
|
|
|
|
|
|
return best_cat, confidence |
|
|
|
|
return best_cats[0], confidence |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load_ideology(csv_path: Path) -> Dict[str, Dict[str, float]]: |
|
|
|
|
|