You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
motief/analysis/right_wing/extremity_scorer.py

215 lines
7.0 KiB

#!/usr/bin/env python3
"""Policy extremity scorer: LLM-based radicalism scoring for right-wing motions.
Usage:
uv run python analysis/right_wing/extremity_scorer.py --sample 50
uv run python analysis/right_wing/extremity_scorer.py --sample -1 # all motions
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
from pathlib import Path
from typing import Any
import duckdb
ROOT = Path(__file__).parent.parent.parent.resolve()
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from ai_provider import ProviderError, chat_completion_json_parallel
from analysis.config import config
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
# JSON schema enforcing the expected response shape
EXTREMITY_SCHEMA = {
"name": "extremity_score",
"strict": True,
"schema": {
"type": "object",
"properties": {
"score": {
"type": "integer",
"description": "Radicalism score from 1 (mild/technical) to 5 (extreme/fundamental)",
"minimum": 1,
"maximum": 5,
},
"explanation": {
"type": "string",
"description": "Short explanation in Dutch of why this score was given",
},
},
"required": ["score", "explanation"],
"additionalProperties": False,
},
}
PROMPT_TEMPLATE = """Dit is een motie in het Nederlandse parlement.
Titel: {title}
Tekst: {text}
Wat vraagt deze motie concreet? Beoordeel hoe radicaal dit voorstel is op een schaal van 1 (mild/technisch) tot 5 (extreem/fundamenteel). Geef alleen het cijfer en een korte verklaring in het Nederlands."""
def _build_prompt(title: str, body_text: str | None) -> str:
text = body_text or title or ""
# Truncate body_text to keep prompt size reasonable
if len(text) > 800:
text = text[:800] + "..."
return PROMPT_TEMPLATE.format(title=title or "", text=text)
def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | None]) -> list[dict[str, Any]]:
"""Score a batch of motions in parallel via LLM."""
message_batches = []
for title, text in zip(titles, texts):
prompt = _build_prompt(title, text)
message_batches.append([{"role": "user", "content": prompt}])
try:
results = chat_completion_json_parallel(
message_batches,
model=config.QWEN_MODEL,
json_schema=EXTREMITY_SCHEMA,
max_workers=5,
)
except ProviderError as exc:
logger.error("Batch API call failed: %s", exc)
return [{"score": None, "explanation": None, "error": str(exc)}] * len(motion_ids)
# Validate each result
validated = []
for res in results:
if not isinstance(res, dict):
validated.append({"score": None, "explanation": None, "error": "non-dict response"})
continue
score = res.get("score")
explanation = res.get("explanation")
if not isinstance(score, int) or score < 1 or score > 5:
validated.append({"score": None, "explanation": None, "error": f"invalid score: {score}"})
continue
validated.append({"score": score, "explanation": explanation, "error": None})
return validated
def score_motions(
db_path: str = "data/motions.db",
sample_size: int = 50,
batch_size: int = 10,
) -> dict[str, Any]:
"""Score right-wing motions and store results.
Args:
sample_size: Number of motions to score. -1 = all classified motions.
"""
db = Path(db_path)
if not db.exists():
raise FileNotFoundError(f"Database not found: {db}")
con = duckdb.connect(str(db))
try:
# Ensure tables exist
tables = {t[0] for t in con.execute("SHOW TABLES").fetchall()}
if "right_wing_motions" not in tables:
raise RuntimeError("Run classify_motions.py first.")
# Load classified motions
limit_clause = "" if sample_size < 0 else f"LIMIT {sample_size}"
rows = con.execute(
f"""
SELECT r.motion_id, m.title, m.body_text
FROM right_wing_motions r
JOIN motions m ON r.motion_id = m.id
WHERE r.classified = TRUE
ORDER BY RANDOM()
{limit_clause}
"""
).fetchall()
if not rows:
logger.warning("No classified right-wing motions found.")
return {"scored": 0, "failed": 0}
logger.info("Scoring %d motions in batches of %d...", len(rows), batch_size)
# Create output table
con.execute("DROP TABLE IF EXISTS extremity_scores")
con.execute(
"""
CREATE TABLE extremity_scores (
motion_id INTEGER PRIMARY KEY,
score INTEGER,
explanation VARCHAR,
error VARCHAR
)
"""
)
scored = 0
failed = 0
for i in range(0, len(rows), batch_size):
batch = rows[i : i + batch_size]
motion_ids = [r[0] for r in batch]
titles = [r[1] for r in batch]
texts = [r[2] for r in batch]
logger.info("Batch %d/%d (%d motions)", i // batch_size + 1, (len(rows) - 1) // batch_size + 1, len(batch))
results = _score_batch(motion_ids, titles, texts)
for mid, res in zip(motion_ids, results):
con.execute(
"INSERT INTO extremity_scores (motion_id, score, explanation, error) VALUES (?, ?, ?, ?)",
(mid, res.get("score"), res.get("explanation"), res.get("error")),
)
if res.get("score") is not None:
scored += 1
else:
failed += 1
con.commit()
# Update yearly summary with average extremity
con.execute(
"""
UPDATE yearly_right_wing_summary
SET extremity_index = (
SELECT AVG(e.score)
FROM extremity_scores e
JOIN right_wing_motions r ON e.motion_id = r.motion_id
WHERE r.year = yearly_right_wing_summary.year
AND e.score IS NOT NULL
)
"""
)
con.commit()
logger.info("Scored %d motions, %d failures", scored, failed)
return {"scored": scored, "failed": failed, "sample_size": len(rows)}
finally:
con.close()
def main() -> int:
parser = argparse.ArgumentParser(description="Score policy extremity of right-wing motions")
parser.add_argument("--db", default="data/motions.db")
parser.add_argument("--sample", type=int, default=50, help="Number of motions to score (-1 for all)")
parser.add_argument("--batch-size", type=int, default=10)
args = parser.parse_args()
result = score_motions(db_path=args.db, sample_size=args.sample, batch_size=args.batch_size)
print(json.dumps(result, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())