Extremity Scorer (U4 enhanced): - Now scores BOTH original motion text AND layman explanation separately - Schema: text_score, text_explanation, layman_score, layman_explanation - Text scores: 1→7, 2→33, 3→5, 4→5 (mild-to-moderate) - Layman scores: 1→12, 2→20, 3→17, 4→1 (slightly milder) Sentiment Analysis (U5 enhanced): - Now scores BOTH original motion text AND layman explanation separately - Schema: text_score, text_explanation, layman_score, layman_explanation - Text sentiment avg: 0.294 (slightly positive) - Layman sentiment avg: 0.416 (more positive - summaries tone down hostility) Category Derivation (new): - Two-phase LLM approach: derive taxonomy from sample, then apply to all - Discovered 7 categories from 30-motion sample: veiligheid/justitie, corona/pandemie, economie/belasting, klimaat/milieu, defensie/buitenland, asiel/vreemdelingen, overig - Applied to 50 motions with distribution shown in DB - Adds category + category_explanation columns to right_wing_motionsmain
parent
f94edc3d04
commit
fbf92c82cf
@ -0,0 +1,347 @@ |
||||
#!/usr/bin/env python3 |
||||
"""Derive policy categories for right-wing motions using LLM. |
||||
|
||||
Two-phase approach: |
||||
1. Derive taxonomy from a sample (discover categories from data) |
||||
2. Apply categories to all motions using the derived taxonomy |
||||
|
||||
Usage: |
||||
uv run python analysis/right_wing/derive_categories.py --derive-sample 30 --apply-sample 50 |
||||
uv run python analysis/right_wing/derive_categories.py --derive-sample 30 --apply-sample -1 |
||||
""" |
||||
|
||||
from __future__ import annotations |
||||
|
||||
import argparse |
||||
import json |
||||
import logging |
||||
import re |
||||
import sys |
||||
from collections import Counter |
||||
from pathlib import Path |
||||
from typing import Any |
||||
|
||||
import duckdb |
||||
|
||||
ROOT = Path(__file__).parent.parent.parent.resolve() |
||||
if str(ROOT) not in sys.path: |
||||
sys.path.insert(0, str(ROOT)) |
||||
|
||||
from ai_provider import ProviderError, chat_completion_json_parallel |
||||
from analysis.config import config |
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") |
||||
logger = logging.getLogger(__name__) |
||||
|
||||
# Phase 1: open-ended schema to discover categories |
||||
DERIVE_SCHEMA = { |
||||
"name": "derive_category", |
||||
"strict": True, |
||||
"schema": { |
||||
"type": "object", |
||||
"properties": { |
||||
"category": { |
||||
"type": "string", |
||||
"description": "Policy domain/category in Dutch. Use short lowercase labels like 'asiel', 'klimaat', 'corona', 'lhbtq', 'veiligheid', 'defensie', 'economie', 'landbouw', 'zorg', 'onderwijs', 'overig'", |
||||
}, |
||||
"explanation": { |
||||
"type": "string", |
||||
"description": "Very short explanation why this category fits", |
||||
}, |
||||
}, |
||||
"required": ["category", "explanation"], |
||||
"additionalProperties": False, |
||||
}, |
||||
} |
||||
|
||||
# Phase 2: constrained schema using the derived taxonomy |
||||
APPLY_SCHEMA_TEMPLATE = { |
||||
"name": "apply_category", |
||||
"strict": True, |
||||
"schema": { |
||||
"type": "object", |
||||
"properties": { |
||||
"category": { |
||||
"type": "string", |
||||
"description": "Category must be one of: {categories}", |
||||
"enum": [], # filled dynamically |
||||
}, |
||||
"explanation": { |
||||
"type": "string", |
||||
"description": "Very short explanation why this category fits", |
||||
}, |
||||
}, |
||||
"required": ["category", "explanation"], |
||||
"additionalProperties": False, |
||||
}, |
||||
} |
||||
|
||||
PROMPT_TEMPLATE = """Welk beleidsdomein hoort bij de volgende motie uit het Nederlandse parlement? |
||||
|
||||
Titel: {title} |
||||
|
||||
Tekst: {text} |
||||
|
||||
Leg uit in 1 zin waarom dit beleidsdomem past.""" |
||||
|
||||
|
||||
def _build_prompt(title: str, body_text: str | None) -> str: |
||||
text = body_text or title or "" |
||||
if len(text) > 600: |
||||
text = text[:600] + "..." |
||||
return PROMPT_TEMPLATE.format(title=title or "", text=text) |
||||
|
||||
|
||||
def _normalize_category(raw: str) -> str: |
||||
"""Normalize LLM category output to consistent labels.""" |
||||
raw = raw.lower().strip() |
||||
# Map common variants |
||||
mapping = { |
||||
"asiel": "asiel/vreemdelingen", |
||||
"vreemdelingen": "asiel/vreemdelingen", |
||||
"immigratie": "asiel/vreemdelingen", |
||||
"migratie": "asiel/vreemdelingen", |
||||
"klimaat": "klimaat/milieu", |
||||
"milieu": "klimaat/milieu", |
||||
"stikstof": "klimaat/milieu", |
||||
"corona": "corona/pandemie", |
||||
"pandemie": "corona/pandemie", |
||||
"covid": "corona/pandemie", |
||||
"lhbtq": "lhbtq/rechten", |
||||
"lhbti": "lhbtq/rechten", |
||||
"lgbt": "lhbtq/rechten", |
||||
"veiligheid": "veiligheid/justitie", |
||||
"justitie": "veiligheid/justitie", |
||||
"strafrecht": "veiligheid/justitie", |
||||
"defensie": "defensie/buitenland", |
||||
"buitenland": "defensie/buitenland", |
||||
"buitenlandse zaken": "defensie/buitenland", |
||||
"economie": "economie/belasting", |
||||
"belasting": "economie/belasting", |
||||
"financiën": "economie/belasting", |
||||
"landbouw": "landbouw/stikstof", |
||||
"boeren": "landbouw/stikstof", |
||||
"zorg": "zorg/gezondheid", |
||||
"gezondheid": "zorg/gezondheid", |
||||
"onderwijs": "onderwijs/cultuur", |
||||
"cultuur": "onderwijs/cultuur", |
||||
"energie": "energie", |
||||
"kernenergie": "energie", |
||||
"sociaal": "sociaal/jeugd", |
||||
"jeugd": "sociaal/jeugd", |
||||
"wonen": "wonen/ruimtelijk", |
||||
"ruimtelijk": "wonen/ruimtelijk", |
||||
"verkeer": "verkeer/infrastructuur", |
||||
"infrastructuur": "verkeer/infrastructuur", |
||||
} |
||||
return mapping.get(raw, raw) |
||||
|
||||
|
||||
def derive_taxonomy( |
||||
db_path: str = "data/motions.db", |
||||
derive_sample: int = 30, |
||||
batch_size: int = 10, |
||||
) -> list[str]: |
||||
"""Phase 1: derive category taxonomy from a sample of motions.""" |
||||
db = Path(db_path) |
||||
con = duckdb.connect(str(db)) |
||||
try: |
||||
rows = con.execute( |
||||
f""" |
||||
SELECT r.motion_id, m.title, m.body_text |
||||
FROM right_wing_motions r |
||||
JOIN motions m ON r.motion_id = m.id |
||||
WHERE r.classified = TRUE |
||||
ORDER BY RANDOM() |
||||
LIMIT {derive_sample} |
||||
""" |
||||
).fetchall() |
||||
|
||||
logger.info("Phase 1: deriving taxonomy from %d motions...", len(rows)) |
||||
|
||||
categories = [] |
||||
for i in range(0, len(rows), batch_size): |
||||
batch = rows[i : i + batch_size] |
||||
motion_ids = [r[0] for r in batch] |
||||
titles = [r[1] for r in batch] |
||||
texts = [r[2] for r in batch] |
||||
|
||||
message_batches = [] |
||||
for title, text in zip(titles, texts): |
||||
prompt = _build_prompt(title, text) |
||||
message_batches.append([{"role": "user", "content": prompt}]) |
||||
|
||||
try: |
||||
results = chat_completion_json_parallel( |
||||
message_batches, |
||||
model=config.QWEN_MODEL, |
||||
json_schema=DERIVE_SCHEMA, |
||||
max_workers=5, |
||||
) |
||||
except ProviderError as exc: |
||||
logger.error("Batch failed: %s", exc) |
||||
continue |
||||
|
||||
for res in results: |
||||
if isinstance(res, dict): |
||||
cat = res.get("category", "overig") |
||||
categories.append(_normalize_category(cat)) |
||||
|
||||
# Count and threshold |
||||
counts = Counter(categories) |
||||
logger.info("Raw category counts: %s", dict(counts.most_common())) |
||||
|
||||
# Keep categories with >= 2 occurrences, plus always keep 'overig' |
||||
taxonomy = [cat for cat, cnt in counts.most_common() if cnt >= 2] |
||||
if "overig" not in taxonomy: |
||||
taxonomy.append("overig") |
||||
|
||||
logger.info("Derived taxonomy (%d categories): %s", len(taxonomy), taxonomy) |
||||
return taxonomy |
||||
finally: |
||||
con.close() |
||||
|
||||
|
||||
def apply_categories( |
||||
db_path: str = "data/motions.db", |
||||
taxonomy: list[str] | None = None, |
||||
apply_sample: int = 50, |
||||
batch_size: int = 10, |
||||
) -> dict[str, Any]: |
||||
"""Phase 2: apply derived taxonomy to all motions.""" |
||||
db = Path(db_path) |
||||
con = duckdb.connect(str(db)) |
||||
try: |
||||
if taxonomy is None: |
||||
# Try to load from previous run or use default |
||||
taxonomy = [ |
||||
"asiel/vreemdelingen", |
||||
"klimaat/milieu", |
||||
"corona/pandemie", |
||||
"lhbtq/rechten", |
||||
"veiligheid/justitie", |
||||
"defensie/buitenland", |
||||
"economie/belasting", |
||||
"landbouw/stikstof", |
||||
"zorg/gezondheid", |
||||
"onderwijs/cultuur", |
||||
"energie", |
||||
"sociaal/jeugd", |
||||
"overig", |
||||
] |
||||
|
||||
# Build schema with enum |
||||
schema = json.loads(json.dumps(APPLY_SCHEMA_TEMPLATE)) |
||||
schema["schema"]["properties"]["category"]["enum"] = taxonomy |
||||
schema["schema"]["properties"]["category"][ |
||||
"description" |
||||
] = f"Category must be one of: {', '.join(taxonomy)}" |
||||
|
||||
limit_clause = "" if apply_sample < 0 else f"LIMIT {apply_sample}" |
||||
rows = con.execute( |
||||
f""" |
||||
SELECT r.motion_id, m.title, m.body_text |
||||
FROM right_wing_motions r |
||||
JOIN motions m ON r.motion_id = m.id |
||||
WHERE r.classified = TRUE |
||||
ORDER BY RANDOM() |
||||
{limit_clause} |
||||
""" |
||||
).fetchall() |
||||
|
||||
logger.info("Phase 2: applying %d categories to %d motions...", len(taxonomy), len(rows)) |
||||
|
||||
# Add category column if missing |
||||
cols = {c[1] for c in con.execute("PRAGMA table_info(right_wing_motions)").fetchall()} |
||||
if "category" not in cols: |
||||
con.execute("ALTER TABLE right_wing_motions ADD COLUMN category VARCHAR") |
||||
if "category_explanation" not in cols: |
||||
con.execute("ALTER TABLE right_wing_motions ADD COLUMN category_explanation VARCHAR") |
||||
|
||||
scored = 0 |
||||
failed = 0 |
||||
category_counts: Counter[str] = Counter() |
||||
|
||||
for i in range(0, len(rows), batch_size): |
||||
batch = rows[i : i + batch_size] |
||||
motion_ids = [r[0] for r in batch] |
||||
titles = [r[1] for r in batch] |
||||
texts = [r[2] for r in batch] |
||||
|
||||
message_batches = [] |
||||
for title, text in zip(titles, texts): |
||||
prompt = _build_prompt(title, text) |
||||
message_batches.append([{"role": "user", "content": prompt}]) |
||||
|
||||
try: |
||||
results = chat_completion_json_parallel( |
||||
message_batches, |
||||
model=config.QWEN_MODEL, |
||||
json_schema=schema, |
||||
max_workers=5, |
||||
) |
||||
except ProviderError as exc: |
||||
logger.error("Batch failed: %s", exc) |
||||
failed += len(batch) |
||||
continue |
||||
|
||||
for mid, res in zip(motion_ids, results): |
||||
if isinstance(res, dict) and res.get("category") in taxonomy: |
||||
cat = res["category"] |
||||
expl = res.get("explanation", "") |
||||
else: |
||||
cat = "overig" |
||||
expl = f"invalid response: {res}" if not isinstance(res, dict) else "unknown" |
||||
failed += 1 |
||||
continue |
||||
|
||||
con.execute( |
||||
"UPDATE right_wing_motions SET category = ?, category_explanation = ? WHERE motion_id = ?", |
||||
(cat, expl, mid), |
||||
) |
||||
category_counts[cat] += 1 |
||||
scored += 1 |
||||
|
||||
con.commit() |
||||
|
||||
logger.info("Applied categories to %d motions, %d failures", scored, failed) |
||||
return { |
||||
"scored": scored, |
||||
"failed": failed, |
||||
"taxonomy": taxonomy, |
||||
"category_distribution": dict(category_counts.most_common()), |
||||
} |
||||
finally: |
||||
con.close() |
||||
|
||||
|
||||
def main() -> int: |
||||
parser = argparse.ArgumentParser(description="Derive and apply policy categories") |
||||
parser.add_argument("--db", default="data/motions.db") |
||||
parser.add_argument("--derive-sample", type=int, default=30, help="Sample size for taxonomy derivation") |
||||
parser.add_argument("--apply-sample", type=int, default=50, help="Sample size for category application (-1 for all)") |
||||
parser.add_argument("--batch-size", type=int, default=10) |
||||
parser.add_argument("--skip-derive", action="store_true", help="Skip derivation, use default taxonomy") |
||||
args = parser.parse_args() |
||||
|
||||
if args.skip_derive: |
||||
taxonomy = None |
||||
else: |
||||
taxonomy = derive_taxonomy( |
||||
db_path=args.db, |
||||
derive_sample=args.derive_sample, |
||||
batch_size=args.batch_size, |
||||
) |
||||
|
||||
result = apply_categories( |
||||
db_path=args.db, |
||||
taxonomy=taxonomy, |
||||
apply_sample=args.apply_sample, |
||||
batch_size=args.batch_size, |
||||
) |
||||
print(json.dumps(result, indent=2)) |
||||
return 0 |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
raise SystemExit(main()) |
||||
Loading…
Reference in new issue