Extremity Scorer (U4 enhanced): - Now scores BOTH original motion text AND layman explanation separately - Schema: text_score, text_explanation, layman_score, layman_explanation - Text scores: 1→7, 2→33, 3→5, 4→5 (mild-to-moderate) - Layman scores: 1→12, 2→20, 3→17, 4→1 (slightly milder) Sentiment Analysis (U5 enhanced): - Now scores BOTH original motion text AND layman explanation separately - Schema: text_score, text_explanation, layman_score, layman_explanation - Text sentiment avg: 0.294 (slightly positive) - Layman sentiment avg: 0.416 (more positive - summaries tone down hostility) Category Derivation (new): - Two-phase LLM approach: derive taxonomy from sample, then apply to all - Discovered 7 categories from 30-motion sample: veiligheid/justitie, corona/pandemie, economie/belasting, klimaat/milieu, defensie/buitenland, asiel/vreemdelingen, overig - Applied to 50 motions with distribution shown in DB - Adds category + category_explanation columns to right_wing_motionsmain
parent
f94edc3d04
commit
fbf92c82cf
@ -0,0 +1,347 @@ |
|||||||
|
#!/usr/bin/env python3 |
||||||
|
"""Derive policy categories for right-wing motions using LLM. |
||||||
|
|
||||||
|
Two-phase approach: |
||||||
|
1. Derive taxonomy from a sample (discover categories from data) |
||||||
|
2. Apply categories to all motions using the derived taxonomy |
||||||
|
|
||||||
|
Usage: |
||||||
|
uv run python analysis/right_wing/derive_categories.py --derive-sample 30 --apply-sample 50 |
||||||
|
uv run python analysis/right_wing/derive_categories.py --derive-sample 30 --apply-sample -1 |
||||||
|
""" |
||||||
|
|
||||||
|
from __future__ import annotations |
||||||
|
|
||||||
|
import argparse |
||||||
|
import json |
||||||
|
import logging |
||||||
|
import re |
||||||
|
import sys |
||||||
|
from collections import Counter |
||||||
|
from pathlib import Path |
||||||
|
from typing import Any |
||||||
|
|
||||||
|
import duckdb |
||||||
|
|
||||||
|
ROOT = Path(__file__).parent.parent.parent.resolve() |
||||||
|
if str(ROOT) not in sys.path: |
||||||
|
sys.path.insert(0, str(ROOT)) |
||||||
|
|
||||||
|
from ai_provider import ProviderError, chat_completion_json_parallel |
||||||
|
from analysis.config import config |
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") |
||||||
|
logger = logging.getLogger(__name__) |
||||||
|
|
||||||
|
# Phase 1: open-ended schema to discover categories |
||||||
|
DERIVE_SCHEMA = { |
||||||
|
"name": "derive_category", |
||||||
|
"strict": True, |
||||||
|
"schema": { |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"category": { |
||||||
|
"type": "string", |
||||||
|
"description": "Policy domain/category in Dutch. Use short lowercase labels like 'asiel', 'klimaat', 'corona', 'lhbtq', 'veiligheid', 'defensie', 'economie', 'landbouw', 'zorg', 'onderwijs', 'overig'", |
||||||
|
}, |
||||||
|
"explanation": { |
||||||
|
"type": "string", |
||||||
|
"description": "Very short explanation why this category fits", |
||||||
|
}, |
||||||
|
}, |
||||||
|
"required": ["category", "explanation"], |
||||||
|
"additionalProperties": False, |
||||||
|
}, |
||||||
|
} |
||||||
|
|
||||||
|
# Phase 2: constrained schema using the derived taxonomy |
||||||
|
APPLY_SCHEMA_TEMPLATE = { |
||||||
|
"name": "apply_category", |
||||||
|
"strict": True, |
||||||
|
"schema": { |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"category": { |
||||||
|
"type": "string", |
||||||
|
"description": "Category must be one of: {categories}", |
||||||
|
"enum": [], # filled dynamically |
||||||
|
}, |
||||||
|
"explanation": { |
||||||
|
"type": "string", |
||||||
|
"description": "Very short explanation why this category fits", |
||||||
|
}, |
||||||
|
}, |
||||||
|
"required": ["category", "explanation"], |
||||||
|
"additionalProperties": False, |
||||||
|
}, |
||||||
|
} |
||||||
|
|
||||||
|
PROMPT_TEMPLATE = """Welk beleidsdomein hoort bij de volgende motie uit het Nederlandse parlement? |
||||||
|
|
||||||
|
Titel: {title} |
||||||
|
|
||||||
|
Tekst: {text} |
||||||
|
|
||||||
|
Leg uit in 1 zin waarom dit beleidsdomem past.""" |
||||||
|
|
||||||
|
|
||||||
|
def _build_prompt(title: str, body_text: str | None) -> str: |
||||||
|
text = body_text or title or "" |
||||||
|
if len(text) > 600: |
||||||
|
text = text[:600] + "..." |
||||||
|
return PROMPT_TEMPLATE.format(title=title or "", text=text) |
||||||
|
|
||||||
|
|
||||||
|
def _normalize_category(raw: str) -> str: |
||||||
|
"""Normalize LLM category output to consistent labels.""" |
||||||
|
raw = raw.lower().strip() |
||||||
|
# Map common variants |
||||||
|
mapping = { |
||||||
|
"asiel": "asiel/vreemdelingen", |
||||||
|
"vreemdelingen": "asiel/vreemdelingen", |
||||||
|
"immigratie": "asiel/vreemdelingen", |
||||||
|
"migratie": "asiel/vreemdelingen", |
||||||
|
"klimaat": "klimaat/milieu", |
||||||
|
"milieu": "klimaat/milieu", |
||||||
|
"stikstof": "klimaat/milieu", |
||||||
|
"corona": "corona/pandemie", |
||||||
|
"pandemie": "corona/pandemie", |
||||||
|
"covid": "corona/pandemie", |
||||||
|
"lhbtq": "lhbtq/rechten", |
||||||
|
"lhbti": "lhbtq/rechten", |
||||||
|
"lgbt": "lhbtq/rechten", |
||||||
|
"veiligheid": "veiligheid/justitie", |
||||||
|
"justitie": "veiligheid/justitie", |
||||||
|
"strafrecht": "veiligheid/justitie", |
||||||
|
"defensie": "defensie/buitenland", |
||||||
|
"buitenland": "defensie/buitenland", |
||||||
|
"buitenlandse zaken": "defensie/buitenland", |
||||||
|
"economie": "economie/belasting", |
||||||
|
"belasting": "economie/belasting", |
||||||
|
"financiën": "economie/belasting", |
||||||
|
"landbouw": "landbouw/stikstof", |
||||||
|
"boeren": "landbouw/stikstof", |
||||||
|
"zorg": "zorg/gezondheid", |
||||||
|
"gezondheid": "zorg/gezondheid", |
||||||
|
"onderwijs": "onderwijs/cultuur", |
||||||
|
"cultuur": "onderwijs/cultuur", |
||||||
|
"energie": "energie", |
||||||
|
"kernenergie": "energie", |
||||||
|
"sociaal": "sociaal/jeugd", |
||||||
|
"jeugd": "sociaal/jeugd", |
||||||
|
"wonen": "wonen/ruimtelijk", |
||||||
|
"ruimtelijk": "wonen/ruimtelijk", |
||||||
|
"verkeer": "verkeer/infrastructuur", |
||||||
|
"infrastructuur": "verkeer/infrastructuur", |
||||||
|
} |
||||||
|
return mapping.get(raw, raw) |
||||||
|
|
||||||
|
|
||||||
|
def derive_taxonomy( |
||||||
|
db_path: str = "data/motions.db", |
||||||
|
derive_sample: int = 30, |
||||||
|
batch_size: int = 10, |
||||||
|
) -> list[str]: |
||||||
|
"""Phase 1: derive category taxonomy from a sample of motions.""" |
||||||
|
db = Path(db_path) |
||||||
|
con = duckdb.connect(str(db)) |
||||||
|
try: |
||||||
|
rows = con.execute( |
||||||
|
f""" |
||||||
|
SELECT r.motion_id, m.title, m.body_text |
||||||
|
FROM right_wing_motions r |
||||||
|
JOIN motions m ON r.motion_id = m.id |
||||||
|
WHERE r.classified = TRUE |
||||||
|
ORDER BY RANDOM() |
||||||
|
LIMIT {derive_sample} |
||||||
|
""" |
||||||
|
).fetchall() |
||||||
|
|
||||||
|
logger.info("Phase 1: deriving taxonomy from %d motions...", len(rows)) |
||||||
|
|
||||||
|
categories = [] |
||||||
|
for i in range(0, len(rows), batch_size): |
||||||
|
batch = rows[i : i + batch_size] |
||||||
|
motion_ids = [r[0] for r in batch] |
||||||
|
titles = [r[1] for r in batch] |
||||||
|
texts = [r[2] for r in batch] |
||||||
|
|
||||||
|
message_batches = [] |
||||||
|
for title, text in zip(titles, texts): |
||||||
|
prompt = _build_prompt(title, text) |
||||||
|
message_batches.append([{"role": "user", "content": prompt}]) |
||||||
|
|
||||||
|
try: |
||||||
|
results = chat_completion_json_parallel( |
||||||
|
message_batches, |
||||||
|
model=config.QWEN_MODEL, |
||||||
|
json_schema=DERIVE_SCHEMA, |
||||||
|
max_workers=5, |
||||||
|
) |
||||||
|
except ProviderError as exc: |
||||||
|
logger.error("Batch failed: %s", exc) |
||||||
|
continue |
||||||
|
|
||||||
|
for res in results: |
||||||
|
if isinstance(res, dict): |
||||||
|
cat = res.get("category", "overig") |
||||||
|
categories.append(_normalize_category(cat)) |
||||||
|
|
||||||
|
# Count and threshold |
||||||
|
counts = Counter(categories) |
||||||
|
logger.info("Raw category counts: %s", dict(counts.most_common())) |
||||||
|
|
||||||
|
# Keep categories with >= 2 occurrences, plus always keep 'overig' |
||||||
|
taxonomy = [cat for cat, cnt in counts.most_common() if cnt >= 2] |
||||||
|
if "overig" not in taxonomy: |
||||||
|
taxonomy.append("overig") |
||||||
|
|
||||||
|
logger.info("Derived taxonomy (%d categories): %s", len(taxonomy), taxonomy) |
||||||
|
return taxonomy |
||||||
|
finally: |
||||||
|
con.close() |
||||||
|
|
||||||
|
|
||||||
|
def apply_categories( |
||||||
|
db_path: str = "data/motions.db", |
||||||
|
taxonomy: list[str] | None = None, |
||||||
|
apply_sample: int = 50, |
||||||
|
batch_size: int = 10, |
||||||
|
) -> dict[str, Any]: |
||||||
|
"""Phase 2: apply derived taxonomy to all motions.""" |
||||||
|
db = Path(db_path) |
||||||
|
con = duckdb.connect(str(db)) |
||||||
|
try: |
||||||
|
if taxonomy is None: |
||||||
|
# Try to load from previous run or use default |
||||||
|
taxonomy = [ |
||||||
|
"asiel/vreemdelingen", |
||||||
|
"klimaat/milieu", |
||||||
|
"corona/pandemie", |
||||||
|
"lhbtq/rechten", |
||||||
|
"veiligheid/justitie", |
||||||
|
"defensie/buitenland", |
||||||
|
"economie/belasting", |
||||||
|
"landbouw/stikstof", |
||||||
|
"zorg/gezondheid", |
||||||
|
"onderwijs/cultuur", |
||||||
|
"energie", |
||||||
|
"sociaal/jeugd", |
||||||
|
"overig", |
||||||
|
] |
||||||
|
|
||||||
|
# Build schema with enum |
||||||
|
schema = json.loads(json.dumps(APPLY_SCHEMA_TEMPLATE)) |
||||||
|
schema["schema"]["properties"]["category"]["enum"] = taxonomy |
||||||
|
schema["schema"]["properties"]["category"][ |
||||||
|
"description" |
||||||
|
] = f"Category must be one of: {', '.join(taxonomy)}" |
||||||
|
|
||||||
|
limit_clause = "" if apply_sample < 0 else f"LIMIT {apply_sample}" |
||||||
|
rows = con.execute( |
||||||
|
f""" |
||||||
|
SELECT r.motion_id, m.title, m.body_text |
||||||
|
FROM right_wing_motions r |
||||||
|
JOIN motions m ON r.motion_id = m.id |
||||||
|
WHERE r.classified = TRUE |
||||||
|
ORDER BY RANDOM() |
||||||
|
{limit_clause} |
||||||
|
""" |
||||||
|
).fetchall() |
||||||
|
|
||||||
|
logger.info("Phase 2: applying %d categories to %d motions...", len(taxonomy), len(rows)) |
||||||
|
|
||||||
|
# Add category column if missing |
||||||
|
cols = {c[1] for c in con.execute("PRAGMA table_info(right_wing_motions)").fetchall()} |
||||||
|
if "category" not in cols: |
||||||
|
con.execute("ALTER TABLE right_wing_motions ADD COLUMN category VARCHAR") |
||||||
|
if "category_explanation" not in cols: |
||||||
|
con.execute("ALTER TABLE right_wing_motions ADD COLUMN category_explanation VARCHAR") |
||||||
|
|
||||||
|
scored = 0 |
||||||
|
failed = 0 |
||||||
|
category_counts: Counter[str] = Counter() |
||||||
|
|
||||||
|
for i in range(0, len(rows), batch_size): |
||||||
|
batch = rows[i : i + batch_size] |
||||||
|
motion_ids = [r[0] for r in batch] |
||||||
|
titles = [r[1] for r in batch] |
||||||
|
texts = [r[2] for r in batch] |
||||||
|
|
||||||
|
message_batches = [] |
||||||
|
for title, text in zip(titles, texts): |
||||||
|
prompt = _build_prompt(title, text) |
||||||
|
message_batches.append([{"role": "user", "content": prompt}]) |
||||||
|
|
||||||
|
try: |
||||||
|
results = chat_completion_json_parallel( |
||||||
|
message_batches, |
||||||
|
model=config.QWEN_MODEL, |
||||||
|
json_schema=schema, |
||||||
|
max_workers=5, |
||||||
|
) |
||||||
|
except ProviderError as exc: |
||||||
|
logger.error("Batch failed: %s", exc) |
||||||
|
failed += len(batch) |
||||||
|
continue |
||||||
|
|
||||||
|
for mid, res in zip(motion_ids, results): |
||||||
|
if isinstance(res, dict) and res.get("category") in taxonomy: |
||||||
|
cat = res["category"] |
||||||
|
expl = res.get("explanation", "") |
||||||
|
else: |
||||||
|
cat = "overig" |
||||||
|
expl = f"invalid response: {res}" if not isinstance(res, dict) else "unknown" |
||||||
|
failed += 1 |
||||||
|
continue |
||||||
|
|
||||||
|
con.execute( |
||||||
|
"UPDATE right_wing_motions SET category = ?, category_explanation = ? WHERE motion_id = ?", |
||||||
|
(cat, expl, mid), |
||||||
|
) |
||||||
|
category_counts[cat] += 1 |
||||||
|
scored += 1 |
||||||
|
|
||||||
|
con.commit() |
||||||
|
|
||||||
|
logger.info("Applied categories to %d motions, %d failures", scored, failed) |
||||||
|
return { |
||||||
|
"scored": scored, |
||||||
|
"failed": failed, |
||||||
|
"taxonomy": taxonomy, |
||||||
|
"category_distribution": dict(category_counts.most_common()), |
||||||
|
} |
||||||
|
finally: |
||||||
|
con.close() |
||||||
|
|
||||||
|
|
||||||
|
def main() -> int: |
||||||
|
parser = argparse.ArgumentParser(description="Derive and apply policy categories") |
||||||
|
parser.add_argument("--db", default="data/motions.db") |
||||||
|
parser.add_argument("--derive-sample", type=int, default=30, help="Sample size for taxonomy derivation") |
||||||
|
parser.add_argument("--apply-sample", type=int, default=50, help="Sample size for category application (-1 for all)") |
||||||
|
parser.add_argument("--batch-size", type=int, default=10) |
||||||
|
parser.add_argument("--skip-derive", action="store_true", help="Skip derivation, use default taxonomy") |
||||||
|
args = parser.parse_args() |
||||||
|
|
||||||
|
if args.skip_derive: |
||||||
|
taxonomy = None |
||||||
|
else: |
||||||
|
taxonomy = derive_taxonomy( |
||||||
|
db_path=args.db, |
||||||
|
derive_sample=args.derive_sample, |
||||||
|
batch_size=args.batch_size, |
||||||
|
) |
||||||
|
|
||||||
|
result = apply_categories( |
||||||
|
db_path=args.db, |
||||||
|
taxonomy=taxonomy, |
||||||
|
apply_sample=args.apply_sample, |
||||||
|
batch_size=args.batch_size, |
||||||
|
) |
||||||
|
print(json.dumps(result, indent=2)) |
||||||
|
return 0 |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
raise SystemExit(main()) |
||||||
Loading…
Reference in new issue