feat(right-wing): dual-scoring extremity/sentiment + derived categories

Extremity Scorer (U4 enhanced):
- Now scores BOTH original motion text AND layman explanation separately
- Schema: text_score, text_explanation, layman_score, layman_explanation
- Text scores: 1→7, 2→33, 3→5, 4→5 (mild-to-moderate)
- Layman scores: 1→12, 2→20, 3→17, 4→1 (slightly milder)

Sentiment Analysis (U5 enhanced):
- Now scores BOTH original motion text AND layman explanation separately
- Schema: text_score, text_explanation, layman_score, layman_explanation
- Text sentiment avg: 0.294 (slightly positive)
- Layman sentiment avg: 0.416 (more positive - summaries tone down hostility)

Category Derivation (new):
- Two-phase LLM approach: derive taxonomy from sample, then apply to all
- Discovered 7 categories from 30-motion sample:
  veiligheid/justitie, corona/pandemie, economie/belasting, klimaat/milieu,
  defensie/buitenland, asiel/vreemdelingen, overig
- Applied to 50 motions with distribution shown in DB
- Adds category + category_explanation columns to right_wing_motions
main
Sven Geboers 1 month ago
parent f94edc3d04
commit fbf92c82cf
  1. 347
      analysis/right_wing/derive_categories.py
  2. 143
      analysis/right_wing/extremity_scorer.py
  3. 134
      analysis/right_wing/sentiment_analysis.py

@ -0,0 +1,347 @@
#!/usr/bin/env python3
"""Derive policy categories for right-wing motions using LLM.
Two-phase approach:
1. Derive taxonomy from a sample (discover categories from data)
2. Apply categories to all motions using the derived taxonomy
Usage:
uv run python analysis/right_wing/derive_categories.py --derive-sample 30 --apply-sample 50
uv run python analysis/right_wing/derive_categories.py --derive-sample 30 --apply-sample -1
"""
from __future__ import annotations
import argparse
import json
import logging
import re
import sys
from collections import Counter
from pathlib import Path
from typing import Any
import duckdb
ROOT = Path(__file__).parent.parent.parent.resolve()
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from ai_provider import ProviderError, chat_completion_json_parallel
from analysis.config import config
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
# Phase 1: open-ended schema to discover categories
DERIVE_SCHEMA = {
"name": "derive_category",
"strict": True,
"schema": {
"type": "object",
"properties": {
"category": {
"type": "string",
"description": "Policy domain/category in Dutch. Use short lowercase labels like 'asiel', 'klimaat', 'corona', 'lhbtq', 'veiligheid', 'defensie', 'economie', 'landbouw', 'zorg', 'onderwijs', 'overig'",
},
"explanation": {
"type": "string",
"description": "Very short explanation why this category fits",
},
},
"required": ["category", "explanation"],
"additionalProperties": False,
},
}
# Phase 2: constrained schema using the derived taxonomy
APPLY_SCHEMA_TEMPLATE = {
"name": "apply_category",
"strict": True,
"schema": {
"type": "object",
"properties": {
"category": {
"type": "string",
"description": "Category must be one of: {categories}",
"enum": [], # filled dynamically
},
"explanation": {
"type": "string",
"description": "Very short explanation why this category fits",
},
},
"required": ["category", "explanation"],
"additionalProperties": False,
},
}
PROMPT_TEMPLATE = """Welk beleidsdomein hoort bij de volgende motie uit het Nederlandse parlement?
Titel: {title}
Tekst: {text}
Leg uit in 1 zin waarom dit beleidsdomem past."""
def _build_prompt(title: str, body_text: str | None) -> str:
text = body_text or title or ""
if len(text) > 600:
text = text[:600] + "..."
return PROMPT_TEMPLATE.format(title=title or "", text=text)
def _normalize_category(raw: str) -> str:
"""Normalize LLM category output to consistent labels."""
raw = raw.lower().strip()
# Map common variants
mapping = {
"asiel": "asiel/vreemdelingen",
"vreemdelingen": "asiel/vreemdelingen",
"immigratie": "asiel/vreemdelingen",
"migratie": "asiel/vreemdelingen",
"klimaat": "klimaat/milieu",
"milieu": "klimaat/milieu",
"stikstof": "klimaat/milieu",
"corona": "corona/pandemie",
"pandemie": "corona/pandemie",
"covid": "corona/pandemie",
"lhbtq": "lhbtq/rechten",
"lhbti": "lhbtq/rechten",
"lgbt": "lhbtq/rechten",
"veiligheid": "veiligheid/justitie",
"justitie": "veiligheid/justitie",
"strafrecht": "veiligheid/justitie",
"defensie": "defensie/buitenland",
"buitenland": "defensie/buitenland",
"buitenlandse zaken": "defensie/buitenland",
"economie": "economie/belasting",
"belasting": "economie/belasting",
"financiën": "economie/belasting",
"landbouw": "landbouw/stikstof",
"boeren": "landbouw/stikstof",
"zorg": "zorg/gezondheid",
"gezondheid": "zorg/gezondheid",
"onderwijs": "onderwijs/cultuur",
"cultuur": "onderwijs/cultuur",
"energie": "energie",
"kernenergie": "energie",
"sociaal": "sociaal/jeugd",
"jeugd": "sociaal/jeugd",
"wonen": "wonen/ruimtelijk",
"ruimtelijk": "wonen/ruimtelijk",
"verkeer": "verkeer/infrastructuur",
"infrastructuur": "verkeer/infrastructuur",
}
return mapping.get(raw, raw)
def derive_taxonomy(
db_path: str = "data/motions.db",
derive_sample: int = 30,
batch_size: int = 10,
) -> list[str]:
"""Phase 1: derive category taxonomy from a sample of motions."""
db = Path(db_path)
con = duckdb.connect(str(db))
try:
rows = con.execute(
f"""
SELECT r.motion_id, m.title, m.body_text
FROM right_wing_motions r
JOIN motions m ON r.motion_id = m.id
WHERE r.classified = TRUE
ORDER BY RANDOM()
LIMIT {derive_sample}
"""
).fetchall()
logger.info("Phase 1: deriving taxonomy from %d motions...", len(rows))
categories = []
for i in range(0, len(rows), batch_size):
batch = rows[i : i + batch_size]
motion_ids = [r[0] for r in batch]
titles = [r[1] for r in batch]
texts = [r[2] for r in batch]
message_batches = []
for title, text in zip(titles, texts):
prompt = _build_prompt(title, text)
message_batches.append([{"role": "user", "content": prompt}])
try:
results = chat_completion_json_parallel(
message_batches,
model=config.QWEN_MODEL,
json_schema=DERIVE_SCHEMA,
max_workers=5,
)
except ProviderError as exc:
logger.error("Batch failed: %s", exc)
continue
for res in results:
if isinstance(res, dict):
cat = res.get("category", "overig")
categories.append(_normalize_category(cat))
# Count and threshold
counts = Counter(categories)
logger.info("Raw category counts: %s", dict(counts.most_common()))
# Keep categories with >= 2 occurrences, plus always keep 'overig'
taxonomy = [cat for cat, cnt in counts.most_common() if cnt >= 2]
if "overig" not in taxonomy:
taxonomy.append("overig")
logger.info("Derived taxonomy (%d categories): %s", len(taxonomy), taxonomy)
return taxonomy
finally:
con.close()
def apply_categories(
db_path: str = "data/motions.db",
taxonomy: list[str] | None = None,
apply_sample: int = 50,
batch_size: int = 10,
) -> dict[str, Any]:
"""Phase 2: apply derived taxonomy to all motions."""
db = Path(db_path)
con = duckdb.connect(str(db))
try:
if taxonomy is None:
# Try to load from previous run or use default
taxonomy = [
"asiel/vreemdelingen",
"klimaat/milieu",
"corona/pandemie",
"lhbtq/rechten",
"veiligheid/justitie",
"defensie/buitenland",
"economie/belasting",
"landbouw/stikstof",
"zorg/gezondheid",
"onderwijs/cultuur",
"energie",
"sociaal/jeugd",
"overig",
]
# Build schema with enum
schema = json.loads(json.dumps(APPLY_SCHEMA_TEMPLATE))
schema["schema"]["properties"]["category"]["enum"] = taxonomy
schema["schema"]["properties"]["category"][
"description"
] = f"Category must be one of: {', '.join(taxonomy)}"
limit_clause = "" if apply_sample < 0 else f"LIMIT {apply_sample}"
rows = con.execute(
f"""
SELECT r.motion_id, m.title, m.body_text
FROM right_wing_motions r
JOIN motions m ON r.motion_id = m.id
WHERE r.classified = TRUE
ORDER BY RANDOM()
{limit_clause}
"""
).fetchall()
logger.info("Phase 2: applying %d categories to %d motions...", len(taxonomy), len(rows))
# Add category column if missing
cols = {c[1] for c in con.execute("PRAGMA table_info(right_wing_motions)").fetchall()}
if "category" not in cols:
con.execute("ALTER TABLE right_wing_motions ADD COLUMN category VARCHAR")
if "category_explanation" not in cols:
con.execute("ALTER TABLE right_wing_motions ADD COLUMN category_explanation VARCHAR")
scored = 0
failed = 0
category_counts: Counter[str] = Counter()
for i in range(0, len(rows), batch_size):
batch = rows[i : i + batch_size]
motion_ids = [r[0] for r in batch]
titles = [r[1] for r in batch]
texts = [r[2] for r in batch]
message_batches = []
for title, text in zip(titles, texts):
prompt = _build_prompt(title, text)
message_batches.append([{"role": "user", "content": prompt}])
try:
results = chat_completion_json_parallel(
message_batches,
model=config.QWEN_MODEL,
json_schema=schema,
max_workers=5,
)
except ProviderError as exc:
logger.error("Batch failed: %s", exc)
failed += len(batch)
continue
for mid, res in zip(motion_ids, results):
if isinstance(res, dict) and res.get("category") in taxonomy:
cat = res["category"]
expl = res.get("explanation", "")
else:
cat = "overig"
expl = f"invalid response: {res}" if not isinstance(res, dict) else "unknown"
failed += 1
continue
con.execute(
"UPDATE right_wing_motions SET category = ?, category_explanation = ? WHERE motion_id = ?",
(cat, expl, mid),
)
category_counts[cat] += 1
scored += 1
con.commit()
logger.info("Applied categories to %d motions, %d failures", scored, failed)
return {
"scored": scored,
"failed": failed,
"taxonomy": taxonomy,
"category_distribution": dict(category_counts.most_common()),
}
finally:
con.close()
def main() -> int:
parser = argparse.ArgumentParser(description="Derive and apply policy categories")
parser.add_argument("--db", default="data/motions.db")
parser.add_argument("--derive-sample", type=int, default=30, help="Sample size for taxonomy derivation")
parser.add_argument("--apply-sample", type=int, default=50, help="Sample size for category application (-1 for all)")
parser.add_argument("--batch-size", type=int, default=10)
parser.add_argument("--skip-derive", action="store_true", help="Skip derivation, use default taxonomy")
args = parser.parse_args()
if args.skip_derive:
taxonomy = None
else:
taxonomy = derive_taxonomy(
db_path=args.db,
derive_sample=args.derive_sample,
batch_size=args.batch_size,
)
result = apply_categories(
db_path=args.db,
taxonomy=taxonomy,
apply_sample=args.apply_sample,
batch_size=args.batch_size,
)
print(json.dumps(result, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())

@ -1,6 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Policy extremity scorer: LLM-based radicalism scoring for right-wing motions. """Policy extremity scorer: LLM-based radicalism scoring for right-wing motions.
Scores BOTH the original motion text and the layman explanation separately.
Usage: Usage:
uv run python analysis/right_wing/extremity_scorer.py --sample 50 uv run python analysis/right_wing/extremity_scorer.py --sample 50
uv run python analysis/right_wing/extremity_scorer.py --sample -1 # all motions uv run python analysis/right_wing/extremity_scorer.py --sample -1 # all motions
@ -11,7 +13,6 @@ from __future__ import annotations
import argparse import argparse
import json import json
import logging import logging
import os
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -28,51 +29,70 @@ from analysis.config import config
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# JSON schema enforcing the expected response shape
EXTREMITY_SCHEMA = { EXTREMITY_SCHEMA = {
"name": "extremity_score", "name": "extremity_score",
"strict": True, "strict": True,
"schema": { "schema": {
"type": "object", "type": "object",
"properties": { "properties": {
"score": { "text_score": {
"type": "integer",
"description": "Radicalism of the original motion text (1=mild to 5=extreme)",
"minimum": 1,
"maximum": 5,
},
"text_explanation": {
"type": "string",
"description": "Why the motion text got this score (Dutch)",
},
"layman_score": {
"type": "integer", "type": "integer",
"description": "Radicalism score from 1 (mild/technical) to 5 (extreme/fundamental)", "description": "Radicalism of the layman explanation (1=mild to 5=extreme)",
"minimum": 1, "minimum": 1,
"maximum": 5, "maximum": 5,
}, },
"explanation": { "layman_explanation": {
"type": "string", "type": "string",
"description": "Short explanation in Dutch of why this score was given", "description": "Why the layman explanation got this score (Dutch)",
}, },
}, },
"required": ["score", "explanation"], "required": ["text_score", "text_explanation", "layman_score", "layman_explanation"],
"additionalProperties": False, "additionalProperties": False,
}, },
} }
PROMPT_TEMPLATE = """Dit is een motie in het Nederlandse parlement. PROMPT_TEMPLATE = """Beoordeel de radicalisme van de volgende motie op twee manieren:
1) Het ORIGINELE motietekst:
Titel: {title} Titel: {title}
Tekst: {text} Tekst: {text}
Wat vraagt deze motie concreet? Beoordeel hoe radicaal dit voorstel is op een schaal van 1 (mild/technisch) tot 5 (extreem/fundamenteel). Geef alleen het cijfer en een korte verklaring in het Nederlands.""" 2) De VEREENVOUDIGDE uitleg:
{layman}
Geef voor ELKE versie een score van 1 (mild/technisch) tot 5 (extreem/fundamenteel) plus een korte verklaring in het Nederlands."""
def _build_prompt(title: str, body_text: str | None) -> str:
text = body_text or title or ""
# Truncate body_text to keep prompt size reasonable
if len(text) > 800:
text = text[:800] + "..."
return PROMPT_TEMPLATE.format(title=title or "", text=text)
def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | None]) -> list[dict[str, Any]]: def _build_prompt(title: str, body_text: str | None, layman: str | None) -> str:
text = body_text or title or ""
if len(text) > 500:
text = text[:500] + "..."
layman = layman or "(geen vereenvoudigde uitleg beschikbaar)"
if len(layman) > 400:
layman = layman[:400] + "..."
return PROMPT_TEMPLATE.format(title=title or "", text=text, layman=layman)
def _score_batch(
motion_ids: list[int],
titles: list[str],
texts: list[str | None],
laymen: list[str | None],
) -> list[dict[str, Any]]:
"""Score a batch of motions in parallel via LLM.""" """Score a batch of motions in parallel via LLM."""
message_batches = [] message_batches = []
for title, text in zip(titles, texts): for title, text, layman in zip(titles, texts, laymen):
prompt = _build_prompt(title, text) prompt = _build_prompt(title, text, layman)
message_batches.append([{"role": "user", "content": prompt}]) message_batches.append([{"role": "user", "content": prompt}])
try: try:
@ -84,20 +104,44 @@ def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | Non
) )
except ProviderError as exc: except ProviderError as exc:
logger.error("Batch API call failed: %s", exc) logger.error("Batch API call failed: %s", exc)
return [{"score": None, "explanation": None, "error": str(exc)}] * len(motion_ids) return [{
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": str(exc),
}] * len(motion_ids)
# Validate each result
validated = [] validated = []
for res in results: for res in results:
if not isinstance(res, dict): if not isinstance(res, dict):
validated.append({"score": None, "explanation": None, "error": "non-dict response"}) validated.append({
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": "non-dict response",
})
continue
ts = res.get("text_score")
te = res.get("text_explanation")
ls = res.get("layman_score")
le = res.get("layman_explanation")
if not isinstance(ts, int) or ts < 1 or ts > 5:
validated.append({
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": f"invalid text_score: {ts}",
})
continue continue
score = res.get("score") if not isinstance(ls, int) or ls < 1 or ls > 5:
explanation = res.get("explanation") validated.append({
if not isinstance(score, int) or score < 1 or score > 5: "text_score": None, "text_explanation": None,
validated.append({"score": None, "explanation": None, "error": f"invalid score: {score}"}) "layman_score": None, "layman_explanation": None,
"error": f"invalid layman_score: {ls}",
})
continue continue
validated.append({"score": score, "explanation": explanation, "error": None}) validated.append({
"text_score": ts, "text_explanation": te,
"layman_score": ls, "layman_explanation": le,
"error": None,
})
return validated return validated
@ -106,27 +150,21 @@ def score_motions(
sample_size: int = 50, sample_size: int = 50,
batch_size: int = 10, batch_size: int = 10,
) -> dict[str, Any]: ) -> dict[str, Any]:
"""Score right-wing motions and store results. """Score right-wing motions and store results."""
Args:
sample_size: Number of motions to score. -1 = all classified motions.
"""
db = Path(db_path) db = Path(db_path)
if not db.exists(): if not db.exists():
raise FileNotFoundError(f"Database not found: {db}") raise FileNotFoundError(f"Database not found: {db}")
con = duckdb.connect(str(db)) con = duckdb.connect(str(db))
try: try:
# Ensure tables exist
tables = {t[0] for t in con.execute("SHOW TABLES").fetchall()} tables = {t[0] for t in con.execute("SHOW TABLES").fetchall()}
if "right_wing_motions" not in tables: if "right_wing_motions" not in tables:
raise RuntimeError("Run classify_motions.py first.") raise RuntimeError("Run classify_motions.py first.")
# Load classified motions
limit_clause = "" if sample_size < 0 else f"LIMIT {sample_size}" limit_clause = "" if sample_size < 0 else f"LIMIT {sample_size}"
rows = con.execute( rows = con.execute(
f""" f"""
SELECT r.motion_id, m.title, m.body_text SELECT r.motion_id, m.title, m.body_text, m.layman_explanation
FROM right_wing_motions r FROM right_wing_motions r
JOIN motions m ON r.motion_id = m.id JOIN motions m ON r.motion_id = m.id
WHERE r.classified = TRUE WHERE r.classified = TRUE
@ -141,14 +179,15 @@ def score_motions(
logger.info("Scoring %d motions in batches of %d...", len(rows), batch_size) logger.info("Scoring %d motions in batches of %d...", len(rows), batch_size)
# Create output table
con.execute("DROP TABLE IF EXISTS extremity_scores") con.execute("DROP TABLE IF EXISTS extremity_scores")
con.execute( con.execute(
""" """
CREATE TABLE extremity_scores ( CREATE TABLE extremity_scores (
motion_id INTEGER PRIMARY KEY, motion_id INTEGER PRIMARY KEY,
score INTEGER, text_score INTEGER,
explanation VARCHAR, text_explanation VARCHAR,
layman_score INTEGER,
layman_explanation VARCHAR,
error VARCHAR error VARCHAR
) )
""" """
@ -162,32 +201,44 @@ def score_motions(
motion_ids = [r[0] for r in batch] motion_ids = [r[0] for r in batch]
titles = [r[1] for r in batch] titles = [r[1] for r in batch]
texts = [r[2] for r in batch] texts = [r[2] for r in batch]
laymen = [r[3] for r in batch]
logger.info("Batch %d/%d (%d motions)", i // batch_size + 1, (len(rows) - 1) // batch_size + 1, len(batch)) logger.info("Batch %d/%d (%d motions)", i // batch_size + 1, (len(rows) - 1) // batch_size + 1, len(batch))
results = _score_batch(motion_ids, titles, texts) results = _score_batch(motion_ids, titles, texts, laymen)
for mid, res in zip(motion_ids, results): for mid, res in zip(motion_ids, results):
con.execute( con.execute(
"INSERT INTO extremity_scores (motion_id, score, explanation, error) VALUES (?, ?, ?, ?)", """
(mid, res.get("score"), res.get("explanation"), res.get("error")), INSERT INTO extremity_scores
(motion_id, text_score, text_explanation, layman_score, layman_explanation, error)
VALUES (?, ?, ?, ?, ?, ?)
""",
(
mid,
res.get("text_score"),
res.get("text_explanation"),
res.get("layman_score"),
res.get("layman_explanation"),
res.get("error"),
),
) )
if res.get("score") is not None: if res.get("error") is None:
scored += 1 scored += 1
else: else:
failed += 1 failed += 1
con.commit() con.commit()
# Update yearly summary with average extremity # Update yearly summary with average extremity (using text_score as primary)
con.execute( con.execute(
""" """
UPDATE yearly_right_wing_summary UPDATE yearly_right_wing_summary
SET extremity_index = ( SET extremity_index = (
SELECT AVG(e.score) SELECT AVG(e.text_score)
FROM extremity_scores e FROM extremity_scores e
JOIN right_wing_motions r ON e.motion_id = r.motion_id JOIN right_wing_motions r ON e.motion_id = r.motion_id
WHERE r.year = yearly_right_wing_summary.year WHERE r.year = yearly_right_wing_summary.year
AND e.score IS NOT NULL AND e.text_score IS NOT NULL
) )
""" """
) )

@ -1,8 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Sentiment analysis pipeline: Dutch sentiment scoring for right-wing motions. """Sentiment analysis pipeline: Dutch sentiment scoring for right-wing motions.
Uses LLM batch calls (fallback when no local Dutch sentiment model is available). Scores BOTH the original motion text and the layman explanation separately.
Maps outputs to [-1, 1] scale where negative = hostile/aggressive, positive = constructive. Uses LLM batch calls. Maps outputs to [-1, 1] scale.
Usage: Usage:
uv run python analysis/right_wing/sentiment_analysis.py --sample 50 uv run python analysis/right_wing/sentiment_analysis.py --sample 50
@ -36,43 +36,64 @@ SENTIMENT_SCHEMA = {
"schema": { "schema": {
"type": "object", "type": "object",
"properties": { "properties": {
"score": { "text_score": {
"type": "number", "type": "number",
"description": "Sentiment score from -1 (very negative/hostile) to 1 (very positive/constructive)", "description": "Sentiment of original motion text from -1 (hostile) to 1 (constructive)",
"minimum": -1, "minimum": -1,
"maximum": 1, "maximum": 1,
}, },
"explanation": { "text_explanation": {
"type": "string", "type": "string",
"description": "Short explanation in Dutch of why this sentiment was given", "description": "Why the motion text got this score (Dutch)",
}, },
"layman_score": {
"type": "number",
"description": "Sentiment of layman explanation from -1 (hostile) to 1 (constructive)",
"minimum": -1,
"maximum": 1,
}, },
"required": ["score", "explanation"], "layman_explanation": {
"type": "string",
"description": "Why the layman explanation got this score (Dutch)",
},
},
"required": ["text_score", "text_explanation", "layman_score", "layman_explanation"],
"additionalProperties": False, "additionalProperties": False,
}, },
} }
PROMPT_TEMPLATE = """Beoordeel de sentiment van de volgende motie uit het Nederlandse parlement. PROMPT_TEMPLATE = """Beoordeel de sentiment van de volgende motie op twee manieren:
1) Het ORIGINELE motietekst:
Titel: {title} Titel: {title}
Tekst: {text} Tekst: {text}
Geef een sentiment score van -1 (zeer negatief, agressief, vijandig) tot 1 (zeer positief, constructief, coöperatief). Geef ook een korte verklaring in het Nederlands.""" 2) De VEREENVOUDIGDE uitleg:
{layman}
Geef voor ELKE versie een sentiment score van -1 (zeer negatief, agressief, vijandig) tot 1 (zeer positief, constructief, coöperatief) plus een korte verklaring in het Nederlands."""
def _build_prompt(title: str, body_text: str | None) -> str:
def _build_prompt(title: str, body_text: str | None, layman: str | None) -> str:
text = body_text or title or "" text = body_text or title or ""
if len(text) > 400: if len(text) > 400:
text = text[:400] + "..." text = text[:400] + "..."
return PROMPT_TEMPLATE.format(title=title or "", text=text) layman = layman or "(geen vereenvoudigde uitleg beschikbaar)"
if len(layman) > 300:
layman = layman[:300] + "..."
def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | None]) -> list[dict[str, Any]]: return PROMPT_TEMPLATE.format(title=title or "", text=text, layman=layman)
def _score_batch(
motion_ids: list[int],
titles: list[str],
texts: list[str | None],
laymen: list[str | None],
) -> list[dict[str, Any]]:
"""Score sentiment for a batch of motions in parallel via LLM.""" """Score sentiment for a batch of motions in parallel via LLM."""
message_batches = [] message_batches = []
for title, text in zip(titles, texts): for title, text, layman in zip(titles, texts, laymen):
prompt = _build_prompt(title, text) prompt = _build_prompt(title, text, layman)
message_batches.append([{"role": "user", "content": prompt}]) message_batches.append([{"role": "user", "content": prompt}])
try: try:
@ -84,19 +105,44 @@ def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | Non
) )
except ProviderError as exc: except ProviderError as exc:
logger.error("Batch API call failed: %s", exc) logger.error("Batch API call failed: %s", exc)
return [{"score": None, "explanation": None, "error": str(exc)}] * len(motion_ids) return [{
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": str(exc),
}] * len(motion_ids)
validated = [] validated = []
for res in results: for res in results:
if not isinstance(res, dict): if not isinstance(res, dict):
validated.append({"score": None, "explanation": None, "error": "non-dict response"}) validated.append({
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": "non-dict response",
})
continue continue
score = res.get("score") ts = res.get("text_score")
explanation = res.get("explanation") te = res.get("text_explanation")
if not isinstance(score, (int, float)) or score < -1 or score > 1: ls = res.get("layman_score")
validated.append({"score": None, "explanation": None, "error": f"invalid score: {score}"}) le = res.get("layman_explanation")
if not isinstance(ts, (int, float)) or ts < -1 or ts > 1:
validated.append({
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": f"invalid text_score: {ts}",
})
continue continue
validated.append({"score": float(score), "explanation": explanation, "error": None}) if not isinstance(ls, (int, float)) or ls < -1 or ls > 1:
validated.append({
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": f"invalid layman_score: {ls}",
})
continue
validated.append({
"text_score": float(ts), "text_explanation": te,
"layman_score": float(ls), "layman_explanation": le,
"error": None,
})
return validated return validated
@ -119,7 +165,7 @@ def analyze_sentiment(
limit_clause = "" if sample_size < 0 else f"LIMIT {sample_size}" limit_clause = "" if sample_size < 0 else f"LIMIT {sample_size}"
rows = con.execute( rows = con.execute(
f""" f"""
SELECT r.motion_id, r.year, m.title, m.body_text SELECT r.motion_id, r.year, m.title, m.body_text, m.layman_explanation
FROM right_wing_motions r FROM right_wing_motions r
JOIN motions m ON r.motion_id = m.id JOIN motions m ON r.motion_id = m.id
WHERE r.classified = TRUE WHERE r.classified = TRUE
@ -140,8 +186,10 @@ def analyze_sentiment(
CREATE TABLE sentiment_scores ( CREATE TABLE sentiment_scores (
motion_id INTEGER PRIMARY KEY, motion_id INTEGER PRIMARY KEY,
year INTEGER, year INTEGER,
score DOUBLE, text_score DOUBLE,
explanation VARCHAR, text_explanation VARCHAR,
layman_score DOUBLE,
layman_explanation VARCHAR,
error VARCHAR error VARCHAR
) )
""" """
@ -156,16 +204,26 @@ def analyze_sentiment(
years = [r[1] for r in batch] years = [r[1] for r in batch]
titles = [r[2] for r in batch] titles = [r[2] for r in batch]
texts = [r[3] for r in batch] texts = [r[3] for r in batch]
laymen = [r[4] for r in batch]
logger.info("Batch %d/%d (%d motions)", i // batch_size + 1, (len(rows) - 1) // batch_size + 1, len(batch)) logger.info("Batch %d/%d (%d motions)", i // batch_size + 1, (len(rows) - 1) // batch_size + 1, len(batch))
results = _score_batch(motion_ids, titles, texts) results = _score_batch(motion_ids, titles, texts, laymen)
for mid, year, res in zip(motion_ids, years, results): for mid, year, res in zip(motion_ids, years, results):
con.execute( con.execute(
"INSERT INTO sentiment_scores (motion_id, year, score, explanation, error) VALUES (?, ?, ?, ?, ?)", """
(mid, year, res.get("score"), res.get("explanation"), res.get("error")), INSERT INTO sentiment_scores
(motion_id, year, text_score, text_explanation, layman_score, layman_explanation, error)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
(
mid, year,
res.get("text_score"), res.get("text_explanation"),
res.get("layman_score"), res.get("layman_explanation"),
res.get("error"),
),
) )
if res.get("score") is not None: if res.get("error") is None:
scored += 1 scored += 1
else: else:
failed += 1 failed += 1
@ -173,7 +231,7 @@ def analyze_sentiment(
con.commit() con.commit()
# Add sentiment columns to yearly summary if not present # Add sentiment columns to yearly summary if not present
cols = {c[0] for c in con.execute("PRAGMA table_info(yearly_right_wing_summary)").fetchall()} cols = {c[1] for c in con.execute("PRAGMA table_info(yearly_right_wing_summary)").fetchall()}
if "avg_sentiment" not in cols: if "avg_sentiment" not in cols:
con.execute("ALTER TABLE yearly_right_wing_summary ADD COLUMN avg_sentiment DOUBLE") con.execute("ALTER TABLE yearly_right_wing_summary ADD COLUMN avg_sentiment DOUBLE")
if "sentiment_std" not in cols: if "sentiment_std" not in cols:
@ -185,22 +243,22 @@ def analyze_sentiment(
""" """
UPDATE yearly_right_wing_summary UPDATE yearly_right_wing_summary
SET avg_sentiment = ( SET avg_sentiment = (
SELECT AVG(s.score) SELECT AVG(s.text_score)
FROM sentiment_scores s FROM sentiment_scores s
WHERE s.year = yearly_right_wing_summary.year WHERE s.year = yearly_right_wing_summary.year
AND s.score IS NOT NULL AND s.text_score IS NOT NULL
), ),
sentiment_std = ( sentiment_std = (
SELECT STDDEV(s.score) SELECT STDDEV(s.text_score)
FROM sentiment_scores s FROM sentiment_scores s
WHERE s.year = yearly_right_wing_summary.year WHERE s.year = yearly_right_wing_summary.year
AND s.score IS NOT NULL AND s.text_score IS NOT NULL
), ),
pct_strongly_negative = ( pct_strongly_negative = (
SELECT COUNT(CASE WHEN s.score < -0.5 THEN 1 END) * 100.0 / NULLIF(COUNT(*), 0) SELECT COUNT(CASE WHEN s.text_score < -0.5 THEN 1 END) * 100.0 / NULLIF(COUNT(*), 0)
FROM sentiment_scores s FROM sentiment_scores s
WHERE s.year = yearly_right_wing_summary.year WHERE s.year = yearly_right_wing_summary.year
AND s.score IS NOT NULL AND s.text_score IS NOT NULL
) )
""" """
) )

Loading…
Cancel
Save