feat(right-wing): dual-scoring extremity/sentiment + derived categories

Extremity Scorer (U4 enhanced):
- Now scores BOTH original motion text AND layman explanation separately
- Schema: text_score, text_explanation, layman_score, layman_explanation
- Text scores: 1→7, 2→33, 3→5, 4→5 (mild-to-moderate)
- Layman scores: 1→12, 2→20, 3→17, 4→1 (slightly milder)

Sentiment Analysis (U5 enhanced):
- Now scores BOTH original motion text AND layman explanation separately
- Schema: text_score, text_explanation, layman_score, layman_explanation
- Text sentiment avg: 0.294 (slightly positive)
- Layman sentiment avg: 0.416 (more positive - summaries tone down hostility)

Category Derivation (new):
- Two-phase LLM approach: derive taxonomy from sample, then apply to all
- Discovered 7 categories from 30-motion sample:
  veiligheid/justitie, corona/pandemie, economie/belasting, klimaat/milieu,
  defensie/buitenland, asiel/vreemdelingen, overig
- Applied to 50 motions with distribution shown in DB
- Adds category + category_explanation columns to right_wing_motions
main
Sven Geboers 1 month ago
parent f94edc3d04
commit fbf92c82cf
  1. 347
      analysis/right_wing/derive_categories.py
  2. 143
      analysis/right_wing/extremity_scorer.py
  3. 134
      analysis/right_wing/sentiment_analysis.py

@ -0,0 +1,347 @@
#!/usr/bin/env python3
"""Derive policy categories for right-wing motions using LLM.
Two-phase approach:
1. Derive taxonomy from a sample (discover categories from data)
2. Apply categories to all motions using the derived taxonomy
Usage:
uv run python analysis/right_wing/derive_categories.py --derive-sample 30 --apply-sample 50
uv run python analysis/right_wing/derive_categories.py --derive-sample 30 --apply-sample -1
"""
from __future__ import annotations
import argparse
import json
import logging
import re
import sys
from collections import Counter
from pathlib import Path
from typing import Any
import duckdb
ROOT = Path(__file__).parent.parent.parent.resolve()
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from ai_provider import ProviderError, chat_completion_json_parallel
from analysis.config import config
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
# Phase 1: open-ended schema to discover categories
DERIVE_SCHEMA = {
"name": "derive_category",
"strict": True,
"schema": {
"type": "object",
"properties": {
"category": {
"type": "string",
"description": "Policy domain/category in Dutch. Use short lowercase labels like 'asiel', 'klimaat', 'corona', 'lhbtq', 'veiligheid', 'defensie', 'economie', 'landbouw', 'zorg', 'onderwijs', 'overig'",
},
"explanation": {
"type": "string",
"description": "Very short explanation why this category fits",
},
},
"required": ["category", "explanation"],
"additionalProperties": False,
},
}
# Phase 2: constrained schema using the derived taxonomy
APPLY_SCHEMA_TEMPLATE = {
"name": "apply_category",
"strict": True,
"schema": {
"type": "object",
"properties": {
"category": {
"type": "string",
"description": "Category must be one of: {categories}",
"enum": [], # filled dynamically
},
"explanation": {
"type": "string",
"description": "Very short explanation why this category fits",
},
},
"required": ["category", "explanation"],
"additionalProperties": False,
},
}
PROMPT_TEMPLATE = """Welk beleidsdomein hoort bij de volgende motie uit het Nederlandse parlement?
Titel: {title}
Tekst: {text}
Leg uit in 1 zin waarom dit beleidsdomem past."""
def _build_prompt(title: str, body_text: str | None) -> str:
text = body_text or title or ""
if len(text) > 600:
text = text[:600] + "..."
return PROMPT_TEMPLATE.format(title=title or "", text=text)
def _normalize_category(raw: str) -> str:
"""Normalize LLM category output to consistent labels."""
raw = raw.lower().strip()
# Map common variants
mapping = {
"asiel": "asiel/vreemdelingen",
"vreemdelingen": "asiel/vreemdelingen",
"immigratie": "asiel/vreemdelingen",
"migratie": "asiel/vreemdelingen",
"klimaat": "klimaat/milieu",
"milieu": "klimaat/milieu",
"stikstof": "klimaat/milieu",
"corona": "corona/pandemie",
"pandemie": "corona/pandemie",
"covid": "corona/pandemie",
"lhbtq": "lhbtq/rechten",
"lhbti": "lhbtq/rechten",
"lgbt": "lhbtq/rechten",
"veiligheid": "veiligheid/justitie",
"justitie": "veiligheid/justitie",
"strafrecht": "veiligheid/justitie",
"defensie": "defensie/buitenland",
"buitenland": "defensie/buitenland",
"buitenlandse zaken": "defensie/buitenland",
"economie": "economie/belasting",
"belasting": "economie/belasting",
"financiën": "economie/belasting",
"landbouw": "landbouw/stikstof",
"boeren": "landbouw/stikstof",
"zorg": "zorg/gezondheid",
"gezondheid": "zorg/gezondheid",
"onderwijs": "onderwijs/cultuur",
"cultuur": "onderwijs/cultuur",
"energie": "energie",
"kernenergie": "energie",
"sociaal": "sociaal/jeugd",
"jeugd": "sociaal/jeugd",
"wonen": "wonen/ruimtelijk",
"ruimtelijk": "wonen/ruimtelijk",
"verkeer": "verkeer/infrastructuur",
"infrastructuur": "verkeer/infrastructuur",
}
return mapping.get(raw, raw)
def derive_taxonomy(
db_path: str = "data/motions.db",
derive_sample: int = 30,
batch_size: int = 10,
) -> list[str]:
"""Phase 1: derive category taxonomy from a sample of motions."""
db = Path(db_path)
con = duckdb.connect(str(db))
try:
rows = con.execute(
f"""
SELECT r.motion_id, m.title, m.body_text
FROM right_wing_motions r
JOIN motions m ON r.motion_id = m.id
WHERE r.classified = TRUE
ORDER BY RANDOM()
LIMIT {derive_sample}
"""
).fetchall()
logger.info("Phase 1: deriving taxonomy from %d motions...", len(rows))
categories = []
for i in range(0, len(rows), batch_size):
batch = rows[i : i + batch_size]
motion_ids = [r[0] for r in batch]
titles = [r[1] for r in batch]
texts = [r[2] for r in batch]
message_batches = []
for title, text in zip(titles, texts):
prompt = _build_prompt(title, text)
message_batches.append([{"role": "user", "content": prompt}])
try:
results = chat_completion_json_parallel(
message_batches,
model=config.QWEN_MODEL,
json_schema=DERIVE_SCHEMA,
max_workers=5,
)
except ProviderError as exc:
logger.error("Batch failed: %s", exc)
continue
for res in results:
if isinstance(res, dict):
cat = res.get("category", "overig")
categories.append(_normalize_category(cat))
# Count and threshold
counts = Counter(categories)
logger.info("Raw category counts: %s", dict(counts.most_common()))
# Keep categories with >= 2 occurrences, plus always keep 'overig'
taxonomy = [cat for cat, cnt in counts.most_common() if cnt >= 2]
if "overig" not in taxonomy:
taxonomy.append("overig")
logger.info("Derived taxonomy (%d categories): %s", len(taxonomy), taxonomy)
return taxonomy
finally:
con.close()
def apply_categories(
db_path: str = "data/motions.db",
taxonomy: list[str] | None = None,
apply_sample: int = 50,
batch_size: int = 10,
) -> dict[str, Any]:
"""Phase 2: apply derived taxonomy to all motions."""
db = Path(db_path)
con = duckdb.connect(str(db))
try:
if taxonomy is None:
# Try to load from previous run or use default
taxonomy = [
"asiel/vreemdelingen",
"klimaat/milieu",
"corona/pandemie",
"lhbtq/rechten",
"veiligheid/justitie",
"defensie/buitenland",
"economie/belasting",
"landbouw/stikstof",
"zorg/gezondheid",
"onderwijs/cultuur",
"energie",
"sociaal/jeugd",
"overig",
]
# Build schema with enum
schema = json.loads(json.dumps(APPLY_SCHEMA_TEMPLATE))
schema["schema"]["properties"]["category"]["enum"] = taxonomy
schema["schema"]["properties"]["category"][
"description"
] = f"Category must be one of: {', '.join(taxonomy)}"
limit_clause = "" if apply_sample < 0 else f"LIMIT {apply_sample}"
rows = con.execute(
f"""
SELECT r.motion_id, m.title, m.body_text
FROM right_wing_motions r
JOIN motions m ON r.motion_id = m.id
WHERE r.classified = TRUE
ORDER BY RANDOM()
{limit_clause}
"""
).fetchall()
logger.info("Phase 2: applying %d categories to %d motions...", len(taxonomy), len(rows))
# Add category column if missing
cols = {c[1] for c in con.execute("PRAGMA table_info(right_wing_motions)").fetchall()}
if "category" not in cols:
con.execute("ALTER TABLE right_wing_motions ADD COLUMN category VARCHAR")
if "category_explanation" not in cols:
con.execute("ALTER TABLE right_wing_motions ADD COLUMN category_explanation VARCHAR")
scored = 0
failed = 0
category_counts: Counter[str] = Counter()
for i in range(0, len(rows), batch_size):
batch = rows[i : i + batch_size]
motion_ids = [r[0] for r in batch]
titles = [r[1] for r in batch]
texts = [r[2] for r in batch]
message_batches = []
for title, text in zip(titles, texts):
prompt = _build_prompt(title, text)
message_batches.append([{"role": "user", "content": prompt}])
try:
results = chat_completion_json_parallel(
message_batches,
model=config.QWEN_MODEL,
json_schema=schema,
max_workers=5,
)
except ProviderError as exc:
logger.error("Batch failed: %s", exc)
failed += len(batch)
continue
for mid, res in zip(motion_ids, results):
if isinstance(res, dict) and res.get("category") in taxonomy:
cat = res["category"]
expl = res.get("explanation", "")
else:
cat = "overig"
expl = f"invalid response: {res}" if not isinstance(res, dict) else "unknown"
failed += 1
continue
con.execute(
"UPDATE right_wing_motions SET category = ?, category_explanation = ? WHERE motion_id = ?",
(cat, expl, mid),
)
category_counts[cat] += 1
scored += 1
con.commit()
logger.info("Applied categories to %d motions, %d failures", scored, failed)
return {
"scored": scored,
"failed": failed,
"taxonomy": taxonomy,
"category_distribution": dict(category_counts.most_common()),
}
finally:
con.close()
def main() -> int:
parser = argparse.ArgumentParser(description="Derive and apply policy categories")
parser.add_argument("--db", default="data/motions.db")
parser.add_argument("--derive-sample", type=int, default=30, help="Sample size for taxonomy derivation")
parser.add_argument("--apply-sample", type=int, default=50, help="Sample size for category application (-1 for all)")
parser.add_argument("--batch-size", type=int, default=10)
parser.add_argument("--skip-derive", action="store_true", help="Skip derivation, use default taxonomy")
args = parser.parse_args()
if args.skip_derive:
taxonomy = None
else:
taxonomy = derive_taxonomy(
db_path=args.db,
derive_sample=args.derive_sample,
batch_size=args.batch_size,
)
result = apply_categories(
db_path=args.db,
taxonomy=taxonomy,
apply_sample=args.apply_sample,
batch_size=args.batch_size,
)
print(json.dumps(result, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())

@ -1,6 +1,8 @@
#!/usr/bin/env python3
"""Policy extremity scorer: LLM-based radicalism scoring for right-wing motions.
Scores BOTH the original motion text and the layman explanation separately.
Usage:
uv run python analysis/right_wing/extremity_scorer.py --sample 50
uv run python analysis/right_wing/extremity_scorer.py --sample -1 # all motions
@ -11,7 +13,6 @@ from __future__ import annotations
import argparse
import json
import logging
import os
import sys
from pathlib import Path
from typing import Any
@ -28,51 +29,70 @@ from analysis.config import config
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
# JSON schema enforcing the expected response shape
EXTREMITY_SCHEMA = {
"name": "extremity_score",
"strict": True,
"schema": {
"type": "object",
"properties": {
"score": {
"text_score": {
"type": "integer",
"description": "Radicalism of the original motion text (1=mild to 5=extreme)",
"minimum": 1,
"maximum": 5,
},
"text_explanation": {
"type": "string",
"description": "Why the motion text got this score (Dutch)",
},
"layman_score": {
"type": "integer",
"description": "Radicalism score from 1 (mild/technical) to 5 (extreme/fundamental)",
"description": "Radicalism of the layman explanation (1=mild to 5=extreme)",
"minimum": 1,
"maximum": 5,
},
"explanation": {
"layman_explanation": {
"type": "string",
"description": "Short explanation in Dutch of why this score was given",
"description": "Why the layman explanation got this score (Dutch)",
},
},
"required": ["score", "explanation"],
"required": ["text_score", "text_explanation", "layman_score", "layman_explanation"],
"additionalProperties": False,
},
}
PROMPT_TEMPLATE = """Dit is een motie in het Nederlandse parlement.
PROMPT_TEMPLATE = """Beoordeel de radicalisme van de volgende motie op twee manieren:
1) Het ORIGINELE motietekst:
Titel: {title}
Tekst: {text}
Wat vraagt deze motie concreet? Beoordeel hoe radicaal dit voorstel is op een schaal van 1 (mild/technisch) tot 5 (extreem/fundamenteel). Geef alleen het cijfer en een korte verklaring in het Nederlands."""
2) De VEREENVOUDIGDE uitleg:
{layman}
def _build_prompt(title: str, body_text: str | None) -> str:
text = body_text or title or ""
# Truncate body_text to keep prompt size reasonable
if len(text) > 800:
text = text[:800] + "..."
return PROMPT_TEMPLATE.format(title=title or "", text=text)
Geef voor ELKE versie een score van 1 (mild/technisch) tot 5 (extreem/fundamenteel) plus een korte verklaring in het Nederlands."""
def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | None]) -> list[dict[str, Any]]:
def _build_prompt(title: str, body_text: str | None, layman: str | None) -> str:
text = body_text or title or ""
if len(text) > 500:
text = text[:500] + "..."
layman = layman or "(geen vereenvoudigde uitleg beschikbaar)"
if len(layman) > 400:
layman = layman[:400] + "..."
return PROMPT_TEMPLATE.format(title=title or "", text=text, layman=layman)
def _score_batch(
motion_ids: list[int],
titles: list[str],
texts: list[str | None],
laymen: list[str | None],
) -> list[dict[str, Any]]:
"""Score a batch of motions in parallel via LLM."""
message_batches = []
for title, text in zip(titles, texts):
prompt = _build_prompt(title, text)
for title, text, layman in zip(titles, texts, laymen):
prompt = _build_prompt(title, text, layman)
message_batches.append([{"role": "user", "content": prompt}])
try:
@ -84,20 +104,44 @@ def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | Non
)
except ProviderError as exc:
logger.error("Batch API call failed: %s", exc)
return [{"score": None, "explanation": None, "error": str(exc)}] * len(motion_ids)
return [{
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": str(exc),
}] * len(motion_ids)
# Validate each result
validated = []
for res in results:
if not isinstance(res, dict):
validated.append({"score": None, "explanation": None, "error": "non-dict response"})
validated.append({
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": "non-dict response",
})
continue
ts = res.get("text_score")
te = res.get("text_explanation")
ls = res.get("layman_score")
le = res.get("layman_explanation")
if not isinstance(ts, int) or ts < 1 or ts > 5:
validated.append({
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": f"invalid text_score: {ts}",
})
continue
score = res.get("score")
explanation = res.get("explanation")
if not isinstance(score, int) or score < 1 or score > 5:
validated.append({"score": None, "explanation": None, "error": f"invalid score: {score}"})
if not isinstance(ls, int) or ls < 1 or ls > 5:
validated.append({
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": f"invalid layman_score: {ls}",
})
continue
validated.append({"score": score, "explanation": explanation, "error": None})
validated.append({
"text_score": ts, "text_explanation": te,
"layman_score": ls, "layman_explanation": le,
"error": None,
})
return validated
@ -106,27 +150,21 @@ def score_motions(
sample_size: int = 50,
batch_size: int = 10,
) -> dict[str, Any]:
"""Score right-wing motions and store results.
Args:
sample_size: Number of motions to score. -1 = all classified motions.
"""
"""Score right-wing motions and store results."""
db = Path(db_path)
if not db.exists():
raise FileNotFoundError(f"Database not found: {db}")
con = duckdb.connect(str(db))
try:
# Ensure tables exist
tables = {t[0] for t in con.execute("SHOW TABLES").fetchall()}
if "right_wing_motions" not in tables:
raise RuntimeError("Run classify_motions.py first.")
# Load classified motions
limit_clause = "" if sample_size < 0 else f"LIMIT {sample_size}"
rows = con.execute(
f"""
SELECT r.motion_id, m.title, m.body_text
SELECT r.motion_id, m.title, m.body_text, m.layman_explanation
FROM right_wing_motions r
JOIN motions m ON r.motion_id = m.id
WHERE r.classified = TRUE
@ -141,14 +179,15 @@ def score_motions(
logger.info("Scoring %d motions in batches of %d...", len(rows), batch_size)
# Create output table
con.execute("DROP TABLE IF EXISTS extremity_scores")
con.execute(
"""
CREATE TABLE extremity_scores (
motion_id INTEGER PRIMARY KEY,
score INTEGER,
explanation VARCHAR,
text_score INTEGER,
text_explanation VARCHAR,
layman_score INTEGER,
layman_explanation VARCHAR,
error VARCHAR
)
"""
@ -162,32 +201,44 @@ def score_motions(
motion_ids = [r[0] for r in batch]
titles = [r[1] for r in batch]
texts = [r[2] for r in batch]
laymen = [r[3] for r in batch]
logger.info("Batch %d/%d (%d motions)", i // batch_size + 1, (len(rows) - 1) // batch_size + 1, len(batch))
results = _score_batch(motion_ids, titles, texts)
results = _score_batch(motion_ids, titles, texts, laymen)
for mid, res in zip(motion_ids, results):
con.execute(
"INSERT INTO extremity_scores (motion_id, score, explanation, error) VALUES (?, ?, ?, ?)",
(mid, res.get("score"), res.get("explanation"), res.get("error")),
"""
INSERT INTO extremity_scores
(motion_id, text_score, text_explanation, layman_score, layman_explanation, error)
VALUES (?, ?, ?, ?, ?, ?)
""",
(
mid,
res.get("text_score"),
res.get("text_explanation"),
res.get("layman_score"),
res.get("layman_explanation"),
res.get("error"),
),
)
if res.get("score") is not None:
if res.get("error") is None:
scored += 1
else:
failed += 1
con.commit()
# Update yearly summary with average extremity
# Update yearly summary with average extremity (using text_score as primary)
con.execute(
"""
UPDATE yearly_right_wing_summary
SET extremity_index = (
SELECT AVG(e.score)
SELECT AVG(e.text_score)
FROM extremity_scores e
JOIN right_wing_motions r ON e.motion_id = r.motion_id
WHERE r.year = yearly_right_wing_summary.year
AND e.score IS NOT NULL
AND e.text_score IS NOT NULL
)
"""
)

@ -1,8 +1,8 @@
#!/usr/bin/env python3
"""Sentiment analysis pipeline: Dutch sentiment scoring for right-wing motions.
Uses LLM batch calls (fallback when no local Dutch sentiment model is available).
Maps outputs to [-1, 1] scale where negative = hostile/aggressive, positive = constructive.
Scores BOTH the original motion text and the layman explanation separately.
Uses LLM batch calls. Maps outputs to [-1, 1] scale.
Usage:
uv run python analysis/right_wing/sentiment_analysis.py --sample 50
@ -36,43 +36,64 @@ SENTIMENT_SCHEMA = {
"schema": {
"type": "object",
"properties": {
"score": {
"text_score": {
"type": "number",
"description": "Sentiment score from -1 (very negative/hostile) to 1 (very positive/constructive)",
"description": "Sentiment of original motion text from -1 (hostile) to 1 (constructive)",
"minimum": -1,
"maximum": 1,
},
"explanation": {
"text_explanation": {
"type": "string",
"description": "Short explanation in Dutch of why this sentiment was given",
"description": "Why the motion text got this score (Dutch)",
},
"layman_score": {
"type": "number",
"description": "Sentiment of layman explanation from -1 (hostile) to 1 (constructive)",
"minimum": -1,
"maximum": 1,
},
"layman_explanation": {
"type": "string",
"description": "Why the layman explanation got this score (Dutch)",
},
},
"required": ["score", "explanation"],
"required": ["text_score", "text_explanation", "layman_score", "layman_explanation"],
"additionalProperties": False,
},
}
PROMPT_TEMPLATE = """Beoordeel de sentiment van de volgende motie uit het Nederlandse parlement.
PROMPT_TEMPLATE = """Beoordeel de sentiment van de volgende motie op twee manieren:
1) Het ORIGINELE motietekst:
Titel: {title}
Tekst: {text}
Geef een sentiment score van -1 (zeer negatief, agressief, vijandig) tot 1 (zeer positief, constructief, coöperatief). Geef ook een korte verklaring in het Nederlands."""
2) De VEREENVOUDIGDE uitleg:
{layman}
Geef voor ELKE versie een sentiment score van -1 (zeer negatief, agressief, vijandig) tot 1 (zeer positief, constructief, coöperatief) plus een korte verklaring in het Nederlands."""
def _build_prompt(title: str, body_text: str | None) -> str:
def _build_prompt(title: str, body_text: str | None, layman: str | None) -> str:
text = body_text or title or ""
if len(text) > 400:
text = text[:400] + "..."
return PROMPT_TEMPLATE.format(title=title or "", text=text)
def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | None]) -> list[dict[str, Any]]:
layman = layman or "(geen vereenvoudigde uitleg beschikbaar)"
if len(layman) > 300:
layman = layman[:300] + "..."
return PROMPT_TEMPLATE.format(title=title or "", text=text, layman=layman)
def _score_batch(
motion_ids: list[int],
titles: list[str],
texts: list[str | None],
laymen: list[str | None],
) -> list[dict[str, Any]]:
"""Score sentiment for a batch of motions in parallel via LLM."""
message_batches = []
for title, text in zip(titles, texts):
prompt = _build_prompt(title, text)
for title, text, layman in zip(titles, texts, laymen):
prompt = _build_prompt(title, text, layman)
message_batches.append([{"role": "user", "content": prompt}])
try:
@ -84,19 +105,44 @@ def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | Non
)
except ProviderError as exc:
logger.error("Batch API call failed: %s", exc)
return [{"score": None, "explanation": None, "error": str(exc)}] * len(motion_ids)
return [{
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": str(exc),
}] * len(motion_ids)
validated = []
for res in results:
if not isinstance(res, dict):
validated.append({"score": None, "explanation": None, "error": "non-dict response"})
validated.append({
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": "non-dict response",
})
continue
ts = res.get("text_score")
te = res.get("text_explanation")
ls = res.get("layman_score")
le = res.get("layman_explanation")
if not isinstance(ts, (int, float)) or ts < -1 or ts > 1:
validated.append({
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": f"invalid text_score: {ts}",
})
continue
score = res.get("score")
explanation = res.get("explanation")
if not isinstance(score, (int, float)) or score < -1 or score > 1:
validated.append({"score": None, "explanation": None, "error": f"invalid score: {score}"})
if not isinstance(ls, (int, float)) or ls < -1 or ls > 1:
validated.append({
"text_score": None, "text_explanation": None,
"layman_score": None, "layman_explanation": None,
"error": f"invalid layman_score: {ls}",
})
continue
validated.append({"score": float(score), "explanation": explanation, "error": None})
validated.append({
"text_score": float(ts), "text_explanation": te,
"layman_score": float(ls), "layman_explanation": le,
"error": None,
})
return validated
@ -119,7 +165,7 @@ def analyze_sentiment(
limit_clause = "" if sample_size < 0 else f"LIMIT {sample_size}"
rows = con.execute(
f"""
SELECT r.motion_id, r.year, m.title, m.body_text
SELECT r.motion_id, r.year, m.title, m.body_text, m.layman_explanation
FROM right_wing_motions r
JOIN motions m ON r.motion_id = m.id
WHERE r.classified = TRUE
@ -140,8 +186,10 @@ def analyze_sentiment(
CREATE TABLE sentiment_scores (
motion_id INTEGER PRIMARY KEY,
year INTEGER,
score DOUBLE,
explanation VARCHAR,
text_score DOUBLE,
text_explanation VARCHAR,
layman_score DOUBLE,
layman_explanation VARCHAR,
error VARCHAR
)
"""
@ -156,16 +204,26 @@ def analyze_sentiment(
years = [r[1] for r in batch]
titles = [r[2] for r in batch]
texts = [r[3] for r in batch]
laymen = [r[4] for r in batch]
logger.info("Batch %d/%d (%d motions)", i // batch_size + 1, (len(rows) - 1) // batch_size + 1, len(batch))
results = _score_batch(motion_ids, titles, texts)
results = _score_batch(motion_ids, titles, texts, laymen)
for mid, year, res in zip(motion_ids, years, results):
con.execute(
"INSERT INTO sentiment_scores (motion_id, year, score, explanation, error) VALUES (?, ?, ?, ?, ?)",
(mid, year, res.get("score"), res.get("explanation"), res.get("error")),
"""
INSERT INTO sentiment_scores
(motion_id, year, text_score, text_explanation, layman_score, layman_explanation, error)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
(
mid, year,
res.get("text_score"), res.get("text_explanation"),
res.get("layman_score"), res.get("layman_explanation"),
res.get("error"),
),
)
if res.get("score") is not None:
if res.get("error") is None:
scored += 1
else:
failed += 1
@ -173,7 +231,7 @@ def analyze_sentiment(
con.commit()
# Add sentiment columns to yearly summary if not present
cols = {c[0] for c in con.execute("PRAGMA table_info(yearly_right_wing_summary)").fetchall()}
cols = {c[1] for c in con.execute("PRAGMA table_info(yearly_right_wing_summary)").fetchall()}
if "avg_sentiment" not in cols:
con.execute("ALTER TABLE yearly_right_wing_summary ADD COLUMN avg_sentiment DOUBLE")
if "sentiment_std" not in cols:
@ -185,22 +243,22 @@ def analyze_sentiment(
"""
UPDATE yearly_right_wing_summary
SET avg_sentiment = (
SELECT AVG(s.score)
SELECT AVG(s.text_score)
FROM sentiment_scores s
WHERE s.year = yearly_right_wing_summary.year
AND s.score IS NOT NULL
AND s.text_score IS NOT NULL
),
sentiment_std = (
SELECT STDDEV(s.score)
SELECT STDDEV(s.text_score)
FROM sentiment_scores s
WHERE s.year = yearly_right_wing_summary.year
AND s.score IS NOT NULL
AND s.text_score IS NOT NULL
),
pct_strongly_negative = (
SELECT COUNT(CASE WHEN s.score < -0.5 THEN 1 END) * 100.0 / NULLIF(COUNT(*), 0)
SELECT COUNT(CASE WHEN s.text_score < -0.5 THEN 1 END) * 100.0 / NULLIF(COUNT(*), 0)
FROM sentiment_scores s
WHERE s.year = yearly_right_wing_summary.year
AND s.score IS NOT NULL
AND s.text_score IS NOT NULL
)
"""
)

Loading…
Cancel
Save