diff --git a/analysis/right_wing/derive_categories.py b/analysis/right_wing/derive_categories.py new file mode 100644 index 0000000..ff25685 --- /dev/null +++ b/analysis/right_wing/derive_categories.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +"""Derive policy categories for right-wing motions using LLM. + +Two-phase approach: + 1. Derive taxonomy from a sample (discover categories from data) + 2. Apply categories to all motions using the derived taxonomy + +Usage: + uv run python analysis/right_wing/derive_categories.py --derive-sample 30 --apply-sample 50 + uv run python analysis/right_wing/derive_categories.py --derive-sample 30 --apply-sample -1 +""" + +from __future__ import annotations + +import argparse +import json +import logging +import re +import sys +from collections import Counter +from pathlib import Path +from typing import Any + +import duckdb + +ROOT = Path(__file__).parent.parent.parent.resolve() +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from ai_provider import ProviderError, chat_completion_json_parallel +from analysis.config import config + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +logger = logging.getLogger(__name__) + +# Phase 1: open-ended schema to discover categories +DERIVE_SCHEMA = { + "name": "derive_category", + "strict": True, + "schema": { + "type": "object", + "properties": { + "category": { + "type": "string", + "description": "Policy domain/category in Dutch. Use short lowercase labels like 'asiel', 'klimaat', 'corona', 'lhbtq', 'veiligheid', 'defensie', 'economie', 'landbouw', 'zorg', 'onderwijs', 'overig'", + }, + "explanation": { + "type": "string", + "description": "Very short explanation why this category fits", + }, + }, + "required": ["category", "explanation"], + "additionalProperties": False, + }, +} + +# Phase 2: constrained schema using the derived taxonomy +APPLY_SCHEMA_TEMPLATE = { + "name": "apply_category", + "strict": True, + "schema": { + "type": "object", + "properties": { + "category": { + "type": "string", + "description": "Category must be one of: {categories}", + "enum": [], # filled dynamically + }, + "explanation": { + "type": "string", + "description": "Very short explanation why this category fits", + }, + }, + "required": ["category", "explanation"], + "additionalProperties": False, + }, +} + +PROMPT_TEMPLATE = """Welk beleidsdomein hoort bij de volgende motie uit het Nederlandse parlement? + +Titel: {title} + +Tekst: {text} + +Leg uit in 1 zin waarom dit beleidsdomem past.""" + + +def _build_prompt(title: str, body_text: str | None) -> str: + text = body_text or title or "" + if len(text) > 600: + text = text[:600] + "..." + return PROMPT_TEMPLATE.format(title=title or "", text=text) + + +def _normalize_category(raw: str) -> str: + """Normalize LLM category output to consistent labels.""" + raw = raw.lower().strip() + # Map common variants + mapping = { + "asiel": "asiel/vreemdelingen", + "vreemdelingen": "asiel/vreemdelingen", + "immigratie": "asiel/vreemdelingen", + "migratie": "asiel/vreemdelingen", + "klimaat": "klimaat/milieu", + "milieu": "klimaat/milieu", + "stikstof": "klimaat/milieu", + "corona": "corona/pandemie", + "pandemie": "corona/pandemie", + "covid": "corona/pandemie", + "lhbtq": "lhbtq/rechten", + "lhbti": "lhbtq/rechten", + "lgbt": "lhbtq/rechten", + "veiligheid": "veiligheid/justitie", + "justitie": "veiligheid/justitie", + "strafrecht": "veiligheid/justitie", + "defensie": "defensie/buitenland", + "buitenland": "defensie/buitenland", + "buitenlandse zaken": "defensie/buitenland", + "economie": "economie/belasting", + "belasting": "economie/belasting", + "financiën": "economie/belasting", + "landbouw": "landbouw/stikstof", + "boeren": "landbouw/stikstof", + "zorg": "zorg/gezondheid", + "gezondheid": "zorg/gezondheid", + "onderwijs": "onderwijs/cultuur", + "cultuur": "onderwijs/cultuur", + "energie": "energie", + "kernenergie": "energie", + "sociaal": "sociaal/jeugd", + "jeugd": "sociaal/jeugd", + "wonen": "wonen/ruimtelijk", + "ruimtelijk": "wonen/ruimtelijk", + "verkeer": "verkeer/infrastructuur", + "infrastructuur": "verkeer/infrastructuur", + } + return mapping.get(raw, raw) + + +def derive_taxonomy( + db_path: str = "data/motions.db", + derive_sample: int = 30, + batch_size: int = 10, +) -> list[str]: + """Phase 1: derive category taxonomy from a sample of motions.""" + db = Path(db_path) + con = duckdb.connect(str(db)) + try: + rows = con.execute( + f""" + SELECT r.motion_id, m.title, m.body_text + FROM right_wing_motions r + JOIN motions m ON r.motion_id = m.id + WHERE r.classified = TRUE + ORDER BY RANDOM() + LIMIT {derive_sample} + """ + ).fetchall() + + logger.info("Phase 1: deriving taxonomy from %d motions...", len(rows)) + + categories = [] + for i in range(0, len(rows), batch_size): + batch = rows[i : i + batch_size] + motion_ids = [r[0] for r in batch] + titles = [r[1] for r in batch] + texts = [r[2] for r in batch] + + message_batches = [] + for title, text in zip(titles, texts): + prompt = _build_prompt(title, text) + message_batches.append([{"role": "user", "content": prompt}]) + + try: + results = chat_completion_json_parallel( + message_batches, + model=config.QWEN_MODEL, + json_schema=DERIVE_SCHEMA, + max_workers=5, + ) + except ProviderError as exc: + logger.error("Batch failed: %s", exc) + continue + + for res in results: + if isinstance(res, dict): + cat = res.get("category", "overig") + categories.append(_normalize_category(cat)) + + # Count and threshold + counts = Counter(categories) + logger.info("Raw category counts: %s", dict(counts.most_common())) + + # Keep categories with >= 2 occurrences, plus always keep 'overig' + taxonomy = [cat for cat, cnt in counts.most_common() if cnt >= 2] + if "overig" not in taxonomy: + taxonomy.append("overig") + + logger.info("Derived taxonomy (%d categories): %s", len(taxonomy), taxonomy) + return taxonomy + finally: + con.close() + + +def apply_categories( + db_path: str = "data/motions.db", + taxonomy: list[str] | None = None, + apply_sample: int = 50, + batch_size: int = 10, +) -> dict[str, Any]: + """Phase 2: apply derived taxonomy to all motions.""" + db = Path(db_path) + con = duckdb.connect(str(db)) + try: + if taxonomy is None: + # Try to load from previous run or use default + taxonomy = [ + "asiel/vreemdelingen", + "klimaat/milieu", + "corona/pandemie", + "lhbtq/rechten", + "veiligheid/justitie", + "defensie/buitenland", + "economie/belasting", + "landbouw/stikstof", + "zorg/gezondheid", + "onderwijs/cultuur", + "energie", + "sociaal/jeugd", + "overig", + ] + + # Build schema with enum + schema = json.loads(json.dumps(APPLY_SCHEMA_TEMPLATE)) + schema["schema"]["properties"]["category"]["enum"] = taxonomy + schema["schema"]["properties"]["category"][ + "description" + ] = f"Category must be one of: {', '.join(taxonomy)}" + + limit_clause = "" if apply_sample < 0 else f"LIMIT {apply_sample}" + rows = con.execute( + f""" + SELECT r.motion_id, m.title, m.body_text + FROM right_wing_motions r + JOIN motions m ON r.motion_id = m.id + WHERE r.classified = TRUE + ORDER BY RANDOM() + {limit_clause} + """ + ).fetchall() + + logger.info("Phase 2: applying %d categories to %d motions...", len(taxonomy), len(rows)) + + # Add category column if missing + cols = {c[1] for c in con.execute("PRAGMA table_info(right_wing_motions)").fetchall()} + if "category" not in cols: + con.execute("ALTER TABLE right_wing_motions ADD COLUMN category VARCHAR") + if "category_explanation" not in cols: + con.execute("ALTER TABLE right_wing_motions ADD COLUMN category_explanation VARCHAR") + + scored = 0 + failed = 0 + category_counts: Counter[str] = Counter() + + for i in range(0, len(rows), batch_size): + batch = rows[i : i + batch_size] + motion_ids = [r[0] for r in batch] + titles = [r[1] for r in batch] + texts = [r[2] for r in batch] + + message_batches = [] + for title, text in zip(titles, texts): + prompt = _build_prompt(title, text) + message_batches.append([{"role": "user", "content": prompt}]) + + try: + results = chat_completion_json_parallel( + message_batches, + model=config.QWEN_MODEL, + json_schema=schema, + max_workers=5, + ) + except ProviderError as exc: + logger.error("Batch failed: %s", exc) + failed += len(batch) + continue + + for mid, res in zip(motion_ids, results): + if isinstance(res, dict) and res.get("category") in taxonomy: + cat = res["category"] + expl = res.get("explanation", "") + else: + cat = "overig" + expl = f"invalid response: {res}" if not isinstance(res, dict) else "unknown" + failed += 1 + continue + + con.execute( + "UPDATE right_wing_motions SET category = ?, category_explanation = ? WHERE motion_id = ?", + (cat, expl, mid), + ) + category_counts[cat] += 1 + scored += 1 + + con.commit() + + logger.info("Applied categories to %d motions, %d failures", scored, failed) + return { + "scored": scored, + "failed": failed, + "taxonomy": taxonomy, + "category_distribution": dict(category_counts.most_common()), + } + finally: + con.close() + + +def main() -> int: + parser = argparse.ArgumentParser(description="Derive and apply policy categories") + parser.add_argument("--db", default="data/motions.db") + parser.add_argument("--derive-sample", type=int, default=30, help="Sample size for taxonomy derivation") + parser.add_argument("--apply-sample", type=int, default=50, help="Sample size for category application (-1 for all)") + parser.add_argument("--batch-size", type=int, default=10) + parser.add_argument("--skip-derive", action="store_true", help="Skip derivation, use default taxonomy") + args = parser.parse_args() + + if args.skip_derive: + taxonomy = None + else: + taxonomy = derive_taxonomy( + db_path=args.db, + derive_sample=args.derive_sample, + batch_size=args.batch_size, + ) + + result = apply_categories( + db_path=args.db, + taxonomy=taxonomy, + apply_sample=args.apply_sample, + batch_size=args.batch_size, + ) + print(json.dumps(result, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/analysis/right_wing/extremity_scorer.py b/analysis/right_wing/extremity_scorer.py index 64551a4..8459a6d 100644 --- a/analysis/right_wing/extremity_scorer.py +++ b/analysis/right_wing/extremity_scorer.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 """Policy extremity scorer: LLM-based radicalism scoring for right-wing motions. +Scores BOTH the original motion text and the layman explanation separately. + Usage: uv run python analysis/right_wing/extremity_scorer.py --sample 50 uv run python analysis/right_wing/extremity_scorer.py --sample -1 # all motions @@ -11,7 +13,6 @@ from __future__ import annotations import argparse import json import logging -import os import sys from pathlib import Path from typing import Any @@ -28,51 +29,70 @@ from analysis.config import config logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) -# JSON schema enforcing the expected response shape EXTREMITY_SCHEMA = { "name": "extremity_score", "strict": True, "schema": { "type": "object", "properties": { - "score": { + "text_score": { + "type": "integer", + "description": "Radicalism of the original motion text (1=mild to 5=extreme)", + "minimum": 1, + "maximum": 5, + }, + "text_explanation": { + "type": "string", + "description": "Why the motion text got this score (Dutch)", + }, + "layman_score": { "type": "integer", - "description": "Radicalism score from 1 (mild/technical) to 5 (extreme/fundamental)", + "description": "Radicalism of the layman explanation (1=mild to 5=extreme)", "minimum": 1, "maximum": 5, }, - "explanation": { + "layman_explanation": { "type": "string", - "description": "Short explanation in Dutch of why this score was given", + "description": "Why the layman explanation got this score (Dutch)", }, }, - "required": ["score", "explanation"], + "required": ["text_score", "text_explanation", "layman_score", "layman_explanation"], "additionalProperties": False, }, } -PROMPT_TEMPLATE = """Dit is een motie in het Nederlandse parlement. +PROMPT_TEMPLATE = """Beoordeel de radicalisme van de volgende motie op twee manieren: +1) Het ORIGINELE motietekst: Titel: {title} - Tekst: {text} -Wat vraagt deze motie concreet? Beoordeel hoe radicaal dit voorstel is op een schaal van 1 (mild/technisch) tot 5 (extreem/fundamenteel). Geef alleen het cijfer en een korte verklaring in het Nederlands.""" - +2) De VEREENVOUDIGDE uitleg: +{layman} -def _build_prompt(title: str, body_text: str | None) -> str: - text = body_text or title or "" - # Truncate body_text to keep prompt size reasonable - if len(text) > 800: - text = text[:800] + "..." - return PROMPT_TEMPLATE.format(title=title or "", text=text) +Geef voor ELKE versie een score van 1 (mild/technisch) tot 5 (extreem/fundamenteel) plus een korte verklaring in het Nederlands.""" -def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | None]) -> list[dict[str, Any]]: +def _build_prompt(title: str, body_text: str | None, layman: str | None) -> str: + text = body_text or title or "" + if len(text) > 500: + text = text[:500] + "..." + layman = layman or "(geen vereenvoudigde uitleg beschikbaar)" + if len(layman) > 400: + layman = layman[:400] + "..." + return PROMPT_TEMPLATE.format(title=title or "", text=text, layman=layman) + + +def _score_batch( + motion_ids: list[int], + titles: list[str], + texts: list[str | None], + laymen: list[str | None], +) -> list[dict[str, Any]]: """Score a batch of motions in parallel via LLM.""" message_batches = [] - for title, text in zip(titles, texts): - prompt = _build_prompt(title, text) + for title, text, layman in zip(titles, texts, laymen): + prompt = _build_prompt(title, text, layman) message_batches.append([{"role": "user", "content": prompt}]) try: @@ -84,20 +104,44 @@ def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | Non ) except ProviderError as exc: logger.error("Batch API call failed: %s", exc) - return [{"score": None, "explanation": None, "error": str(exc)}] * len(motion_ids) + return [{ + "text_score": None, "text_explanation": None, + "layman_score": None, "layman_explanation": None, + "error": str(exc), + }] * len(motion_ids) - # Validate each result validated = [] for res in results: if not isinstance(res, dict): - validated.append({"score": None, "explanation": None, "error": "non-dict response"}) + validated.append({ + "text_score": None, "text_explanation": None, + "layman_score": None, "layman_explanation": None, + "error": "non-dict response", + }) + continue + ts = res.get("text_score") + te = res.get("text_explanation") + ls = res.get("layman_score") + le = res.get("layman_explanation") + if not isinstance(ts, int) or ts < 1 or ts > 5: + validated.append({ + "text_score": None, "text_explanation": None, + "layman_score": None, "layman_explanation": None, + "error": f"invalid text_score: {ts}", + }) continue - score = res.get("score") - explanation = res.get("explanation") - if not isinstance(score, int) or score < 1 or score > 5: - validated.append({"score": None, "explanation": None, "error": f"invalid score: {score}"}) + if not isinstance(ls, int) or ls < 1 or ls > 5: + validated.append({ + "text_score": None, "text_explanation": None, + "layman_score": None, "layman_explanation": None, + "error": f"invalid layman_score: {ls}", + }) continue - validated.append({"score": score, "explanation": explanation, "error": None}) + validated.append({ + "text_score": ts, "text_explanation": te, + "layman_score": ls, "layman_explanation": le, + "error": None, + }) return validated @@ -106,27 +150,21 @@ def score_motions( sample_size: int = 50, batch_size: int = 10, ) -> dict[str, Any]: - """Score right-wing motions and store results. - - Args: - sample_size: Number of motions to score. -1 = all classified motions. - """ + """Score right-wing motions and store results.""" db = Path(db_path) if not db.exists(): raise FileNotFoundError(f"Database not found: {db}") con = duckdb.connect(str(db)) try: - # Ensure tables exist tables = {t[0] for t in con.execute("SHOW TABLES").fetchall()} if "right_wing_motions" not in tables: raise RuntimeError("Run classify_motions.py first.") - # Load classified motions limit_clause = "" if sample_size < 0 else f"LIMIT {sample_size}" rows = con.execute( f""" - SELECT r.motion_id, m.title, m.body_text + SELECT r.motion_id, m.title, m.body_text, m.layman_explanation FROM right_wing_motions r JOIN motions m ON r.motion_id = m.id WHERE r.classified = TRUE @@ -141,14 +179,15 @@ def score_motions( logger.info("Scoring %d motions in batches of %d...", len(rows), batch_size) - # Create output table con.execute("DROP TABLE IF EXISTS extremity_scores") con.execute( """ CREATE TABLE extremity_scores ( motion_id INTEGER PRIMARY KEY, - score INTEGER, - explanation VARCHAR, + text_score INTEGER, + text_explanation VARCHAR, + layman_score INTEGER, + layman_explanation VARCHAR, error VARCHAR ) """ @@ -162,32 +201,44 @@ def score_motions( motion_ids = [r[0] for r in batch] titles = [r[1] for r in batch] texts = [r[2] for r in batch] + laymen = [r[3] for r in batch] logger.info("Batch %d/%d (%d motions)", i // batch_size + 1, (len(rows) - 1) // batch_size + 1, len(batch)) - results = _score_batch(motion_ids, titles, texts) + results = _score_batch(motion_ids, titles, texts, laymen) for mid, res in zip(motion_ids, results): con.execute( - "INSERT INTO extremity_scores (motion_id, score, explanation, error) VALUES (?, ?, ?, ?)", - (mid, res.get("score"), res.get("explanation"), res.get("error")), + """ + INSERT INTO extremity_scores + (motion_id, text_score, text_explanation, layman_score, layman_explanation, error) + VALUES (?, ?, ?, ?, ?, ?) + """, + ( + mid, + res.get("text_score"), + res.get("text_explanation"), + res.get("layman_score"), + res.get("layman_explanation"), + res.get("error"), + ), ) - if res.get("score") is not None: + if res.get("error") is None: scored += 1 else: failed += 1 con.commit() - # Update yearly summary with average extremity + # Update yearly summary with average extremity (using text_score as primary) con.execute( """ UPDATE yearly_right_wing_summary SET extremity_index = ( - SELECT AVG(e.score) + SELECT AVG(e.text_score) FROM extremity_scores e JOIN right_wing_motions r ON e.motion_id = r.motion_id WHERE r.year = yearly_right_wing_summary.year - AND e.score IS NOT NULL + AND e.text_score IS NOT NULL ) """ ) diff --git a/analysis/right_wing/sentiment_analysis.py b/analysis/right_wing/sentiment_analysis.py index 689c993..19d9a59 100644 --- a/analysis/right_wing/sentiment_analysis.py +++ b/analysis/right_wing/sentiment_analysis.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """Sentiment analysis pipeline: Dutch sentiment scoring for right-wing motions. -Uses LLM batch calls (fallback when no local Dutch sentiment model is available). -Maps outputs to [-1, 1] scale where negative = hostile/aggressive, positive = constructive. +Scores BOTH the original motion text and the layman explanation separately. +Uses LLM batch calls. Maps outputs to [-1, 1] scale. Usage: uv run python analysis/right_wing/sentiment_analysis.py --sample 50 @@ -36,43 +36,64 @@ SENTIMENT_SCHEMA = { "schema": { "type": "object", "properties": { - "score": { + "text_score": { "type": "number", - "description": "Sentiment score from -1 (very negative/hostile) to 1 (very positive/constructive)", + "description": "Sentiment of original motion text from -1 (hostile) to 1 (constructive)", "minimum": -1, "maximum": 1, }, - "explanation": { + "text_explanation": { "type": "string", - "description": "Short explanation in Dutch of why this sentiment was given", + "description": "Why the motion text got this score (Dutch)", + }, + "layman_score": { + "type": "number", + "description": "Sentiment of layman explanation from -1 (hostile) to 1 (constructive)", + "minimum": -1, + "maximum": 1, + }, + "layman_explanation": { + "type": "string", + "description": "Why the layman explanation got this score (Dutch)", }, }, - "required": ["score", "explanation"], + "required": ["text_score", "text_explanation", "layman_score", "layman_explanation"], "additionalProperties": False, }, } -PROMPT_TEMPLATE = """Beoordeel de sentiment van de volgende motie uit het Nederlandse parlement. +PROMPT_TEMPLATE = """Beoordeel de sentiment van de volgende motie op twee manieren: +1) Het ORIGINELE motietekst: Titel: {title} - Tekst: {text} -Geef een sentiment score van -1 (zeer negatief, agressief, vijandig) tot 1 (zeer positief, constructief, coöperatief). Geef ook een korte verklaring in het Nederlands.""" +2) De VEREENVOUDIGDE uitleg: +{layman} + +Geef voor ELKE versie een sentiment score van -1 (zeer negatief, agressief, vijandig) tot 1 (zeer positief, constructief, coöperatief) plus een korte verklaring in het Nederlands.""" -def _build_prompt(title: str, body_text: str | None) -> str: +def _build_prompt(title: str, body_text: str | None, layman: str | None) -> str: text = body_text or title or "" if len(text) > 400: text = text[:400] + "..." - return PROMPT_TEMPLATE.format(title=title or "", text=text) - - -def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | None]) -> list[dict[str, Any]]: + layman = layman or "(geen vereenvoudigde uitleg beschikbaar)" + if len(layman) > 300: + layman = layman[:300] + "..." + return PROMPT_TEMPLATE.format(title=title or "", text=text, layman=layman) + + +def _score_batch( + motion_ids: list[int], + titles: list[str], + texts: list[str | None], + laymen: list[str | None], +) -> list[dict[str, Any]]: """Score sentiment for a batch of motions in parallel via LLM.""" message_batches = [] - for title, text in zip(titles, texts): - prompt = _build_prompt(title, text) + for title, text, layman in zip(titles, texts, laymen): + prompt = _build_prompt(title, text, layman) message_batches.append([{"role": "user", "content": prompt}]) try: @@ -84,19 +105,44 @@ def _score_batch(motion_ids: list[int], titles: list[str], texts: list[str | Non ) except ProviderError as exc: logger.error("Batch API call failed: %s", exc) - return [{"score": None, "explanation": None, "error": str(exc)}] * len(motion_ids) + return [{ + "text_score": None, "text_explanation": None, + "layman_score": None, "layman_explanation": None, + "error": str(exc), + }] * len(motion_ids) validated = [] for res in results: if not isinstance(res, dict): - validated.append({"score": None, "explanation": None, "error": "non-dict response"}) + validated.append({ + "text_score": None, "text_explanation": None, + "layman_score": None, "layman_explanation": None, + "error": "non-dict response", + }) + continue + ts = res.get("text_score") + te = res.get("text_explanation") + ls = res.get("layman_score") + le = res.get("layman_explanation") + if not isinstance(ts, (int, float)) or ts < -1 or ts > 1: + validated.append({ + "text_score": None, "text_explanation": None, + "layman_score": None, "layman_explanation": None, + "error": f"invalid text_score: {ts}", + }) continue - score = res.get("score") - explanation = res.get("explanation") - if not isinstance(score, (int, float)) or score < -1 or score > 1: - validated.append({"score": None, "explanation": None, "error": f"invalid score: {score}"}) + if not isinstance(ls, (int, float)) or ls < -1 or ls > 1: + validated.append({ + "text_score": None, "text_explanation": None, + "layman_score": None, "layman_explanation": None, + "error": f"invalid layman_score: {ls}", + }) continue - validated.append({"score": float(score), "explanation": explanation, "error": None}) + validated.append({ + "text_score": float(ts), "text_explanation": te, + "layman_score": float(ls), "layman_explanation": le, + "error": None, + }) return validated @@ -119,7 +165,7 @@ def analyze_sentiment( limit_clause = "" if sample_size < 0 else f"LIMIT {sample_size}" rows = con.execute( f""" - SELECT r.motion_id, r.year, m.title, m.body_text + SELECT r.motion_id, r.year, m.title, m.body_text, m.layman_explanation FROM right_wing_motions r JOIN motions m ON r.motion_id = m.id WHERE r.classified = TRUE @@ -140,8 +186,10 @@ def analyze_sentiment( CREATE TABLE sentiment_scores ( motion_id INTEGER PRIMARY KEY, year INTEGER, - score DOUBLE, - explanation VARCHAR, + text_score DOUBLE, + text_explanation VARCHAR, + layman_score DOUBLE, + layman_explanation VARCHAR, error VARCHAR ) """ @@ -156,16 +204,26 @@ def analyze_sentiment( years = [r[1] for r in batch] titles = [r[2] for r in batch] texts = [r[3] for r in batch] + laymen = [r[4] for r in batch] logger.info("Batch %d/%d (%d motions)", i // batch_size + 1, (len(rows) - 1) // batch_size + 1, len(batch)) - results = _score_batch(motion_ids, titles, texts) + results = _score_batch(motion_ids, titles, texts, laymen) for mid, year, res in zip(motion_ids, years, results): con.execute( - "INSERT INTO sentiment_scores (motion_id, year, score, explanation, error) VALUES (?, ?, ?, ?, ?)", - (mid, year, res.get("score"), res.get("explanation"), res.get("error")), + """ + INSERT INTO sentiment_scores + (motion_id, year, text_score, text_explanation, layman_score, layman_explanation, error) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + ( + mid, year, + res.get("text_score"), res.get("text_explanation"), + res.get("layman_score"), res.get("layman_explanation"), + res.get("error"), + ), ) - if res.get("score") is not None: + if res.get("error") is None: scored += 1 else: failed += 1 @@ -173,7 +231,7 @@ def analyze_sentiment( con.commit() # Add sentiment columns to yearly summary if not present - cols = {c[0] for c in con.execute("PRAGMA table_info(yearly_right_wing_summary)").fetchall()} + cols = {c[1] for c in con.execute("PRAGMA table_info(yearly_right_wing_summary)").fetchall()} if "avg_sentiment" not in cols: con.execute("ALTER TABLE yearly_right_wing_summary ADD COLUMN avg_sentiment DOUBLE") if "sentiment_std" not in cols: @@ -185,22 +243,22 @@ def analyze_sentiment( """ UPDATE yearly_right_wing_summary SET avg_sentiment = ( - SELECT AVG(s.score) + SELECT AVG(s.text_score) FROM sentiment_scores s WHERE s.year = yearly_right_wing_summary.year - AND s.score IS NOT NULL + AND s.text_score IS NOT NULL ), sentiment_std = ( - SELECT STDDEV(s.score) + SELECT STDDEV(s.text_score) FROM sentiment_scores s WHERE s.year = yearly_right_wing_summary.year - AND s.score IS NOT NULL + AND s.text_score IS NOT NULL ), pct_strongly_negative = ( - SELECT COUNT(CASE WHEN s.score < -0.5 THEN 1 END) * 100.0 / NULLIF(COUNT(*), 0) + SELECT COUNT(CASE WHEN s.text_score < -0.5 THEN 1 END) * 100.0 / NULLIF(COUNT(*), 0) FROM sentiment_scores s WHERE s.year = yearly_right_wing_summary.year - AND s.score IS NOT NULL + AND s.text_score IS NOT NULL ) """ )