diff --git a/analysis/right_wing/extremity_rescore_2d.py b/analysis/right_wing/extremity_rescore_2d.py new file mode 100644 index 0000000..750ecab --- /dev/null +++ b/analysis/right_wing/extremity_rescore_2d.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python3 +"""Two-dimensional extremity rescoring orchestrator. + +Scores Dutch parliamentary motions on two independent dimensions: +1. stijl_extremiteit (stylistic extremity, 1-5) +2. materiele_impact (material impact, 1-5) + +Usage: + uv run python analysis/right_wing/extremity_rescore_2d.py --db data/motions.db + uv run python analysis/right_wing/extremity_rescore_2d.py --db data/motions.db --dry-run +""" + +from __future__ import annotations + +import argparse +import json +import logging +import re +from pathlib import Path +from typing import Any + +import duckdb + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +logger = logging.getLogger(__name__) + + +# ── prompt / schema loading ────────────────────────────────────────────────── + +SKILL_MD_PATH = Path(__file__).parent.parent.parent / ".opencode" / "skills" / "score-extremity" / "SKILL.md" + + +def load_skill(skill_path: str | None = None) -> dict[str, Any]: + """Read SKILL.md and extract prompt template and output schemas. + + Returns: + dict with keys "prompt_template", "single_schema", "batch_schema". + """ + path = Path(skill_path) if skill_path else SKILL_MD_PATH + if not path.exists(): + raise FileNotFoundError(f"Skill file not found: {path}") + + content = path.read_text(encoding="utf-8") + + # Extract prompt template from ```text ... ``` block + prompt_match = re.search(r"```text\n(.*?)```", content, re.DOTALL) + prompt_template = prompt_match.group(1).strip() if prompt_match else "" + + # Extract JSON schema blocks (first = single, second = batch) + json_blocks = re.findall(r"```json\n(.*?)```", content, re.DOTALL) + + single_schema: dict[str, Any] = {} + batch_schema: dict[str, Any] = {} + if len(json_blocks) >= 1: + try: + single_schema = json.loads(json_blocks[0].strip()) + except json.JSONDecodeError: + logger.warning("Failed to parse single schema JSON block") + if len(json_blocks) >= 2: + try: + batch_schema = json.loads(json_blocks[1].strip()) + except json.JSONDecodeError: + logger.warning("Failed to parse batch schema JSON block") + + return { + "prompt_template": prompt_template, + "single_schema": single_schema, + "batch_schema": batch_schema, + } + + +# ── sampling ───────────────────────────────────────────────────────────────── + +def sample_motions( + db_path: str, + n_per_bucket: int = 25, + seed: int = 42, +) -> list[dict[str, Any]]: + """Stratified sample from right_wing_motions JOIN extremity_scores. + + Samples n_per_bucket motions from each text_score bucket (1-5). + + Returns: + List of dicts with keys: motion_id, title, text, layman, text_score. + """ + con = duckdb.connect(db_path) + try: + # Ensure tables exist + tables = {t[0] for t in con.execute("SHOW TABLES").fetchall()} + required = {"right_wing_motions", "motions", "extremity_scores"} + missing = required - tables + if missing: + logger.warning("Missing tables: %s, returning empty sample", missing) + return [] + + # Apply seed for reproducibility + con.execute(f"SELECT setseed({seed / 1000000.0})") + + rows = con.execute( + """ + SELECT m.id, m.title, m.body_text, m.layman_explanation, e.text_score + FROM right_wing_motions r + JOIN motions m ON r.motion_id = m.id + JOIN extremity_scores e ON r.motion_id = e.motion_id + WHERE r.classified = TRUE + AND e.text_score IS NOT NULL + AND e.error IS NULL + ORDER BY RANDOM() + """ + ).fetchall() + + if not rows: + return [] + + # Bucket by text_score + buckets: dict[int, list[dict[str, Any]]] = {} + for row in rows: + mid, title, body_text, layman, text_score = row + score_bucket = int(text_score) + buckets.setdefault(score_bucket, []).append({ + "motion_id": mid, + "title": title or "", + "text": body_text or "", + "layman": layman or "", + "text_score": score_bucket, + }) + + # Sample n_per_bucket from each bucket + result: list[dict[str, Any]] = [] + for bucket_id in sorted(buckets.keys()): + bucket = buckets[bucket_id] + result.extend(bucket[:n_per_bucket]) + + logger.info( + "Sampled %d motions from %d buckets (n_per_bucket=%d)", + len(result), len(buckets), n_per_bucket, + ) + return result + + finally: + con.close() + + +# ── batch formatting ───────────────────────────────────────────────────────── + +def format_batches( + motions: list[dict[str, Any]], + prompt_template: str, + batch_size: int = 10, +) -> list[list[str]]: + """Split motions into batches and fill prompt template for each motion. + + Args: + motions: List of dicts with keys title, text, layman. + prompt_template: Template string with {title}, {text}, {layman} placeholders. + batch_size: Number of motions per batch. + + Returns: + List of batches; each batch is a list of filled prompt strings, one per motion. + """ + batches: list[list[str]] = [] + for i in range(0, len(motions), batch_size): + batch_motions = motions[i : i + batch_size] + batch_prompts: list[str] = [] + for m in batch_motions: + prompt = prompt_template.format( + title=m.get("title", ""), + text=m.get("text", ""), + layman=m.get("layman", ""), + ) + batch_prompts.append(prompt) + batches.append(batch_prompts) + return batches + + +# ── validation ─────────────────────────────────────────────────────────────── + +EXPECTED_FIELDS = [ + "stijl_extremiteit", + "stijl_toelichting", + "materiele_impact", + "materiele_toelichting", +] + + +def validate_single_result(result: dict[str, Any]) -> tuple[bool, str | None]: + """Validate a single motion 2d scoring result. + + Returns: + (True, None) if valid, (False, error_message) otherwise. + """ + # Check all required fields exist + for field in EXPECTED_FIELDS: + if field not in result: + return False, f"missing field: {field}" + + # Validate stijl_extremiteit (int, 1-5) + se = result["stijl_extremiteit"] + if not isinstance(se, int) or se < 1 or se > 5: + return False, f"stijl_extremiteit out of range 1-5: {se}" + + # Validate materiele_impact (int, 1-5) + mi = result["materiele_impact"] + if not isinstance(mi, int) or mi < 1 or mi > 5: + return False, f"materiele_impact out of range 1-5: {mi}" + + return True, None + + +# ── storage ────────────────────────────────────────────────────────────────── + +def store_scores(db_path: str, results: list[dict[str, Any]]) -> int: + """Store validated 2d scores in the extremity_scores_2d table. + + Creates the table if it doesn't exist. + + Args: + db_path: Path to DuckDB database. + results: List of dicts with keys: motion_id, stijl_extremiteit, + stijl_toelichting, materiele_impact, materiele_toelichting. + + Returns: + Number of rows inserted. + """ + con = duckdb.connect(db_path) + try: + con.execute( + """ + CREATE TABLE IF NOT EXISTS extremity_scores_2d ( + motion_id INTEGER PRIMARY KEY, + stylistic_score INTEGER NOT NULL, + material_score INTEGER NOT NULL, + stylistic_rationale TEXT, + material_rationale TEXT + ) + """ + ) + + count = 0 + for r in results: + con.execute( + """ + INSERT OR REPLACE INTO extremity_scores_2d + (motion_id, stylistic_score, material_score, stylistic_rationale, material_rationale) + VALUES (?, ?, ?, ?, ?) + """, + ( + r["motion_id"], + r["stijl_extremiteit"], + r["materiele_impact"], + r.get("stijl_toelichting"), + r.get("materiele_toelichting"), + ), + ) + count += 1 + + con.commit() + logger.info("Stored %d scores in extremity_scores_2d", count) + return count + + finally: + con.close() + + +# ── orchestrator ───────────────────────────────────────────────────────────── + +def rescore_2d( + db_path: str, + n_per_bucket: int = 25, + batch_size: int = 10, + dry_run: bool = False, +) -> dict[str, Any]: + """Two-dimensional extremity rescoring orchestrator. + + Samples motions from right_wing_motions/extremity_scores, formats batches, + and (in non-dry-run mode) dispatches subagents for scoring. + + Args: + db_path: Path to DuckDB database. + n_per_bucket: Number of motions to sample per text_score bucket. + batch_size: Motions per subagent batch. + dry_run: If True, only print the plan without spawning subagents. + + Returns: + Dict with summary stats. + """ + skill = load_skill() + prompt_template = skill["prompt_template"] + + motions = sample_motions(db_path, n_per_bucket=n_per_bucket) + + if not motions: + logger.warning("No motions to rescore.") + return {"motions_count": 0, "batch_count": 0, "dry_run": dry_run} + + batches = format_batches(motions, prompt_template, batch_size=batch_size) + + logger.info("Plan: %d motions in %d batches (batch_size=%d)", len(motions), len(batches), batch_size) + + if dry_run: + logger.info("DRY RUN — no subagents will be spawned.") + return { + "motions_count": len(motions), + "batch_count": len(batches), + "dry_run": True, + } + + # ── subagent dispatch (placeholder) ────────────────────────────────── + # In production, each batch would be sent to a subagent via the `task` tool. + # The subagent receives: + # - The prompt_template filled with motion data + # - Instruction to return JSON matching the batch_schema + # + # Example dispatch (not executed in script): + # for batch_idx, batch_prompts in enumerate(batches): + # combined_prompt = "\n\n---\n\n".join(batch_prompts) + # result = task( + # description=f"Score batch {batch_idx + 1}/{len(batches)}", + # prompt=combined_prompt, + # subagent_type="general", + # ) + # validated_results = [r for r in json.loads(result)["motions"] if validate_single_result(r)[0]] + # store_scores(db_path, validated_results) + + logger.info( + "Subagent dispatch placeholder: %d batches ready for scoring. " + "Run via an agent context (e.g. opencode task) to execute.", + len(batches), + ) + + return { + "motions_count": len(motions), + "batch_count": len(batches), + "dry_run": False, + "subagents_spawned": 0, + } + + +# ── CLI ────────────────────────────────────────────────────────────────────── + +def main() -> int: + parser = argparse.ArgumentParser( + description="Two-dimensional extremity rescoring orchestrator" + ) + parser.add_argument("--db", default="data/motions.db", help="Path to DuckDB database") + parser.add_argument("--n-per-bucket", type=int, default=25, help="Motions per text_score bucket") + parser.add_argument("--batch-size", type=int, default=10, help="Motions per subagent batch") + parser.add_argument("--dry-run", action="store_true", help="Print plan without spawning subagents") + args = parser.parse_args() + + result = rescore_2d( + db_path=args.db, + n_per_bucket=args.n_per_bucket, + batch_size=args.batch_size, + dry_run=args.dry_run, + ) + print(json.dumps(result, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/reports/overton_window/2d_extremity_correlation_report.md b/reports/overton_window/2d_extremity_correlation_report.md new file mode 100644 index 0000000..a734c07 --- /dev/null +++ b/reports/overton_window/2d_extremity_correlation_report.md @@ -0,0 +1,112 @@ +# Two-Dimensional Extremity Correlation Report + +**Date:** 2026-05-24 +**Motions scored:** 117 (stratified sample: ~25 per original extremity bucket) +**Scoring model:** Deepseek v4 flash (subagents via project skill) + +## Purpose + +The original extremity score is a single 1–5 rating of policy radicalism. This conflates two potentially independent dimensions: +- **Stylistic extremity (stijl-extremiteit):** How inflammatory, hostile, or polarizing the language is +- **Material impact (materiële impact):** How much the proposed policy would substantively affect people's rights, institutions, or freedoms + +This validation samples motions across the full extremity range and scores both dimensions independently to test whether they correlate strongly enough for a single score, or whether they should be tracked separately. + +--- + +## Results + +### Overall correlation + +| Metric | Value | +|--------|-------| +| N | 117 | +| Pearson r | **0.453** (moderate) | +| Mean stylistic | 2.01 | +| Mean material | 2.86 | +| Mean absolute difference | 1.11 | +| S ≤ 2 AND M ≥ 3 (masking) | 43 (36.8%) | + +**r = 0.453 is moderate — the dimensions are partly correlated but clearly separable.** Stylistic extremism explains only ~20% of the variance in material impact (R² = 0.205). A motion can be inflammatory without being consequential, and vice versa. + +### Joint distribution + +| | M=1 | M=2 | M=3 | M=4 | M=5 | +|---|---|---|---|---|---| +| **S=1** | 11 | 17 | 10 | 5 | 1 | +| **S=2** | 4 | 9 | 15 | 8 | 4 | +| **S=3** | 2 | 4 | 9 | 4 | 5 | +| **S=4** | 0 | 1 | 0 | 3 | 2 | +| **S=5** | 0 | 0 | 0 | 1 | 2 | + +### By original extremity bucket + +| Bucket | N | Mean style | Mean material | Gap | +|--------|---|-----------|--------------|-----| +| 1–2 (mild) | 50 | 1.56 | 2.24 | +0.68 | +| 2–3 (moderate) | 25 | 2.00 | 2.88 | +0.88 | +| 3–4 (high) | 25 | 2.56 | 3.56 | +1.00 | +| 4–5 (extreme) | 17 | 2.53 | 3.65 | +1.12 | + +Material impact consistently rates higher than stylistic extremity across all buckets. The gap widens at higher original extremity levels — suggesting the original LLM scoring was more sensitive to language style, while subagents systematically identify greater material consequences in the same motions. + +--- + +## Key findings + +### 1. "Low style, high impact" is the dominant divergence pattern + +**36.8% of motions (43 of 117)** use restrained language (S ≤ 2) for policies with substantial material impact (M ≥ 3). These are the motions most poorly captured by a single-dimensional score: + +- **Motion 16227** (S=1, M=5): "Verzoekt de regering kennis te geven van het voornemen tot uittreding uit de Europese Unie conform artikel 50 VWEU." Neutral, procedural language invoking an EU treaty article — but the policy is fundamental dissolution of the entire Dutch-EU legal framework. + +- **Motion 7713** (S=1, M=4): "Verzoekt de regering per direct te stoppen met arbeidsmigratie." Restrained, single-sentence motion with no inflammatory language — but it would suspend free movement of persons, a fundamental EU treaty right. + +- **Motion 16704** (S=1, M=3): Formal Raad van State advice and technical amendment text. No political rhetoric — but a concrete law change with measurable employment and investment effects. + +- **Motion 687** (S=1, M=3): Technical-juridical language about the scope of "emissiegegevens" in the EU environmental information directive — but would significantly restrict public transparency about agricultural emissions. + +### 2. Material impact averages significantly higher + +Across all buckets, material impact scores are 0.68–1.12 points higher than stylistic scores. This suggests: +- Parliamentarians write motions using formal, restrained language even when proposing consequential policies +- The original LLM scoring (which showed mean extremity = 2.19 overall) likely understates how radical these policies are in material terms +- Dutch parliamentary language norms mask policy radicalism + +### 3. "High style" motions are rare and concentrated + +Only 3 motions scored S=5 (the most inflammatory end), and all had M=4 or M=5. Explicitly discriminatory or hostile language — when it occurs — is paired with substantively extreme policies. But the vast majority of consequential right-wing motions use parliamentary language: + +- **Motion 11956** (S=4, M=5): Explicitly hostile language ("à la Turkije," "vreemdelingen die we hier niet willen hebben") paired with fundamental rights violation (forced deportation without country-of-origin consent) +- **Motion 18064** (S=5, M=4): Explicit ethnic targeting ("niet-westerse allochtonen" as COVID rulebreakers) — discriminatory state action + +### 4. The original LLM audit gap is partially explained + +The manual audit found 75% agreement with the original LLM scores and noted "systematic overrating of anti-institutional language." The two-dimensional data clarifies this: the original LLM was more sensitive to *stylistic* extremity (inflammatory language) than to *material* policy impact. The 25% disagreement likely occurred on "low style, high impact" motions where the single-dimensional score was anchored to language rather than substance. + +--- + +## Implications for Overton analysis + +### For the current findings + +The "no content extremity increase" (d = −0.09) finding in the Overton report relied on single-dimensional LLM scores. The two-dimensional data suggests this may be an **artifact of the language-focused scoring**: if right-wing motions became more consequential while maintaining or softening their language, the single score would miss the shift entirely. + +The "acceptance without conversion" interpretation — centrists vote more with right-wing despite spatial divergence — is **strengthened** by these findings. It is consistent with right-wing motions becoming *substantively* consequential (high material impact) while maintaining procedural language norms, making them harder for centrists to vote against without appearing obstructionist. + +### Recommendations + +1. **Re-score all 2,986 motions with two-dimensional scoring.** The moderate r = 0.453 confirms the dimensions are separable. A single score obscures the most important category: motions with low stylistic extremism but high material impact. + +2. **Re-run the extremity-stratified centrist support analysis with material impact buckets.** The critical question: did centrist support for *high material impact* motions increase after 2024? If low-language, high-impact motions are the ones gaining centrist tolerance, that is stronger Overton evidence than the current analysis captures. + +3. **For mechanism analysis (U4):** Score mechanisms specifically for *material impact* rather than general extremity. The question is not "how extreme is this motion?" but "what specific rights, institutions, or groups does this motion affect, and how much?" + +--- + +## Data + +- **Full results:** `data/motions.db` → `extremity_scores_2d` (117 rows) +- **Raw JSON:** `/tmp/extremity_2d_results.json` +- **Scoring skill:** `.opencode/skills/score-extremity/SKILL.md` +- **Orchestrator:** `analysis/right_wing/extremity_rescore_2d.py` diff --git a/tests/right_wing/test_extremity_rescore_2d.py b/tests/right_wing/test_extremity_rescore_2d.py new file mode 100644 index 0000000..0911eec --- /dev/null +++ b/tests/right_wing/test_extremity_rescore_2d.py @@ -0,0 +1,360 @@ +"""Tests for two-dimensional extremity rescoring orchestrator.""" + +import json + +import duckdb +import pytest + +pytest.importorskip("duckdb") + + +# ── fixtures ──────────────────────────────────────────────────────────────── + +@pytest.fixture +def synthetic_motions(): + """Return 103 synthetic motion dicts for testing batch formatting.""" + motions = [] + for i in range(103): + motions.append({ + "motion_id": i + 1, + "title": f"Motion {i + 1}", + "text": f"Body text for motion {i + 1}", + "layman": f"Layman explanation {i + 1}", + }) + return motions + + +@pytest.fixture +def prompt_template(): + """Minimal prompt template with {title}, {text}, {layman} placeholders.""" + return ( + "Titel: {title}\n" + "Tekst: {text}\n" + "Uitleg: {layman}\n" + ) + + +@pytest.fixture +def valid_single_result(): + """A valid single-motion 2d result dict.""" + return { + "stijl_extremiteit": 3, + "stijl_toelichting": "Neutraal taalgebruik", + "materiele_impact": 4, + "materiele_toelichting": "Beperkt rechten voor specifieke groep", + } + + +# ── load_skill tests ──────────────────────────────────────────────────────── + +class TestLoadSkill: + def test_returns_prompt_and_schema(self): + from analysis.right_wing.extremity_rescore_2d import load_skill + + result = load_skill() + assert isinstance(result, dict) + assert "prompt_template" in result + assert "batch_schema" in result + assert "single_schema" in result + assert isinstance(result["prompt_template"], str) + assert len(result["prompt_template"]) > 0 + assert "STIJL-EXTREMITEIT" in result["prompt_template"] + assert "MATERIELE IMPACT" in result["prompt_template"] + assert isinstance(result["batch_schema"], dict) + assert "motions" in result["batch_schema"] + assert isinstance(result["single_schema"], dict) + + def test_missing_file_raises(self): + from analysis.right_wing.extremity_rescore_2d import load_skill + + with pytest.raises(FileNotFoundError, match="not found"): + load_skill(skill_path="/nonexistent/path/skill.md") + + +# ── format_batches tests ──────────────────────────────────────────────────── + +class TestFormatBatches: + def test_splits_into_batches(self, synthetic_motions, prompt_template): + from analysis.right_wing.extremity_rescore_2d import format_batches + + batches = format_batches(synthetic_motions[:100], prompt_template, batch_size=10) + assert isinstance(batches, list) + assert len(batches) == 10 + for batch in batches: + assert isinstance(batch, list) + assert len(batch) == 10 + for prompt_str in batch: + assert "Motion" in prompt_str + + def test_uneven_batches(self, synthetic_motions, prompt_template): + from analysis.right_wing.extremity_rescore_2d import format_batches + + batches = format_batches(synthetic_motions, prompt_template, batch_size=10) + assert len(batches) == 11 + for batch in batches[:-1]: + assert len(batch) == 10 + assert len(batches[-1]) == 3 + + def test_substitutes_placeholders(self, prompt_template): + from analysis.right_wing.extremity_rescore_2d import format_batches + + motions = [{ + "motion_id": 1, + "title": "Test Title", + "text": "Test Text", + "layman": "Test Layman", + }] + batches = format_batches(motions, prompt_template, batch_size=1) + prompt_str = batches[0][0] + assert "Test Title" in prompt_str + assert "Test Text" in prompt_str + assert "Test Layman" in prompt_str + + +# ── validate_single_result tests ──────────────────────────────────────────── + +class TestValidateSingleResult: + def test_valid_result(self, valid_single_result): + from analysis.right_wing.extremity_rescore_2d import validate_single_result + + ok, err = validate_single_result(valid_single_result) + assert ok is True + assert err is None + + def test_missing_field(self, valid_single_result): + from analysis.right_wing.extremity_rescore_2d import validate_single_result + + invalid = dict(valid_single_result) + del invalid["materiele_impact"] + ok, err = validate_single_result(invalid) + assert ok is False + assert "materiele_impact" in err + + def test_out_of_range_high(self, valid_single_result): + from analysis.right_wing.extremity_rescore_2d import validate_single_result + + invalid = dict(valid_single_result) + invalid["stijl_extremiteit"] = 6 + ok, err = validate_single_result(invalid) + assert ok is False + assert "stijl_extremiteit" in err + + def test_out_of_range_low(self, valid_single_result): + from analysis.right_wing.extremity_rescore_2d import validate_single_result + + invalid = dict(valid_single_result) + invalid["materiele_impact"] = 0 + ok, err = validate_single_result(invalid) + assert ok is False + assert "materiele_impact" in err + + def test_non_integer_score(self, valid_single_result): + from analysis.right_wing.extremity_rescore_2d import validate_single_result + + invalid = dict(valid_single_result) + invalid["stijl_extremiteit"] = "3" + ok, err = validate_single_result(invalid) + assert ok is False + assert "stijl_extremiteit" in err + + +# ── store_scores tests ────────────────────────────────────────────────────── + +class TestStoreScores: + def test_stores_and_returns_count(self, tmp_duckdb_path): + import duckdb + from analysis.right_wing.extremity_rescore_2d import store_scores + + results = [ + {"motion_id": 1, "stijl_extremiteit": 3, "stijl_toelichting": "a", + "materiele_impact": 4, "materiele_toelichting": "b"}, + {"motion_id": 2, "stijl_extremiteit": 2, "stijl_toelichting": "c", + "materiele_impact": 1, "materiele_toelichting": "d"}, + ] + count = store_scores(tmp_duckdb_path, results) + assert count == 2 + + con = duckdb.connect(tmp_duckdb_path) + try: + rows = con.execute( + "SELECT motion_id, stylistic_score, material_score " + "FROM extremity_scores_2d ORDER BY motion_id" + ).fetchall() + assert len(rows) == 2 + assert rows[0] == (1, 3, 4) + assert rows[1] == (2, 2, 1) + finally: + con.close() + + def test_replace_existing(self, tmp_duckdb_path): + import duckdb + from analysis.right_wing.extremity_rescore_2d import store_scores + + results = [{ + "motion_id": 1, "stijl_extremiteit": 1, "stijl_toelichting": "x", + "materiele_impact": 1, "materiele_toelichting": "y", + }] + store_scores(tmp_duckdb_path, results) + + updated = [{ + "motion_id": 1, "stijl_extremiteit": 5, "stijl_toelichting": "z", + "materiele_impact": 5, "materiele_toelichting": "w", + }] + count = store_scores(tmp_duckdb_path, updated) + assert count == 1 + + con = duckdb.connect(tmp_duckdb_path) + try: + rows = con.execute( + "SELECT stylistic_score, material_score FROM extremity_scores_2d WHERE motion_id = 1" + ).fetchall() + assert rows[0] == (5, 5) + finally: + con.close() + + +# ── sample_motions tests ──────────────────────────────────────────────────── + +class TestSampleMotions: + @pytest.fixture(autouse=True) + def setup_db(self, tmp_duckdb_path): + """Set up right_wing_motions and extremity_scores tables with synthetic data.""" + con = duckdb.connect(tmp_duckdb_path) + try: + con.execute(""" + CREATE TABLE IF NOT EXISTS right_wing_motions ( + motion_id INTEGER PRIMARY KEY, + classified BOOLEAN DEFAULT TRUE + ) + """) + con.execute(""" + CREATE TABLE IF NOT EXISTS motions ( + id INTEGER PRIMARY KEY, + title VARCHAR, + body_text VARCHAR, + layman_explanation VARCHAR + ) + """) + con.execute(""" + CREATE TABLE IF NOT EXISTS extremity_scores ( + motion_id INTEGER PRIMARY KEY, + text_score INTEGER, + text_explanation VARCHAR, + layman_score INTEGER, + layman_explanation VARCHAR, + error VARCHAR + ) + """) + # Insert motions across 4 text_score buckets: 1, 2, 4, 5 + records = [] + for bucket, score in enumerate([1, 2, 4, 5], start=1): + for i in range(15): + mid = (bucket - 1) * 15 + i + 1 + con.execute( + "INSERT INTO motions VALUES (?, ?, ?, ?)", + (mid, f"Title {mid}", f"Text {mid}", f"Layman {mid}"), + ) + con.execute( + "INSERT INTO right_wing_motions VALUES (?, TRUE)", + (mid,), + ) + con.execute( + "INSERT OR REPLACE INTO extremity_scores VALUES (?, ?, '', ?, '', NULL)", + (mid, score, score), + ) + con.commit() + finally: + con.close() + + def test_returns_stratified_sample(self, tmp_duckdb_path): + from analysis.right_wing.extremity_rescore_2d import sample_motions + + result = sample_motions(tmp_duckdb_path, n_per_bucket=5, seed=42) + assert isinstance(result, list) + assert len(result) == 20 # 4 buckets * 5 each + for row in result: + assert "motion_id" in row + assert "title" in row + assert "text" in row + assert "layman" in row + assert "text_score" in row + + def test_respects_seed(self, tmp_duckdb_path): + from analysis.right_wing.extremity_rescore_2d import sample_motions + + result_a = sample_motions(tmp_duckdb_path, n_per_bucket=3, seed=99) + result_b = sample_motions(tmp_duckdb_path, n_per_bucket=3, seed=99) + ids_a = sorted(r["motion_id"] for r in result_a) + ids_b = sorted(r["motion_id"] for r in result_b) + assert ids_a == ids_b + + def test_n_per_bucket_limits(self, tmp_duckdb_path): + from analysis.right_wing.extremity_rescore_2d import sample_motions + + result = sample_motions(tmp_duckdb_path, n_per_bucket=2, seed=1) + assert len(result) == 8 # 4 buckets * 2 + + +# ── rescore_2d dry_run tests ──────────────────────────────────────────────── + +class TestRescore2dDryRun: + @pytest.fixture(autouse=True) + def setup_db(self, tmp_duckdb_path): + """Set up minimal tables for dry_run test.""" + con = duckdb.connect(tmp_duckdb_path) + try: + con.execute(""" + CREATE TABLE IF NOT EXISTS right_wing_motions ( + motion_id INTEGER PRIMARY KEY, + classified BOOLEAN DEFAULT TRUE + ) + """) + con.execute(""" + CREATE TABLE IF NOT EXISTS motions ( + id INTEGER PRIMARY KEY, + title VARCHAR, + body_text VARCHAR, + layman_explanation VARCHAR + ) + """) + con.execute(""" + CREATE TABLE IF NOT EXISTS extremity_scores ( + motion_id INTEGER PRIMARY KEY, + text_score INTEGER, + text_explanation VARCHAR, + layman_score INTEGER, + layman_explanation VARCHAR, + error VARCHAR + ) + """) + for mid in range(1, 21): + con.execute( + "INSERT INTO motions VALUES (?, ?, ?, ?)", + (mid, f"Title {mid}", f"Text {mid}", f"Layman {mid}"), + ) + con.execute( + "INSERT INTO right_wing_motions VALUES (?, TRUE)", + (mid,), + ) + con.execute( + "INSERT OR REPLACE INTO extremity_scores VALUES (?, ?, '', ?, '', NULL)", + (mid, (mid % 5) + 1, (mid % 5) + 1), + ) + con.commit() + finally: + con.close() + + def test_dry_run_no_subagents(self, tmp_duckdb_path, caplog): + from analysis.right_wing.extremity_rescore_2d import rescore_2d + + import logging + caplog.set_level(logging.INFO) + + result = rescore_2d(tmp_duckdb_path, n_per_bucket=3, dry_run=True) + assert isinstance(result, dict) + assert result.get("dry_run") is True + assert "motions_count" in result + assert "batch_count" in result + + combined = caplog.text.lower() + assert "dry run" in combined