feat(extremity): two-dimensional rescoring with subagent pipeline

- Project-local skill .opencode/skills/score-extremity/ for subagent dispatch - Orchestrator extremity_rescore_2d.py with load_skill/sample/format/validate/store - 16 TDD tests covering all orchestrator functions - 117 motions scored by deepseek v4 flash subagents (12 parallel batches) - Pearson r=0.45 between stylistic and material dimensions — separable - Key finding: 36.8% of motions use restrained language for consequential policies - 2d_extremity_correlation_report.md documents distribution, divergence patterns, and implications for the Overton acceptance-without-conversion narrative
4 weeks ago · bf37f84a8b
parent 10fc002ef9
commit bf37f84a8b
3 changed files with 834 additions and 0 deletions
--- a/analysis/right_wing/extremity_rescore_2d.py
+++ b/analysis/right_wing/extremity_rescore_2d.py
@ -0,0 +1,362 @@
+#!/usr/bin/env python3
+"""Two-dimensional extremity rescoring orchestrator.
+
+Scores Dutch parliamentary motions on two independent dimensions:
+1. stijl_extremiteit (stylistic extremity, 1-5)
+2. materiele_impact (material impact, 1-5)
+
+Usage:
+    uv run python analysis/right_wing/extremity_rescore_2d.py --db data/motions.db
+    uv run python analysis/right_wing/extremity_rescore_2d.py --db data/motions.db --dry-run
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import re
+from pathlib import Path
+from typing import Any
+
+import duckdb
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger(__name__)
+
+
+# ── prompt / schema loading ──────────────────────────────────────────────────
+
+SKILL_MD_PATH = Path(__file__).parent.parent.parent / ".opencode" / "skills" / "score-extremity" / "SKILL.md"
+
+
+def load_skill(skill_path: str | None = None) -> dict[str, Any]:
+    """Read SKILL.md and extract prompt template and output schemas.
+
+    Returns:
+        dict with keys "prompt_template", "single_schema", "batch_schema".
+    """
+    path = Path(skill_path) if skill_path else SKILL_MD_PATH
+    if not path.exists():
+        raise FileNotFoundError(f"Skill file not found: {path}")
+
+    content = path.read_text(encoding="utf-8")
+
+    # Extract prompt template from ```text ... ``` block
+    prompt_match = re.search(r"```text\n(.*?)```", content, re.DOTALL)
+    prompt_template = prompt_match.group(1).strip() if prompt_match else ""
+
+    # Extract JSON schema blocks (first = single, second = batch)
+    json_blocks = re.findall(r"```json\n(.*?)```", content, re.DOTALL)
+
+    single_schema: dict[str, Any] = {}
+    batch_schema: dict[str, Any] = {}
+    if len(json_blocks) >= 1:
+        try:
+            single_schema = json.loads(json_blocks[0].strip())
+        except json.JSONDecodeError:
+            logger.warning("Failed to parse single schema JSON block")
+    if len(json_blocks) >= 2:
+        try:
+            batch_schema = json.loads(json_blocks[1].strip())
+        except json.JSONDecodeError:
+            logger.warning("Failed to parse batch schema JSON block")
+
+    return {
+        "prompt_template": prompt_template,
+        "single_schema": single_schema,
+        "batch_schema": batch_schema,
+    }
+
+
+# ── sampling ─────────────────────────────────────────────────────────────────
+
+def sample_motions(
+    db_path: str,
+    n_per_bucket: int = 25,
+    seed: int = 42,
+) -> list[dict[str, Any]]:
+    """Stratified sample from right_wing_motions JOIN extremity_scores.
+
+    Samples n_per_bucket motions from each text_score bucket (1-5).
+
+    Returns:
+        List of dicts with keys: motion_id, title, text, layman, text_score.
+    """
+    con = duckdb.connect(db_path)
+    try:
+        # Ensure tables exist
+        tables = {t[0] for t in con.execute("SHOW TABLES").fetchall()}
+        required = {"right_wing_motions", "motions", "extremity_scores"}
+        missing = required - tables
+        if missing:
+            logger.warning("Missing tables: %s, returning empty sample", missing)
+            return []
+
+        # Apply seed for reproducibility
+        con.execute(f"SELECT setseed({seed / 1000000.0})")
+
+        rows = con.execute(
+            """
+            SELECT m.id, m.title, m.body_text, m.layman_explanation, e.text_score
+            FROM right_wing_motions r
+            JOIN motions m ON r.motion_id = m.id
+            JOIN extremity_scores e ON r.motion_id = e.motion_id
+            WHERE r.classified = TRUE
+              AND e.text_score IS NOT NULL
+              AND e.error IS NULL
+            ORDER BY RANDOM()
+            """
+        ).fetchall()
+
+        if not rows:
+            return []
+
+        # Bucket by text_score
+        buckets: dict[int, list[dict[str, Any]]] = {}
+        for row in rows:
+            mid, title, body_text, layman, text_score = row
+            score_bucket = int(text_score)
+            buckets.setdefault(score_bucket, []).append({
+                "motion_id": mid,
+                "title": title or "",
+                "text": body_text or "",
+                "layman": layman or "",
+                "text_score": score_bucket,
+            })
+
+        # Sample n_per_bucket from each bucket
+        result: list[dict[str, Any]] = []
+        for bucket_id in sorted(buckets.keys()):
+            bucket = buckets[bucket_id]
+            result.extend(bucket[:n_per_bucket])
+
+        logger.info(
+            "Sampled %d motions from %d buckets (n_per_bucket=%d)",
+            len(result), len(buckets), n_per_bucket,
+        )
+        return result
+
+    finally:
+        con.close()
+
+
+# ── batch formatting ─────────────────────────────────────────────────────────
+
+def format_batches(
+    motions: list[dict[str, Any]],
+    prompt_template: str,
+    batch_size: int = 10,
+) -> list[list[str]]:
+    """Split motions into batches and fill prompt template for each motion.
+
+    Args:
+        motions: List of dicts with keys title, text, layman.
+        prompt_template: Template string with {title}, {text}, {layman} placeholders.
+        batch_size: Number of motions per batch.
+
+    Returns:
+        List of batches; each batch is a list of filled prompt strings, one per motion.
+    """
+    batches: list[list[str]] = []
+    for i in range(0, len(motions), batch_size):
+        batch_motions = motions[i : i + batch_size]
+        batch_prompts: list[str] = []
+        for m in batch_motions:
+            prompt = prompt_template.format(
+                title=m.get("title", ""),
+                text=m.get("text", ""),
+                layman=m.get("layman", ""),
+            )
+            batch_prompts.append(prompt)
+        batches.append(batch_prompts)
+    return batches
+
+
+# ── validation ───────────────────────────────────────────────────────────────
+
+EXPECTED_FIELDS = [
+    "stijl_extremiteit",
+    "stijl_toelichting",
+    "materiele_impact",
+    "materiele_toelichting",
+]
+
+
+def validate_single_result(result: dict[str, Any]) -> tuple[bool, str | None]:
+    """Validate a single motion 2d scoring result.
+
+    Returns:
+        (True, None) if valid, (False, error_message) otherwise.
+    """
+    # Check all required fields exist
+    for field in EXPECTED_FIELDS:
+        if field not in result:
+            return False, f"missing field: {field}"
+
+    # Validate stijl_extremiteit (int, 1-5)
+    se = result["stijl_extremiteit"]
+    if not isinstance(se, int) or se < 1 or se > 5:
+        return False, f"stijl_extremiteit out of range 1-5: {se}"
+
+    # Validate materiele_impact (int, 1-5)
+    mi = result["materiele_impact"]
+    if not isinstance(mi, int) or mi < 1 or mi > 5:
+        return False, f"materiele_impact out of range 1-5: {mi}"
+
+    return True, None
+
+
+# ── storage ──────────────────────────────────────────────────────────────────
+
+def store_scores(db_path: str, results: list[dict[str, Any]]) -> int:
+    """Store validated 2d scores in the extremity_scores_2d table.
+
+    Creates the table if it doesn't exist.
+
+    Args:
+        db_path: Path to DuckDB database.
+        results: List of dicts with keys: motion_id, stijl_extremiteit,
+                 stijl_toelichting, materiele_impact, materiele_toelichting.
+
+    Returns:
+        Number of rows inserted.
+    """
+    con = duckdb.connect(db_path)
+    try:
+        con.execute(
+            """
+            CREATE TABLE IF NOT EXISTS extremity_scores_2d (
+                motion_id INTEGER PRIMARY KEY,
+                stylistic_score INTEGER NOT NULL,
+                material_score INTEGER NOT NULL,
+                stylistic_rationale TEXT,
+                material_rationale TEXT
+            )
+            """
+        )
+
+        count = 0
+        for r in results:
+            con.execute(
+                """
+                INSERT OR REPLACE INTO extremity_scores_2d
+                (motion_id, stylistic_score, material_score, stylistic_rationale, material_rationale)
+                VALUES (?, ?, ?, ?, ?)
+                """,
+                (
+                    r["motion_id"],
+                    r["stijl_extremiteit"],
+                    r["materiele_impact"],
+                    r.get("stijl_toelichting"),
+                    r.get("materiele_toelichting"),
+                ),
+            )
+            count += 1
+
+        con.commit()
+        logger.info("Stored %d scores in extremity_scores_2d", count)
+        return count
+
+    finally:
+        con.close()
+
+
+# ── orchestrator ─────────────────────────────────────────────────────────────
+
+def rescore_2d(
+    db_path: str,
+    n_per_bucket: int = 25,
+    batch_size: int = 10,
+    dry_run: bool = False,
+) -> dict[str, Any]:
+    """Two-dimensional extremity rescoring orchestrator.
+
+    Samples motions from right_wing_motions/extremity_scores, formats batches,
+    and (in non-dry-run mode) dispatches subagents for scoring.
+
+    Args:
+        db_path: Path to DuckDB database.
+        n_per_bucket: Number of motions to sample per text_score bucket.
+        batch_size: Motions per subagent batch.
+        dry_run: If True, only print the plan without spawning subagents.
+
+    Returns:
+        Dict with summary stats.
+    """
+    skill = load_skill()
+    prompt_template = skill["prompt_template"]
+
+    motions = sample_motions(db_path, n_per_bucket=n_per_bucket)
+
+    if not motions:
+        logger.warning("No motions to rescore.")
+        return {"motions_count": 0, "batch_count": 0, "dry_run": dry_run}
+
+    batches = format_batches(motions, prompt_template, batch_size=batch_size)
+
+    logger.info("Plan: %d motions in %d batches (batch_size=%d)", len(motions), len(batches), batch_size)
+
+    if dry_run:
+        logger.info("DRY RUN — no subagents will be spawned.")
+        return {
+            "motions_count": len(motions),
+            "batch_count": len(batches),
+            "dry_run": True,
+        }
+
+    # ── subagent dispatch (placeholder) ──────────────────────────────────
+    # In production, each batch would be sent to a subagent via the `task` tool.
+    # The subagent receives:
+    #   - The prompt_template filled with motion data
+    #   - Instruction to return JSON matching the batch_schema
+    #
+    # Example dispatch (not executed in script):
+    #   for batch_idx, batch_prompts in enumerate(batches):
+    #       combined_prompt = "\n\n---\n\n".join(batch_prompts)
+    #       result = task(
+    #           description=f"Score batch {batch_idx + 1}/{len(batches)}",
+    #           prompt=combined_prompt,
+    #           subagent_type="general",
+    #       )
+    #       validated_results = [r for r in json.loads(result)["motions"] if validate_single_result(r)[0]]
+    #       store_scores(db_path, validated_results)
+
+    logger.info(
+        "Subagent dispatch placeholder: %d batches ready for scoring. "
+        "Run via an agent context (e.g. opencode task) to execute.",
+        len(batches),
+    )
+
+    return {
+        "motions_count": len(motions),
+        "batch_count": len(batches),
+        "dry_run": False,
+        "subagents_spawned": 0,
+    }
+
+
+# ── CLI ──────────────────────────────────────────────────────────────────────
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Two-dimensional extremity rescoring orchestrator"
+    )
+    parser.add_argument("--db", default="data/motions.db", help="Path to DuckDB database")
+    parser.add_argument("--n-per-bucket", type=int, default=25, help="Motions per text_score bucket")
+    parser.add_argument("--batch-size", type=int, default=10, help="Motions per subagent batch")
+    parser.add_argument("--dry-run", action="store_true", help="Print plan without spawning subagents")
+    args = parser.parse_args()
+
+    result = rescore_2d(
+        db_path=args.db,
+        n_per_bucket=args.n_per_bucket,
+        batch_size=args.batch_size,
+        dry_run=args.dry_run,
+    )
+    print(json.dumps(result, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/reports/overton_window/2d_extremity_correlation_report.md
+++ b/reports/overton_window/2d_extremity_correlation_report.md
@ -0,0 +1,112 @@
+# Two-Dimensional Extremity Correlation Report
+
+**Date:** 2026-05-24
+**Motions scored:** 117 (stratified sample: ~25 per original extremity bucket)
+**Scoring model:** Deepseek v4 flash (subagents via project skill)
+
+## Purpose
+
+The original extremity score is a single 1–5 rating of policy radicalism. This conflates two potentially independent dimensions:
+- **Stylistic extremity (stijl-extremiteit):** How inflammatory, hostile, or polarizing the language is
+- **Material impact (materiële impact):** How much the proposed policy would substantively affect people's rights, institutions, or freedoms
+
+This validation samples motions across the full extremity range and scores both dimensions independently to test whether they correlate strongly enough for a single score, or whether they should be tracked separately.
+
+---
+
+## Results
+
+### Overall correlation
+
+| Metric | Value |
+|--------|-------|
+| N | 117 |
+| Pearson r | **0.453** (moderate) |
+| Mean stylistic | 2.01 |
+| Mean material | 2.86 |
+| Mean absolute difference | 1.11 |
+| S ≤ 2 AND M ≥ 3 (masking) | 43 (36.8%) |
+
+**r = 0.453 is moderate — the dimensions are partly correlated but clearly separable.** Stylistic extremism explains only ~20% of the variance in material impact (R² = 0.205). A motion can be inflammatory without being consequential, and vice versa.
+
+### Joint distribution
+
+| | M=1 | M=2 | M=3 | M=4 | M=5 |
+|---|---|---|---|---|---|
+| **S=1** | 11 | 17 | 10 | 5 | 1 |
+| **S=2** | 4 | 9 | 15 | 8 | 4 |
+| **S=3** | 2 | 4 | 9 | 4 | 5 |
+| **S=4** | 0 | 1 | 0 | 3 | 2 |
+| **S=5** | 0 | 0 | 0 | 1 | 2 |
+
+### By original extremity bucket
+
+| Bucket | N | Mean style | Mean material | Gap |
+|--------|---|-----------|--------------|-----|
+| 1–2 (mild) | 50 | 1.56 | 2.24 | +0.68 |
+| 2–3 (moderate) | 25 | 2.00 | 2.88 | +0.88 |
+| 3–4 (high) | 25 | 2.56 | 3.56 | +1.00 |
+| 4–5 (extreme) | 17 | 2.53 | 3.65 | +1.12 |
+
+Material impact consistently rates higher than stylistic extremity across all buckets. The gap widens at higher original extremity levels — suggesting the original LLM scoring was more sensitive to language style, while subagents systematically identify greater material consequences in the same motions.
+
+---
+
+## Key findings
+
+### 1. "Low style, high impact" is the dominant divergence pattern
+
+**36.8% of motions (43 of 117)** use restrained language (S ≤ 2) for policies with substantial material impact (M ≥ 3). These are the motions most poorly captured by a single-dimensional score:
+
+- **Motion 16227** (S=1, M=5): "Verzoekt de regering kennis te geven van het voornemen tot uittreding uit de Europese Unie conform artikel 50 VWEU." Neutral, procedural language invoking an EU treaty article — but the policy is fundamental dissolution of the entire Dutch-EU legal framework.
+
+- **Motion 7713** (S=1, M=4): "Verzoekt de regering per direct te stoppen met arbeidsmigratie." Restrained, single-sentence motion with no inflammatory language — but it would suspend free movement of persons, a fundamental EU treaty right.
+
+- **Motion 16704** (S=1, M=3): Formal Raad van State advice and technical amendment text. No political rhetoric — but a concrete law change with measurable employment and investment effects.
+
+- **Motion 687** (S=1, M=3): Technical-juridical language about the scope of "emissiegegevens" in the EU environmental information directive — but would significantly restrict public transparency about agricultural emissions.
+
+### 2. Material impact averages significantly higher
+
+Across all buckets, material impact scores are 0.68–1.12 points higher than stylistic scores. This suggests:
+- Parliamentarians write motions using formal, restrained language even when proposing consequential policies
+- The original LLM scoring (which showed mean extremity = 2.19 overall) likely understates how radical these policies are in material terms
+- Dutch parliamentary language norms mask policy radicalism
+
+### 3. "High style" motions are rare and concentrated
+
+Only 3 motions scored S=5 (the most inflammatory end), and all had M=4 or M=5. Explicitly discriminatory or hostile language — when it occurs — is paired with substantively extreme policies. But the vast majority of consequential right-wing motions use parliamentary language:
+
+- **Motion 11956** (S=4, M=5): Explicitly hostile language ("à la Turkije," "vreemdelingen die we hier niet willen hebben") paired with fundamental rights violation (forced deportation without country-of-origin consent)
+- **Motion 18064** (S=5, M=4): Explicit ethnic targeting ("niet-westerse allochtonen" as COVID rulebreakers) — discriminatory state action
+
+### 4. The original LLM audit gap is partially explained
+
+The manual audit found 75% agreement with the original LLM scores and noted "systematic overrating of anti-institutional language." The two-dimensional data clarifies this: the original LLM was more sensitive to *stylistic* extremity (inflammatory language) than to *material* policy impact. The 25% disagreement likely occurred on "low style, high impact" motions where the single-dimensional score was anchored to language rather than substance.
+
+---
+
+## Implications for Overton analysis
+
+### For the current findings
+
+The "no content extremity increase" (d = −0.09) finding in the Overton report relied on single-dimensional LLM scores. The two-dimensional data suggests this may be an **artifact of the language-focused scoring**: if right-wing motions became more consequential while maintaining or softening their language, the single score would miss the shift entirely.
+
+The "acceptance without conversion" interpretation — centrists vote more with right-wing despite spatial divergence — is **strengthened** by these findings. It is consistent with right-wing motions becoming *substantively* consequential (high material impact) while maintaining procedural language norms, making them harder for centrists to vote against without appearing obstructionist.
+
+### Recommendations
+
+1. **Re-score all 2,986 motions with two-dimensional scoring.** The moderate r = 0.453 confirms the dimensions are separable. A single score obscures the most important category: motions with low stylistic extremism but high material impact.
+
+2. **Re-run the extremity-stratified centrist support analysis with material impact buckets.** The critical question: did centrist support for *high material impact* motions increase after 2024? If low-language, high-impact motions are the ones gaining centrist tolerance, that is stronger Overton evidence than the current analysis captures.
+
+3. **For mechanism analysis (U4):** Score mechanisms specifically for *material impact* rather than general extremity. The question is not "how extreme is this motion?" but "what specific rights, institutions, or groups does this motion affect, and how much?"
+
+---
+
+## Data
+
+- **Full results:** `data/motions.db` → `extremity_scores_2d` (117 rows)
+- **Raw JSON:** `/tmp/extremity_2d_results.json`
+- **Scoring skill:** `.opencode/skills/score-extremity/SKILL.md`
+- **Orchestrator:** `analysis/right_wing/extremity_rescore_2d.py`
--- a/tests/right_wing/test_extremity_rescore_2d.py
+++ b/tests/right_wing/test_extremity_rescore_2d.py
@ -0,0 +1,360 @@
+"""Tests for two-dimensional extremity rescoring orchestrator."""
+
+import json
+
+import duckdb
+import pytest
+
+pytest.importorskip("duckdb")
+
+
+# ── fixtures ────────────────────────────────────────────────────────────────
+
+@pytest.fixture
+def synthetic_motions():
+    """Return 103 synthetic motion dicts for testing batch formatting."""
+    motions = []
+    for i in range(103):
+        motions.append({
+            "motion_id": i + 1,
+            "title": f"Motion {i + 1}",
+            "text": f"Body text for motion {i + 1}",
+            "layman": f"Layman explanation {i + 1}",
+        })
+    return motions
+
+
+@pytest.fixture
+def prompt_template():
+    """Minimal prompt template with {title}, {text}, {layman} placeholders."""
+    return (
+        "Titel: {title}\n"
+        "Tekst: {text}\n"
+        "Uitleg: {layman}\n"
+    )
+
+
+@pytest.fixture
+def valid_single_result():
+    """A valid single-motion 2d result dict."""
+    return {
+        "stijl_extremiteit": 3,
+        "stijl_toelichting": "Neutraal taalgebruik",
+        "materiele_impact": 4,
+        "materiele_toelichting": "Beperkt rechten voor specifieke groep",
+    }
+
+
+# ── load_skill tests ────────────────────────────────────────────────────────
+
+class TestLoadSkill:
+    def test_returns_prompt_and_schema(self):
+        from analysis.right_wing.extremity_rescore_2d import load_skill
+
+        result = load_skill()
+        assert isinstance(result, dict)
+        assert "prompt_template" in result
+        assert "batch_schema" in result
+        assert "single_schema" in result
+        assert isinstance(result["prompt_template"], str)
+        assert len(result["prompt_template"]) > 0
+        assert "STIJL-EXTREMITEIT" in result["prompt_template"]
+        assert "MATERIELE IMPACT" in result["prompt_template"]
+        assert isinstance(result["batch_schema"], dict)
+        assert "motions" in result["batch_schema"]
+        assert isinstance(result["single_schema"], dict)
+
+    def test_missing_file_raises(self):
+        from analysis.right_wing.extremity_rescore_2d import load_skill
+
+        with pytest.raises(FileNotFoundError, match="not found"):
+            load_skill(skill_path="/nonexistent/path/skill.md")
+
+
+# ── format_batches tests ────────────────────────────────────────────────────
+
+class TestFormatBatches:
+    def test_splits_into_batches(self, synthetic_motions, prompt_template):
+        from analysis.right_wing.extremity_rescore_2d import format_batches
+
+        batches = format_batches(synthetic_motions[:100], prompt_template, batch_size=10)
+        assert isinstance(batches, list)
+        assert len(batches) == 10
+        for batch in batches:
+            assert isinstance(batch, list)
+            assert len(batch) == 10
+            for prompt_str in batch:
+                assert "Motion" in prompt_str
+
+    def test_uneven_batches(self, synthetic_motions, prompt_template):
+        from analysis.right_wing.extremity_rescore_2d import format_batches
+
+        batches = format_batches(synthetic_motions, prompt_template, batch_size=10)
+        assert len(batches) == 11
+        for batch in batches[:-1]:
+            assert len(batch) == 10
+        assert len(batches[-1]) == 3
+
+    def test_substitutes_placeholders(self, prompt_template):
+        from analysis.right_wing.extremity_rescore_2d import format_batches
+
+        motions = [{
+            "motion_id": 1,
+            "title": "Test Title",
+            "text": "Test Text",
+            "layman": "Test Layman",
+        }]
+        batches = format_batches(motions, prompt_template, batch_size=1)
+        prompt_str = batches[0][0]
+        assert "Test Title" in prompt_str
+        assert "Test Text" in prompt_str
+        assert "Test Layman" in prompt_str
+
+
+# ── validate_single_result tests ────────────────────────────────────────────
+
+class TestValidateSingleResult:
+    def test_valid_result(self, valid_single_result):
+        from analysis.right_wing.extremity_rescore_2d import validate_single_result
+
+        ok, err = validate_single_result(valid_single_result)
+        assert ok is True
+        assert err is None
+
+    def test_missing_field(self, valid_single_result):
+        from analysis.right_wing.extremity_rescore_2d import validate_single_result
+
+        invalid = dict(valid_single_result)
+        del invalid["materiele_impact"]
+        ok, err = validate_single_result(invalid)
+        assert ok is False
+        assert "materiele_impact" in err
+
+    def test_out_of_range_high(self, valid_single_result):
+        from analysis.right_wing.extremity_rescore_2d import validate_single_result
+
+        invalid = dict(valid_single_result)
+        invalid["stijl_extremiteit"] = 6
+        ok, err = validate_single_result(invalid)
+        assert ok is False
+        assert "stijl_extremiteit" in err
+
+    def test_out_of_range_low(self, valid_single_result):
+        from analysis.right_wing.extremity_rescore_2d import validate_single_result
+
+        invalid = dict(valid_single_result)
+        invalid["materiele_impact"] = 0
+        ok, err = validate_single_result(invalid)
+        assert ok is False
+        assert "materiele_impact" in err
+
+    def test_non_integer_score(self, valid_single_result):
+        from analysis.right_wing.extremity_rescore_2d import validate_single_result
+
+        invalid = dict(valid_single_result)
+        invalid["stijl_extremiteit"] = "3"
+        ok, err = validate_single_result(invalid)
+        assert ok is False
+        assert "stijl_extremiteit" in err
+
+
+# ── store_scores tests ──────────────────────────────────────────────────────
+
+class TestStoreScores:
+    def test_stores_and_returns_count(self, tmp_duckdb_path):
+        import duckdb
+        from analysis.right_wing.extremity_rescore_2d import store_scores
+
+        results = [
+            {"motion_id": 1, "stijl_extremiteit": 3, "stijl_toelichting": "a",
+             "materiele_impact": 4, "materiele_toelichting": "b"},
+            {"motion_id": 2, "stijl_extremiteit": 2, "stijl_toelichting": "c",
+             "materiele_impact": 1, "materiele_toelichting": "d"},
+        ]
+        count = store_scores(tmp_duckdb_path, results)
+        assert count == 2
+
+        con = duckdb.connect(tmp_duckdb_path)
+        try:
+            rows = con.execute(
+                "SELECT motion_id, stylistic_score, material_score "
+                "FROM extremity_scores_2d ORDER BY motion_id"
+            ).fetchall()
+            assert len(rows) == 2
+            assert rows[0] == (1, 3, 4)
+            assert rows[1] == (2, 2, 1)
+        finally:
+            con.close()
+
+    def test_replace_existing(self, tmp_duckdb_path):
+        import duckdb
+        from analysis.right_wing.extremity_rescore_2d import store_scores
+
+        results = [{
+            "motion_id": 1, "stijl_extremiteit": 1, "stijl_toelichting": "x",
+            "materiele_impact": 1, "materiele_toelichting": "y",
+        }]
+        store_scores(tmp_duckdb_path, results)
+
+        updated = [{
+            "motion_id": 1, "stijl_extremiteit": 5, "stijl_toelichting": "z",
+            "materiele_impact": 5, "materiele_toelichting": "w",
+        }]
+        count = store_scores(tmp_duckdb_path, updated)
+        assert count == 1
+
+        con = duckdb.connect(tmp_duckdb_path)
+        try:
+            rows = con.execute(
+                "SELECT stylistic_score, material_score FROM extremity_scores_2d WHERE motion_id = 1"
+            ).fetchall()
+            assert rows[0] == (5, 5)
+        finally:
+            con.close()
+
+
+# ── sample_motions tests ────────────────────────────────────────────────────
+
+class TestSampleMotions:
+    @pytest.fixture(autouse=True)
+    def setup_db(self, tmp_duckdb_path):
+        """Set up right_wing_motions and extremity_scores tables with synthetic data."""
+        con = duckdb.connect(tmp_duckdb_path)
+        try:
+            con.execute("""
+                CREATE TABLE IF NOT EXISTS right_wing_motions (
+                    motion_id INTEGER PRIMARY KEY,
+                    classified BOOLEAN DEFAULT TRUE
+                )
+            """)
+            con.execute("""
+                CREATE TABLE IF NOT EXISTS motions (
+                    id INTEGER PRIMARY KEY,
+                    title VARCHAR,
+                    body_text VARCHAR,
+                    layman_explanation VARCHAR
+                )
+            """)
+            con.execute("""
+                CREATE TABLE IF NOT EXISTS extremity_scores (
+                    motion_id INTEGER PRIMARY KEY,
+                    text_score INTEGER,
+                    text_explanation VARCHAR,
+                    layman_score INTEGER,
+                    layman_explanation VARCHAR,
+                    error VARCHAR
+                )
+            """)
+            # Insert motions across 4 text_score buckets: 1, 2, 4, 5
+            records = []
+            for bucket, score in enumerate([1, 2, 4, 5], start=1):
+                for i in range(15):
+                    mid = (bucket - 1) * 15 + i + 1
+                    con.execute(
+                        "INSERT INTO motions VALUES (?, ?, ?, ?)",
+                        (mid, f"Title {mid}", f"Text {mid}", f"Layman {mid}"),
+                    )
+                    con.execute(
+                        "INSERT INTO right_wing_motions VALUES (?, TRUE)",
+                        (mid,),
+                    )
+                    con.execute(
+                        "INSERT OR REPLACE INTO extremity_scores VALUES (?, ?, '', ?, '', NULL)",
+                        (mid, score, score),
+                    )
+            con.commit()
+        finally:
+            con.close()
+
+    def test_returns_stratified_sample(self, tmp_duckdb_path):
+        from analysis.right_wing.extremity_rescore_2d import sample_motions
+
+        result = sample_motions(tmp_duckdb_path, n_per_bucket=5, seed=42)
+        assert isinstance(result, list)
+        assert len(result) == 20  # 4 buckets * 5 each
+        for row in result:
+            assert "motion_id" in row
+            assert "title" in row
+            assert "text" in row
+            assert "layman" in row
+            assert "text_score" in row
+
+    def test_respects_seed(self, tmp_duckdb_path):
+        from analysis.right_wing.extremity_rescore_2d import sample_motions
+
+        result_a = sample_motions(tmp_duckdb_path, n_per_bucket=3, seed=99)
+        result_b = sample_motions(tmp_duckdb_path, n_per_bucket=3, seed=99)
+        ids_a = sorted(r["motion_id"] for r in result_a)
+        ids_b = sorted(r["motion_id"] for r in result_b)
+        assert ids_a == ids_b
+
+    def test_n_per_bucket_limits(self, tmp_duckdb_path):
+        from analysis.right_wing.extremity_rescore_2d import sample_motions
+
+        result = sample_motions(tmp_duckdb_path, n_per_bucket=2, seed=1)
+        assert len(result) == 8  # 4 buckets * 2
+
+
+# ── rescore_2d dry_run tests ────────────────────────────────────────────────
+
+class TestRescore2dDryRun:
+    @pytest.fixture(autouse=True)
+    def setup_db(self, tmp_duckdb_path):
+        """Set up minimal tables for dry_run test."""
+        con = duckdb.connect(tmp_duckdb_path)
+        try:
+            con.execute("""
+                CREATE TABLE IF NOT EXISTS right_wing_motions (
+                    motion_id INTEGER PRIMARY KEY,
+                    classified BOOLEAN DEFAULT TRUE
+                )
+            """)
+            con.execute("""
+                CREATE TABLE IF NOT EXISTS motions (
+                    id INTEGER PRIMARY KEY,
+                    title VARCHAR,
+                    body_text VARCHAR,
+                    layman_explanation VARCHAR
+                )
+            """)
+            con.execute("""
+                CREATE TABLE IF NOT EXISTS extremity_scores (
+                    motion_id INTEGER PRIMARY KEY,
+                    text_score INTEGER,
+                    text_explanation VARCHAR,
+                    layman_score INTEGER,
+                    layman_explanation VARCHAR,
+                    error VARCHAR
+                )
+            """)
+            for mid in range(1, 21):
+                con.execute(
+                    "INSERT INTO motions VALUES (?, ?, ?, ?)",
+                    (mid, f"Title {mid}", f"Text {mid}", f"Layman {mid}"),
+                )
+                con.execute(
+                    "INSERT INTO right_wing_motions VALUES (?, TRUE)",
+                    (mid,),
+                )
+                con.execute(
+                    "INSERT OR REPLACE INTO extremity_scores VALUES (?, ?, '', ?, '', NULL)",
+                    (mid, (mid % 5) + 1, (mid % 5) + 1),
+                )
+            con.commit()
+        finally:
+            con.close()
+
+    def test_dry_run_no_subagents(self, tmp_duckdb_path, caplog):
+        from analysis.right_wing.extremity_rescore_2d import rescore_2d
+
+        import logging
+        caplog.set_level(logging.INFO)
+
+        result = rescore_2d(tmp_duckdb_path, n_per_bucket=3, dry_run=True)
+        assert isinstance(result, dict)
+        assert result.get("dry_run") is True
+        assert "motions_count" in result
+        assert "batch_count" in result
+
+        combined = caplog.text.lower()
+        assert "dry run" in combined