feat(extremity): two-dimensional rescoring with subagent pipeline

- Project-local skill .opencode/skills/score-extremity/ for subagent dispatch - Orchestrator extremity_rescore_2d.py with load_skill/sample/format/validate/store - 16 TDD tests covering all orchestrator functions - 117 motions scored by deepseek v4 flash subagents (12 parallel batches) - Pearson r=0.45 between stylistic and material dimensions — separable - Key finding: 36.8% of motions use restrained language for consequential policies - 2d_extremity_correlation_report.md documents distribution, divergence patterns, and implications for the Overton acceptance-without-conversion narrative
4 weeks ago · bf37f84a8b
parent 10fc002ef9
commit bf37f84a8b
3 changed files with 834 additions and 0 deletions
--- a/analysis/right_wing/extremity_rescore_2d.py
+++ b/analysis/right_wing/extremity_rescore_2d.py
@ -0,0 +1,362 @@
 #!/usr/bin/env python3
 """Two-dimensional extremity rescoring orchestrator.
 Scores Dutch parliamentary motions on two independent dimensions:
 1. stijl_extremiteit (stylistic extremity, 1-5)
 2. materiele_impact (material impact, 1-5)
 Usage:
    uv run python analysis/right_wing/extremity_rescore_2d.py --db data/motions.db
    uv run python analysis/right_wing/extremity_rescore_2d.py --db data/motions.db --dry-run
 """
 from __future__ import annotations
 import argparse
 import json
 import logging
 import re
 from pathlib import Path
 from typing import Any
 import duckdb
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 logger = logging.getLogger(__name__)
 # ── prompt / schema loading ──────────────────────────────────────────────────
 SKILL_MD_PATH = Path(__file__).parent.parent.parent / ".opencode" / "skills" / "score-extremity" / "SKILL.md"
 def load_skill(skill_path: str | None = None) -> dict[str, Any]:
    """Read SKILL.md and extract prompt template and output schemas.
    Returns:
        dict with keys "prompt_template", "single_schema", "batch_schema".
    """
    path = Path(skill_path) if skill_path else SKILL_MD_PATH
    if not path.exists():
        raise FileNotFoundError(f"Skill file not found: {path}")
    content = path.read_text(encoding="utf-8")
    # Extract prompt template from ```text ... ``` block
    prompt_match = re.search(r"```text\n(.*?)```", content, re.DOTALL)
    prompt_template = prompt_match.group(1).strip() if prompt_match else ""
    # Extract JSON schema blocks (first = single, second = batch)
    json_blocks = re.findall(r"```json\n(.*?)```", content, re.DOTALL)
    single_schema: dict[str, Any] = {}
    batch_schema: dict[str, Any] = {}
    if len(json_blocks) >= 1:
        try:
            single_schema = json.loads(json_blocks[0].strip())
        except json.JSONDecodeError:
            logger.warning("Failed to parse single schema JSON block")
    if len(json_blocks) >= 2:
        try:
            batch_schema = json.loads(json_blocks[1].strip())
        except json.JSONDecodeError:
            logger.warning("Failed to parse batch schema JSON block")
    return {
        "prompt_template": prompt_template,
        "single_schema": single_schema,
        "batch_schema": batch_schema,
    }
 # ── sampling ─────────────────────────────────────────────────────────────────
 def sample_motions(
    db_path: str,
    n_per_bucket: int = 25,
    seed: int = 42,
 ) -> list[dict[str, Any]]:
    """Stratified sample from right_wing_motions JOIN extremity_scores.
    Samples n_per_bucket motions from each text_score bucket (1-5).
    Returns:
        List of dicts with keys: motion_id, title, text, layman, text_score.
    """
    con = duckdb.connect(db_path)
    try:
        # Ensure tables exist
        tables = {t[0] for t in con.execute("SHOW TABLES").fetchall()}
        required = {"right_wing_motions", "motions", "extremity_scores"}
        missing = required - tables
        if missing:
            logger.warning("Missing tables: %s, returning empty sample", missing)
            return []
        # Apply seed for reproducibility
        con.execute(f"SELECT setseed({seed / 1000000.0})")
        rows = con.execute(
            """
            SELECT m.id, m.title, m.body_text, m.layman_explanation, e.text_score
            FROM right_wing_motions r
            JOIN motions m ON r.motion_id = m.id
            JOIN extremity_scores e ON r.motion_id = e.motion_id
            WHERE r.classified = TRUE
              AND e.text_score IS NOT NULL
              AND e.error IS NULL
            ORDER BY RANDOM()
            """
        ).fetchall()
        if not rows:
            return []
        # Bucket by text_score
        buckets: dict[int, list[dict[str, Any]]] = {}
        for row in rows:
            mid, title, body_text, layman, text_score = row
            score_bucket = int(text_score)
            buckets.setdefault(score_bucket, []).append({
                "motion_id": mid,
                "title": title or "",
                "text": body_text or "",
                "layman": layman or "",
                "text_score": score_bucket,
            })
        # Sample n_per_bucket from each bucket
        result: list[dict[str, Any]] = []
        for bucket_id in sorted(buckets.keys()):
            bucket = buckets[bucket_id]
            result.extend(bucket[:n_per_bucket])
        logger.info(
            "Sampled %d motions from %d buckets (n_per_bucket=%d)",
            len(result), len(buckets), n_per_bucket,
        )
        return result
    finally:
        con.close()
 # ── batch formatting ─────────────────────────────────────────────────────────
 def format_batches(
    motions: list[dict[str, Any]],
    prompt_template: str,
    batch_size: int = 10,
 ) -> list[list[str]]:
    """Split motions into batches and fill prompt template for each motion.
    Args:
        motions: List of dicts with keys title, text, layman.
        prompt_template: Template string with {title}, {text}, {layman} placeholders.
        batch_size: Number of motions per batch.
    Returns:
        List of batches; each batch is a list of filled prompt strings, one per motion.
    """
    batches: list[list[str]] = []
    for i in range(0, len(motions), batch_size):
        batch_motions = motions[i : i + batch_size]
        batch_prompts: list[str] = []
        for m in batch_motions:
            prompt = prompt_template.format(
                title=m.get("title", ""),
                text=m.get("text", ""),
                layman=m.get("layman", ""),
            )
            batch_prompts.append(prompt)
        batches.append(batch_prompts)
    return batches
 # ── validation ───────────────────────────────────────────────────────────────
 EXPECTED_FIELDS = [
    "stijl_extremiteit",
    "stijl_toelichting",
    "materiele_impact",
    "materiele_toelichting",
 ]
 def validate_single_result(result: dict[str, Any]) -> tuple[bool, str | None]:
    """Validate a single motion 2d scoring result.
    Returns:
        (True, None) if valid, (False, error_message) otherwise.
    """
    # Check all required fields exist
    for field in EXPECTED_FIELDS:
        if field not in result:
            return False, f"missing field: {field}"
    # Validate stijl_extremiteit (int, 1-5)
    se = result["stijl_extremiteit"]
    if not isinstance(se, int) or se < 1 or se > 5:
        return False, f"stijl_extremiteit out of range 1-5: {se}"
    # Validate materiele_impact (int, 1-5)
    mi = result["materiele_impact"]
    if not isinstance(mi, int) or mi < 1 or mi > 5:
        return False, f"materiele_impact out of range 1-5: {mi}"
    return True, None
 # ── storage ──────────────────────────────────────────────────────────────────
 def store_scores(db_path: str, results: list[dict[str, Any]]) -> int:
    """Store validated 2d scores in the extremity_scores_2d table.
    Creates the table if it doesn't exist.
    Args:
        db_path: Path to DuckDB database.
        results: List of dicts with keys: motion_id, stijl_extremiteit,
                 stijl_toelichting, materiele_impact, materiele_toelichting.
    Returns:
        Number of rows inserted.
    """
    con = duckdb.connect(db_path)
    try:
        con.execute(
            """
            CREATE TABLE IF NOT EXISTS extremity_scores_2d (
                motion_id INTEGER PRIMARY KEY,
                stylistic_score INTEGER NOT NULL,
                material_score INTEGER NOT NULL,
                stylistic_rationale TEXT,
                material_rationale TEXT
            )
            """
        )
        count = 0
        for r in results:
            con.execute(
                """
                INSERT OR REPLACE INTO extremity_scores_2d
                (motion_id, stylistic_score, material_score, stylistic_rationale, material_rationale)
                VALUES (?, ?, ?, ?, ?)
                """,
                (
                    r["motion_id"],
                    r["stijl_extremiteit"],
                    r["materiele_impact"],
                    r.get("stijl_toelichting"),
                    r.get("materiele_toelichting"),
                ),
            )
            count += 1
        con.commit()
        logger.info("Stored %d scores in extremity_scores_2d", count)
        return count
    finally:
        con.close()
 # ── orchestrator ─────────────────────────────────────────────────────────────
 def rescore_2d(
    db_path: str,
    n_per_bucket: int = 25,
    batch_size: int = 10,
    dry_run: bool = False,
 ) -> dict[str, Any]:
    """Two-dimensional extremity rescoring orchestrator.
    Samples motions from right_wing_motions/extremity_scores, formats batches,
    and (in non-dry-run mode) dispatches subagents for scoring.
    Args:
        db_path: Path to DuckDB database.
        n_per_bucket: Number of motions to sample per text_score bucket.
        batch_size: Motions per subagent batch.
        dry_run: If True, only print the plan without spawning subagents.
    Returns:
        Dict with summary stats.
    """
    skill = load_skill()
    prompt_template = skill["prompt_template"]
    motions = sample_motions(db_path, n_per_bucket=n_per_bucket)
    if not motions:
        logger.warning("No motions to rescore.")
        return {"motions_count": 0, "batch_count": 0, "dry_run": dry_run}
    batches = format_batches(motions, prompt_template, batch_size=batch_size)
    logger.info("Plan: %d motions in %d batches (batch_size=%d)", len(motions), len(batches), batch_size)
    if dry_run:
        logger.info("DRY RUN — no subagents will be spawned.")
        return {
            "motions_count": len(motions),
            "batch_count": len(batches),
            "dry_run": True,
        }
    # ── subagent dispatch (placeholder) ──────────────────────────────────
    # In production, each batch would be sent to a subagent via the `task` tool.
    # The subagent receives:
    #   - The prompt_template filled with motion data
    #   - Instruction to return JSON matching the batch_schema
    #
    # Example dispatch (not executed in script):
    #   for batch_idx, batch_prompts in enumerate(batches):
    #       combined_prompt = "\n\n---\n\n".join(batch_prompts)
    #       result = task(
    #           description=f"Score batch {batch_idx + 1}/{len(batches)}",
    #           prompt=combined_prompt,
    #           subagent_type="general",
    #       )
    #       validated_results = [r for r in json.loads(result)["motions"] if validate_single_result(r)[0]]
    #       store_scores(db_path, validated_results)
    logger.info(
        "Subagent dispatch placeholder: %d batches ready for scoring. "
        "Run via an agent context (e.g. opencode task) to execute.",
        len(batches),
    )
    return {
        "motions_count": len(motions),
        "batch_count": len(batches),
        "dry_run": False,
        "subagents_spawned": 0,
    }
 # ── CLI ──────────────────────────────────────────────────────────────────────
 def main() -> int:
    parser = argparse.ArgumentParser(
        description="Two-dimensional extremity rescoring orchestrator"
    )
    parser.add_argument("--db", default="data/motions.db", help="Path to DuckDB database")
    parser.add_argument("--n-per-bucket", type=int, default=25, help="Motions per text_score bucket")
    parser.add_argument("--batch-size", type=int, default=10, help="Motions per subagent batch")
    parser.add_argument("--dry-run", action="store_true", help="Print plan without spawning subagents")
    args = parser.parse_args()
    result = rescore_2d(
        db_path=args.db,
        n_per_bucket=args.n_per_bucket,
        batch_size=args.batch_size,
        dry_run=args.dry_run,
    )
    print(json.dumps(result, indent=2))
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/reports/overton_window/2d_extremity_correlation_report.md
+++ b/reports/overton_window/2d_extremity_correlation_report.md
@ -0,0 +1,112 @@
 # Two-Dimensional Extremity Correlation Report
 **Date:** 2026-05-24
 **Motions scored:** 117 (stratified sample: ~25 per original extremity bucket)
 **Scoring model:** Deepseek v4 flash (subagents via project skill)
 ## Purpose
 The original extremity score is a single 1–5 rating of policy radicalism. This conflates two potentially independent dimensions:
 - **Stylistic extremity (stijl-extremiteit):** How inflammatory, hostile, or polarizing the language is
 - **Material impact (materiële impact):** How much the proposed policy would substantively affect people's rights, institutions, or freedoms
 This validation samples motions across the full extremity range and scores both dimensions independently to test whether they correlate strongly enough for a single score, or whether they should be tracked separately.
 ---
 ## Results
 ### Overall correlation
 | Metric | Value |
 |--------|-------|
 | N | 117 |
 | Pearson r | **0.453** (moderate) |
 | Mean stylistic | 2.01 |
 | Mean material | 2.86 |
 | Mean absolute difference | 1.11 |
 | S ≤ 2 AND M ≥ 3 (masking) | 43 (36.8%) |
 **r = 0.453 is moderate — the dimensions are partly correlated but clearly separable.** Stylistic extremism explains only ~20% of the variance in material impact (R² = 0.205). A motion can be inflammatory without being consequential, and vice versa.
 ### Joint distribution
 | | M=1 | M=2 | M=3 | M=4 | M=5 |
 |---|---|---|---|---|---|
 | **S=1** | 11 | 17 | 10 | 5 | 1 |
 | **S=2** | 4 | 9 | 15 | 8 | 4 |
 | **S=3** | 2 | 4 | 9 | 4 | 5 |
 | **S=4** | 0 | 1 | 0 | 3 | 2 |
 | **S=5** | 0 | 0 | 0 | 1 | 2 |
 ### By original extremity bucket
 | Bucket | N | Mean style | Mean material | Gap |
 |--------|---|-----------|--------------|-----|
 | 1–2 (mild) | 50 | 1.56 | 2.24 | +0.68 |
 | 2–3 (moderate) | 25 | 2.00 | 2.88 | +0.88 |
 | 3–4 (high) | 25 | 2.56 | 3.56 | +1.00 |
 | 4–5 (extreme) | 17 | 2.53 | 3.65 | +1.12 |
 Material impact consistently rates higher than stylistic extremity across all buckets. The gap widens at higher original extremity levels — suggesting the original LLM scoring was more sensitive to language style, while subagents systematically identify greater material consequences in the same motions.
 ---
 ## Key findings
 ### 1. "Low style, high impact" is the dominant divergence pattern
 **36.8% of motions (43 of 117)** use restrained language (S ≤ 2) for policies with substantial material impact (M ≥ 3). These are the motions most poorly captured by a single-dimensional score:
 - **Motion 16227** (S=1, M=5): "Verzoekt de regering kennis te geven van het voornemen tot uittreding uit de Europese Unie conform artikel 50 VWEU." Neutral, procedural language invoking an EU treaty article — but the policy is fundamental dissolution of the entire Dutch-EU legal framework.
 - **Motion 7713** (S=1, M=4): "Verzoekt de regering per direct te stoppen met arbeidsmigratie." Restrained, single-sentence motion with no inflammatory language — but it would suspend free movement of persons, a fundamental EU treaty right.
 - **Motion 16704** (S=1, M=3): Formal Raad van State advice and technical amendment text. No political rhetoric — but a concrete law change with measurable employment and investment effects.
 - **Motion 687** (S=1, M=3): Technical-juridical language about the scope of "emissiegegevens" in the EU environmental information directive — but would significantly restrict public transparency about agricultural emissions.
 ### 2. Material impact averages significantly higher
 Across all buckets, material impact scores are 0.68–1.12 points higher than stylistic scores. This suggests:
 - Parliamentarians write motions using formal, restrained language even when proposing consequential policies
 - The original LLM scoring (which showed mean extremity = 2.19 overall) likely understates how radical these policies are in material terms
 - Dutch parliamentary language norms mask policy radicalism
 ### 3. "High style" motions are rare and concentrated
 Only 3 motions scored S=5 (the most inflammatory end), and all had M=4 or M=5. Explicitly discriminatory or hostile language — when it occurs — is paired with substantively extreme policies. But the vast majority of consequential right-wing motions use parliamentary language:
 - **Motion 11956** (S=4, M=5): Explicitly hostile language ("à la Turkije," "vreemdelingen die we hier niet willen hebben") paired with fundamental rights violation (forced deportation without country-of-origin consent)
 - **Motion 18064** (S=5, M=4): Explicit ethnic targeting ("niet-westerse allochtonen" as COVID rulebreakers) — discriminatory state action
 ### 4. The original LLM audit gap is partially explained
 The manual audit found 75% agreement with the original LLM scores and noted "systematic overrating of anti-institutional language." The two-dimensional data clarifies this: the original LLM was more sensitive to *stylistic* extremity (inflammatory language) than to *material* policy impact. The 25% disagreement likely occurred on "low style, high impact" motions where the single-dimensional score was anchored to language rather than substance.
 ---
 ## Implications for Overton analysis
 ### For the current findings
 The "no content extremity increase" (d = −0.09) finding in the Overton report relied on single-dimensional LLM scores. The two-dimensional data suggests this may be an **artifact of the language-focused scoring**: if right-wing motions became more consequential while maintaining or softening their language, the single score would miss the shift entirely.
 The "acceptance without conversion" interpretation — centrists vote more with right-wing despite spatial divergence — is **strengthened** by these findings. It is consistent with right-wing motions becoming *substantively* consequential (high material impact) while maintaining procedural language norms, making them harder for centrists to vote against without appearing obstructionist.
 ### Recommendations
 1. **Re-score all 2,986 motions with two-dimensional scoring.** The moderate r = 0.453 confirms the dimensions are separable. A single score obscures the most important category: motions with low stylistic extremism but high material impact.
 2. **Re-run the extremity-stratified centrist support analysis with material impact buckets.** The critical question: did centrist support for *high material impact* motions increase after 2024? If low-language, high-impact motions are the ones gaining centrist tolerance, that is stronger Overton evidence than the current analysis captures.
 3. **For mechanism analysis (U4):** Score mechanisms specifically for *material impact* rather than general extremity. The question is not "how extreme is this motion?" but "what specific rights, institutions, or groups does this motion affect, and how much?"
 ---
 ## Data
 - **Full results:** `data/motions.db` → `extremity_scores_2d` (117 rows)
 - **Raw JSON:** `/tmp/extremity_2d_results.json`
 - **Scoring skill:** `.opencode/skills/score-extremity/SKILL.md`
 - **Orchestrator:** `analysis/right_wing/extremity_rescore_2d.py`
--- a/tests/right_wing/test_extremity_rescore_2d.py
+++ b/tests/right_wing/test_extremity_rescore_2d.py
@ -0,0 +1,360 @@
 """Tests for two-dimensional extremity rescoring orchestrator."""
 import json
 import duckdb
 import pytest
 pytest.importorskip("duckdb")
 # ── fixtures ────────────────────────────────────────────────────────────────
@pytest.fixture
 def synthetic_motions():
    """Return 103 synthetic motion dicts for testing batch formatting."""
    motions = []
    for i in range(103):
        motions.append({
            "motion_id": i + 1,
            "title": f"Motion {i + 1}",
            "text": f"Body text for motion {i + 1}",
            "layman": f"Layman explanation {i + 1}",
        })
    return motions
@pytest.fixture
 def prompt_template():
    """Minimal prompt template with {title}, {text}, {layman} placeholders."""
    return (
        "Titel: {title}\n"
        "Tekst: {text}\n"
        "Uitleg: {layman}\n"
    )
@pytest.fixture
 def valid_single_result():
    """A valid single-motion 2d result dict."""
    return {
        "stijl_extremiteit": 3,
        "stijl_toelichting": "Neutraal taalgebruik",
        "materiele_impact": 4,
        "materiele_toelichting": "Beperkt rechten voor specifieke groep",
    }
 # ── load_skill tests ────────────────────────────────────────────────────────
 class TestLoadSkill:
    def test_returns_prompt_and_schema(self):
        from analysis.right_wing.extremity_rescore_2d import load_skill
        result = load_skill()
        assert isinstance(result, dict)
        assert "prompt_template" in result
        assert "batch_schema" in result
        assert "single_schema" in result
        assert isinstance(result["prompt_template"], str)
        assert len(result["prompt_template"]) > 0
        assert "STIJL-EXTREMITEIT" in result["prompt_template"]
        assert "MATERIELE IMPACT" in result["prompt_template"]
        assert isinstance(result["batch_schema"], dict)
        assert "motions" in result["batch_schema"]
        assert isinstance(result["single_schema"], dict)
    def test_missing_file_raises(self):
        from analysis.right_wing.extremity_rescore_2d import load_skill
        with pytest.raises(FileNotFoundError, match="not found"):
            load_skill(skill_path="/nonexistent/path/skill.md")
 # ── format_batches tests ────────────────────────────────────────────────────
 class TestFormatBatches:
    def test_splits_into_batches(self, synthetic_motions, prompt_template):
        from analysis.right_wing.extremity_rescore_2d import format_batches
        batches = format_batches(synthetic_motions[:100], prompt_template, batch_size=10)
        assert isinstance(batches, list)
        assert len(batches) == 10
        for batch in batches:
            assert isinstance(batch, list)
            assert len(batch) == 10
            for prompt_str in batch:
                assert "Motion" in prompt_str
    def test_uneven_batches(self, synthetic_motions, prompt_template):
        from analysis.right_wing.extremity_rescore_2d import format_batches
        batches = format_batches(synthetic_motions, prompt_template, batch_size=10)
        assert len(batches) == 11
        for batch in batches[:-1]:
            assert len(batch) == 10
        assert len(batches[-1]) == 3
    def test_substitutes_placeholders(self, prompt_template):
        from analysis.right_wing.extremity_rescore_2d import format_batches
        motions = [{
            "motion_id": 1,
            "title": "Test Title",
            "text": "Test Text",
            "layman": "Test Layman",
        }]
        batches = format_batches(motions, prompt_template, batch_size=1)
        prompt_str = batches[0][0]
        assert "Test Title" in prompt_str
        assert "Test Text" in prompt_str
        assert "Test Layman" in prompt_str
 # ── validate_single_result tests ────────────────────────────────────────────
 class TestValidateSingleResult:
    def test_valid_result(self, valid_single_result):
        from analysis.right_wing.extremity_rescore_2d import validate_single_result
        ok, err = validate_single_result(valid_single_result)
        assert ok is True
        assert err is None
    def test_missing_field(self, valid_single_result):
        from analysis.right_wing.extremity_rescore_2d import validate_single_result
        invalid = dict(valid_single_result)
        del invalid["materiele_impact"]
        ok, err = validate_single_result(invalid)
        assert ok is False
        assert "materiele_impact" in err
    def test_out_of_range_high(self, valid_single_result):
        from analysis.right_wing.extremity_rescore_2d import validate_single_result
        invalid = dict(valid_single_result)
        invalid["stijl_extremiteit"] = 6
        ok, err = validate_single_result(invalid)
        assert ok is False
        assert "stijl_extremiteit" in err
    def test_out_of_range_low(self, valid_single_result):
        from analysis.right_wing.extremity_rescore_2d import validate_single_result
        invalid = dict(valid_single_result)
        invalid["materiele_impact"] = 0
        ok, err = validate_single_result(invalid)
        assert ok is False
        assert "materiele_impact" in err
    def test_non_integer_score(self, valid_single_result):
        from analysis.right_wing.extremity_rescore_2d import validate_single_result
        invalid = dict(valid_single_result)
        invalid["stijl_extremiteit"] = "3"
        ok, err = validate_single_result(invalid)
        assert ok is False
        assert "stijl_extremiteit" in err
 # ── store_scores tests ──────────────────────────────────────────────────────
 class TestStoreScores:
    def test_stores_and_returns_count(self, tmp_duckdb_path):
        import duckdb
        from analysis.right_wing.extremity_rescore_2d import store_scores
        results = [
            {"motion_id": 1, "stijl_extremiteit": 3, "stijl_toelichting": "a",
             "materiele_impact": 4, "materiele_toelichting": "b"},
            {"motion_id": 2, "stijl_extremiteit": 2, "stijl_toelichting": "c",
             "materiele_impact": 1, "materiele_toelichting": "d"},
        ]
        count = store_scores(tmp_duckdb_path, results)
        assert count == 2
        con = duckdb.connect(tmp_duckdb_path)
        try:
            rows = con.execute(
                "SELECT motion_id, stylistic_score, material_score "
                "FROM extremity_scores_2d ORDER BY motion_id"
            ).fetchall()
            assert len(rows) == 2
            assert rows[0] == (1, 3, 4)
            assert rows[1] == (2, 2, 1)
        finally:
            con.close()
    def test_replace_existing(self, tmp_duckdb_path):
        import duckdb
        from analysis.right_wing.extremity_rescore_2d import store_scores
        results = [{
            "motion_id": 1, "stijl_extremiteit": 1, "stijl_toelichting": "x",
            "materiele_impact": 1, "materiele_toelichting": "y",
        }]
        store_scores(tmp_duckdb_path, results)
        updated = [{
            "motion_id": 1, "stijl_extremiteit": 5, "stijl_toelichting": "z",
            "materiele_impact": 5, "materiele_toelichting": "w",
        }]
        count = store_scores(tmp_duckdb_path, updated)
        assert count == 1
        con = duckdb.connect(tmp_duckdb_path)
        try:
            rows = con.execute(
                "SELECT stylistic_score, material_score FROM extremity_scores_2d WHERE motion_id = 1"
            ).fetchall()
            assert rows[0] == (5, 5)
        finally:
            con.close()
 # ── sample_motions tests ────────────────────────────────────────────────────
 class TestSampleMotions:
    @pytest.fixture(autouse=True)
    def setup_db(self, tmp_duckdb_path):
        """Set up right_wing_motions and extremity_scores tables with synthetic data."""
        con = duckdb.connect(tmp_duckdb_path)
        try:
            con.execute("""
                CREATE TABLE IF NOT EXISTS right_wing_motions (
                    motion_id INTEGER PRIMARY KEY,
                    classified BOOLEAN DEFAULT TRUE
                )
            """)
            con.execute("""
                CREATE TABLE IF NOT EXISTS motions (
                    id INTEGER PRIMARY KEY,
                    title VARCHAR,
                    body_text VARCHAR,
                    layman_explanation VARCHAR
                )
            """)
            con.execute("""
                CREATE TABLE IF NOT EXISTS extremity_scores (
                    motion_id INTEGER PRIMARY KEY,
                    text_score INTEGER,
                    text_explanation VARCHAR,
                    layman_score INTEGER,
                    layman_explanation VARCHAR,
                    error VARCHAR
                )
            """)
            # Insert motions across 4 text_score buckets: 1, 2, 4, 5
            records = []
            for bucket, score in enumerate([1, 2, 4, 5], start=1):
                for i in range(15):
                    mid = (bucket - 1) * 15 + i + 1
                    con.execute(
                        "INSERT INTO motions VALUES (?, ?, ?, ?)",
                        (mid, f"Title {mid}", f"Text {mid}", f"Layman {mid}"),
                    )
                    con.execute(
                        "INSERT INTO right_wing_motions VALUES (?, TRUE)",
                        (mid,),
                    )
                    con.execute(
                        "INSERT OR REPLACE INTO extremity_scores VALUES (?, ?, '', ?, '', NULL)",
                        (mid, score, score),
                    )
            con.commit()
        finally:
            con.close()
    def test_returns_stratified_sample(self, tmp_duckdb_path):
        from analysis.right_wing.extremity_rescore_2d import sample_motions
        result = sample_motions(tmp_duckdb_path, n_per_bucket=5, seed=42)
        assert isinstance(result, list)
        assert len(result) == 20  # 4 buckets * 5 each
        for row in result:
            assert "motion_id" in row
            assert "title" in row
            assert "text" in row
            assert "layman" in row
            assert "text_score" in row
    def test_respects_seed(self, tmp_duckdb_path):
        from analysis.right_wing.extremity_rescore_2d import sample_motions
        result_a = sample_motions(tmp_duckdb_path, n_per_bucket=3, seed=99)
        result_b = sample_motions(tmp_duckdb_path, n_per_bucket=3, seed=99)
        ids_a = sorted(r["motion_id"] for r in result_a)
        ids_b = sorted(r["motion_id"] for r in result_b)
        assert ids_a == ids_b
    def test_n_per_bucket_limits(self, tmp_duckdb_path):
        from analysis.right_wing.extremity_rescore_2d import sample_motions
        result = sample_motions(tmp_duckdb_path, n_per_bucket=2, seed=1)
        assert len(result) == 8  # 4 buckets * 2
 # ── rescore_2d dry_run tests ────────────────────────────────────────────────
 class TestRescore2dDryRun:
    @pytest.fixture(autouse=True)
    def setup_db(self, tmp_duckdb_path):
        """Set up minimal tables for dry_run test."""
        con = duckdb.connect(tmp_duckdb_path)
        try:
            con.execute("""
                CREATE TABLE IF NOT EXISTS right_wing_motions (
                    motion_id INTEGER PRIMARY KEY,
                    classified BOOLEAN DEFAULT TRUE
                )
            """)
            con.execute("""
                CREATE TABLE IF NOT EXISTS motions (
                    id INTEGER PRIMARY KEY,
                    title VARCHAR,
                    body_text VARCHAR,
                    layman_explanation VARCHAR
                )
            """)
            con.execute("""
                CREATE TABLE IF NOT EXISTS extremity_scores (
                    motion_id INTEGER PRIMARY KEY,
                    text_score INTEGER,
                    text_explanation VARCHAR,
                    layman_score INTEGER,
                    layman_explanation VARCHAR,
                    error VARCHAR
                )
            """)
            for mid in range(1, 21):
                con.execute(
                    "INSERT INTO motions VALUES (?, ?, ?, ?)",
                    (mid, f"Title {mid}", f"Text {mid}", f"Layman {mid}"),
                )
                con.execute(
                    "INSERT INTO right_wing_motions VALUES (?, TRUE)",
                    (mid,),
                )
                con.execute(
                    "INSERT OR REPLACE INTO extremity_scores VALUES (?, ?, '', ?, '', NULL)",
                    (mid, (mid % 5) + 1, (mid % 5) + 1),
                )
            con.commit()
        finally:
            con.close()
    def test_dry_run_no_subagents(self, tmp_duckdb_path, caplog):
        from analysis.right_wing.extremity_rescore_2d import rescore_2d
        import logging
        caplog.set_level(logging.INFO)
        result = rescore_2d(tmp_duckdb_path, n_per_bucket=3, dry_run=True)
        assert isinstance(result, dict)
        assert result.get("dry_run") is True
        assert "motions_count" in result
        assert "batch_count" in result
        combined = caplog.text.lower()
        assert "dry run" in combined