feat(extremity): two-dimensional rescoring with subagent pipeline

- Project-local skill .opencode/skills/score-extremity/ for subagent dispatch
- Orchestrator extremity_rescore_2d.py with load_skill/sample/format/validate/store
- 16 TDD tests covering all orchestrator functions
- 117 motions scored by deepseek v4 flash subagents (12 parallel batches)
- Pearson r=0.45 between stylistic and material dimensions — separable
- Key finding: 36.8% of motions use restrained language for consequential policies
- 2d_extremity_correlation_report.md documents distribution, divergence patterns,
  and implications for the Overton acceptance-without-conversion narrative
main
Sven Geboers 4 weeks ago
parent 10fc002ef9
commit bf37f84a8b
  1. 362
      analysis/right_wing/extremity_rescore_2d.py
  2. 112
      reports/overton_window/2d_extremity_correlation_report.md
  3. 360
      tests/right_wing/test_extremity_rescore_2d.py

@ -0,0 +1,362 @@
#!/usr/bin/env python3
"""Two-dimensional extremity rescoring orchestrator.
Scores Dutch parliamentary motions on two independent dimensions:
1. stijl_extremiteit (stylistic extremity, 1-5)
2. materiele_impact (material impact, 1-5)
Usage:
uv run python analysis/right_wing/extremity_rescore_2d.py --db data/motions.db
uv run python analysis/right_wing/extremity_rescore_2d.py --db data/motions.db --dry-run
"""
from __future__ import annotations
import argparse
import json
import logging
import re
from pathlib import Path
from typing import Any
import duckdb
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
# ── prompt / schema loading ──────────────────────────────────────────────────
SKILL_MD_PATH = Path(__file__).parent.parent.parent / ".opencode" / "skills" / "score-extremity" / "SKILL.md"
def load_skill(skill_path: str | None = None) -> dict[str, Any]:
"""Read SKILL.md and extract prompt template and output schemas.
Returns:
dict with keys "prompt_template", "single_schema", "batch_schema".
"""
path = Path(skill_path) if skill_path else SKILL_MD_PATH
if not path.exists():
raise FileNotFoundError(f"Skill file not found: {path}")
content = path.read_text(encoding="utf-8")
# Extract prompt template from ```text ... ``` block
prompt_match = re.search(r"```text\n(.*?)```", content, re.DOTALL)
prompt_template = prompt_match.group(1).strip() if prompt_match else ""
# Extract JSON schema blocks (first = single, second = batch)
json_blocks = re.findall(r"```json\n(.*?)```", content, re.DOTALL)
single_schema: dict[str, Any] = {}
batch_schema: dict[str, Any] = {}
if len(json_blocks) >= 1:
try:
single_schema = json.loads(json_blocks[0].strip())
except json.JSONDecodeError:
logger.warning("Failed to parse single schema JSON block")
if len(json_blocks) >= 2:
try:
batch_schema = json.loads(json_blocks[1].strip())
except json.JSONDecodeError:
logger.warning("Failed to parse batch schema JSON block")
return {
"prompt_template": prompt_template,
"single_schema": single_schema,
"batch_schema": batch_schema,
}
# ── sampling ─────────────────────────────────────────────────────────────────
def sample_motions(
db_path: str,
n_per_bucket: int = 25,
seed: int = 42,
) -> list[dict[str, Any]]:
"""Stratified sample from right_wing_motions JOIN extremity_scores.
Samples n_per_bucket motions from each text_score bucket (1-5).
Returns:
List of dicts with keys: motion_id, title, text, layman, text_score.
"""
con = duckdb.connect(db_path)
try:
# Ensure tables exist
tables = {t[0] for t in con.execute("SHOW TABLES").fetchall()}
required = {"right_wing_motions", "motions", "extremity_scores"}
missing = required - tables
if missing:
logger.warning("Missing tables: %s, returning empty sample", missing)
return []
# Apply seed for reproducibility
con.execute(f"SELECT setseed({seed / 1000000.0})")
rows = con.execute(
"""
SELECT m.id, m.title, m.body_text, m.layman_explanation, e.text_score
FROM right_wing_motions r
JOIN motions m ON r.motion_id = m.id
JOIN extremity_scores e ON r.motion_id = e.motion_id
WHERE r.classified = TRUE
AND e.text_score IS NOT NULL
AND e.error IS NULL
ORDER BY RANDOM()
"""
).fetchall()
if not rows:
return []
# Bucket by text_score
buckets: dict[int, list[dict[str, Any]]] = {}
for row in rows:
mid, title, body_text, layman, text_score = row
score_bucket = int(text_score)
buckets.setdefault(score_bucket, []).append({
"motion_id": mid,
"title": title or "",
"text": body_text or "",
"layman": layman or "",
"text_score": score_bucket,
})
# Sample n_per_bucket from each bucket
result: list[dict[str, Any]] = []
for bucket_id in sorted(buckets.keys()):
bucket = buckets[bucket_id]
result.extend(bucket[:n_per_bucket])
logger.info(
"Sampled %d motions from %d buckets (n_per_bucket=%d)",
len(result), len(buckets), n_per_bucket,
)
return result
finally:
con.close()
# ── batch formatting ─────────────────────────────────────────────────────────
def format_batches(
motions: list[dict[str, Any]],
prompt_template: str,
batch_size: int = 10,
) -> list[list[str]]:
"""Split motions into batches and fill prompt template for each motion.
Args:
motions: List of dicts with keys title, text, layman.
prompt_template: Template string with {title}, {text}, {layman} placeholders.
batch_size: Number of motions per batch.
Returns:
List of batches; each batch is a list of filled prompt strings, one per motion.
"""
batches: list[list[str]] = []
for i in range(0, len(motions), batch_size):
batch_motions = motions[i : i + batch_size]
batch_prompts: list[str] = []
for m in batch_motions:
prompt = prompt_template.format(
title=m.get("title", ""),
text=m.get("text", ""),
layman=m.get("layman", ""),
)
batch_prompts.append(prompt)
batches.append(batch_prompts)
return batches
# ── validation ───────────────────────────────────────────────────────────────
EXPECTED_FIELDS = [
"stijl_extremiteit",
"stijl_toelichting",
"materiele_impact",
"materiele_toelichting",
]
def validate_single_result(result: dict[str, Any]) -> tuple[bool, str | None]:
"""Validate a single motion 2d scoring result.
Returns:
(True, None) if valid, (False, error_message) otherwise.
"""
# Check all required fields exist
for field in EXPECTED_FIELDS:
if field not in result:
return False, f"missing field: {field}"
# Validate stijl_extremiteit (int, 1-5)
se = result["stijl_extremiteit"]
if not isinstance(se, int) or se < 1 or se > 5:
return False, f"stijl_extremiteit out of range 1-5: {se}"
# Validate materiele_impact (int, 1-5)
mi = result["materiele_impact"]
if not isinstance(mi, int) or mi < 1 or mi > 5:
return False, f"materiele_impact out of range 1-5: {mi}"
return True, None
# ── storage ──────────────────────────────────────────────────────────────────
def store_scores(db_path: str, results: list[dict[str, Any]]) -> int:
"""Store validated 2d scores in the extremity_scores_2d table.
Creates the table if it doesn't exist.
Args:
db_path: Path to DuckDB database.
results: List of dicts with keys: motion_id, stijl_extremiteit,
stijl_toelichting, materiele_impact, materiele_toelichting.
Returns:
Number of rows inserted.
"""
con = duckdb.connect(db_path)
try:
con.execute(
"""
CREATE TABLE IF NOT EXISTS extremity_scores_2d (
motion_id INTEGER PRIMARY KEY,
stylistic_score INTEGER NOT NULL,
material_score INTEGER NOT NULL,
stylistic_rationale TEXT,
material_rationale TEXT
)
"""
)
count = 0
for r in results:
con.execute(
"""
INSERT OR REPLACE INTO extremity_scores_2d
(motion_id, stylistic_score, material_score, stylistic_rationale, material_rationale)
VALUES (?, ?, ?, ?, ?)
""",
(
r["motion_id"],
r["stijl_extremiteit"],
r["materiele_impact"],
r.get("stijl_toelichting"),
r.get("materiele_toelichting"),
),
)
count += 1
con.commit()
logger.info("Stored %d scores in extremity_scores_2d", count)
return count
finally:
con.close()
# ── orchestrator ─────────────────────────────────────────────────────────────
def rescore_2d(
db_path: str,
n_per_bucket: int = 25,
batch_size: int = 10,
dry_run: bool = False,
) -> dict[str, Any]:
"""Two-dimensional extremity rescoring orchestrator.
Samples motions from right_wing_motions/extremity_scores, formats batches,
and (in non-dry-run mode) dispatches subagents for scoring.
Args:
db_path: Path to DuckDB database.
n_per_bucket: Number of motions to sample per text_score bucket.
batch_size: Motions per subagent batch.
dry_run: If True, only print the plan without spawning subagents.
Returns:
Dict with summary stats.
"""
skill = load_skill()
prompt_template = skill["prompt_template"]
motions = sample_motions(db_path, n_per_bucket=n_per_bucket)
if not motions:
logger.warning("No motions to rescore.")
return {"motions_count": 0, "batch_count": 0, "dry_run": dry_run}
batches = format_batches(motions, prompt_template, batch_size=batch_size)
logger.info("Plan: %d motions in %d batches (batch_size=%d)", len(motions), len(batches), batch_size)
if dry_run:
logger.info("DRY RUN — no subagents will be spawned.")
return {
"motions_count": len(motions),
"batch_count": len(batches),
"dry_run": True,
}
# ── subagent dispatch (placeholder) ──────────────────────────────────
# In production, each batch would be sent to a subagent via the `task` tool.
# The subagent receives:
# - The prompt_template filled with motion data
# - Instruction to return JSON matching the batch_schema
#
# Example dispatch (not executed in script):
# for batch_idx, batch_prompts in enumerate(batches):
# combined_prompt = "\n\n---\n\n".join(batch_prompts)
# result = task(
# description=f"Score batch {batch_idx + 1}/{len(batches)}",
# prompt=combined_prompt,
# subagent_type="general",
# )
# validated_results = [r for r in json.loads(result)["motions"] if validate_single_result(r)[0]]
# store_scores(db_path, validated_results)
logger.info(
"Subagent dispatch placeholder: %d batches ready for scoring. "
"Run via an agent context (e.g. opencode task) to execute.",
len(batches),
)
return {
"motions_count": len(motions),
"batch_count": len(batches),
"dry_run": False,
"subagents_spawned": 0,
}
# ── CLI ──────────────────────────────────────────────────────────────────────
def main() -> int:
parser = argparse.ArgumentParser(
description="Two-dimensional extremity rescoring orchestrator"
)
parser.add_argument("--db", default="data/motions.db", help="Path to DuckDB database")
parser.add_argument("--n-per-bucket", type=int, default=25, help="Motions per text_score bucket")
parser.add_argument("--batch-size", type=int, default=10, help="Motions per subagent batch")
parser.add_argument("--dry-run", action="store_true", help="Print plan without spawning subagents")
args = parser.parse_args()
result = rescore_2d(
db_path=args.db,
n_per_bucket=args.n_per_bucket,
batch_size=args.batch_size,
dry_run=args.dry_run,
)
print(json.dumps(result, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())

@ -0,0 +1,112 @@
# Two-Dimensional Extremity Correlation Report
**Date:** 2026-05-24
**Motions scored:** 117 (stratified sample: ~25 per original extremity bucket)
**Scoring model:** Deepseek v4 flash (subagents via project skill)
## Purpose
The original extremity score is a single 1–5 rating of policy radicalism. This conflates two potentially independent dimensions:
- **Stylistic extremity (stijl-extremiteit):** How inflammatory, hostile, or polarizing the language is
- **Material impact (materiële impact):** How much the proposed policy would substantively affect people's rights, institutions, or freedoms
This validation samples motions across the full extremity range and scores both dimensions independently to test whether they correlate strongly enough for a single score, or whether they should be tracked separately.
---
## Results
### Overall correlation
| Metric | Value |
|--------|-------|
| N | 117 |
| Pearson r | **0.453** (moderate) |
| Mean stylistic | 2.01 |
| Mean material | 2.86 |
| Mean absolute difference | 1.11 |
| S ≤ 2 AND M ≥ 3 (masking) | 43 (36.8%) |
**r = 0.453 is moderate — the dimensions are partly correlated but clearly separable.** Stylistic extremism explains only ~20% of the variance in material impact (R² = 0.205). A motion can be inflammatory without being consequential, and vice versa.
### Joint distribution
| | M=1 | M=2 | M=3 | M=4 | M=5 |
|---|---|---|---|---|---|
| **S=1** | 11 | 17 | 10 | 5 | 1 |
| **S=2** | 4 | 9 | 15 | 8 | 4 |
| **S=3** | 2 | 4 | 9 | 4 | 5 |
| **S=4** | 0 | 1 | 0 | 3 | 2 |
| **S=5** | 0 | 0 | 0 | 1 | 2 |
### By original extremity bucket
| Bucket | N | Mean style | Mean material | Gap |
|--------|---|-----------|--------------|-----|
| 1–2 (mild) | 50 | 1.56 | 2.24 | +0.68 |
| 2–3 (moderate) | 25 | 2.00 | 2.88 | +0.88 |
| 3–4 (high) | 25 | 2.56 | 3.56 | +1.00 |
| 4–5 (extreme) | 17 | 2.53 | 3.65 | +1.12 |
Material impact consistently rates higher than stylistic extremity across all buckets. The gap widens at higher original extremity levels — suggesting the original LLM scoring was more sensitive to language style, while subagents systematically identify greater material consequences in the same motions.
---
## Key findings
### 1. "Low style, high impact" is the dominant divergence pattern
**36.8% of motions (43 of 117)** use restrained language (S ≤ 2) for policies with substantial material impact (M ≥ 3). These are the motions most poorly captured by a single-dimensional score:
- **Motion 16227** (S=1, M=5): "Verzoekt de regering kennis te geven van het voornemen tot uittreding uit de Europese Unie conform artikel 50 VWEU." Neutral, procedural language invoking an EU treaty article — but the policy is fundamental dissolution of the entire Dutch-EU legal framework.
- **Motion 7713** (S=1, M=4): "Verzoekt de regering per direct te stoppen met arbeidsmigratie." Restrained, single-sentence motion with no inflammatory language — but it would suspend free movement of persons, a fundamental EU treaty right.
- **Motion 16704** (S=1, M=3): Formal Raad van State advice and technical amendment text. No political rhetoric — but a concrete law change with measurable employment and investment effects.
- **Motion 687** (S=1, M=3): Technical-juridical language about the scope of "emissiegegevens" in the EU environmental information directive — but would significantly restrict public transparency about agricultural emissions.
### 2. Material impact averages significantly higher
Across all buckets, material impact scores are 0.68–1.12 points higher than stylistic scores. This suggests:
- Parliamentarians write motions using formal, restrained language even when proposing consequential policies
- The original LLM scoring (which showed mean extremity = 2.19 overall) likely understates how radical these policies are in material terms
- Dutch parliamentary language norms mask policy radicalism
### 3. "High style" motions are rare and concentrated
Only 3 motions scored S=5 (the most inflammatory end), and all had M=4 or M=5. Explicitly discriminatory or hostile language — when it occurs — is paired with substantively extreme policies. But the vast majority of consequential right-wing motions use parliamentary language:
- **Motion 11956** (S=4, M=5): Explicitly hostile language ("à la Turkije," "vreemdelingen die we hier niet willen hebben") paired with fundamental rights violation (forced deportation without country-of-origin consent)
- **Motion 18064** (S=5, M=4): Explicit ethnic targeting ("niet-westerse allochtonen" as COVID rulebreakers) — discriminatory state action
### 4. The original LLM audit gap is partially explained
The manual audit found 75% agreement with the original LLM scores and noted "systematic overrating of anti-institutional language." The two-dimensional data clarifies this: the original LLM was more sensitive to *stylistic* extremity (inflammatory language) than to *material* policy impact. The 25% disagreement likely occurred on "low style, high impact" motions where the single-dimensional score was anchored to language rather than substance.
---
## Implications for Overton analysis
### For the current findings
The "no content extremity increase" (d = −0.09) finding in the Overton report relied on single-dimensional LLM scores. The two-dimensional data suggests this may be an **artifact of the language-focused scoring**: if right-wing motions became more consequential while maintaining or softening their language, the single score would miss the shift entirely.
The "acceptance without conversion" interpretation — centrists vote more with right-wing despite spatial divergence — is **strengthened** by these findings. It is consistent with right-wing motions becoming *substantively* consequential (high material impact) while maintaining procedural language norms, making them harder for centrists to vote against without appearing obstructionist.
### Recommendations
1. **Re-score all 2,986 motions with two-dimensional scoring.** The moderate r = 0.453 confirms the dimensions are separable. A single score obscures the most important category: motions with low stylistic extremism but high material impact.
2. **Re-run the extremity-stratified centrist support analysis with material impact buckets.** The critical question: did centrist support for *high material impact* motions increase after 2024? If low-language, high-impact motions are the ones gaining centrist tolerance, that is stronger Overton evidence than the current analysis captures.
3. **For mechanism analysis (U4):** Score mechanisms specifically for *material impact* rather than general extremity. The question is not "how extreme is this motion?" but "what specific rights, institutions, or groups does this motion affect, and how much?"
---
## Data
- **Full results:** `data/motions.db``extremity_scores_2d` (117 rows)
- **Raw JSON:** `/tmp/extremity_2d_results.json`
- **Scoring skill:** `.opencode/skills/score-extremity/SKILL.md`
- **Orchestrator:** `analysis/right_wing/extremity_rescore_2d.py`

@ -0,0 +1,360 @@
"""Tests for two-dimensional extremity rescoring orchestrator."""
import json
import duckdb
import pytest
pytest.importorskip("duckdb")
# ── fixtures ────────────────────────────────────────────────────────────────
@pytest.fixture
def synthetic_motions():
"""Return 103 synthetic motion dicts for testing batch formatting."""
motions = []
for i in range(103):
motions.append({
"motion_id": i + 1,
"title": f"Motion {i + 1}",
"text": f"Body text for motion {i + 1}",
"layman": f"Layman explanation {i + 1}",
})
return motions
@pytest.fixture
def prompt_template():
"""Minimal prompt template with {title}, {text}, {layman} placeholders."""
return (
"Titel: {title}\n"
"Tekst: {text}\n"
"Uitleg: {layman}\n"
)
@pytest.fixture
def valid_single_result():
"""A valid single-motion 2d result dict."""
return {
"stijl_extremiteit": 3,
"stijl_toelichting": "Neutraal taalgebruik",
"materiele_impact": 4,
"materiele_toelichting": "Beperkt rechten voor specifieke groep",
}
# ── load_skill tests ────────────────────────────────────────────────────────
class TestLoadSkill:
def test_returns_prompt_and_schema(self):
from analysis.right_wing.extremity_rescore_2d import load_skill
result = load_skill()
assert isinstance(result, dict)
assert "prompt_template" in result
assert "batch_schema" in result
assert "single_schema" in result
assert isinstance(result["prompt_template"], str)
assert len(result["prompt_template"]) > 0
assert "STIJL-EXTREMITEIT" in result["prompt_template"]
assert "MATERIELE IMPACT" in result["prompt_template"]
assert isinstance(result["batch_schema"], dict)
assert "motions" in result["batch_schema"]
assert isinstance(result["single_schema"], dict)
def test_missing_file_raises(self):
from analysis.right_wing.extremity_rescore_2d import load_skill
with pytest.raises(FileNotFoundError, match="not found"):
load_skill(skill_path="/nonexistent/path/skill.md")
# ── format_batches tests ────────────────────────────────────────────────────
class TestFormatBatches:
def test_splits_into_batches(self, synthetic_motions, prompt_template):
from analysis.right_wing.extremity_rescore_2d import format_batches
batches = format_batches(synthetic_motions[:100], prompt_template, batch_size=10)
assert isinstance(batches, list)
assert len(batches) == 10
for batch in batches:
assert isinstance(batch, list)
assert len(batch) == 10
for prompt_str in batch:
assert "Motion" in prompt_str
def test_uneven_batches(self, synthetic_motions, prompt_template):
from analysis.right_wing.extremity_rescore_2d import format_batches
batches = format_batches(synthetic_motions, prompt_template, batch_size=10)
assert len(batches) == 11
for batch in batches[:-1]:
assert len(batch) == 10
assert len(batches[-1]) == 3
def test_substitutes_placeholders(self, prompt_template):
from analysis.right_wing.extremity_rescore_2d import format_batches
motions = [{
"motion_id": 1,
"title": "Test Title",
"text": "Test Text",
"layman": "Test Layman",
}]
batches = format_batches(motions, prompt_template, batch_size=1)
prompt_str = batches[0][0]
assert "Test Title" in prompt_str
assert "Test Text" in prompt_str
assert "Test Layman" in prompt_str
# ── validate_single_result tests ────────────────────────────────────────────
class TestValidateSingleResult:
def test_valid_result(self, valid_single_result):
from analysis.right_wing.extremity_rescore_2d import validate_single_result
ok, err = validate_single_result(valid_single_result)
assert ok is True
assert err is None
def test_missing_field(self, valid_single_result):
from analysis.right_wing.extremity_rescore_2d import validate_single_result
invalid = dict(valid_single_result)
del invalid["materiele_impact"]
ok, err = validate_single_result(invalid)
assert ok is False
assert "materiele_impact" in err
def test_out_of_range_high(self, valid_single_result):
from analysis.right_wing.extremity_rescore_2d import validate_single_result
invalid = dict(valid_single_result)
invalid["stijl_extremiteit"] = 6
ok, err = validate_single_result(invalid)
assert ok is False
assert "stijl_extremiteit" in err
def test_out_of_range_low(self, valid_single_result):
from analysis.right_wing.extremity_rescore_2d import validate_single_result
invalid = dict(valid_single_result)
invalid["materiele_impact"] = 0
ok, err = validate_single_result(invalid)
assert ok is False
assert "materiele_impact" in err
def test_non_integer_score(self, valid_single_result):
from analysis.right_wing.extremity_rescore_2d import validate_single_result
invalid = dict(valid_single_result)
invalid["stijl_extremiteit"] = "3"
ok, err = validate_single_result(invalid)
assert ok is False
assert "stijl_extremiteit" in err
# ── store_scores tests ──────────────────────────────────────────────────────
class TestStoreScores:
def test_stores_and_returns_count(self, tmp_duckdb_path):
import duckdb
from analysis.right_wing.extremity_rescore_2d import store_scores
results = [
{"motion_id": 1, "stijl_extremiteit": 3, "stijl_toelichting": "a",
"materiele_impact": 4, "materiele_toelichting": "b"},
{"motion_id": 2, "stijl_extremiteit": 2, "stijl_toelichting": "c",
"materiele_impact": 1, "materiele_toelichting": "d"},
]
count = store_scores(tmp_duckdb_path, results)
assert count == 2
con = duckdb.connect(tmp_duckdb_path)
try:
rows = con.execute(
"SELECT motion_id, stylistic_score, material_score "
"FROM extremity_scores_2d ORDER BY motion_id"
).fetchall()
assert len(rows) == 2
assert rows[0] == (1, 3, 4)
assert rows[1] == (2, 2, 1)
finally:
con.close()
def test_replace_existing(self, tmp_duckdb_path):
import duckdb
from analysis.right_wing.extremity_rescore_2d import store_scores
results = [{
"motion_id": 1, "stijl_extremiteit": 1, "stijl_toelichting": "x",
"materiele_impact": 1, "materiele_toelichting": "y",
}]
store_scores(tmp_duckdb_path, results)
updated = [{
"motion_id": 1, "stijl_extremiteit": 5, "stijl_toelichting": "z",
"materiele_impact": 5, "materiele_toelichting": "w",
}]
count = store_scores(tmp_duckdb_path, updated)
assert count == 1
con = duckdb.connect(tmp_duckdb_path)
try:
rows = con.execute(
"SELECT stylistic_score, material_score FROM extremity_scores_2d WHERE motion_id = 1"
).fetchall()
assert rows[0] == (5, 5)
finally:
con.close()
# ── sample_motions tests ────────────────────────────────────────────────────
class TestSampleMotions:
@pytest.fixture(autouse=True)
def setup_db(self, tmp_duckdb_path):
"""Set up right_wing_motions and extremity_scores tables with synthetic data."""
con = duckdb.connect(tmp_duckdb_path)
try:
con.execute("""
CREATE TABLE IF NOT EXISTS right_wing_motions (
motion_id INTEGER PRIMARY KEY,
classified BOOLEAN DEFAULT TRUE
)
""")
con.execute("""
CREATE TABLE IF NOT EXISTS motions (
id INTEGER PRIMARY KEY,
title VARCHAR,
body_text VARCHAR,
layman_explanation VARCHAR
)
""")
con.execute("""
CREATE TABLE IF NOT EXISTS extremity_scores (
motion_id INTEGER PRIMARY KEY,
text_score INTEGER,
text_explanation VARCHAR,
layman_score INTEGER,
layman_explanation VARCHAR,
error VARCHAR
)
""")
# Insert motions across 4 text_score buckets: 1, 2, 4, 5
records = []
for bucket, score in enumerate([1, 2, 4, 5], start=1):
for i in range(15):
mid = (bucket - 1) * 15 + i + 1
con.execute(
"INSERT INTO motions VALUES (?, ?, ?, ?)",
(mid, f"Title {mid}", f"Text {mid}", f"Layman {mid}"),
)
con.execute(
"INSERT INTO right_wing_motions VALUES (?, TRUE)",
(mid,),
)
con.execute(
"INSERT OR REPLACE INTO extremity_scores VALUES (?, ?, '', ?, '', NULL)",
(mid, score, score),
)
con.commit()
finally:
con.close()
def test_returns_stratified_sample(self, tmp_duckdb_path):
from analysis.right_wing.extremity_rescore_2d import sample_motions
result = sample_motions(tmp_duckdb_path, n_per_bucket=5, seed=42)
assert isinstance(result, list)
assert len(result) == 20 # 4 buckets * 5 each
for row in result:
assert "motion_id" in row
assert "title" in row
assert "text" in row
assert "layman" in row
assert "text_score" in row
def test_respects_seed(self, tmp_duckdb_path):
from analysis.right_wing.extremity_rescore_2d import sample_motions
result_a = sample_motions(tmp_duckdb_path, n_per_bucket=3, seed=99)
result_b = sample_motions(tmp_duckdb_path, n_per_bucket=3, seed=99)
ids_a = sorted(r["motion_id"] for r in result_a)
ids_b = sorted(r["motion_id"] for r in result_b)
assert ids_a == ids_b
def test_n_per_bucket_limits(self, tmp_duckdb_path):
from analysis.right_wing.extremity_rescore_2d import sample_motions
result = sample_motions(tmp_duckdb_path, n_per_bucket=2, seed=1)
assert len(result) == 8 # 4 buckets * 2
# ── rescore_2d dry_run tests ────────────────────────────────────────────────
class TestRescore2dDryRun:
@pytest.fixture(autouse=True)
def setup_db(self, tmp_duckdb_path):
"""Set up minimal tables for dry_run test."""
con = duckdb.connect(tmp_duckdb_path)
try:
con.execute("""
CREATE TABLE IF NOT EXISTS right_wing_motions (
motion_id INTEGER PRIMARY KEY,
classified BOOLEAN DEFAULT TRUE
)
""")
con.execute("""
CREATE TABLE IF NOT EXISTS motions (
id INTEGER PRIMARY KEY,
title VARCHAR,
body_text VARCHAR,
layman_explanation VARCHAR
)
""")
con.execute("""
CREATE TABLE IF NOT EXISTS extremity_scores (
motion_id INTEGER PRIMARY KEY,
text_score INTEGER,
text_explanation VARCHAR,
layman_score INTEGER,
layman_explanation VARCHAR,
error VARCHAR
)
""")
for mid in range(1, 21):
con.execute(
"INSERT INTO motions VALUES (?, ?, ?, ?)",
(mid, f"Title {mid}", f"Text {mid}", f"Layman {mid}"),
)
con.execute(
"INSERT INTO right_wing_motions VALUES (?, TRUE)",
(mid,),
)
con.execute(
"INSERT OR REPLACE INTO extremity_scores VALUES (?, ?, '', ?, '', NULL)",
(mid, (mid % 5) + 1, (mid % 5) + 1),
)
con.commit()
finally:
con.close()
def test_dry_run_no_subagents(self, tmp_duckdb_path, caplog):
from analysis.right_wing.extremity_rescore_2d import rescore_2d
import logging
caplog.set_level(logging.INFO)
result = rescore_2d(tmp_duckdb_path, n_per_bucket=3, dry_run=True)
assert isinstance(result, dict)
assert result.get("dry_run") is True
assert "motions_count" in result
assert "batch_count" in result
combined = caplog.text.lower()
assert "dry run" in combined
Loading…
Cancel
Save