- Project-local skill .opencode/skills/score-extremity/ for subagent dispatch - Orchestrator extremity_rescore_2d.py with load_skill/sample/format/validate/store - 16 TDD tests covering all orchestrator functions - 117 motions scored by deepseek v4 flash subagents (12 parallel batches) - Pearson r=0.45 between stylistic and material dimensions — separable - Key finding: 36.8% of motions use restrained language for consequential policies - 2d_extremity_correlation_report.md documents distribution, divergence patterns, and implications for the Overton acceptance-without-conversion narrativemain
parent
10fc002ef9
commit
bf37f84a8b
@ -0,0 +1,362 @@ |
||||
#!/usr/bin/env python3 |
||||
"""Two-dimensional extremity rescoring orchestrator. |
||||
|
||||
Scores Dutch parliamentary motions on two independent dimensions: |
||||
1. stijl_extremiteit (stylistic extremity, 1-5) |
||||
2. materiele_impact (material impact, 1-5) |
||||
|
||||
Usage: |
||||
uv run python analysis/right_wing/extremity_rescore_2d.py --db data/motions.db |
||||
uv run python analysis/right_wing/extremity_rescore_2d.py --db data/motions.db --dry-run |
||||
""" |
||||
|
||||
from __future__ import annotations |
||||
|
||||
import argparse |
||||
import json |
||||
import logging |
||||
import re |
||||
from pathlib import Path |
||||
from typing import Any |
||||
|
||||
import duckdb |
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") |
||||
logger = logging.getLogger(__name__) |
||||
|
||||
|
||||
# ── prompt / schema loading ────────────────────────────────────────────────── |
||||
|
||||
SKILL_MD_PATH = Path(__file__).parent.parent.parent / ".opencode" / "skills" / "score-extremity" / "SKILL.md" |
||||
|
||||
|
||||
def load_skill(skill_path: str | None = None) -> dict[str, Any]: |
||||
"""Read SKILL.md and extract prompt template and output schemas. |
||||
|
||||
Returns: |
||||
dict with keys "prompt_template", "single_schema", "batch_schema". |
||||
""" |
||||
path = Path(skill_path) if skill_path else SKILL_MD_PATH |
||||
if not path.exists(): |
||||
raise FileNotFoundError(f"Skill file not found: {path}") |
||||
|
||||
content = path.read_text(encoding="utf-8") |
||||
|
||||
# Extract prompt template from ```text ... ``` block |
||||
prompt_match = re.search(r"```text\n(.*?)```", content, re.DOTALL) |
||||
prompt_template = prompt_match.group(1).strip() if prompt_match else "" |
||||
|
||||
# Extract JSON schema blocks (first = single, second = batch) |
||||
json_blocks = re.findall(r"```json\n(.*?)```", content, re.DOTALL) |
||||
|
||||
single_schema: dict[str, Any] = {} |
||||
batch_schema: dict[str, Any] = {} |
||||
if len(json_blocks) >= 1: |
||||
try: |
||||
single_schema = json.loads(json_blocks[0].strip()) |
||||
except json.JSONDecodeError: |
||||
logger.warning("Failed to parse single schema JSON block") |
||||
if len(json_blocks) >= 2: |
||||
try: |
||||
batch_schema = json.loads(json_blocks[1].strip()) |
||||
except json.JSONDecodeError: |
||||
logger.warning("Failed to parse batch schema JSON block") |
||||
|
||||
return { |
||||
"prompt_template": prompt_template, |
||||
"single_schema": single_schema, |
||||
"batch_schema": batch_schema, |
||||
} |
||||
|
||||
|
||||
# ── sampling ───────────────────────────────────────────────────────────────── |
||||
|
||||
def sample_motions( |
||||
db_path: str, |
||||
n_per_bucket: int = 25, |
||||
seed: int = 42, |
||||
) -> list[dict[str, Any]]: |
||||
"""Stratified sample from right_wing_motions JOIN extremity_scores. |
||||
|
||||
Samples n_per_bucket motions from each text_score bucket (1-5). |
||||
|
||||
Returns: |
||||
List of dicts with keys: motion_id, title, text, layman, text_score. |
||||
""" |
||||
con = duckdb.connect(db_path) |
||||
try: |
||||
# Ensure tables exist |
||||
tables = {t[0] for t in con.execute("SHOW TABLES").fetchall()} |
||||
required = {"right_wing_motions", "motions", "extremity_scores"} |
||||
missing = required - tables |
||||
if missing: |
||||
logger.warning("Missing tables: %s, returning empty sample", missing) |
||||
return [] |
||||
|
||||
# Apply seed for reproducibility |
||||
con.execute(f"SELECT setseed({seed / 1000000.0})") |
||||
|
||||
rows = con.execute( |
||||
""" |
||||
SELECT m.id, m.title, m.body_text, m.layman_explanation, e.text_score |
||||
FROM right_wing_motions r |
||||
JOIN motions m ON r.motion_id = m.id |
||||
JOIN extremity_scores e ON r.motion_id = e.motion_id |
||||
WHERE r.classified = TRUE |
||||
AND e.text_score IS NOT NULL |
||||
AND e.error IS NULL |
||||
ORDER BY RANDOM() |
||||
""" |
||||
).fetchall() |
||||
|
||||
if not rows: |
||||
return [] |
||||
|
||||
# Bucket by text_score |
||||
buckets: dict[int, list[dict[str, Any]]] = {} |
||||
for row in rows: |
||||
mid, title, body_text, layman, text_score = row |
||||
score_bucket = int(text_score) |
||||
buckets.setdefault(score_bucket, []).append({ |
||||
"motion_id": mid, |
||||
"title": title or "", |
||||
"text": body_text or "", |
||||
"layman": layman or "", |
||||
"text_score": score_bucket, |
||||
}) |
||||
|
||||
# Sample n_per_bucket from each bucket |
||||
result: list[dict[str, Any]] = [] |
||||
for bucket_id in sorted(buckets.keys()): |
||||
bucket = buckets[bucket_id] |
||||
result.extend(bucket[:n_per_bucket]) |
||||
|
||||
logger.info( |
||||
"Sampled %d motions from %d buckets (n_per_bucket=%d)", |
||||
len(result), len(buckets), n_per_bucket, |
||||
) |
||||
return result |
||||
|
||||
finally: |
||||
con.close() |
||||
|
||||
|
||||
# ── batch formatting ───────────────────────────────────────────────────────── |
||||
|
||||
def format_batches( |
||||
motions: list[dict[str, Any]], |
||||
prompt_template: str, |
||||
batch_size: int = 10, |
||||
) -> list[list[str]]: |
||||
"""Split motions into batches and fill prompt template for each motion. |
||||
|
||||
Args: |
||||
motions: List of dicts with keys title, text, layman. |
||||
prompt_template: Template string with {title}, {text}, {layman} placeholders. |
||||
batch_size: Number of motions per batch. |
||||
|
||||
Returns: |
||||
List of batches; each batch is a list of filled prompt strings, one per motion. |
||||
""" |
||||
batches: list[list[str]] = [] |
||||
for i in range(0, len(motions), batch_size): |
||||
batch_motions = motions[i : i + batch_size] |
||||
batch_prompts: list[str] = [] |
||||
for m in batch_motions: |
||||
prompt = prompt_template.format( |
||||
title=m.get("title", ""), |
||||
text=m.get("text", ""), |
||||
layman=m.get("layman", ""), |
||||
) |
||||
batch_prompts.append(prompt) |
||||
batches.append(batch_prompts) |
||||
return batches |
||||
|
||||
|
||||
# ── validation ─────────────────────────────────────────────────────────────── |
||||
|
||||
EXPECTED_FIELDS = [ |
||||
"stijl_extremiteit", |
||||
"stijl_toelichting", |
||||
"materiele_impact", |
||||
"materiele_toelichting", |
||||
] |
||||
|
||||
|
||||
def validate_single_result(result: dict[str, Any]) -> tuple[bool, str | None]: |
||||
"""Validate a single motion 2d scoring result. |
||||
|
||||
Returns: |
||||
(True, None) if valid, (False, error_message) otherwise. |
||||
""" |
||||
# Check all required fields exist |
||||
for field in EXPECTED_FIELDS: |
||||
if field not in result: |
||||
return False, f"missing field: {field}" |
||||
|
||||
# Validate stijl_extremiteit (int, 1-5) |
||||
se = result["stijl_extremiteit"] |
||||
if not isinstance(se, int) or se < 1 or se > 5: |
||||
return False, f"stijl_extremiteit out of range 1-5: {se}" |
||||
|
||||
# Validate materiele_impact (int, 1-5) |
||||
mi = result["materiele_impact"] |
||||
if not isinstance(mi, int) or mi < 1 or mi > 5: |
||||
return False, f"materiele_impact out of range 1-5: {mi}" |
||||
|
||||
return True, None |
||||
|
||||
|
||||
# ── storage ────────────────────────────────────────────────────────────────── |
||||
|
||||
def store_scores(db_path: str, results: list[dict[str, Any]]) -> int: |
||||
"""Store validated 2d scores in the extremity_scores_2d table. |
||||
|
||||
Creates the table if it doesn't exist. |
||||
|
||||
Args: |
||||
db_path: Path to DuckDB database. |
||||
results: List of dicts with keys: motion_id, stijl_extremiteit, |
||||
stijl_toelichting, materiele_impact, materiele_toelichting. |
||||
|
||||
Returns: |
||||
Number of rows inserted. |
||||
""" |
||||
con = duckdb.connect(db_path) |
||||
try: |
||||
con.execute( |
||||
""" |
||||
CREATE TABLE IF NOT EXISTS extremity_scores_2d ( |
||||
motion_id INTEGER PRIMARY KEY, |
||||
stylistic_score INTEGER NOT NULL, |
||||
material_score INTEGER NOT NULL, |
||||
stylistic_rationale TEXT, |
||||
material_rationale TEXT |
||||
) |
||||
""" |
||||
) |
||||
|
||||
count = 0 |
||||
for r in results: |
||||
con.execute( |
||||
""" |
||||
INSERT OR REPLACE INTO extremity_scores_2d |
||||
(motion_id, stylistic_score, material_score, stylistic_rationale, material_rationale) |
||||
VALUES (?, ?, ?, ?, ?) |
||||
""", |
||||
( |
||||
r["motion_id"], |
||||
r["stijl_extremiteit"], |
||||
r["materiele_impact"], |
||||
r.get("stijl_toelichting"), |
||||
r.get("materiele_toelichting"), |
||||
), |
||||
) |
||||
count += 1 |
||||
|
||||
con.commit() |
||||
logger.info("Stored %d scores in extremity_scores_2d", count) |
||||
return count |
||||
|
||||
finally: |
||||
con.close() |
||||
|
||||
|
||||
# ── orchestrator ───────────────────────────────────────────────────────────── |
||||
|
||||
def rescore_2d( |
||||
db_path: str, |
||||
n_per_bucket: int = 25, |
||||
batch_size: int = 10, |
||||
dry_run: bool = False, |
||||
) -> dict[str, Any]: |
||||
"""Two-dimensional extremity rescoring orchestrator. |
||||
|
||||
Samples motions from right_wing_motions/extremity_scores, formats batches, |
||||
and (in non-dry-run mode) dispatches subagents for scoring. |
||||
|
||||
Args: |
||||
db_path: Path to DuckDB database. |
||||
n_per_bucket: Number of motions to sample per text_score bucket. |
||||
batch_size: Motions per subagent batch. |
||||
dry_run: If True, only print the plan without spawning subagents. |
||||
|
||||
Returns: |
||||
Dict with summary stats. |
||||
""" |
||||
skill = load_skill() |
||||
prompt_template = skill["prompt_template"] |
||||
|
||||
motions = sample_motions(db_path, n_per_bucket=n_per_bucket) |
||||
|
||||
if not motions: |
||||
logger.warning("No motions to rescore.") |
||||
return {"motions_count": 0, "batch_count": 0, "dry_run": dry_run} |
||||
|
||||
batches = format_batches(motions, prompt_template, batch_size=batch_size) |
||||
|
||||
logger.info("Plan: %d motions in %d batches (batch_size=%d)", len(motions), len(batches), batch_size) |
||||
|
||||
if dry_run: |
||||
logger.info("DRY RUN — no subagents will be spawned.") |
||||
return { |
||||
"motions_count": len(motions), |
||||
"batch_count": len(batches), |
||||
"dry_run": True, |
||||
} |
||||
|
||||
# ── subagent dispatch (placeholder) ────────────────────────────────── |
||||
# In production, each batch would be sent to a subagent via the `task` tool. |
||||
# The subagent receives: |
||||
# - The prompt_template filled with motion data |
||||
# - Instruction to return JSON matching the batch_schema |
||||
# |
||||
# Example dispatch (not executed in script): |
||||
# for batch_idx, batch_prompts in enumerate(batches): |
||||
# combined_prompt = "\n\n---\n\n".join(batch_prompts) |
||||
# result = task( |
||||
# description=f"Score batch {batch_idx + 1}/{len(batches)}", |
||||
# prompt=combined_prompt, |
||||
# subagent_type="general", |
||||
# ) |
||||
# validated_results = [r for r in json.loads(result)["motions"] if validate_single_result(r)[0]] |
||||
# store_scores(db_path, validated_results) |
||||
|
||||
logger.info( |
||||
"Subagent dispatch placeholder: %d batches ready for scoring. " |
||||
"Run via an agent context (e.g. opencode task) to execute.", |
||||
len(batches), |
||||
) |
||||
|
||||
return { |
||||
"motions_count": len(motions), |
||||
"batch_count": len(batches), |
||||
"dry_run": False, |
||||
"subagents_spawned": 0, |
||||
} |
||||
|
||||
|
||||
# ── CLI ────────────────────────────────────────────────────────────────────── |
||||
|
||||
def main() -> int: |
||||
parser = argparse.ArgumentParser( |
||||
description="Two-dimensional extremity rescoring orchestrator" |
||||
) |
||||
parser.add_argument("--db", default="data/motions.db", help="Path to DuckDB database") |
||||
parser.add_argument("--n-per-bucket", type=int, default=25, help="Motions per text_score bucket") |
||||
parser.add_argument("--batch-size", type=int, default=10, help="Motions per subagent batch") |
||||
parser.add_argument("--dry-run", action="store_true", help="Print plan without spawning subagents") |
||||
args = parser.parse_args() |
||||
|
||||
result = rescore_2d( |
||||
db_path=args.db, |
||||
n_per_bucket=args.n_per_bucket, |
||||
batch_size=args.batch_size, |
||||
dry_run=args.dry_run, |
||||
) |
||||
print(json.dumps(result, indent=2)) |
||||
return 0 |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
raise SystemExit(main()) |
||||
@ -0,0 +1,112 @@ |
||||
# Two-Dimensional Extremity Correlation Report |
||||
|
||||
**Date:** 2026-05-24 |
||||
**Motions scored:** 117 (stratified sample: ~25 per original extremity bucket) |
||||
**Scoring model:** Deepseek v4 flash (subagents via project skill) |
||||
|
||||
## Purpose |
||||
|
||||
The original extremity score is a single 1–5 rating of policy radicalism. This conflates two potentially independent dimensions: |
||||
- **Stylistic extremity (stijl-extremiteit):** How inflammatory, hostile, or polarizing the language is |
||||
- **Material impact (materiële impact):** How much the proposed policy would substantively affect people's rights, institutions, or freedoms |
||||
|
||||
This validation samples motions across the full extremity range and scores both dimensions independently to test whether they correlate strongly enough for a single score, or whether they should be tracked separately. |
||||
|
||||
--- |
||||
|
||||
## Results |
||||
|
||||
### Overall correlation |
||||
|
||||
| Metric | Value | |
||||
|--------|-------| |
||||
| N | 117 | |
||||
| Pearson r | **0.453** (moderate) | |
||||
| Mean stylistic | 2.01 | |
||||
| Mean material | 2.86 | |
||||
| Mean absolute difference | 1.11 | |
||||
| S ≤ 2 AND M ≥ 3 (masking) | 43 (36.8%) | |
||||
|
||||
**r = 0.453 is moderate — the dimensions are partly correlated but clearly separable.** Stylistic extremism explains only ~20% of the variance in material impact (R² = 0.205). A motion can be inflammatory without being consequential, and vice versa. |
||||
|
||||
### Joint distribution |
||||
|
||||
| | M=1 | M=2 | M=3 | M=4 | M=5 | |
||||
|---|---|---|---|---|---| |
||||
| **S=1** | 11 | 17 | 10 | 5 | 1 | |
||||
| **S=2** | 4 | 9 | 15 | 8 | 4 | |
||||
| **S=3** | 2 | 4 | 9 | 4 | 5 | |
||||
| **S=4** | 0 | 1 | 0 | 3 | 2 | |
||||
| **S=5** | 0 | 0 | 0 | 1 | 2 | |
||||
|
||||
### By original extremity bucket |
||||
|
||||
| Bucket | N | Mean style | Mean material | Gap | |
||||
|--------|---|-----------|--------------|-----| |
||||
| 1–2 (mild) | 50 | 1.56 | 2.24 | +0.68 | |
||||
| 2–3 (moderate) | 25 | 2.00 | 2.88 | +0.88 | |
||||
| 3–4 (high) | 25 | 2.56 | 3.56 | +1.00 | |
||||
| 4–5 (extreme) | 17 | 2.53 | 3.65 | +1.12 | |
||||
|
||||
Material impact consistently rates higher than stylistic extremity across all buckets. The gap widens at higher original extremity levels — suggesting the original LLM scoring was more sensitive to language style, while subagents systematically identify greater material consequences in the same motions. |
||||
|
||||
--- |
||||
|
||||
## Key findings |
||||
|
||||
### 1. "Low style, high impact" is the dominant divergence pattern |
||||
|
||||
**36.8% of motions (43 of 117)** use restrained language (S ≤ 2) for policies with substantial material impact (M ≥ 3). These are the motions most poorly captured by a single-dimensional score: |
||||
|
||||
- **Motion 16227** (S=1, M=5): "Verzoekt de regering kennis te geven van het voornemen tot uittreding uit de Europese Unie conform artikel 50 VWEU." Neutral, procedural language invoking an EU treaty article — but the policy is fundamental dissolution of the entire Dutch-EU legal framework. |
||||
|
||||
- **Motion 7713** (S=1, M=4): "Verzoekt de regering per direct te stoppen met arbeidsmigratie." Restrained, single-sentence motion with no inflammatory language — but it would suspend free movement of persons, a fundamental EU treaty right. |
||||
|
||||
- **Motion 16704** (S=1, M=3): Formal Raad van State advice and technical amendment text. No political rhetoric — but a concrete law change with measurable employment and investment effects. |
||||
|
||||
- **Motion 687** (S=1, M=3): Technical-juridical language about the scope of "emissiegegevens" in the EU environmental information directive — but would significantly restrict public transparency about agricultural emissions. |
||||
|
||||
### 2. Material impact averages significantly higher |
||||
|
||||
Across all buckets, material impact scores are 0.68–1.12 points higher than stylistic scores. This suggests: |
||||
- Parliamentarians write motions using formal, restrained language even when proposing consequential policies |
||||
- The original LLM scoring (which showed mean extremity = 2.19 overall) likely understates how radical these policies are in material terms |
||||
- Dutch parliamentary language norms mask policy radicalism |
||||
|
||||
### 3. "High style" motions are rare and concentrated |
||||
|
||||
Only 3 motions scored S=5 (the most inflammatory end), and all had M=4 or M=5. Explicitly discriminatory or hostile language — when it occurs — is paired with substantively extreme policies. But the vast majority of consequential right-wing motions use parliamentary language: |
||||
|
||||
- **Motion 11956** (S=4, M=5): Explicitly hostile language ("à la Turkije," "vreemdelingen die we hier niet willen hebben") paired with fundamental rights violation (forced deportation without country-of-origin consent) |
||||
- **Motion 18064** (S=5, M=4): Explicit ethnic targeting ("niet-westerse allochtonen" as COVID rulebreakers) — discriminatory state action |
||||
|
||||
### 4. The original LLM audit gap is partially explained |
||||
|
||||
The manual audit found 75% agreement with the original LLM scores and noted "systematic overrating of anti-institutional language." The two-dimensional data clarifies this: the original LLM was more sensitive to *stylistic* extremity (inflammatory language) than to *material* policy impact. The 25% disagreement likely occurred on "low style, high impact" motions where the single-dimensional score was anchored to language rather than substance. |
||||
|
||||
--- |
||||
|
||||
## Implications for Overton analysis |
||||
|
||||
### For the current findings |
||||
|
||||
The "no content extremity increase" (d = −0.09) finding in the Overton report relied on single-dimensional LLM scores. The two-dimensional data suggests this may be an **artifact of the language-focused scoring**: if right-wing motions became more consequential while maintaining or softening their language, the single score would miss the shift entirely. |
||||
|
||||
The "acceptance without conversion" interpretation — centrists vote more with right-wing despite spatial divergence — is **strengthened** by these findings. It is consistent with right-wing motions becoming *substantively* consequential (high material impact) while maintaining procedural language norms, making them harder for centrists to vote against without appearing obstructionist. |
||||
|
||||
### Recommendations |
||||
|
||||
1. **Re-score all 2,986 motions with two-dimensional scoring.** The moderate r = 0.453 confirms the dimensions are separable. A single score obscures the most important category: motions with low stylistic extremism but high material impact. |
||||
|
||||
2. **Re-run the extremity-stratified centrist support analysis with material impact buckets.** The critical question: did centrist support for *high material impact* motions increase after 2024? If low-language, high-impact motions are the ones gaining centrist tolerance, that is stronger Overton evidence than the current analysis captures. |
||||
|
||||
3. **For mechanism analysis (U4):** Score mechanisms specifically for *material impact* rather than general extremity. The question is not "how extreme is this motion?" but "what specific rights, institutions, or groups does this motion affect, and how much?" |
||||
|
||||
--- |
||||
|
||||
## Data |
||||
|
||||
- **Full results:** `data/motions.db` → `extremity_scores_2d` (117 rows) |
||||
- **Raw JSON:** `/tmp/extremity_2d_results.json` |
||||
- **Scoring skill:** `.opencode/skills/score-extremity/SKILL.md` |
||||
- **Orchestrator:** `analysis/right_wing/extremity_rescore_2d.py` |
||||
@ -0,0 +1,360 @@ |
||||
"""Tests for two-dimensional extremity rescoring orchestrator.""" |
||||
|
||||
import json |
||||
|
||||
import duckdb |
||||
import pytest |
||||
|
||||
pytest.importorskip("duckdb") |
||||
|
||||
|
||||
# ── fixtures ──────────────────────────────────────────────────────────────── |
||||
|
||||
@pytest.fixture |
||||
def synthetic_motions(): |
||||
"""Return 103 synthetic motion dicts for testing batch formatting.""" |
||||
motions = [] |
||||
for i in range(103): |
||||
motions.append({ |
||||
"motion_id": i + 1, |
||||
"title": f"Motion {i + 1}", |
||||
"text": f"Body text for motion {i + 1}", |
||||
"layman": f"Layman explanation {i + 1}", |
||||
}) |
||||
return motions |
||||
|
||||
|
||||
@pytest.fixture |
||||
def prompt_template(): |
||||
"""Minimal prompt template with {title}, {text}, {layman} placeholders.""" |
||||
return ( |
||||
"Titel: {title}\n" |
||||
"Tekst: {text}\n" |
||||
"Uitleg: {layman}\n" |
||||
) |
||||
|
||||
|
||||
@pytest.fixture |
||||
def valid_single_result(): |
||||
"""A valid single-motion 2d result dict.""" |
||||
return { |
||||
"stijl_extremiteit": 3, |
||||
"stijl_toelichting": "Neutraal taalgebruik", |
||||
"materiele_impact": 4, |
||||
"materiele_toelichting": "Beperkt rechten voor specifieke groep", |
||||
} |
||||
|
||||
|
||||
# ── load_skill tests ──────────────────────────────────────────────────────── |
||||
|
||||
class TestLoadSkill: |
||||
def test_returns_prompt_and_schema(self): |
||||
from analysis.right_wing.extremity_rescore_2d import load_skill |
||||
|
||||
result = load_skill() |
||||
assert isinstance(result, dict) |
||||
assert "prompt_template" in result |
||||
assert "batch_schema" in result |
||||
assert "single_schema" in result |
||||
assert isinstance(result["prompt_template"], str) |
||||
assert len(result["prompt_template"]) > 0 |
||||
assert "STIJL-EXTREMITEIT" in result["prompt_template"] |
||||
assert "MATERIELE IMPACT" in result["prompt_template"] |
||||
assert isinstance(result["batch_schema"], dict) |
||||
assert "motions" in result["batch_schema"] |
||||
assert isinstance(result["single_schema"], dict) |
||||
|
||||
def test_missing_file_raises(self): |
||||
from analysis.right_wing.extremity_rescore_2d import load_skill |
||||
|
||||
with pytest.raises(FileNotFoundError, match="not found"): |
||||
load_skill(skill_path="/nonexistent/path/skill.md") |
||||
|
||||
|
||||
# ── format_batches tests ──────────────────────────────────────────────────── |
||||
|
||||
class TestFormatBatches: |
||||
def test_splits_into_batches(self, synthetic_motions, prompt_template): |
||||
from analysis.right_wing.extremity_rescore_2d import format_batches |
||||
|
||||
batches = format_batches(synthetic_motions[:100], prompt_template, batch_size=10) |
||||
assert isinstance(batches, list) |
||||
assert len(batches) == 10 |
||||
for batch in batches: |
||||
assert isinstance(batch, list) |
||||
assert len(batch) == 10 |
||||
for prompt_str in batch: |
||||
assert "Motion" in prompt_str |
||||
|
||||
def test_uneven_batches(self, synthetic_motions, prompt_template): |
||||
from analysis.right_wing.extremity_rescore_2d import format_batches |
||||
|
||||
batches = format_batches(synthetic_motions, prompt_template, batch_size=10) |
||||
assert len(batches) == 11 |
||||
for batch in batches[:-1]: |
||||
assert len(batch) == 10 |
||||
assert len(batches[-1]) == 3 |
||||
|
||||
def test_substitutes_placeholders(self, prompt_template): |
||||
from analysis.right_wing.extremity_rescore_2d import format_batches |
||||
|
||||
motions = [{ |
||||
"motion_id": 1, |
||||
"title": "Test Title", |
||||
"text": "Test Text", |
||||
"layman": "Test Layman", |
||||
}] |
||||
batches = format_batches(motions, prompt_template, batch_size=1) |
||||
prompt_str = batches[0][0] |
||||
assert "Test Title" in prompt_str |
||||
assert "Test Text" in prompt_str |
||||
assert "Test Layman" in prompt_str |
||||
|
||||
|
||||
# ── validate_single_result tests ──────────────────────────────────────────── |
||||
|
||||
class TestValidateSingleResult: |
||||
def test_valid_result(self, valid_single_result): |
||||
from analysis.right_wing.extremity_rescore_2d import validate_single_result |
||||
|
||||
ok, err = validate_single_result(valid_single_result) |
||||
assert ok is True |
||||
assert err is None |
||||
|
||||
def test_missing_field(self, valid_single_result): |
||||
from analysis.right_wing.extremity_rescore_2d import validate_single_result |
||||
|
||||
invalid = dict(valid_single_result) |
||||
del invalid["materiele_impact"] |
||||
ok, err = validate_single_result(invalid) |
||||
assert ok is False |
||||
assert "materiele_impact" in err |
||||
|
||||
def test_out_of_range_high(self, valid_single_result): |
||||
from analysis.right_wing.extremity_rescore_2d import validate_single_result |
||||
|
||||
invalid = dict(valid_single_result) |
||||
invalid["stijl_extremiteit"] = 6 |
||||
ok, err = validate_single_result(invalid) |
||||
assert ok is False |
||||
assert "stijl_extremiteit" in err |
||||
|
||||
def test_out_of_range_low(self, valid_single_result): |
||||
from analysis.right_wing.extremity_rescore_2d import validate_single_result |
||||
|
||||
invalid = dict(valid_single_result) |
||||
invalid["materiele_impact"] = 0 |
||||
ok, err = validate_single_result(invalid) |
||||
assert ok is False |
||||
assert "materiele_impact" in err |
||||
|
||||
def test_non_integer_score(self, valid_single_result): |
||||
from analysis.right_wing.extremity_rescore_2d import validate_single_result |
||||
|
||||
invalid = dict(valid_single_result) |
||||
invalid["stijl_extremiteit"] = "3" |
||||
ok, err = validate_single_result(invalid) |
||||
assert ok is False |
||||
assert "stijl_extremiteit" in err |
||||
|
||||
|
||||
# ── store_scores tests ────────────────────────────────────────────────────── |
||||
|
||||
class TestStoreScores: |
||||
def test_stores_and_returns_count(self, tmp_duckdb_path): |
||||
import duckdb |
||||
from analysis.right_wing.extremity_rescore_2d import store_scores |
||||
|
||||
results = [ |
||||
{"motion_id": 1, "stijl_extremiteit": 3, "stijl_toelichting": "a", |
||||
"materiele_impact": 4, "materiele_toelichting": "b"}, |
||||
{"motion_id": 2, "stijl_extremiteit": 2, "stijl_toelichting": "c", |
||||
"materiele_impact": 1, "materiele_toelichting": "d"}, |
||||
] |
||||
count = store_scores(tmp_duckdb_path, results) |
||||
assert count == 2 |
||||
|
||||
con = duckdb.connect(tmp_duckdb_path) |
||||
try: |
||||
rows = con.execute( |
||||
"SELECT motion_id, stylistic_score, material_score " |
||||
"FROM extremity_scores_2d ORDER BY motion_id" |
||||
).fetchall() |
||||
assert len(rows) == 2 |
||||
assert rows[0] == (1, 3, 4) |
||||
assert rows[1] == (2, 2, 1) |
||||
finally: |
||||
con.close() |
||||
|
||||
def test_replace_existing(self, tmp_duckdb_path): |
||||
import duckdb |
||||
from analysis.right_wing.extremity_rescore_2d import store_scores |
||||
|
||||
results = [{ |
||||
"motion_id": 1, "stijl_extremiteit": 1, "stijl_toelichting": "x", |
||||
"materiele_impact": 1, "materiele_toelichting": "y", |
||||
}] |
||||
store_scores(tmp_duckdb_path, results) |
||||
|
||||
updated = [{ |
||||
"motion_id": 1, "stijl_extremiteit": 5, "stijl_toelichting": "z", |
||||
"materiele_impact": 5, "materiele_toelichting": "w", |
||||
}] |
||||
count = store_scores(tmp_duckdb_path, updated) |
||||
assert count == 1 |
||||
|
||||
con = duckdb.connect(tmp_duckdb_path) |
||||
try: |
||||
rows = con.execute( |
||||
"SELECT stylistic_score, material_score FROM extremity_scores_2d WHERE motion_id = 1" |
||||
).fetchall() |
||||
assert rows[0] == (5, 5) |
||||
finally: |
||||
con.close() |
||||
|
||||
|
||||
# ── sample_motions tests ──────────────────────────────────────────────────── |
||||
|
||||
class TestSampleMotions: |
||||
@pytest.fixture(autouse=True) |
||||
def setup_db(self, tmp_duckdb_path): |
||||
"""Set up right_wing_motions and extremity_scores tables with synthetic data.""" |
||||
con = duckdb.connect(tmp_duckdb_path) |
||||
try: |
||||
con.execute(""" |
||||
CREATE TABLE IF NOT EXISTS right_wing_motions ( |
||||
motion_id INTEGER PRIMARY KEY, |
||||
classified BOOLEAN DEFAULT TRUE |
||||
) |
||||
""") |
||||
con.execute(""" |
||||
CREATE TABLE IF NOT EXISTS motions ( |
||||
id INTEGER PRIMARY KEY, |
||||
title VARCHAR, |
||||
body_text VARCHAR, |
||||
layman_explanation VARCHAR |
||||
) |
||||
""") |
||||
con.execute(""" |
||||
CREATE TABLE IF NOT EXISTS extremity_scores ( |
||||
motion_id INTEGER PRIMARY KEY, |
||||
text_score INTEGER, |
||||
text_explanation VARCHAR, |
||||
layman_score INTEGER, |
||||
layman_explanation VARCHAR, |
||||
error VARCHAR |
||||
) |
||||
""") |
||||
# Insert motions across 4 text_score buckets: 1, 2, 4, 5 |
||||
records = [] |
||||
for bucket, score in enumerate([1, 2, 4, 5], start=1): |
||||
for i in range(15): |
||||
mid = (bucket - 1) * 15 + i + 1 |
||||
con.execute( |
||||
"INSERT INTO motions VALUES (?, ?, ?, ?)", |
||||
(mid, f"Title {mid}", f"Text {mid}", f"Layman {mid}"), |
||||
) |
||||
con.execute( |
||||
"INSERT INTO right_wing_motions VALUES (?, TRUE)", |
||||
(mid,), |
||||
) |
||||
con.execute( |
||||
"INSERT OR REPLACE INTO extremity_scores VALUES (?, ?, '', ?, '', NULL)", |
||||
(mid, score, score), |
||||
) |
||||
con.commit() |
||||
finally: |
||||
con.close() |
||||
|
||||
def test_returns_stratified_sample(self, tmp_duckdb_path): |
||||
from analysis.right_wing.extremity_rescore_2d import sample_motions |
||||
|
||||
result = sample_motions(tmp_duckdb_path, n_per_bucket=5, seed=42) |
||||
assert isinstance(result, list) |
||||
assert len(result) == 20 # 4 buckets * 5 each |
||||
for row in result: |
||||
assert "motion_id" in row |
||||
assert "title" in row |
||||
assert "text" in row |
||||
assert "layman" in row |
||||
assert "text_score" in row |
||||
|
||||
def test_respects_seed(self, tmp_duckdb_path): |
||||
from analysis.right_wing.extremity_rescore_2d import sample_motions |
||||
|
||||
result_a = sample_motions(tmp_duckdb_path, n_per_bucket=3, seed=99) |
||||
result_b = sample_motions(tmp_duckdb_path, n_per_bucket=3, seed=99) |
||||
ids_a = sorted(r["motion_id"] for r in result_a) |
||||
ids_b = sorted(r["motion_id"] for r in result_b) |
||||
assert ids_a == ids_b |
||||
|
||||
def test_n_per_bucket_limits(self, tmp_duckdb_path): |
||||
from analysis.right_wing.extremity_rescore_2d import sample_motions |
||||
|
||||
result = sample_motions(tmp_duckdb_path, n_per_bucket=2, seed=1) |
||||
assert len(result) == 8 # 4 buckets * 2 |
||||
|
||||
|
||||
# ── rescore_2d dry_run tests ──────────────────────────────────────────────── |
||||
|
||||
class TestRescore2dDryRun: |
||||
@pytest.fixture(autouse=True) |
||||
def setup_db(self, tmp_duckdb_path): |
||||
"""Set up minimal tables for dry_run test.""" |
||||
con = duckdb.connect(tmp_duckdb_path) |
||||
try: |
||||
con.execute(""" |
||||
CREATE TABLE IF NOT EXISTS right_wing_motions ( |
||||
motion_id INTEGER PRIMARY KEY, |
||||
classified BOOLEAN DEFAULT TRUE |
||||
) |
||||
""") |
||||
con.execute(""" |
||||
CREATE TABLE IF NOT EXISTS motions ( |
||||
id INTEGER PRIMARY KEY, |
||||
title VARCHAR, |
||||
body_text VARCHAR, |
||||
layman_explanation VARCHAR |
||||
) |
||||
""") |
||||
con.execute(""" |
||||
CREATE TABLE IF NOT EXISTS extremity_scores ( |
||||
motion_id INTEGER PRIMARY KEY, |
||||
text_score INTEGER, |
||||
text_explanation VARCHAR, |
||||
layman_score INTEGER, |
||||
layman_explanation VARCHAR, |
||||
error VARCHAR |
||||
) |
||||
""") |
||||
for mid in range(1, 21): |
||||
con.execute( |
||||
"INSERT INTO motions VALUES (?, ?, ?, ?)", |
||||
(mid, f"Title {mid}", f"Text {mid}", f"Layman {mid}"), |
||||
) |
||||
con.execute( |
||||
"INSERT INTO right_wing_motions VALUES (?, TRUE)", |
||||
(mid,), |
||||
) |
||||
con.execute( |
||||
"INSERT OR REPLACE INTO extremity_scores VALUES (?, ?, '', ?, '', NULL)", |
||||
(mid, (mid % 5) + 1, (mid % 5) + 1), |
||||
) |
||||
con.commit() |
||||
finally: |
||||
con.close() |
||||
|
||||
def test_dry_run_no_subagents(self, tmp_duckdb_path, caplog): |
||||
from analysis.right_wing.extremity_rescore_2d import rescore_2d |
||||
|
||||
import logging |
||||
caplog.set_level(logging.INFO) |
||||
|
||||
result = rescore_2d(tmp_duckdb_path, n_per_bucket=3, dry_run=True) |
||||
assert isinstance(result, dict) |
||||
assert result.get("dry_run") is True |
||||
assert "motions_count" in result |
||||
assert "batch_count" in result |
||||
|
||||
combined = caplog.text.lower() |
||||
assert "dry run" in combined |
||||
Loading…
Reference in new issue