motief/analysis/right_wing/mechanism_validation.py

#!/usr/bin/env python3
"""Mechanism classification validation with a second classifier.

Computes inter-rater reliability (Cohen's kappa) between the original inline
classifications and a second LLM-based classification using a different prompt
template and (optionally) a different model.

Usage:
    uv run python analysis/right_wing/mechanism_validation.py
"""

from __future__ import annotations

import argparse
import json
import logging
import sys
import time
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Any

import duckdb

ROOT = Path(__file__).parent.parent.parent.resolve()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from ai_provider import ProviderError, chat_completion
from analysis.config import config

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

# ── mechanism taxonomy ───────────────────────────────────────────────────────

MECHANISMS = [
    "consensus_framing",
    "institutional_rule_of_law",
    "welfare_service_expansion",
    "procedural_technical",
    "local_constituency",
    "coalition_alignment",
    "symbolic_declaratory",
    "targeted_restriction",
    "system_dismantling",
    "crisis_response",
]

MECHANISM_LABELS_NL = {
    "consensus_framing": "Consensus framing (gedeeld belang)",
    "institutional_rule_of_law": "Institutioneel/rechtsstatelijk",
    "welfare_service_expansion": "Welzijn/dienstverlening uitbreiding",
    "procedural_technical": "Procedureel/technisch",
    "local_constituency": "Lokaal/regionaal",
    "coalition_alignment": "Coalitie-afstemming",
    "symbolic_declaratory": "Symbolisch/declaratoir",
    "targeted_restriction": "Gerichte restrictie",
    "system_dismantling": "Systeemontmanteling",
    "crisis_response": "Crisisrespons",
}

MECHANISM_LABELS_EN = {
    "consensus_framing": "Consensus framing / shared interest",
    "institutional_rule_of_law": "Institutional / rule of law",
    "welfare_service_expansion": "Welfare / service expansion",
    "procedural_technical": "Procedural / technical",
    "local_constituency": "Local / regional constituency",
    "coalition_alignment": "Coalition alignment",
    "symbolic_declaratory": "Symbolic / declaratory",
    "targeted_restriction": "Targeted restriction",
    "system_dismantling": "System dismantling",
    "crisis_response": "Crisis response",
}

# Original inline classifications (from mechanism_classification.py)
ORIGINAL_CLASSIFICATIONS: dict[int, str] = {
    15458: "crisis_response",
    26477: "institutional_rule_of_law",
    9149: "consensus_framing",
    17099: "procedural_technical",
    4933: "procedural_technical",
    17751: "consensus_framing",
    20068: "procedural_technical",
    16520: "consensus_framing",
    17036: "welfare_service_expansion",
    17681: "consensus_framing",
    14554: "procedural_technical",
    21864: "procedural_technical",
    26493: "targeted_restriction",
    21982: "consensus_framing",
    14125: "crisis_response",
    13683: "welfare_service_expansion",
    16691: "procedural_technical",
    15005: "procedural_technical",
    17536: "institutional_rule_of_law",
    16999: "consensus_framing",
    8325: "procedural_technical",
    13370: "welfare_service_expansion",
    18030: "procedural_technical",
    11382: "procedural_technical",
    18616: "procedural_technical",
    12411: "crisis_response",
    22595: "crisis_response",
    15772: "system_dismantling",
    7111: "welfare_service_expansion",
    25784: "targeted_restriction",
    27731: "system_dismantling",
    15626: "crisis_response",
    20215: "welfare_service_expansion",
    16430: "symbolic_declaratory",
    25982: "local_constituency",
    17176: "targeted_restriction",
    7054: "procedural_technical",
    20323: "procedural_technical",
    18025: "system_dismantling",
    14837: "system_dismantling",
    19620: "targeted_restriction",
    21801: "consensus_framing",
    19464: "crisis_response",
    26855: "targeted_restriction",
    22280: "local_constituency",
    20115: "symbolic_declaratory",
    15082: "targeted_restriction",
    6637: "targeted_restriction",
    18691: "symbolic_declaratory",
    18062: "crisis_response",
    3784: "procedural_technical",
    10205: "procedural_technical",
    10278: "coalition_alignment",
    25079: "consensus_framing",
    2980: "targeted_restriction",
    10420: "crisis_response",
    25092: "targeted_restriction",
    25545: "institutional_rule_of_law",
    23065: "procedural_technical",
    2878: "welfare_service_expansion",
    25573: "procedural_technical",
    3298: "symbolic_declaratory",
    25061: "consensus_framing",
    4481: "consensus_framing",
    3961: "procedural_technical",
    473: "institutional_rule_of_law",
    10413: "consensus_framing",
    974: "procedural_technical",
    24009: "procedural_technical",
    9789: "institutional_rule_of_law",
    24651: "targeted_restriction",
    1890: "local_constituency",
    1191: "consensus_framing",
    3448: "targeted_restriction",
    23910: "institutional_rule_of_law",
    25566: "welfare_service_expansion",
    2070: "targeted_restriction",
    23885: "consensus_framing",
    24906: "procedural_technical",
    2496: "procedural_technical",
    25582: "targeted_restriction",
    3053: "local_constituency",
    1495: "procedural_technical",
    10178: "procedural_technical",
    1614: "procedural_technical",
    23441: "consensus_framing",
    3569: "consensus_framing",
    10285: "procedural_technical",
    23058: "procedural_technical",
    3287: "procedural_technical",
    10434: "consensus_framing",
    10089: "procedural_technical",
    22706: "consensus_framing",
    3877: "institutional_rule_of_law",
    25062: "consensus_framing",
    3687: "targeted_restriction",
    25166: "procedural_technical",
    4618: "procedural_technical",
    3468: "institutional_rule_of_law",
    24632: "institutional_rule_of_law",
    25451: "symbolic_declaratory",
    2351: "targeted_restriction",
    4227: "consensus_framing",
    22853: "consensus_framing",
    9884: "procedural_technical",
    1428: "consensus_framing",
    3629: "symbolic_declaratory",
    1572: "local_constituency",
    25493: "procedural_technical",
    1359: "procedural_technical",
    2252: "procedural_technical",
    23605: "procedural_technical",
    3760: "consensus_framing",
    1005: "consensus_framing",
    10110: "coalition_alignment",
    23301: "consensus_framing",
    24046: "symbolic_declaratory",
    651: "welfare_service_expansion",
    1491: "targeted_restriction",
    25606: "targeted_restriction",
    313: "procedural_technical",
    24008: "consensus_framing",
    754: "targeted_restriction",
    25469: "targeted_restriction",
    25091: "targeted_restriction",
    2170: "institutional_rule_of_law",
    22792: "procedural_technical",
    10597: "institutional_rule_of_law",
    23013: "institutional_rule_of_law",
    3472: "institutional_rule_of_law",
    2014: "system_dismantling",
    920: "procedural_technical",
    2143: "welfare_service_expansion",
    688: "system_dismantling",
    2290: "system_dismantling",
    4497: "targeted_restriction",
    3823: "symbolic_declaratory",
    23141: "institutional_rule_of_law",
    4436: "institutional_rule_of_law",
    25616: "targeted_restriction",
    2662: "institutional_rule_of_law",
    23287: "institutional_rule_of_law",
    4660: "consensus_framing",
    4761: "targeted_restriction",
    2264: "institutional_rule_of_law",
    4394: "institutional_rule_of_law",
    1691: "targeted_restriction",
    10601: "targeted_restriction",
    4089: "targeted_restriction",
    23206: "procedural_technical",
    22676: "institutional_rule_of_law",
    115: "system_dismantling",
    3951: "consensus_framing",
    1375: "targeted_restriction",
    3090: "targeted_restriction",
    24650: "procedural_technical",
    1772: "consensus_framing",
    3678: "system_dismantling",
    1692: "institutional_rule_of_law",
    24077: "symbolic_declaratory",
    349: "institutional_rule_of_law",
    9769: "targeted_restriction",
    4656: "symbolic_declaratory",
    23984: "system_dismantling",
    2168: "institutional_rule_of_law",
    4443: "institutional_rule_of_law",
    4489: "procedural_technical",
    10290: "targeted_restriction",
    4071: "targeted_restriction",
    4088: "targeted_restriction",
    1507: "system_dismantling",
    2870: "procedural_technical",
    1912: "system_dismantling",
    22658: "symbolic_declaratory",
    10288: "targeted_restriction",
    4080: "institutional_rule_of_law",
    1847: "targeted_restriction",
    23127: "system_dismantling",
    4367: "targeted_restriction",
    9790: "targeted_restriction",
    4150: "procedural_technical",
    741: "targeted_restriction",
    1705: "consensus_framing",
    1831: "consensus_framing",
    10600: "targeted_restriction",
    9767: "targeted_restriction",
    3830: "system_dismantling",
    4221: "system_dismantling",
    3354: "institutional_rule_of_law",
    9977: "symbolic_declaratory",
    898: "consensus_framing",
    24848: "system_dismantling",
    756: "targeted_restriction",
    24358: "institutional_rule_of_law",
    4309: "institutional_rule_of_law",
    10167: "local_constituency",
    23633: "procedural_technical",
    23030: "targeted_restriction",
    1959: "system_dismantling",
    23454: "procedural_technical",
}

# ── prompt templates ─────────────────────────────────────────────────────────

# Original prompt (from mechanism_classification.py — inline subagent)
# Classifications were done by reading full title + body_text.
# The second classifier uses a DIFFERENT template:
#  - English wording (not Dutch)
#  - Mechanisms presented in DIFFERENT order (reverse alphabetical)
#  - Asks for RANKING (top 3) instead of single pick
#  - Includes definition context for each mechanism

MECHANISMS_SHUFLLED = list(reversed(MECHANISMS))

MECHANISM_DEFINITIONS_EN = """1. crisis_response — A temporary, emergency measure responding to an acute event (pandemic, natural disaster, sudden crisis). Reactive and time-limited.

2. system_dismantling — Aims to dismantle, abolish, or fundamentally restructure an existing policy, institution, or regulatory framework. Not reform but abolition/reversal.

3. targeted_restriction — Imposes specific restrictions on a defined group, behavior, or activity. Narrow scope, punitive or exclusionary intent.

4. symbolic_declaratory — Primarily sends a political signal, makes a statement, or takes a position without direct policy impact. Declaratory, symbolic, expressive.

5. procedural_technical — Technical adjustment, budget amendment, implementation detail, or administrative procedure. Bureaucratic, operational, non-ideological.

6. local_constituency — Serves a specific local/regional interest, constituency, or geographic area. NIMBY or local-advocacy pattern.

7. coalition_alignment — Reflects coalition politics: budget compromises, package deals, or alignments between coalition partners. Coalition-maintenance.

8. welfare_service_expansion — Expands government services, social welfare, public goods, or citizen entitlements. Positive provision, not restriction.

9. institutional_rule_of_law — Concerns legal frameworks, rule of law, institutional integrity, judicial process, or constitutional matters. Rule-based, institutional.

10. consensus_framing — Frames the motion as serving a broad, shared interest. Appeals to common ground, national interest, or bipartisan consensus. Inclusive, bridge-building, non-polarizing."""

SECOND_CLASSIFIER_PROMPT = """Classify the following Dutch parliamentary motion according to the mechanism taxonomy below.

MOTION TITLE: {title}

MOTION TEXT: {body}

TASK: Identify the PRIMARY mechanism this motion uses. Select exactly ONE mechanism from the list below. Base your decision on what the motion actually DOES (action-oriented) rather than what it merely TALKS about.

MECHANISM TAXONOMY (read carefully before choosing):

{MECHANISM_DEFINITIONS}

IMPORTANT RULES:
- Choose the mechanism that BEST describes the dominant pattern of the motion.
- If a motion could fit multiple mechanisms, pick the most specific one.
- procedural_technical should be the DEFAULT only if no other mechanism fits better.
- Return ONLY the mechanism key exactly as listed above (e.g., "system_dismantling").

Respond with a JSON object containing:
- "mechanism": the selected mechanism key
- "confidence": 1-5 (1=very uncertain, 5=very certain)
- "reasoning": brief explanation (max 2 sentences)"""


def build_second_classifier_prompt(title: str, body_text: str) -> str:
    text = body_text or title or ""
    if len(text) > 1200:
        text = text[:1200] + "..."
    return SECOND_CLASSIFIER_PROMPT.format(
        title=title or "", body=text, MECHANISM_DEFINITIONS=MECHANISM_DEFINITIONS_EN
    )


# ── LLM call helpers ─────────────────────────────────────────────────────────


def chat_completion_json(
    messages: list[dict[str, str]],
    model: str | None = None,
    retries: int = 3,
) -> dict[str, Any] | None:
    """Call chat_completion and parse JSON response with retries."""
    model = model or config.QWEN_MODEL
    prompt = messages[0]["content"]
    system_msg = (
        "You are a political science classifier. You classify Dutch parliamentary "
        "motions by their dominant mechanism type. Respond ONLY with valid JSON. "
        "No markdown, no code fences, no preamble — pure JSON object."
    )
    full_messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": prompt},
    ]

    backoff = 0.5
    for attempt in range(1, retries + 1):
        try:
            raw = chat_completion(full_messages, model=model)
        except ProviderError as exc:
            if attempt == retries:
                logger.error("ProviderError on attempt %d: %s", attempt, exc)
                return None
            time.sleep(backoff * (2 ** (attempt - 1)))
            continue

        raw = raw.strip()
        if raw.startswith("```"):
            raw = raw.split("```", 2)[1]
            if raw.startswith("json"):
                raw = raw[4:]
            raw = raw.strip()

        try:
            result = json.loads(raw)
            if "mechanism" in result and result["mechanism"] in MECHANISMS:
                return result
            logger.warning(
                "Invalid mechanism '%s' on attempt %d", result.get("mechanism"), attempt
            )
        except json.JSONDecodeError:
            logger.warning("JSON decode failed on attempt %d: %s", attempt, raw[:100])

        if attempt < retries:
            time.sleep(backoff * (2 ** (attempt - 1)))

    return None


def chat_completion_json_parallel(
    message_batches: list[list[dict[str, str]]],
    model: str | None = None,
    max_workers: int = 5,
) -> list[dict[str, Any] | None]:
    """
    Run multiple chat completions in parallel using ThreadPoolExecutor.

    Each element in message_batches is a list of messages for one completion.
    Returns a list of parsed JSON dicts (or None for failures), same order.
    """
    model = model or config.QWEN_MODEL

    def _fetch_one(messages: list[dict[str, str]]) -> dict[str, Any] | None:
        return chat_completion_json(messages, model=model)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(_fetch_one, batch) for batch in message_batches]
        return [f.result() for f in futures]


# ── data loading ─────────────────────────────────────────────────────────────


def load_motions(db_path: str, motion_ids: list[int]) -> list[dict[str, Any]]:
    """Load motion data from the database for the given motion IDs."""
    con = duckdb.connect(db_path)
    try:
        placeholders = ",".join("?" for _ in motion_ids)
        rows = con.execute(
            f"""
            SELECT r.motion_id, m.title, m.body_text, r.year, r.centrist_support_strict
            FROM right_wing_motions r
            JOIN motions m ON r.motion_id = m.id
            WHERE r.motion_id IN ({placeholders})
            ORDER BY r.motion_id
            """,
            motion_ids,
        ).fetchall()

        return [
            {
                "motion_id": r[0],
                "title": r[1] or "",
                "body_text": r[2] or "",
                "year": r[3],
                "centrist_support_strict": r[4],
            }
            for r in rows
        ]
    finally:
        con.close()


# ── classification ───────────────────────────────────────────────────────────


def classify_motions_second_pass(
    motions: list[dict[str, Any]],
    second_model: str | None = None,
    batch_size: int = 10,
    max_workers: int = 5,
) -> dict[int, dict[str, Any]]:
    """Run second classifier on all motions, return motion_id -> result dict."""
    second_model = second_model or config.QWEN_MODEL
    results: dict[int, dict[str, Any]] = {}

    for i in range(0, len(motions), batch_size):
        batch = motions[i : i + batch_size]
        logger.info(
            "Batch %d/%d (%d motions)",
            i // batch_size + 1,
            (len(motions) - 1) // batch_size + 1,
            len(batch),
        )

        message_batches = []
        for m in batch:
            prompt = build_second_classifier_prompt(m["title"], m["body_text"])
            message_batches.append([{"role": "user", "content": prompt}])

        raw_results = chat_completion_json_parallel(
            message_batches, model=second_model, max_workers=max_workers
        )

        for m, res in zip(batch, raw_results):
            mid = m["motion_id"]
            if res and res.get("mechanism") in MECHANISMS:
                results[mid] = {
                    "mechanism": res["mechanism"],
                    "confidence": res.get("confidence", 0),
                    "reasoning": res.get("reasoning", ""),
                    "error": None,
                }
            else:
                results[mid] = {
                    "mechanism": None,
                    "confidence": 0,
                    "reasoning": "",
                    "error": "classification failed",
                }

        time.sleep(0.5)

    return results


# ── agreement analysis ───────────────────────────────────────────────────────


def compute_cohens_kappa(
    rater1: dict[int, str],
    rater2: dict[int, str],
    categories: list[str],
) -> dict[str, Any]:
    """Compute Cohen's kappa for two raters.

    Uses only motion_ids present in BOTH raters.
    """
    common_ids = sorted(set(rater1) & set(rater2))

    n = len(common_ids)
    if n == 0:
        return {"kappa": None, "agreement_rate": None, "n": 0, "error": "no common motions"}

    agreements = 0
    for mid in common_ids:
        if rater1[mid] == rater2[mid]:
            agreements += 1

    p_o = agreements / n

    # Expected agreement
    p_e = 0.0
    for cat in categories:
        p1 = sum(1 for mid in common_ids if rater1[mid] == cat) / n
        p2 = sum(1 for mid in common_ids if rater2[mid] == cat) / n
        p_e += p1 * p2

    if p_e >= 1.0:
        kappa = 1.0
    else:
        kappa = (p_o - p_e) / (1.0 - p_e) if p_e < 1.0 else 0.0

    return {
        "kappa": round(kappa, 4),
        "agreement_rate": round(p_o, 4),
        "n": n,
        "agreements": agreements,
        "p_o": round(p_o, 4),
        "p_e": round(p_e, 4),
        "error": None,
    }


def find_disagreements(
    rater1: dict[int, str],
    rater2: dict[int, str],
) -> list[dict[str, Any]]:
    """Find all disagreements between two raters."""
    common_ids = sorted(set(rater1) & set(rater2))
    disagreements = []
    for mid in common_ids:
        c1 = rater1[mid]
        c2 = rater2[mid]
        if c1 != c2:
            disagreements.append(
                {
                    "motion_id": mid,
                    "original": c1,
                    "second": c2,
                }
            )
    return disagreements


def build_confusion_matrix(
    rater1: dict[int, str],
    rater2: dict[int, str],
) -> dict[str, Any]:
    """Build confusion matrix between two raters."""
    common_ids = set(rater1) & set(rater2)
    matrix: dict[str, Counter[str]] = {m: Counter() for m in MECHANISMS}
    for mid in common_ids:
        c1 = rater1[mid]
        c2 = rater2[mid]
        matrix[c1][c2] += 1
    return {k: dict(v) for k, v in matrix.items()}


# ── resolution ───────────────────────────────────────────────────────────────


def resolve_disagreements(
    disagreements: list[dict[str, Any]],
    second_results: dict[int, dict[str, Any]],
    motions: list[dict[str, Any]],
) -> list[dict[str, Any]]:
    """Resolve disagreements by preferring higher-confidence classification."""
    motion_map = {m["motion_id"]: m for m in motions}
    resolved = []
    for d in disagreements:
        mid = d["motion_id"]
        sr = second_results.get(mid, {})
        confidence = sr.get("confidence", 0)

        # Rule: if second classifier confidence >= 4, prefer second
        # Otherwise default to original (more carefully classified)
        if confidence >= 4:
            winner = "second"
            resolved_mech = d["second"]
        else:
            winner = "original"
            resolved_mech = d["original"]

        motion = motion_map.get(mid, {})
        resolved.append(
            {
                "motion_id": mid,
                "title": motion.get("title", "")[:120],
                "original": d["original"],
                "second": d["second"],
                "second_confidence": confidence,
                "resolved": resolved_mech,
                "winner": winner,
            }
        )
    return resolved


def build_validated_classifications(
    original: dict[int, str],
    second: dict[int, str],
    resolutions: list[dict[str, Any]],
) -> dict[int, str]:
    """Build the validated classification dict based on resolution outcomes."""
    resolution_map = {r["motion_id"]: r["resolved"] for r in resolutions}
    validated = dict(original)
    for mid in validated:
        if mid in resolution_map:
            validated[mid] = resolution_map[mid]
    return validated


# ── report generation ────────────────────────────────────────────────────────


def generate_report(
    kappa_result: dict[str, Any],
    disagreements: list[dict[str, Any]],
    resolutions: list[dict[str, Any]],
    confusion: dict[str, Any],
    validated_dist: dict[str, Any],
    second_results: dict[int, dict[str, Any]],
    output_path: str,
) -> None:
    """Generate mechanism validation markdown report."""
    n_second_classified = sum(1 for v in second_results.values() if v.get("mechanism"))
    avg_confidence = (
        sum(v.get("confidence", 0) for v in second_results.values() if v.get("mechanism"))
        / max(n_second_classified, 1)
    )

    lines = [
        "# Mechanism Classification Validation Report",
        "",
        "## 1. Inter-Rater Reliability",
        "",
        f"- **Motions compared:** {kappa_result['n']}",
        f"- **Agreements:** {kappa_result['agreements']} / {kappa_result['n']}",
        f"- **Agreement rate:** {kappa_result['agreement_rate']:.1%}",
        f"- **Cohen's kappa (κ):** {kappa_result['kappa']}",
        f"  - P_o (observed): {kappa_result['p_o']:.4f}",
        f"  - P_e (expected): {kappa_result['p_e']:.4f}",
        "",
    ]

    kappa = kappa_result["kappa"]
    if kappa is not None:
        if kappa < 0.0:
            strength = "Less than chance agreement"
        elif kappa < 0.20:
            strength = "Slight agreement"
        elif kappa < 0.40:
            strength = "Fair agreement"
        elif kappa < 0.60:
            strength = "Moderate agreement"
        elif kappa < 0.80:
            strength = "Substantial agreement"
        else:
            strength = "Almost perfect agreement"
        lines.append(f"**Interpretation:** {strength}")
        lines.append("")

    if kappa is not None and kappa < 0.60:
        lines.append("**The mechanism taxonomy needs revision.** The inter-rater agreement is below 0.6, suggesting the 10-mechanism framework is not being applied consistently across raters. Consider:")
        lines.append("- Simplifying or merging ambiguous mechanism pairs")
        lines.append("- Adding clearer decision rules for borderline cases")
        lines.append("- Reducing the number of mechanisms")
        lines.append("")
    elif kappa is not None:
        lines.append("**The mechanism taxonomy appears adequate.** Inter-rater agreement is at or above 0.6, indicating reasonable consistency.")
        lines.append("")

    lines.extend([
        "## 2. Second Classifier Summary",
        "",
        f"- **Model:** {config.QWEN_MODEL}",
        f"- **Motions classified:** {n_second_classified}",
        f"- **Average confidence:** {avg_confidence:.1f}/5",
        "",
    ])

    conf_dist = Counter()
    for v in second_results.values():
        conf_dist[v.get("confidence", 0)] += 1
    lines.append("### Confidence Distribution")
    lines.append("| Confidence | Count |")
    lines.append("|------------|-------|")
    for level in range(1, 6):
        lines.append(f"| {level} | {conf_dist.get(level, 0)} |")
    lines.append("")

    lines.extend([
        "## 3. Disagreement Table",
        "",
        f"**Total disagreements:** {len(disagreements)} / {kappa_result['n']} ({len(disagreements) / max(kappa_result['n'], 1) * 100:.1f}%)",
        "",
        "| Motion ID | Title | Original | Second | Confidence | Resolved | Winner |",
        "|-----------|-------|----------|--------|------------|----------|--------|",
    ])

    for r in resolutions:
        orig_label = MECHANISM_LABELS_NL.get(r["original"], r["original"])
        second_label = MECHANISM_LABELS_NL.get(r["second"], r["second"])
        res_label = MECHANISM_LABELS_NL.get(r["resolved"], r["resolved"])
        lines.append(
            f"| {r['motion_id']} | {r['title'][:80]} | {orig_label} | {second_label} | {r['second_confidence']} | {res_label} | {r['winner']} |"
        )

    lines.extend([
        "",
        "## 4. Mechanism Distribution Comparison",
        "",
        "| Mechanism | Original Count | Second Count | Validated Count |",
        "|-----------|---------------|--------------|-----------------|",
    ])

    orig_dist = Counter(ORIGINAL_CLASSIFICATIONS.values())
    second_dist = Counter()
    for v in second_results.values():
        m = v.get("mechanism")
        if m:
            second_dist[m] += 1

    for mech in MECHANISMS:
        label = MECHANISM_LABELS_NL.get(mech, mech)
        o_cnt = orig_dist.get(mech, 0)
        s_cnt = second_dist.get(mech, 0)
        v_cnt = validated_dist.get(mech, 0)
        lines.append(f"| {label} | {o_cnt} | {s_cnt} | {v_cnt} |")

    lines.extend([
        "",
        "## 5. Confusion Matrix (Top Rows)",
        "",
        "| Original \\ Second | " + " | ".join(MECHANISM_LABELS_EN[m][:20] for m in MECHANISMS) + " |",
        "|" + "---|" * (len(MECHANISMS) + 1),
    ])

    for mech in MECHANISMS:
        label = MECHANISM_LABELS_EN[mech][:20]
        row_data = confusion.get(mech, {})
        cells = [str(row_data.get(m, 0)) for m in MECHANISMS]
        lines.append(f"| {label} | {' | '.join(cells)} |")

    lines.extend([
        "",
        "## 6. Conclusion",
        "",
        f"Cohen's kappa of **{kappa}** indicates **{strength.lower()}** between the original inline classification and the independent second classifier.",
        "",
        "### Key findings:",
        f"- {kappa_result['agreements']} out of {kappa_result['n']} motions agreed ({kappa_result['agreement_rate']:.1%})",
        f"- {len(disagreements)} disagreements resolved: {sum(1 for r in resolutions if r['winner'] == 'original')} kept original, {sum(1 for r in resolutions if r['winner'] == 'second')} adopted second",
        "",
    ])

    top_disagreement_pairs = Counter()
    for d in disagreements:
        pair = f"{d['original']} / {d['second']}"
        top_disagreement_pairs[pair] += 1

    if top_disagreement_pairs:
        lines.append("### Most common disagreement pairs:")
        for pair, cnt in top_disagreement_pairs.most_common(5):
            lines.append(f"- {pair}: {cnt} times")
        lines.append("")

    lines.append("### Revised mechanism taxonomy recommendation:")
    if kappa is not None and kappa < 0.60:
        lines.append("- Taxonomy needs revision to improve inter-rater reliability.")
        if top_disagreement_pairs:
            top_pair = top_disagreement_pairs.most_common(1)[0][0]
            lines.append(f"- Most confused pair: {top_pair} — consider merging or clarifying distinction.")
    else:
        lines.append("- Taxonomy is sufficiently reliable. Minor clarifications may be helpful for borderline cases.")
    lines.append("")

    out_path = Path(output_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
    logger.info("Report written to %s", out_path)


# ── main ─────────────────────────────────────────────────────────────────────


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Validate mechanism classification with second classifier"
    )
    parser.add_argument("--db", default="data/motions.db", help="Path to DuckDB database")
    parser.add_argument(
        "--model",
        default=None,
        help=f"Second classifier model (default: {config.QWEN_MODEL})",
    )
    parser.add_argument("--batch-size", type=int, default=10, help="Motions per batch")
    parser.add_argument("--max-workers", type=int, default=3, help="Max parallel workers")
    parser.add_argument(
        "--output",
        default="reports/overton_window/mechanism_validation.md",
        help="Output report path",
    )
    parser.add_argument(
        "--save-results",
        default=None,
        help="Save full second classification results to JSON path",
    )
    args = parser.parse_args()

    second_model = args.model or config.QWEN_MODEL
    logger.info("Second classifier model: %s", second_model)

    motion_ids = list(ORIGINAL_CLASSIFICATIONS.keys())
    logger.info("Loading %d motions from database...", len(motion_ids))

    motions = load_motions(args.db, motion_ids)
    logger.info("Loaded %d motions", len(motions))

    logger.info("Running second classifier...")
    second_results = classify_motions_second_pass(
        motions,
        second_model=second_model,
        batch_size=args.batch_size,
        max_workers=args.max_workers,
    )

    # Extract mechanism-only dict for agreement analysis
    second_classifications: dict[int, str] = {}
    for mid, res in second_results.items():
        if res.get("mechanism") and res["mechanism"] in MECHANISMS:
            second_classifications[mid] = res["mechanism"]

    n_second_classified = len(second_classifications)
    logger.info(
        "Second classifier completed: %d/%d motions classified",
        n_second_classified,
        len(motions),
    )

    # Filter original to only include motions with second classification
    original_filtered = {
        mid: ORIGINAL_CLASSIFICATIONS[mid]
        for mid in second_classifications
        if mid in ORIGINAL_CLASSIFICATIONS
    }

    # Compute Cohen's kappa
    kappa_result = compute_cohens_kappa(
        original_filtered, second_classifications, MECHANISMS
    )
    logger.info("Cohen's kappa: %s", kappa_result["kappa"])
    logger.info("Agreement rate: %s", kappa_result["agreement_rate"])

    # Find disagreements
    disagreements = find_disagreements(original_filtered, second_classifications)
    logger.info("Disagreements: %d", len(disagreements))

    # Build confusion matrix
    confusion = build_confusion_matrix(original_filtered, second_classifications)

    # Resolve disagreements
    resolutions = resolve_disagreements(disagreements, second_results, motions)

    # Build validated classifications
    validated = build_validated_classifications(
        ORIGINAL_CLASSIFICATIONS, second_classifications, resolutions
    )
    validated_dist = Counter(validated.values())

    # Save results if requested
    if args.save_results:
        save_path = Path(args.save_results)
        save_path.parent.mkdir(parents=True, exist_ok=True)
        save_data = {
            "kappa": kappa_result["kappa"],
            "agreement_rate": kappa_result["agreement_rate"],
            "n_motions": kappa_result["n"],
            "n_disagreements": len(disagreements),
            "second_results": {
                str(mid): res for mid, res in second_results.items()
            },
            "resolutions": resolutions,
        }
        save_path.write_text(json.dumps(save_data, indent=2, ensure_ascii=False), encoding="utf-8")
        logger.info("Results saved to %s", save_path)

    # Generate report
    generate_report(
        kappa_result=kappa_result,
        disagreements=disagreements,
        resolutions=resolutions,
        confusion=confusion,
        validated_dist=dict(validated_dist),
        second_results=second_results,
        output_path=args.output,
    )

    print(f"\nCohen's kappa: {kappa_result['kappa']}")
    print(f"Agreement rate: {kappa_result['agreement_rate']:.1%}")
    print(f"Disagreements: {len(disagreements)}/{kappa_result['n']}")
    print(f"Report: {args.output}")

    if kappa_result["kappa"] is not None:
        if kappa_result["kappa"] < 0.60:
            print("TAXONOMY NEEDS REVISION: kappa < 0.6 indicates poor reliability")
        else:
            print("TAXONOMY ADEQUATE: kappa >= 0.6 indicates acceptable reliability")

    return 0


if __name__ == "__main__":
    raise SystemExit(main())