You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
946 lines
34 KiB
946 lines
34 KiB
#!/usr/bin/env python3
|
|
"""Mechanism classification validation with a second classifier.
|
|
|
|
Computes inter-rater reliability (Cohen's kappa) between the original inline
|
|
classifications and a second LLM-based classification using a different prompt
|
|
template and (optionally) a different model.
|
|
|
|
Usage:
|
|
uv run python analysis/right_wing/mechanism_validation.py
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import sys
|
|
import time
|
|
from collections import Counter
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import duckdb
|
|
|
|
ROOT = Path(__file__).parent.parent.parent.resolve()
|
|
if str(ROOT) not in sys.path:
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from ai_provider import ProviderError, chat_completion
|
|
from analysis.config import config
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── mechanism taxonomy ───────────────────────────────────────────────────────
|
|
|
|
MECHANISMS = [
|
|
"consensus_framing",
|
|
"institutional_rule_of_law",
|
|
"welfare_service_expansion",
|
|
"procedural_technical",
|
|
"local_constituency",
|
|
"coalition_alignment",
|
|
"symbolic_declaratory",
|
|
"targeted_restriction",
|
|
"system_dismantling",
|
|
"crisis_response",
|
|
]
|
|
|
|
MECHANISM_LABELS_NL = {
|
|
"consensus_framing": "Consensus framing (gedeeld belang)",
|
|
"institutional_rule_of_law": "Institutioneel/rechtsstatelijk",
|
|
"welfare_service_expansion": "Welzijn/dienstverlening uitbreiding",
|
|
"procedural_technical": "Procedureel/technisch",
|
|
"local_constituency": "Lokaal/regionaal",
|
|
"coalition_alignment": "Coalitie-afstemming",
|
|
"symbolic_declaratory": "Symbolisch/declaratoir",
|
|
"targeted_restriction": "Gerichte restrictie",
|
|
"system_dismantling": "Systeemontmanteling",
|
|
"crisis_response": "Crisisrespons",
|
|
}
|
|
|
|
MECHANISM_LABELS_EN = {
|
|
"consensus_framing": "Consensus framing / shared interest",
|
|
"institutional_rule_of_law": "Institutional / rule of law",
|
|
"welfare_service_expansion": "Welfare / service expansion",
|
|
"procedural_technical": "Procedural / technical",
|
|
"local_constituency": "Local / regional constituency",
|
|
"coalition_alignment": "Coalition alignment",
|
|
"symbolic_declaratory": "Symbolic / declaratory",
|
|
"targeted_restriction": "Targeted restriction",
|
|
"system_dismantling": "System dismantling",
|
|
"crisis_response": "Crisis response",
|
|
}
|
|
|
|
# Original inline classifications (from mechanism_classification.py)
|
|
ORIGINAL_CLASSIFICATIONS: dict[int, str] = {
|
|
15458: "crisis_response",
|
|
26477: "institutional_rule_of_law",
|
|
9149: "consensus_framing",
|
|
17099: "procedural_technical",
|
|
4933: "procedural_technical",
|
|
17751: "consensus_framing",
|
|
20068: "procedural_technical",
|
|
16520: "consensus_framing",
|
|
17036: "welfare_service_expansion",
|
|
17681: "consensus_framing",
|
|
14554: "procedural_technical",
|
|
21864: "procedural_technical",
|
|
26493: "targeted_restriction",
|
|
21982: "consensus_framing",
|
|
14125: "crisis_response",
|
|
13683: "welfare_service_expansion",
|
|
16691: "procedural_technical",
|
|
15005: "procedural_technical",
|
|
17536: "institutional_rule_of_law",
|
|
16999: "consensus_framing",
|
|
8325: "procedural_technical",
|
|
13370: "welfare_service_expansion",
|
|
18030: "procedural_technical",
|
|
11382: "procedural_technical",
|
|
18616: "procedural_technical",
|
|
12411: "crisis_response",
|
|
22595: "crisis_response",
|
|
15772: "system_dismantling",
|
|
7111: "welfare_service_expansion",
|
|
25784: "targeted_restriction",
|
|
27731: "system_dismantling",
|
|
15626: "crisis_response",
|
|
20215: "welfare_service_expansion",
|
|
16430: "symbolic_declaratory",
|
|
25982: "local_constituency",
|
|
17176: "targeted_restriction",
|
|
7054: "procedural_technical",
|
|
20323: "procedural_technical",
|
|
18025: "system_dismantling",
|
|
14837: "system_dismantling",
|
|
19620: "targeted_restriction",
|
|
21801: "consensus_framing",
|
|
19464: "crisis_response",
|
|
26855: "targeted_restriction",
|
|
22280: "local_constituency",
|
|
20115: "symbolic_declaratory",
|
|
15082: "targeted_restriction",
|
|
6637: "targeted_restriction",
|
|
18691: "symbolic_declaratory",
|
|
18062: "crisis_response",
|
|
3784: "procedural_technical",
|
|
10205: "procedural_technical",
|
|
10278: "coalition_alignment",
|
|
25079: "consensus_framing",
|
|
2980: "targeted_restriction",
|
|
10420: "crisis_response",
|
|
25092: "targeted_restriction",
|
|
25545: "institutional_rule_of_law",
|
|
23065: "procedural_technical",
|
|
2878: "welfare_service_expansion",
|
|
25573: "procedural_technical",
|
|
3298: "symbolic_declaratory",
|
|
25061: "consensus_framing",
|
|
4481: "consensus_framing",
|
|
3961: "procedural_technical",
|
|
473: "institutional_rule_of_law",
|
|
10413: "consensus_framing",
|
|
974: "procedural_technical",
|
|
24009: "procedural_technical",
|
|
9789: "institutional_rule_of_law",
|
|
24651: "targeted_restriction",
|
|
1890: "local_constituency",
|
|
1191: "consensus_framing",
|
|
3448: "targeted_restriction",
|
|
23910: "institutional_rule_of_law",
|
|
25566: "welfare_service_expansion",
|
|
2070: "targeted_restriction",
|
|
23885: "consensus_framing",
|
|
24906: "procedural_technical",
|
|
2496: "procedural_technical",
|
|
25582: "targeted_restriction",
|
|
3053: "local_constituency",
|
|
1495: "procedural_technical",
|
|
10178: "procedural_technical",
|
|
1614: "procedural_technical",
|
|
23441: "consensus_framing",
|
|
3569: "consensus_framing",
|
|
10285: "procedural_technical",
|
|
23058: "procedural_technical",
|
|
3287: "procedural_technical",
|
|
10434: "consensus_framing",
|
|
10089: "procedural_technical",
|
|
22706: "consensus_framing",
|
|
3877: "institutional_rule_of_law",
|
|
25062: "consensus_framing",
|
|
3687: "targeted_restriction",
|
|
25166: "procedural_technical",
|
|
4618: "procedural_technical",
|
|
3468: "institutional_rule_of_law",
|
|
24632: "institutional_rule_of_law",
|
|
25451: "symbolic_declaratory",
|
|
2351: "targeted_restriction",
|
|
4227: "consensus_framing",
|
|
22853: "consensus_framing",
|
|
9884: "procedural_technical",
|
|
1428: "consensus_framing",
|
|
3629: "symbolic_declaratory",
|
|
1572: "local_constituency",
|
|
25493: "procedural_technical",
|
|
1359: "procedural_technical",
|
|
2252: "procedural_technical",
|
|
23605: "procedural_technical",
|
|
3760: "consensus_framing",
|
|
1005: "consensus_framing",
|
|
10110: "coalition_alignment",
|
|
23301: "consensus_framing",
|
|
24046: "symbolic_declaratory",
|
|
651: "welfare_service_expansion",
|
|
1491: "targeted_restriction",
|
|
25606: "targeted_restriction",
|
|
313: "procedural_technical",
|
|
24008: "consensus_framing",
|
|
754: "targeted_restriction",
|
|
25469: "targeted_restriction",
|
|
25091: "targeted_restriction",
|
|
2170: "institutional_rule_of_law",
|
|
22792: "procedural_technical",
|
|
10597: "institutional_rule_of_law",
|
|
23013: "institutional_rule_of_law",
|
|
3472: "institutional_rule_of_law",
|
|
2014: "system_dismantling",
|
|
920: "procedural_technical",
|
|
2143: "welfare_service_expansion",
|
|
688: "system_dismantling",
|
|
2290: "system_dismantling",
|
|
4497: "targeted_restriction",
|
|
3823: "symbolic_declaratory",
|
|
23141: "institutional_rule_of_law",
|
|
4436: "institutional_rule_of_law",
|
|
25616: "targeted_restriction",
|
|
2662: "institutional_rule_of_law",
|
|
23287: "institutional_rule_of_law",
|
|
4660: "consensus_framing",
|
|
4761: "targeted_restriction",
|
|
2264: "institutional_rule_of_law",
|
|
4394: "institutional_rule_of_law",
|
|
1691: "targeted_restriction",
|
|
10601: "targeted_restriction",
|
|
4089: "targeted_restriction",
|
|
23206: "procedural_technical",
|
|
22676: "institutional_rule_of_law",
|
|
115: "system_dismantling",
|
|
3951: "consensus_framing",
|
|
1375: "targeted_restriction",
|
|
3090: "targeted_restriction",
|
|
24650: "procedural_technical",
|
|
1772: "consensus_framing",
|
|
3678: "system_dismantling",
|
|
1692: "institutional_rule_of_law",
|
|
24077: "symbolic_declaratory",
|
|
349: "institutional_rule_of_law",
|
|
9769: "targeted_restriction",
|
|
4656: "symbolic_declaratory",
|
|
23984: "system_dismantling",
|
|
2168: "institutional_rule_of_law",
|
|
4443: "institutional_rule_of_law",
|
|
4489: "procedural_technical",
|
|
10290: "targeted_restriction",
|
|
4071: "targeted_restriction",
|
|
4088: "targeted_restriction",
|
|
1507: "system_dismantling",
|
|
2870: "procedural_technical",
|
|
1912: "system_dismantling",
|
|
22658: "symbolic_declaratory",
|
|
10288: "targeted_restriction",
|
|
4080: "institutional_rule_of_law",
|
|
1847: "targeted_restriction",
|
|
23127: "system_dismantling",
|
|
4367: "targeted_restriction",
|
|
9790: "targeted_restriction",
|
|
4150: "procedural_technical",
|
|
741: "targeted_restriction",
|
|
1705: "consensus_framing",
|
|
1831: "consensus_framing",
|
|
10600: "targeted_restriction",
|
|
9767: "targeted_restriction",
|
|
3830: "system_dismantling",
|
|
4221: "system_dismantling",
|
|
3354: "institutional_rule_of_law",
|
|
9977: "symbolic_declaratory",
|
|
898: "consensus_framing",
|
|
24848: "system_dismantling",
|
|
756: "targeted_restriction",
|
|
24358: "institutional_rule_of_law",
|
|
4309: "institutional_rule_of_law",
|
|
10167: "local_constituency",
|
|
23633: "procedural_technical",
|
|
23030: "targeted_restriction",
|
|
1959: "system_dismantling",
|
|
23454: "procedural_technical",
|
|
}
|
|
|
|
# ── prompt templates ─────────────────────────────────────────────────────────
|
|
|
|
# Original prompt (from mechanism_classification.py — inline subagent)
|
|
# Classifications were done by reading full title + body_text.
|
|
# The second classifier uses a DIFFERENT template:
|
|
# - English wording (not Dutch)
|
|
# - Mechanisms presented in DIFFERENT order (reverse alphabetical)
|
|
# - Asks for RANKING (top 3) instead of single pick
|
|
# - Includes definition context for each mechanism
|
|
|
|
MECHANISMS_SHUFLLED = list(reversed(MECHANISMS))
|
|
|
|
MECHANISM_DEFINITIONS_EN = """1. crisis_response — A temporary, emergency measure responding to an acute event (pandemic, natural disaster, sudden crisis). Reactive and time-limited.
|
|
|
|
2. system_dismantling — Aims to dismantle, abolish, or fundamentally restructure an existing policy, institution, or regulatory framework. Not reform but abolition/reversal.
|
|
|
|
3. targeted_restriction — Imposes specific restrictions on a defined group, behavior, or activity. Narrow scope, punitive or exclusionary intent.
|
|
|
|
4. symbolic_declaratory — Primarily sends a political signal, makes a statement, or takes a position without direct policy impact. Declaratory, symbolic, expressive.
|
|
|
|
5. procedural_technical — Technical adjustment, budget amendment, implementation detail, or administrative procedure. Bureaucratic, operational, non-ideological.
|
|
|
|
6. local_constituency — Serves a specific local/regional interest, constituency, or geographic area. NIMBY or local-advocacy pattern.
|
|
|
|
7. coalition_alignment — Reflects coalition politics: budget compromises, package deals, or alignments between coalition partners. Coalition-maintenance.
|
|
|
|
8. welfare_service_expansion — Expands government services, social welfare, public goods, or citizen entitlements. Positive provision, not restriction.
|
|
|
|
9. institutional_rule_of_law — Concerns legal frameworks, rule of law, institutional integrity, judicial process, or constitutional matters. Rule-based, institutional.
|
|
|
|
10. consensus_framing — Frames the motion as serving a broad, shared interest. Appeals to common ground, national interest, or bipartisan consensus. Inclusive, bridge-building, non-polarizing."""
|
|
|
|
SECOND_CLASSIFIER_PROMPT = """Classify the following Dutch parliamentary motion according to the mechanism taxonomy below.
|
|
|
|
MOTION TITLE: {title}
|
|
|
|
MOTION TEXT: {body}
|
|
|
|
TASK: Identify the PRIMARY mechanism this motion uses. Select exactly ONE mechanism from the list below. Base your decision on what the motion actually DOES (action-oriented) rather than what it merely TALKS about.
|
|
|
|
MECHANISM TAXONOMY (read carefully before choosing):
|
|
|
|
{MECHANISM_DEFINITIONS}
|
|
|
|
IMPORTANT RULES:
|
|
- Choose the mechanism that BEST describes the dominant pattern of the motion.
|
|
- If a motion could fit multiple mechanisms, pick the most specific one.
|
|
- procedural_technical should be the DEFAULT only if no other mechanism fits better.
|
|
- Return ONLY the mechanism key exactly as listed above (e.g., "system_dismantling").
|
|
|
|
Respond with a JSON object containing:
|
|
- "mechanism": the selected mechanism key
|
|
- "confidence": 1-5 (1=very uncertain, 5=very certain)
|
|
- "reasoning": brief explanation (max 2 sentences)"""
|
|
|
|
|
|
def build_second_classifier_prompt(title: str, body_text: str) -> str:
|
|
text = body_text or title or ""
|
|
if len(text) > 1200:
|
|
text = text[:1200] + "..."
|
|
return SECOND_CLASSIFIER_PROMPT.format(
|
|
title=title or "", body=text, MECHANISM_DEFINITIONS=MECHANISM_DEFINITIONS_EN
|
|
)
|
|
|
|
|
|
# ── LLM call helpers ─────────────────────────────────────────────────────────
|
|
|
|
|
|
def chat_completion_json(
|
|
messages: list[dict[str, str]],
|
|
model: str | None = None,
|
|
retries: int = 3,
|
|
) -> dict[str, Any] | None:
|
|
"""Call chat_completion and parse JSON response with retries."""
|
|
model = model or config.QWEN_MODEL
|
|
prompt = messages[0]["content"]
|
|
system_msg = (
|
|
"You are a political science classifier. You classify Dutch parliamentary "
|
|
"motions by their dominant mechanism type. Respond ONLY with valid JSON. "
|
|
"No markdown, no code fences, no preamble — pure JSON object."
|
|
)
|
|
full_messages = [
|
|
{"role": "system", "content": system_msg},
|
|
{"role": "user", "content": prompt},
|
|
]
|
|
|
|
backoff = 0.5
|
|
for attempt in range(1, retries + 1):
|
|
try:
|
|
raw = chat_completion(full_messages, model=model)
|
|
except ProviderError as exc:
|
|
if attempt == retries:
|
|
logger.error("ProviderError on attempt %d: %s", attempt, exc)
|
|
return None
|
|
time.sleep(backoff * (2 ** (attempt - 1)))
|
|
continue
|
|
|
|
raw = raw.strip()
|
|
if raw.startswith("```"):
|
|
raw = raw.split("```", 2)[1]
|
|
if raw.startswith("json"):
|
|
raw = raw[4:]
|
|
raw = raw.strip()
|
|
|
|
try:
|
|
result = json.loads(raw)
|
|
if "mechanism" in result and result["mechanism"] in MECHANISMS:
|
|
return result
|
|
logger.warning(
|
|
"Invalid mechanism '%s' on attempt %d", result.get("mechanism"), attempt
|
|
)
|
|
except json.JSONDecodeError:
|
|
logger.warning("JSON decode failed on attempt %d: %s", attempt, raw[:100])
|
|
|
|
if attempt < retries:
|
|
time.sleep(backoff * (2 ** (attempt - 1)))
|
|
|
|
return None
|
|
|
|
|
|
def chat_completion_json_parallel(
|
|
message_batches: list[list[dict[str, str]]],
|
|
model: str | None = None,
|
|
max_workers: int = 5,
|
|
) -> list[dict[str, Any] | None]:
|
|
"""
|
|
Run multiple chat completions in parallel using ThreadPoolExecutor.
|
|
|
|
Each element in message_batches is a list of messages for one completion.
|
|
Returns a list of parsed JSON dicts (or None for failures), same order.
|
|
"""
|
|
model = model or config.QWEN_MODEL
|
|
|
|
def _fetch_one(messages: list[dict[str, str]]) -> dict[str, Any] | None:
|
|
return chat_completion_json(messages, model=model)
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
futures = [executor.submit(_fetch_one, batch) for batch in message_batches]
|
|
return [f.result() for f in futures]
|
|
|
|
|
|
# ── data loading ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
def load_motions(db_path: str, motion_ids: list[int]) -> list[dict[str, Any]]:
|
|
"""Load motion data from the database for the given motion IDs."""
|
|
con = duckdb.connect(db_path)
|
|
try:
|
|
placeholders = ",".join("?" for _ in motion_ids)
|
|
rows = con.execute(
|
|
f"""
|
|
SELECT r.motion_id, m.title, m.body_text, r.year, r.centrist_support_strict
|
|
FROM right_wing_motions r
|
|
JOIN motions m ON r.motion_id = m.id
|
|
WHERE r.motion_id IN ({placeholders})
|
|
ORDER BY r.motion_id
|
|
""",
|
|
motion_ids,
|
|
).fetchall()
|
|
|
|
return [
|
|
{
|
|
"motion_id": r[0],
|
|
"title": r[1] or "",
|
|
"body_text": r[2] or "",
|
|
"year": r[3],
|
|
"centrist_support_strict": r[4],
|
|
}
|
|
for r in rows
|
|
]
|
|
finally:
|
|
con.close()
|
|
|
|
|
|
# ── classification ───────────────────────────────────────────────────────────
|
|
|
|
|
|
def classify_motions_second_pass(
|
|
motions: list[dict[str, Any]],
|
|
second_model: str | None = None,
|
|
batch_size: int = 10,
|
|
max_workers: int = 5,
|
|
) -> dict[int, dict[str, Any]]:
|
|
"""Run second classifier on all motions, return motion_id -> result dict."""
|
|
second_model = second_model or config.QWEN_MODEL
|
|
results: dict[int, dict[str, Any]] = {}
|
|
|
|
for i in range(0, len(motions), batch_size):
|
|
batch = motions[i : i + batch_size]
|
|
logger.info(
|
|
"Batch %d/%d (%d motions)",
|
|
i // batch_size + 1,
|
|
(len(motions) - 1) // batch_size + 1,
|
|
len(batch),
|
|
)
|
|
|
|
message_batches = []
|
|
for m in batch:
|
|
prompt = build_second_classifier_prompt(m["title"], m["body_text"])
|
|
message_batches.append([{"role": "user", "content": prompt}])
|
|
|
|
raw_results = chat_completion_json_parallel(
|
|
message_batches, model=second_model, max_workers=max_workers
|
|
)
|
|
|
|
for m, res in zip(batch, raw_results):
|
|
mid = m["motion_id"]
|
|
if res and res.get("mechanism") in MECHANISMS:
|
|
results[mid] = {
|
|
"mechanism": res["mechanism"],
|
|
"confidence": res.get("confidence", 0),
|
|
"reasoning": res.get("reasoning", ""),
|
|
"error": None,
|
|
}
|
|
else:
|
|
results[mid] = {
|
|
"mechanism": None,
|
|
"confidence": 0,
|
|
"reasoning": "",
|
|
"error": "classification failed",
|
|
}
|
|
|
|
time.sleep(0.5)
|
|
|
|
return results
|
|
|
|
|
|
# ── agreement analysis ───────────────────────────────────────────────────────
|
|
|
|
|
|
def compute_cohens_kappa(
|
|
rater1: dict[int, str],
|
|
rater2: dict[int, str],
|
|
categories: list[str],
|
|
) -> dict[str, Any]:
|
|
"""Compute Cohen's kappa for two raters.
|
|
|
|
Uses only motion_ids present in BOTH raters.
|
|
"""
|
|
common_ids = sorted(set(rater1) & set(rater2))
|
|
|
|
n = len(common_ids)
|
|
if n == 0:
|
|
return {"kappa": None, "agreement_rate": None, "n": 0, "error": "no common motions"}
|
|
|
|
agreements = 0
|
|
for mid in common_ids:
|
|
if rater1[mid] == rater2[mid]:
|
|
agreements += 1
|
|
|
|
p_o = agreements / n
|
|
|
|
# Expected agreement
|
|
p_e = 0.0
|
|
for cat in categories:
|
|
p1 = sum(1 for mid in common_ids if rater1[mid] == cat) / n
|
|
p2 = sum(1 for mid in common_ids if rater2[mid] == cat) / n
|
|
p_e += p1 * p2
|
|
|
|
if p_e >= 1.0:
|
|
kappa = 1.0
|
|
else:
|
|
kappa = (p_o - p_e) / (1.0 - p_e) if p_e < 1.0 else 0.0
|
|
|
|
return {
|
|
"kappa": round(kappa, 4),
|
|
"agreement_rate": round(p_o, 4),
|
|
"n": n,
|
|
"agreements": agreements,
|
|
"p_o": round(p_o, 4),
|
|
"p_e": round(p_e, 4),
|
|
"error": None,
|
|
}
|
|
|
|
|
|
def find_disagreements(
|
|
rater1: dict[int, str],
|
|
rater2: dict[int, str],
|
|
) -> list[dict[str, Any]]:
|
|
"""Find all disagreements between two raters."""
|
|
common_ids = sorted(set(rater1) & set(rater2))
|
|
disagreements = []
|
|
for mid in common_ids:
|
|
c1 = rater1[mid]
|
|
c2 = rater2[mid]
|
|
if c1 != c2:
|
|
disagreements.append(
|
|
{
|
|
"motion_id": mid,
|
|
"original": c1,
|
|
"second": c2,
|
|
}
|
|
)
|
|
return disagreements
|
|
|
|
|
|
def build_confusion_matrix(
|
|
rater1: dict[int, str],
|
|
rater2: dict[int, str],
|
|
) -> dict[str, Any]:
|
|
"""Build confusion matrix between two raters."""
|
|
common_ids = set(rater1) & set(rater2)
|
|
matrix: dict[str, Counter[str]] = {m: Counter() for m in MECHANISMS}
|
|
for mid in common_ids:
|
|
c1 = rater1[mid]
|
|
c2 = rater2[mid]
|
|
matrix[c1][c2] += 1
|
|
return {k: dict(v) for k, v in matrix.items()}
|
|
|
|
|
|
# ── resolution ───────────────────────────────────────────────────────────────
|
|
|
|
|
|
def resolve_disagreements(
|
|
disagreements: list[dict[str, Any]],
|
|
second_results: dict[int, dict[str, Any]],
|
|
motions: list[dict[str, Any]],
|
|
) -> list[dict[str, Any]]:
|
|
"""Resolve disagreements by preferring higher-confidence classification."""
|
|
motion_map = {m["motion_id"]: m for m in motions}
|
|
resolved = []
|
|
for d in disagreements:
|
|
mid = d["motion_id"]
|
|
sr = second_results.get(mid, {})
|
|
confidence = sr.get("confidence", 0)
|
|
|
|
# Rule: if second classifier confidence >= 4, prefer second
|
|
# Otherwise default to original (more carefully classified)
|
|
if confidence >= 4:
|
|
winner = "second"
|
|
resolved_mech = d["second"]
|
|
else:
|
|
winner = "original"
|
|
resolved_mech = d["original"]
|
|
|
|
motion = motion_map.get(mid, {})
|
|
resolved.append(
|
|
{
|
|
"motion_id": mid,
|
|
"title": motion.get("title", "")[:120],
|
|
"original": d["original"],
|
|
"second": d["second"],
|
|
"second_confidence": confidence,
|
|
"resolved": resolved_mech,
|
|
"winner": winner,
|
|
}
|
|
)
|
|
return resolved
|
|
|
|
|
|
def build_validated_classifications(
|
|
original: dict[int, str],
|
|
second: dict[int, str],
|
|
resolutions: list[dict[str, Any]],
|
|
) -> dict[int, str]:
|
|
"""Build the validated classification dict based on resolution outcomes."""
|
|
resolution_map = {r["motion_id"]: r["resolved"] for r in resolutions}
|
|
validated = dict(original)
|
|
for mid in validated:
|
|
if mid in resolution_map:
|
|
validated[mid] = resolution_map[mid]
|
|
return validated
|
|
|
|
|
|
# ── report generation ────────────────────────────────────────────────────────
|
|
|
|
|
|
def generate_report(
|
|
kappa_result: dict[str, Any],
|
|
disagreements: list[dict[str, Any]],
|
|
resolutions: list[dict[str, Any]],
|
|
confusion: dict[str, Any],
|
|
validated_dist: dict[str, Any],
|
|
second_results: dict[int, dict[str, Any]],
|
|
output_path: str,
|
|
) -> None:
|
|
"""Generate mechanism validation markdown report."""
|
|
n_second_classified = sum(1 for v in second_results.values() if v.get("mechanism"))
|
|
avg_confidence = (
|
|
sum(v.get("confidence", 0) for v in second_results.values() if v.get("mechanism"))
|
|
/ max(n_second_classified, 1)
|
|
)
|
|
|
|
lines = [
|
|
"# Mechanism Classification Validation Report",
|
|
"",
|
|
"## 1. Inter-Rater Reliability",
|
|
"",
|
|
f"- **Motions compared:** {kappa_result['n']}",
|
|
f"- **Agreements:** {kappa_result['agreements']} / {kappa_result['n']}",
|
|
f"- **Agreement rate:** {kappa_result['agreement_rate']:.1%}",
|
|
f"- **Cohen's kappa (κ):** {kappa_result['kappa']}",
|
|
f" - P_o (observed): {kappa_result['p_o']:.4f}",
|
|
f" - P_e (expected): {kappa_result['p_e']:.4f}",
|
|
"",
|
|
]
|
|
|
|
kappa = kappa_result["kappa"]
|
|
if kappa is not None:
|
|
if kappa < 0.0:
|
|
strength = "Less than chance agreement"
|
|
elif kappa < 0.20:
|
|
strength = "Slight agreement"
|
|
elif kappa < 0.40:
|
|
strength = "Fair agreement"
|
|
elif kappa < 0.60:
|
|
strength = "Moderate agreement"
|
|
elif kappa < 0.80:
|
|
strength = "Substantial agreement"
|
|
else:
|
|
strength = "Almost perfect agreement"
|
|
lines.append(f"**Interpretation:** {strength}")
|
|
lines.append("")
|
|
|
|
if kappa is not None and kappa < 0.60:
|
|
lines.append("**The mechanism taxonomy needs revision.** The inter-rater agreement is below 0.6, suggesting the 10-mechanism framework is not being applied consistently across raters. Consider:")
|
|
lines.append("- Simplifying or merging ambiguous mechanism pairs")
|
|
lines.append("- Adding clearer decision rules for borderline cases")
|
|
lines.append("- Reducing the number of mechanisms")
|
|
lines.append("")
|
|
elif kappa is not None:
|
|
lines.append("**The mechanism taxonomy appears adequate.** Inter-rater agreement is at or above 0.6, indicating reasonable consistency.")
|
|
lines.append("")
|
|
|
|
lines.extend([
|
|
"## 2. Second Classifier Summary",
|
|
"",
|
|
f"- **Model:** {config.QWEN_MODEL}",
|
|
f"- **Motions classified:** {n_second_classified}",
|
|
f"- **Average confidence:** {avg_confidence:.1f}/5",
|
|
"",
|
|
])
|
|
|
|
conf_dist = Counter()
|
|
for v in second_results.values():
|
|
conf_dist[v.get("confidence", 0)] += 1
|
|
lines.append("### Confidence Distribution")
|
|
lines.append("| Confidence | Count |")
|
|
lines.append("|------------|-------|")
|
|
for level in range(1, 6):
|
|
lines.append(f"| {level} | {conf_dist.get(level, 0)} |")
|
|
lines.append("")
|
|
|
|
lines.extend([
|
|
"## 3. Disagreement Table",
|
|
"",
|
|
f"**Total disagreements:** {len(disagreements)} / {kappa_result['n']} ({len(disagreements) / max(kappa_result['n'], 1) * 100:.1f}%)",
|
|
"",
|
|
"| Motion ID | Title | Original | Second | Confidence | Resolved | Winner |",
|
|
"|-----------|-------|----------|--------|------------|----------|--------|",
|
|
])
|
|
|
|
for r in resolutions:
|
|
orig_label = MECHANISM_LABELS_NL.get(r["original"], r["original"])
|
|
second_label = MECHANISM_LABELS_NL.get(r["second"], r["second"])
|
|
res_label = MECHANISM_LABELS_NL.get(r["resolved"], r["resolved"])
|
|
lines.append(
|
|
f"| {r['motion_id']} | {r['title'][:80]} | {orig_label} | {second_label} | {r['second_confidence']} | {res_label} | {r['winner']} |"
|
|
)
|
|
|
|
lines.extend([
|
|
"",
|
|
"## 4. Mechanism Distribution Comparison",
|
|
"",
|
|
"| Mechanism | Original Count | Second Count | Validated Count |",
|
|
"|-----------|---------------|--------------|-----------------|",
|
|
])
|
|
|
|
orig_dist = Counter(ORIGINAL_CLASSIFICATIONS.values())
|
|
second_dist = Counter()
|
|
for v in second_results.values():
|
|
m = v.get("mechanism")
|
|
if m:
|
|
second_dist[m] += 1
|
|
|
|
for mech in MECHANISMS:
|
|
label = MECHANISM_LABELS_NL.get(mech, mech)
|
|
o_cnt = orig_dist.get(mech, 0)
|
|
s_cnt = second_dist.get(mech, 0)
|
|
v_cnt = validated_dist.get(mech, 0)
|
|
lines.append(f"| {label} | {o_cnt} | {s_cnt} | {v_cnt} |")
|
|
|
|
lines.extend([
|
|
"",
|
|
"## 5. Confusion Matrix (Top Rows)",
|
|
"",
|
|
"| Original \\ Second | " + " | ".join(MECHANISM_LABELS_EN[m][:20] for m in MECHANISMS) + " |",
|
|
"|" + "---|" * (len(MECHANISMS) + 1),
|
|
])
|
|
|
|
for mech in MECHANISMS:
|
|
label = MECHANISM_LABELS_EN[mech][:20]
|
|
row_data = confusion.get(mech, {})
|
|
cells = [str(row_data.get(m, 0)) for m in MECHANISMS]
|
|
lines.append(f"| {label} | {' | '.join(cells)} |")
|
|
|
|
lines.extend([
|
|
"",
|
|
"## 6. Conclusion",
|
|
"",
|
|
f"Cohen's kappa of **{kappa}** indicates **{strength.lower()}** between the original inline classification and the independent second classifier.",
|
|
"",
|
|
"### Key findings:",
|
|
f"- {kappa_result['agreements']} out of {kappa_result['n']} motions agreed ({kappa_result['agreement_rate']:.1%})",
|
|
f"- {len(disagreements)} disagreements resolved: {sum(1 for r in resolutions if r['winner'] == 'original')} kept original, {sum(1 for r in resolutions if r['winner'] == 'second')} adopted second",
|
|
"",
|
|
])
|
|
|
|
top_disagreement_pairs = Counter()
|
|
for d in disagreements:
|
|
pair = f"{d['original']} / {d['second']}"
|
|
top_disagreement_pairs[pair] += 1
|
|
|
|
if top_disagreement_pairs:
|
|
lines.append("### Most common disagreement pairs:")
|
|
for pair, cnt in top_disagreement_pairs.most_common(5):
|
|
lines.append(f"- {pair}: {cnt} times")
|
|
lines.append("")
|
|
|
|
lines.append("### Revised mechanism taxonomy recommendation:")
|
|
if kappa is not None and kappa < 0.60:
|
|
lines.append("- Taxonomy needs revision to improve inter-rater reliability.")
|
|
if top_disagreement_pairs:
|
|
top_pair = top_disagreement_pairs.most_common(1)[0][0]
|
|
lines.append(f"- Most confused pair: {top_pair} — consider merging or clarifying distinction.")
|
|
else:
|
|
lines.append("- Taxonomy is sufficiently reliable. Minor clarifications may be helpful for borderline cases.")
|
|
lines.append("")
|
|
|
|
out_path = Path(output_path)
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
out_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
logger.info("Report written to %s", out_path)
|
|
|
|
|
|
# ── main ─────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(
|
|
description="Validate mechanism classification with second classifier"
|
|
)
|
|
parser.add_argument("--db", default="data/motions.db", help="Path to DuckDB database")
|
|
parser.add_argument(
|
|
"--model",
|
|
default=None,
|
|
help=f"Second classifier model (default: {config.QWEN_MODEL})",
|
|
)
|
|
parser.add_argument("--batch-size", type=int, default=10, help="Motions per batch")
|
|
parser.add_argument("--max-workers", type=int, default=3, help="Max parallel workers")
|
|
parser.add_argument(
|
|
"--output",
|
|
default="reports/overton_window/mechanism_validation.md",
|
|
help="Output report path",
|
|
)
|
|
parser.add_argument(
|
|
"--save-results",
|
|
default=None,
|
|
help="Save full second classification results to JSON path",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
second_model = args.model or config.QWEN_MODEL
|
|
logger.info("Second classifier model: %s", second_model)
|
|
|
|
motion_ids = list(ORIGINAL_CLASSIFICATIONS.keys())
|
|
logger.info("Loading %d motions from database...", len(motion_ids))
|
|
|
|
motions = load_motions(args.db, motion_ids)
|
|
logger.info("Loaded %d motions", len(motions))
|
|
|
|
logger.info("Running second classifier...")
|
|
second_results = classify_motions_second_pass(
|
|
motions,
|
|
second_model=second_model,
|
|
batch_size=args.batch_size,
|
|
max_workers=args.max_workers,
|
|
)
|
|
|
|
# Extract mechanism-only dict for agreement analysis
|
|
second_classifications: dict[int, str] = {}
|
|
for mid, res in second_results.items():
|
|
if res.get("mechanism") and res["mechanism"] in MECHANISMS:
|
|
second_classifications[mid] = res["mechanism"]
|
|
|
|
n_second_classified = len(second_classifications)
|
|
logger.info(
|
|
"Second classifier completed: %d/%d motions classified",
|
|
n_second_classified,
|
|
len(motions),
|
|
)
|
|
|
|
# Filter original to only include motions with second classification
|
|
original_filtered = {
|
|
mid: ORIGINAL_CLASSIFICATIONS[mid]
|
|
for mid in second_classifications
|
|
if mid in ORIGINAL_CLASSIFICATIONS
|
|
}
|
|
|
|
# Compute Cohen's kappa
|
|
kappa_result = compute_cohens_kappa(
|
|
original_filtered, second_classifications, MECHANISMS
|
|
)
|
|
logger.info("Cohen's kappa: %s", kappa_result["kappa"])
|
|
logger.info("Agreement rate: %s", kappa_result["agreement_rate"])
|
|
|
|
# Find disagreements
|
|
disagreements = find_disagreements(original_filtered, second_classifications)
|
|
logger.info("Disagreements: %d", len(disagreements))
|
|
|
|
# Build confusion matrix
|
|
confusion = build_confusion_matrix(original_filtered, second_classifications)
|
|
|
|
# Resolve disagreements
|
|
resolutions = resolve_disagreements(disagreements, second_results, motions)
|
|
|
|
# Build validated classifications
|
|
validated = build_validated_classifications(
|
|
ORIGINAL_CLASSIFICATIONS, second_classifications, resolutions
|
|
)
|
|
validated_dist = Counter(validated.values())
|
|
|
|
# Save results if requested
|
|
if args.save_results:
|
|
save_path = Path(args.save_results)
|
|
save_path.parent.mkdir(parents=True, exist_ok=True)
|
|
save_data = {
|
|
"kappa": kappa_result["kappa"],
|
|
"agreement_rate": kappa_result["agreement_rate"],
|
|
"n_motions": kappa_result["n"],
|
|
"n_disagreements": len(disagreements),
|
|
"second_results": {
|
|
str(mid): res for mid, res in second_results.items()
|
|
},
|
|
"resolutions": resolutions,
|
|
}
|
|
save_path.write_text(json.dumps(save_data, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
logger.info("Results saved to %s", save_path)
|
|
|
|
# Generate report
|
|
generate_report(
|
|
kappa_result=kappa_result,
|
|
disagreements=disagreements,
|
|
resolutions=resolutions,
|
|
confusion=confusion,
|
|
validated_dist=dict(validated_dist),
|
|
second_results=second_results,
|
|
output_path=args.output,
|
|
)
|
|
|
|
print(f"\nCohen's kappa: {kappa_result['kappa']}")
|
|
print(f"Agreement rate: {kappa_result['agreement_rate']:.1%}")
|
|
print(f"Disagreements: {len(disagreements)}/{kappa_result['n']}")
|
|
print(f"Report: {args.output}")
|
|
|
|
if kappa_result["kappa"] is not None:
|
|
if kappa_result["kappa"] < 0.60:
|
|
print("TAXONOMY NEEDS REVISION: kappa < 0.6 indicates poor reliability")
|
|
else:
|
|
print("TAXONOMY ADEQUATE: kappa >= 0.6 indicates acceptable reliability")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|
|
|