#!/usr/bin/env python3 """Mechanism classification validation with a second classifier. Computes inter-rater reliability (Cohen's kappa) between the original inline classifications and a second LLM-based classification using a different prompt template and (optionally) a different model. Usage: uv run python analysis/right_wing/mechanism_validation.py """ from __future__ import annotations import argparse import json import logging import sys import time from collections import Counter from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import Any import duckdb ROOT = Path(__file__).parent.parent.parent.resolve() if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT)) from ai_provider import ProviderError, chat_completion from analysis.config import config logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) # ── mechanism taxonomy ─────────────────────────────────────────────────────── MECHANISMS = [ "consensus_framing", "institutional_rule_of_law", "welfare_service_expansion", "procedural_technical", "local_constituency", "coalition_alignment", "symbolic_declaratory", "targeted_restriction", "system_dismantling", "crisis_response", ] MECHANISM_LABELS_NL = { "consensus_framing": "Consensus framing (gedeeld belang)", "institutional_rule_of_law": "Institutioneel/rechtsstatelijk", "welfare_service_expansion": "Welzijn/dienstverlening uitbreiding", "procedural_technical": "Procedureel/technisch", "local_constituency": "Lokaal/regionaal", "coalition_alignment": "Coalitie-afstemming", "symbolic_declaratory": "Symbolisch/declaratoir", "targeted_restriction": "Gerichte restrictie", "system_dismantling": "Systeemontmanteling", "crisis_response": "Crisisrespons", } MECHANISM_LABELS_EN = { "consensus_framing": "Consensus framing / shared interest", "institutional_rule_of_law": "Institutional / rule of law", "welfare_service_expansion": "Welfare / service expansion", "procedural_technical": "Procedural / technical", "local_constituency": "Local / regional constituency", "coalition_alignment": "Coalition alignment", "symbolic_declaratory": "Symbolic / declaratory", "targeted_restriction": "Targeted restriction", "system_dismantling": "System dismantling", "crisis_response": "Crisis response", } # Original inline classifications (from mechanism_classification.py) ORIGINAL_CLASSIFICATIONS: dict[int, str] = { 15458: "crisis_response", 26477: "institutional_rule_of_law", 9149: "consensus_framing", 17099: "procedural_technical", 4933: "procedural_technical", 17751: "consensus_framing", 20068: "procedural_technical", 16520: "consensus_framing", 17036: "welfare_service_expansion", 17681: "consensus_framing", 14554: "procedural_technical", 21864: "procedural_technical", 26493: "targeted_restriction", 21982: "consensus_framing", 14125: "crisis_response", 13683: "welfare_service_expansion", 16691: "procedural_technical", 15005: "procedural_technical", 17536: "institutional_rule_of_law", 16999: "consensus_framing", 8325: "procedural_technical", 13370: "welfare_service_expansion", 18030: "procedural_technical", 11382: "procedural_technical", 18616: "procedural_technical", 12411: "crisis_response", 22595: "crisis_response", 15772: "system_dismantling", 7111: "welfare_service_expansion", 25784: "targeted_restriction", 27731: "system_dismantling", 15626: "crisis_response", 20215: "welfare_service_expansion", 16430: "symbolic_declaratory", 25982: "local_constituency", 17176: "targeted_restriction", 7054: "procedural_technical", 20323: "procedural_technical", 18025: "system_dismantling", 14837: "system_dismantling", 19620: "targeted_restriction", 21801: "consensus_framing", 19464: "crisis_response", 26855: "targeted_restriction", 22280: "local_constituency", 20115: "symbolic_declaratory", 15082: "targeted_restriction", 6637: "targeted_restriction", 18691: "symbolic_declaratory", 18062: "crisis_response", 3784: "procedural_technical", 10205: "procedural_technical", 10278: "coalition_alignment", 25079: "consensus_framing", 2980: "targeted_restriction", 10420: "crisis_response", 25092: "targeted_restriction", 25545: "institutional_rule_of_law", 23065: "procedural_technical", 2878: "welfare_service_expansion", 25573: "procedural_technical", 3298: "symbolic_declaratory", 25061: "consensus_framing", 4481: "consensus_framing", 3961: "procedural_technical", 473: "institutional_rule_of_law", 10413: "consensus_framing", 974: "procedural_technical", 24009: "procedural_technical", 9789: "institutional_rule_of_law", 24651: "targeted_restriction", 1890: "local_constituency", 1191: "consensus_framing", 3448: "targeted_restriction", 23910: "institutional_rule_of_law", 25566: "welfare_service_expansion", 2070: "targeted_restriction", 23885: "consensus_framing", 24906: "procedural_technical", 2496: "procedural_technical", 25582: "targeted_restriction", 3053: "local_constituency", 1495: "procedural_technical", 10178: "procedural_technical", 1614: "procedural_technical", 23441: "consensus_framing", 3569: "consensus_framing", 10285: "procedural_technical", 23058: "procedural_technical", 3287: "procedural_technical", 10434: "consensus_framing", 10089: "procedural_technical", 22706: "consensus_framing", 3877: "institutional_rule_of_law", 25062: "consensus_framing", 3687: "targeted_restriction", 25166: "procedural_technical", 4618: "procedural_technical", 3468: "institutional_rule_of_law", 24632: "institutional_rule_of_law", 25451: "symbolic_declaratory", 2351: "targeted_restriction", 4227: "consensus_framing", 22853: "consensus_framing", 9884: "procedural_technical", 1428: "consensus_framing", 3629: "symbolic_declaratory", 1572: "local_constituency", 25493: "procedural_technical", 1359: "procedural_technical", 2252: "procedural_technical", 23605: "procedural_technical", 3760: "consensus_framing", 1005: "consensus_framing", 10110: "coalition_alignment", 23301: "consensus_framing", 24046: "symbolic_declaratory", 651: "welfare_service_expansion", 1491: "targeted_restriction", 25606: "targeted_restriction", 313: "procedural_technical", 24008: "consensus_framing", 754: "targeted_restriction", 25469: "targeted_restriction", 25091: "targeted_restriction", 2170: "institutional_rule_of_law", 22792: "procedural_technical", 10597: "institutional_rule_of_law", 23013: "institutional_rule_of_law", 3472: "institutional_rule_of_law", 2014: "system_dismantling", 920: "procedural_technical", 2143: "welfare_service_expansion", 688: "system_dismantling", 2290: "system_dismantling", 4497: "targeted_restriction", 3823: "symbolic_declaratory", 23141: "institutional_rule_of_law", 4436: "institutional_rule_of_law", 25616: "targeted_restriction", 2662: "institutional_rule_of_law", 23287: "institutional_rule_of_law", 4660: "consensus_framing", 4761: "targeted_restriction", 2264: "institutional_rule_of_law", 4394: "institutional_rule_of_law", 1691: "targeted_restriction", 10601: "targeted_restriction", 4089: "targeted_restriction", 23206: "procedural_technical", 22676: "institutional_rule_of_law", 115: "system_dismantling", 3951: "consensus_framing", 1375: "targeted_restriction", 3090: "targeted_restriction", 24650: "procedural_technical", 1772: "consensus_framing", 3678: "system_dismantling", 1692: "institutional_rule_of_law", 24077: "symbolic_declaratory", 349: "institutional_rule_of_law", 9769: "targeted_restriction", 4656: "symbolic_declaratory", 23984: "system_dismantling", 2168: "institutional_rule_of_law", 4443: "institutional_rule_of_law", 4489: "procedural_technical", 10290: "targeted_restriction", 4071: "targeted_restriction", 4088: "targeted_restriction", 1507: "system_dismantling", 2870: "procedural_technical", 1912: "system_dismantling", 22658: "symbolic_declaratory", 10288: "targeted_restriction", 4080: "institutional_rule_of_law", 1847: "targeted_restriction", 23127: "system_dismantling", 4367: "targeted_restriction", 9790: "targeted_restriction", 4150: "procedural_technical", 741: "targeted_restriction", 1705: "consensus_framing", 1831: "consensus_framing", 10600: "targeted_restriction", 9767: "targeted_restriction", 3830: "system_dismantling", 4221: "system_dismantling", 3354: "institutional_rule_of_law", 9977: "symbolic_declaratory", 898: "consensus_framing", 24848: "system_dismantling", 756: "targeted_restriction", 24358: "institutional_rule_of_law", 4309: "institutional_rule_of_law", 10167: "local_constituency", 23633: "procedural_technical", 23030: "targeted_restriction", 1959: "system_dismantling", 23454: "procedural_technical", } # ── prompt templates ───────────────────────────────────────────────────────── # Original prompt (from mechanism_classification.py — inline subagent) # Classifications were done by reading full title + body_text. # The second classifier uses a DIFFERENT template: # - English wording (not Dutch) # - Mechanisms presented in DIFFERENT order (reverse alphabetical) # - Asks for RANKING (top 3) instead of single pick # - Includes definition context for each mechanism MECHANISMS_SHUFLLED = list(reversed(MECHANISMS)) MECHANISM_DEFINITIONS_EN = """1. crisis_response — A temporary, emergency measure responding to an acute event (pandemic, natural disaster, sudden crisis). Reactive and time-limited. 2. system_dismantling — Aims to dismantle, abolish, or fundamentally restructure an existing policy, institution, or regulatory framework. Not reform but abolition/reversal. 3. targeted_restriction — Imposes specific restrictions on a defined group, behavior, or activity. Narrow scope, punitive or exclusionary intent. 4. symbolic_declaratory — Primarily sends a political signal, makes a statement, or takes a position without direct policy impact. Declaratory, symbolic, expressive. 5. procedural_technical — Technical adjustment, budget amendment, implementation detail, or administrative procedure. Bureaucratic, operational, non-ideological. 6. local_constituency — Serves a specific local/regional interest, constituency, or geographic area. NIMBY or local-advocacy pattern. 7. coalition_alignment — Reflects coalition politics: budget compromises, package deals, or alignments between coalition partners. Coalition-maintenance. 8. welfare_service_expansion — Expands government services, social welfare, public goods, or citizen entitlements. Positive provision, not restriction. 9. institutional_rule_of_law — Concerns legal frameworks, rule of law, institutional integrity, judicial process, or constitutional matters. Rule-based, institutional. 10. consensus_framing — Frames the motion as serving a broad, shared interest. Appeals to common ground, national interest, or bipartisan consensus. Inclusive, bridge-building, non-polarizing.""" SECOND_CLASSIFIER_PROMPT = """Classify the following Dutch parliamentary motion according to the mechanism taxonomy below. MOTION TITLE: {title} MOTION TEXT: {body} TASK: Identify the PRIMARY mechanism this motion uses. Select exactly ONE mechanism from the list below. Base your decision on what the motion actually DOES (action-oriented) rather than what it merely TALKS about. MECHANISM TAXONOMY (read carefully before choosing): {MECHANISM_DEFINITIONS} IMPORTANT RULES: - Choose the mechanism that BEST describes the dominant pattern of the motion. - If a motion could fit multiple mechanisms, pick the most specific one. - procedural_technical should be the DEFAULT only if no other mechanism fits better. - Return ONLY the mechanism key exactly as listed above (e.g., "system_dismantling"). Respond with a JSON object containing: - "mechanism": the selected mechanism key - "confidence": 1-5 (1=very uncertain, 5=very certain) - "reasoning": brief explanation (max 2 sentences)""" def build_second_classifier_prompt(title: str, body_text: str) -> str: text = body_text or title or "" if len(text) > 1200: text = text[:1200] + "..." return SECOND_CLASSIFIER_PROMPT.format( title=title or "", body=text, MECHANISM_DEFINITIONS=MECHANISM_DEFINITIONS_EN ) # ── LLM call helpers ───────────────────────────────────────────────────────── def chat_completion_json( messages: list[dict[str, str]], model: str | None = None, retries: int = 3, ) -> dict[str, Any] | None: """Call chat_completion and parse JSON response with retries.""" model = model or config.QWEN_MODEL prompt = messages[0]["content"] system_msg = ( "You are a political science classifier. You classify Dutch parliamentary " "motions by their dominant mechanism type. Respond ONLY with valid JSON. " "No markdown, no code fences, no preamble — pure JSON object." ) full_messages = [ {"role": "system", "content": system_msg}, {"role": "user", "content": prompt}, ] backoff = 0.5 for attempt in range(1, retries + 1): try: raw = chat_completion(full_messages, model=model) except ProviderError as exc: if attempt == retries: logger.error("ProviderError on attempt %d: %s", attempt, exc) return None time.sleep(backoff * (2 ** (attempt - 1))) continue raw = raw.strip() if raw.startswith("```"): raw = raw.split("```", 2)[1] if raw.startswith("json"): raw = raw[4:] raw = raw.strip() try: result = json.loads(raw) if "mechanism" in result and result["mechanism"] in MECHANISMS: return result logger.warning( "Invalid mechanism '%s' on attempt %d", result.get("mechanism"), attempt ) except json.JSONDecodeError: logger.warning("JSON decode failed on attempt %d: %s", attempt, raw[:100]) if attempt < retries: time.sleep(backoff * (2 ** (attempt - 1))) return None def chat_completion_json_parallel( message_batches: list[list[dict[str, str]]], model: str | None = None, max_workers: int = 5, ) -> list[dict[str, Any] | None]: """ Run multiple chat completions in parallel using ThreadPoolExecutor. Each element in message_batches is a list of messages for one completion. Returns a list of parsed JSON dicts (or None for failures), same order. """ model = model or config.QWEN_MODEL def _fetch_one(messages: list[dict[str, str]]) -> dict[str, Any] | None: return chat_completion_json(messages, model=model) with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [executor.submit(_fetch_one, batch) for batch in message_batches] return [f.result() for f in futures] # ── data loading ───────────────────────────────────────────────────────────── def load_motions(db_path: str, motion_ids: list[int]) -> list[dict[str, Any]]: """Load motion data from the database for the given motion IDs.""" con = duckdb.connect(db_path) try: placeholders = ",".join("?" for _ in motion_ids) rows = con.execute( f""" SELECT r.motion_id, m.title, m.body_text, r.year, r.centrist_support_strict FROM right_wing_motions r JOIN motions m ON r.motion_id = m.id WHERE r.motion_id IN ({placeholders}) ORDER BY r.motion_id """, motion_ids, ).fetchall() return [ { "motion_id": r[0], "title": r[1] or "", "body_text": r[2] or "", "year": r[3], "centrist_support_strict": r[4], } for r in rows ] finally: con.close() # ── classification ─────────────────────────────────────────────────────────── def classify_motions_second_pass( motions: list[dict[str, Any]], second_model: str | None = None, batch_size: int = 10, max_workers: int = 5, ) -> dict[int, dict[str, Any]]: """Run second classifier on all motions, return motion_id -> result dict.""" second_model = second_model or config.QWEN_MODEL results: dict[int, dict[str, Any]] = {} for i in range(0, len(motions), batch_size): batch = motions[i : i + batch_size] logger.info( "Batch %d/%d (%d motions)", i // batch_size + 1, (len(motions) - 1) // batch_size + 1, len(batch), ) message_batches = [] for m in batch: prompt = build_second_classifier_prompt(m["title"], m["body_text"]) message_batches.append([{"role": "user", "content": prompt}]) raw_results = chat_completion_json_parallel( message_batches, model=second_model, max_workers=max_workers ) for m, res in zip(batch, raw_results): mid = m["motion_id"] if res and res.get("mechanism") in MECHANISMS: results[mid] = { "mechanism": res["mechanism"], "confidence": res.get("confidence", 0), "reasoning": res.get("reasoning", ""), "error": None, } else: results[mid] = { "mechanism": None, "confidence": 0, "reasoning": "", "error": "classification failed", } time.sleep(0.5) return results # ── agreement analysis ─────────────────────────────────────────────────────── def compute_cohens_kappa( rater1: dict[int, str], rater2: dict[int, str], categories: list[str], ) -> dict[str, Any]: """Compute Cohen's kappa for two raters. Uses only motion_ids present in BOTH raters. """ common_ids = sorted(set(rater1) & set(rater2)) n = len(common_ids) if n == 0: return {"kappa": None, "agreement_rate": None, "n": 0, "error": "no common motions"} agreements = 0 for mid in common_ids: if rater1[mid] == rater2[mid]: agreements += 1 p_o = agreements / n # Expected agreement p_e = 0.0 for cat in categories: p1 = sum(1 for mid in common_ids if rater1[mid] == cat) / n p2 = sum(1 for mid in common_ids if rater2[mid] == cat) / n p_e += p1 * p2 if p_e >= 1.0: kappa = 1.0 else: kappa = (p_o - p_e) / (1.0 - p_e) if p_e < 1.0 else 0.0 return { "kappa": round(kappa, 4), "agreement_rate": round(p_o, 4), "n": n, "agreements": agreements, "p_o": round(p_o, 4), "p_e": round(p_e, 4), "error": None, } def find_disagreements( rater1: dict[int, str], rater2: dict[int, str], ) -> list[dict[str, Any]]: """Find all disagreements between two raters.""" common_ids = sorted(set(rater1) & set(rater2)) disagreements = [] for mid in common_ids: c1 = rater1[mid] c2 = rater2[mid] if c1 != c2: disagreements.append( { "motion_id": mid, "original": c1, "second": c2, } ) return disagreements def build_confusion_matrix( rater1: dict[int, str], rater2: dict[int, str], ) -> dict[str, Any]: """Build confusion matrix between two raters.""" common_ids = set(rater1) & set(rater2) matrix: dict[str, Counter[str]] = {m: Counter() for m in MECHANISMS} for mid in common_ids: c1 = rater1[mid] c2 = rater2[mid] matrix[c1][c2] += 1 return {k: dict(v) for k, v in matrix.items()} # ── resolution ─────────────────────────────────────────────────────────────── def resolve_disagreements( disagreements: list[dict[str, Any]], second_results: dict[int, dict[str, Any]], motions: list[dict[str, Any]], ) -> list[dict[str, Any]]: """Resolve disagreements by preferring higher-confidence classification.""" motion_map = {m["motion_id"]: m for m in motions} resolved = [] for d in disagreements: mid = d["motion_id"] sr = second_results.get(mid, {}) confidence = sr.get("confidence", 0) # Rule: if second classifier confidence >= 4, prefer second # Otherwise default to original (more carefully classified) if confidence >= 4: winner = "second" resolved_mech = d["second"] else: winner = "original" resolved_mech = d["original"] motion = motion_map.get(mid, {}) resolved.append( { "motion_id": mid, "title": motion.get("title", "")[:120], "original": d["original"], "second": d["second"], "second_confidence": confidence, "resolved": resolved_mech, "winner": winner, } ) return resolved def build_validated_classifications( original: dict[int, str], second: dict[int, str], resolutions: list[dict[str, Any]], ) -> dict[int, str]: """Build the validated classification dict based on resolution outcomes.""" resolution_map = {r["motion_id"]: r["resolved"] for r in resolutions} validated = dict(original) for mid in validated: if mid in resolution_map: validated[mid] = resolution_map[mid] return validated # ── report generation ──────────────────────────────────────────────────────── def generate_report( kappa_result: dict[str, Any], disagreements: list[dict[str, Any]], resolutions: list[dict[str, Any]], confusion: dict[str, Any], validated_dist: dict[str, Any], second_results: dict[int, dict[str, Any]], output_path: str, ) -> None: """Generate mechanism validation markdown report.""" n_second_classified = sum(1 for v in second_results.values() if v.get("mechanism")) avg_confidence = ( sum(v.get("confidence", 0) for v in second_results.values() if v.get("mechanism")) / max(n_second_classified, 1) ) lines = [ "# Mechanism Classification Validation Report", "", "## 1. Inter-Rater Reliability", "", f"- **Motions compared:** {kappa_result['n']}", f"- **Agreements:** {kappa_result['agreements']} / {kappa_result['n']}", f"- **Agreement rate:** {kappa_result['agreement_rate']:.1%}", f"- **Cohen's kappa (κ):** {kappa_result['kappa']}", f" - P_o (observed): {kappa_result['p_o']:.4f}", f" - P_e (expected): {kappa_result['p_e']:.4f}", "", ] kappa = kappa_result["kappa"] if kappa is not None: if kappa < 0.0: strength = "Less than chance agreement" elif kappa < 0.20: strength = "Slight agreement" elif kappa < 0.40: strength = "Fair agreement" elif kappa < 0.60: strength = "Moderate agreement" elif kappa < 0.80: strength = "Substantial agreement" else: strength = "Almost perfect agreement" lines.append(f"**Interpretation:** {strength}") lines.append("") if kappa is not None and kappa < 0.60: lines.append("**The mechanism taxonomy needs revision.** The inter-rater agreement is below 0.6, suggesting the 10-mechanism framework is not being applied consistently across raters. Consider:") lines.append("- Simplifying or merging ambiguous mechanism pairs") lines.append("- Adding clearer decision rules for borderline cases") lines.append("- Reducing the number of mechanisms") lines.append("") elif kappa is not None: lines.append("**The mechanism taxonomy appears adequate.** Inter-rater agreement is at or above 0.6, indicating reasonable consistency.") lines.append("") lines.extend([ "## 2. Second Classifier Summary", "", f"- **Model:** {config.QWEN_MODEL}", f"- **Motions classified:** {n_second_classified}", f"- **Average confidence:** {avg_confidence:.1f}/5", "", ]) conf_dist = Counter() for v in second_results.values(): conf_dist[v.get("confidence", 0)] += 1 lines.append("### Confidence Distribution") lines.append("| Confidence | Count |") lines.append("|------------|-------|") for level in range(1, 6): lines.append(f"| {level} | {conf_dist.get(level, 0)} |") lines.append("") lines.extend([ "## 3. Disagreement Table", "", f"**Total disagreements:** {len(disagreements)} / {kappa_result['n']} ({len(disagreements) / max(kappa_result['n'], 1) * 100:.1f}%)", "", "| Motion ID | Title | Original | Second | Confidence | Resolved | Winner |", "|-----------|-------|----------|--------|------------|----------|--------|", ]) for r in resolutions: orig_label = MECHANISM_LABELS_NL.get(r["original"], r["original"]) second_label = MECHANISM_LABELS_NL.get(r["second"], r["second"]) res_label = MECHANISM_LABELS_NL.get(r["resolved"], r["resolved"]) lines.append( f"| {r['motion_id']} | {r['title'][:80]} | {orig_label} | {second_label} | {r['second_confidence']} | {res_label} | {r['winner']} |" ) lines.extend([ "", "## 4. Mechanism Distribution Comparison", "", "| Mechanism | Original Count | Second Count | Validated Count |", "|-----------|---------------|--------------|-----------------|", ]) orig_dist = Counter(ORIGINAL_CLASSIFICATIONS.values()) second_dist = Counter() for v in second_results.values(): m = v.get("mechanism") if m: second_dist[m] += 1 for mech in MECHANISMS: label = MECHANISM_LABELS_NL.get(mech, mech) o_cnt = orig_dist.get(mech, 0) s_cnt = second_dist.get(mech, 0) v_cnt = validated_dist.get(mech, 0) lines.append(f"| {label} | {o_cnt} | {s_cnt} | {v_cnt} |") lines.extend([ "", "## 5. Confusion Matrix (Top Rows)", "", "| Original \\ Second | " + " | ".join(MECHANISM_LABELS_EN[m][:20] for m in MECHANISMS) + " |", "|" + "---|" * (len(MECHANISMS) + 1), ]) for mech in MECHANISMS: label = MECHANISM_LABELS_EN[mech][:20] row_data = confusion.get(mech, {}) cells = [str(row_data.get(m, 0)) for m in MECHANISMS] lines.append(f"| {label} | {' | '.join(cells)} |") lines.extend([ "", "## 6. Conclusion", "", f"Cohen's kappa of **{kappa}** indicates **{strength.lower()}** between the original inline classification and the independent second classifier.", "", "### Key findings:", f"- {kappa_result['agreements']} out of {kappa_result['n']} motions agreed ({kappa_result['agreement_rate']:.1%})", f"- {len(disagreements)} disagreements resolved: {sum(1 for r in resolutions if r['winner'] == 'original')} kept original, {sum(1 for r in resolutions if r['winner'] == 'second')} adopted second", "", ]) top_disagreement_pairs = Counter() for d in disagreements: pair = f"{d['original']} / {d['second']}" top_disagreement_pairs[pair] += 1 if top_disagreement_pairs: lines.append("### Most common disagreement pairs:") for pair, cnt in top_disagreement_pairs.most_common(5): lines.append(f"- {pair}: {cnt} times") lines.append("") lines.append("### Revised mechanism taxonomy recommendation:") if kappa is not None and kappa < 0.60: lines.append("- Taxonomy needs revision to improve inter-rater reliability.") if top_disagreement_pairs: top_pair = top_disagreement_pairs.most_common(1)[0][0] lines.append(f"- Most confused pair: {top_pair} — consider merging or clarifying distinction.") else: lines.append("- Taxonomy is sufficiently reliable. Minor clarifications may be helpful for borderline cases.") lines.append("") out_path = Path(output_path) out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text("\n".join(lines) + "\n", encoding="utf-8") logger.info("Report written to %s", out_path) # ── main ───────────────────────────────────────────────────────────────────── def main() -> int: parser = argparse.ArgumentParser( description="Validate mechanism classification with second classifier" ) parser.add_argument("--db", default="data/motions.db", help="Path to DuckDB database") parser.add_argument( "--model", default=None, help=f"Second classifier model (default: {config.QWEN_MODEL})", ) parser.add_argument("--batch-size", type=int, default=10, help="Motions per batch") parser.add_argument("--max-workers", type=int, default=3, help="Max parallel workers") parser.add_argument( "--output", default="reports/overton_window/mechanism_validation.md", help="Output report path", ) parser.add_argument( "--save-results", default=None, help="Save full second classification results to JSON path", ) args = parser.parse_args() second_model = args.model or config.QWEN_MODEL logger.info("Second classifier model: %s", second_model) motion_ids = list(ORIGINAL_CLASSIFICATIONS.keys()) logger.info("Loading %d motions from database...", len(motion_ids)) motions = load_motions(args.db, motion_ids) logger.info("Loaded %d motions", len(motions)) logger.info("Running second classifier...") second_results = classify_motions_second_pass( motions, second_model=second_model, batch_size=args.batch_size, max_workers=args.max_workers, ) # Extract mechanism-only dict for agreement analysis second_classifications: dict[int, str] = {} for mid, res in second_results.items(): if res.get("mechanism") and res["mechanism"] in MECHANISMS: second_classifications[mid] = res["mechanism"] n_second_classified = len(second_classifications) logger.info( "Second classifier completed: %d/%d motions classified", n_second_classified, len(motions), ) # Filter original to only include motions with second classification original_filtered = { mid: ORIGINAL_CLASSIFICATIONS[mid] for mid in second_classifications if mid in ORIGINAL_CLASSIFICATIONS } # Compute Cohen's kappa kappa_result = compute_cohens_kappa( original_filtered, second_classifications, MECHANISMS ) logger.info("Cohen's kappa: %s", kappa_result["kappa"]) logger.info("Agreement rate: %s", kappa_result["agreement_rate"]) # Find disagreements disagreements = find_disagreements(original_filtered, second_classifications) logger.info("Disagreements: %d", len(disagreements)) # Build confusion matrix confusion = build_confusion_matrix(original_filtered, second_classifications) # Resolve disagreements resolutions = resolve_disagreements(disagreements, second_results, motions) # Build validated classifications validated = build_validated_classifications( ORIGINAL_CLASSIFICATIONS, second_classifications, resolutions ) validated_dist = Counter(validated.values()) # Save results if requested if args.save_results: save_path = Path(args.save_results) save_path.parent.mkdir(parents=True, exist_ok=True) save_data = { "kappa": kappa_result["kappa"], "agreement_rate": kappa_result["agreement_rate"], "n_motions": kappa_result["n"], "n_disagreements": len(disagreements), "second_results": { str(mid): res for mid, res in second_results.items() }, "resolutions": resolutions, } save_path.write_text(json.dumps(save_data, indent=2, ensure_ascii=False), encoding="utf-8") logger.info("Results saved to %s", save_path) # Generate report generate_report( kappa_result=kappa_result, disagreements=disagreements, resolutions=resolutions, confusion=confusion, validated_dist=dict(validated_dist), second_results=second_results, output_path=args.output, ) print(f"\nCohen's kappa: {kappa_result['kappa']}") print(f"Agreement rate: {kappa_result['agreement_rate']:.1%}") print(f"Disagreements: {len(disagreements)}/{kappa_result['n']}") print(f"Report: {args.output}") if kappa_result["kappa"] is not None: if kappa_result["kappa"] < 0.60: print("TAXONOMY NEEDS REVISION: kappa < 0.6 indicates poor reliability") else: print("TAXONOMY ADEQUATE: kappa >= 0.6 indicates acceptable reliability") return 0 if __name__ == "__main__": raise SystemExit(main())