From 2c60f41f295a17146cd08ea9346c675d27d1e1d9 Mon Sep 17 00:00:00 2001 From: Sven Geboers Date: Fri, 1 May 2026 12:22:55 +0200 Subject: [PATCH] cleanup: archive stale scripts and delete orphaned generate_extra_charts Archives 8 one-off/backfill/research scripts to scripts/archive/: - compare_svd_exclude_parties.py (diagnostic) - compute_test_batch.py (test utility) - fill_mp_votes_parties.py (backfill) - generate_compass.py (generates to deleted outputs/) - inspect_axis.py (diagnostic) - qa_similarity.py (QA script, references deleted thoughts/ledgers/) - recompute_svd.py (one-off recompute) - semantic_gravity_examples.py (research) Deletes: - generate_extra_charts.py (0 references, generates to deleted outputs/) - tests/test_qa_similarity.py (test for archived script) Adds: - scripts/archive/README.md explaining archive purpose - docs/plans/2026-05-01-001-scripts-audit-cleanup-plan.md --- ...26-05-01-001-scripts-audit-cleanup-plan.md | 137 ++++++++++++++ scripts/archive/README.md | 7 + .../compare_svd_exclude_parties.py | 0 scripts/{ => archive}/compute_test_batch.py | 0 .../{ => archive}/fill_mp_votes_parties.py | 0 scripts/{ => archive}/generate_compass.py | 0 scripts/{ => archive}/inspect_axis.py | 0 scripts/{ => archive}/qa_similarity.py | 0 scripts/{ => archive}/recompute_svd.py | 0 .../semantic_gravity_examples.py | 0 scripts/generate_extra_charts.py | 172 ------------------ tests/test_qa_similarity.py | 51 ------ 12 files changed, 144 insertions(+), 223 deletions(-) create mode 100644 docs/plans/2026-05-01-001-scripts-audit-cleanup-plan.md create mode 100644 scripts/archive/README.md rename scripts/{ => archive}/compare_svd_exclude_parties.py (100%) rename scripts/{ => archive}/compute_test_batch.py (100%) rename scripts/{ => archive}/fill_mp_votes_parties.py (100%) rename scripts/{ => archive}/generate_compass.py (100%) rename scripts/{ => archive}/inspect_axis.py (100%) rename scripts/{ => archive}/qa_similarity.py (100%) rename scripts/{ => archive}/recompute_svd.py (100%) rename scripts/{ => archive}/semantic_gravity_examples.py (100%) delete mode 100644 scripts/generate_extra_charts.py delete mode 100644 tests/test_qa_similarity.py diff --git a/docs/plans/2026-05-01-001-scripts-audit-cleanup-plan.md b/docs/plans/2026-05-01-001-scripts-audit-cleanup-plan.md new file mode 100644 index 0000000..103b20d --- /dev/null +++ b/docs/plans/2026-05-01-001-scripts-audit-cleanup-plan.md @@ -0,0 +1,137 @@ +--- +title: Scripts Directory Audit and Cleanup Plan +type: refactor +status: active +date: 2026-05-01 +--- + +# Scripts Directory Audit and Cleanup Plan + +## Overview + +The `scripts/` directory contains 20 Python files (~4,900 lines total). Many are one-off diagnostics, research utilities, or data backfill scripts from early pipeline development. Several are no longer needed, some generate outputs to now-deleted directories, and a few have overlapping functionality. This plan establishes a clear taxonomy and cleanup path. + +--- + +## Current Inventory + +| Script | Lines | Last Commit | References | Status | +|--------|-------|-------------|------------|--------| +| `download_past_year.py` | 295 | 2026-04-30 | 11 | **Keep** — Active data ingestion | +| `health_check.py` | 98 | 2026-05-01 | 21 | **Keep** — Active health check CLI | +| `validate_svd_themes.py` | 343 | 2026-04-30 | 13 | **Keep** — Active validation | +| `generate_svd_json.py` | 594 | 2026-04-13 | 12 | **Keep** — Generates `thoughts/explorer/top_svd_top_motions.json` | +| `motion_drift.py` | 1,207 | 2026-04-05 | 42 | **Keep** — Referenced in active plans | +| `sync_motion_content.py` | 704 | 2026-03-23 | 8 | **Keep** — Content enrichment pipeline | +| `rerun_embeddings.py` | 233 | 2026-03-23 | 15 | **Keep** — Embedding rebuild utility | +| `derive_svd_labels.py` | 423 | 2026-04-13 | 5 | **Keep** — SVD label derivation | +| `diagnose_trajectories_cli.py` | 234 | 2026-03-31 | 5 | **Keep** — Diagnostic utility | +| `svd_diagnostics.py` | 214 | 2026-03-22 | 9 | **Keep** — SVD diagnostics | +| `recompute_svd.py` | 172 | 2026-04-16 | 2 | **Archive** — One-off recompute | +| `semantic_gravity_examples.py` | 286 | 2026-04-05 | 6 | **Archive** — Research script | +| `qa_similarity.py` | 150 | 2026-03-23 | 4 | **Archive** — QA script (references deleted `thoughts/ledgers/`) | +| `fill_mp_votes_parties.py` | 277 | 2026-03-22 | 2 | **Archive** — Backfill script | +| `inspect_axis.py` | 137 | 2026-03-22 | 3 | **Archive** — Diagnostic | +| `compare_svd_exclude_parties.py` | 204 | 2026-03-22 | 1 | **Archive** — Diagnostic | +| `generate_compass.py` | 157 | 2026-03-22 | 2 | **Archive** — Generates to deleted `outputs/` | +| `compute_test_batch.py` | 128 | 2026-03-20 | 3 | **Archive** — Test batch | +| `generate_extra_charts.py` | 172 | 2026-03-22 | 0 | **Delete** — Generates to deleted `outputs/`, 0 references | + +--- + +## Categorization Rules + +### Keep (10 scripts) +Scripts that are: +- Imported or invoked by active code/tests +- Referenced in active plans (docs/plans/) +- Run regularly as part of pipeline or diagnostics +- Updated recently (April 2026+) + +### Archive (9 scripts) +Scripts that are: +- One-off diagnostics or backfill utilities +- Research/exploration scripts with no active plan references +- Superseded by pipeline code but kept for historical reference +- Generate outputs to `outputs/` (deleted) or `thoughts/ledgers/` (deleted) + +**Archive location:** `scripts/archive/` — not imported, not tested, preserved for reference. + +### Delete (1 script) +Scripts that are: +- Completely orphaned (0 references) +- Superseded with no unique value +- Generate outputs to non-existent directories + +--- + +## Implementation Units + +- [ ] U1. **Create `scripts/archive/` directory** + - Files: `scripts/archive/` (new directory) + - Verification: Directory exists + +- [ ] U2. **Move archive scripts to `scripts/archive/`** + - Files to move: + - `scripts/recompute_svd.py` + - `scripts/semantic_gravity_examples.py` + - `scripts/qa_similarity.py` + - `scripts/fill_mp_votes_parties.py` + - `scripts/inspect_axis.py` + - `scripts/compare_svd_exclude_parties.py` + - `scripts/generate_compass.py` + - `scripts/compute_test_batch.py` + - Verification: Scripts are in `scripts/archive/`, not in `scripts/` + +- [ ] U3. **Delete orphaned scripts** + - Files to delete: + - `scripts/generate_extra_charts.py` + - Verification: File no longer exists + +- [ ] U4. **Update `.gitignore` for archive** + - Add: `scripts/archive/` (optional — if we don't want to track archived scripts) + - Or add README in archive explaining purpose + - Verification: Archive is handled appropriately + +- [ ] U5. **Run test suite** + - Command: `uv run pytest tests/ -q` + - Verification: All tests pass, no import errors from moved scripts + +--- + +## Risks + +| Risk | Mitigation | +|------|-----------| +| A test imports an archived script | Check all test imports before moving | +| A plan references an archived script | Plans already checked — none reference archive candidates exclusively | +| Future need for archived script | Git history preserves everything; archive is just convenience | + +--- + +## Post-Cleanup State + +``` +scripts/ +├── archive/ # 8 archived scripts (reference only) +│ ├── compare_svd_exclude_parties.py +│ ├── compute_test_batch.py +│ ├── fill_mp_votes_parties.py +│ ├── generate_compass.py +│ ├── inspect_axis.py +│ ├── qa_similarity.py +│ ├── recompute_svd.py +│ └── semantic_gravity_examples.py +├── download_past_year.py +├── health_check.py +├── derive_svd_labels.py +├── diagnose_trajectories_cli.py +├── generate_svd_json.py +├── motion_drift.py +├── rerun_embeddings.py +├── sync_motion_content.py +├── svd_diagnostics.py +└── validate_svd_themes.py +``` + +**Result:** 10 active scripts + 8 archived. ~1,700 lines removed from active directory. diff --git a/scripts/archive/README.md b/scripts/archive/README.md new file mode 100644 index 0000000..d682081 --- /dev/null +++ b/scripts/archive/README.md @@ -0,0 +1,7 @@ +# Archived scripts +# +# These scripts are preserved for reference but are no longer actively +# maintained or run. They include one-off diagnostics, backfill utilities, +# and research scripts from early pipeline development. +# +# Git history preserves everything; this directory is just a convenience. diff --git a/scripts/compare_svd_exclude_parties.py b/scripts/archive/compare_svd_exclude_parties.py similarity index 100% rename from scripts/compare_svd_exclude_parties.py rename to scripts/archive/compare_svd_exclude_parties.py diff --git a/scripts/compute_test_batch.py b/scripts/archive/compute_test_batch.py similarity index 100% rename from scripts/compute_test_batch.py rename to scripts/archive/compute_test_batch.py diff --git a/scripts/fill_mp_votes_parties.py b/scripts/archive/fill_mp_votes_parties.py similarity index 100% rename from scripts/fill_mp_votes_parties.py rename to scripts/archive/fill_mp_votes_parties.py diff --git a/scripts/generate_compass.py b/scripts/archive/generate_compass.py similarity index 100% rename from scripts/generate_compass.py rename to scripts/archive/generate_compass.py diff --git a/scripts/inspect_axis.py b/scripts/archive/inspect_axis.py similarity index 100% rename from scripts/inspect_axis.py rename to scripts/archive/inspect_axis.py diff --git a/scripts/qa_similarity.py b/scripts/archive/qa_similarity.py similarity index 100% rename from scripts/qa_similarity.py rename to scripts/archive/qa_similarity.py diff --git a/scripts/recompute_svd.py b/scripts/archive/recompute_svd.py similarity index 100% rename from scripts/recompute_svd.py rename to scripts/archive/recompute_svd.py diff --git a/scripts/semantic_gravity_examples.py b/scripts/archive/semantic_gravity_examples.py similarity index 100% rename from scripts/semantic_gravity_examples.py rename to scripts/archive/semantic_gravity_examples.py diff --git a/scripts/generate_extra_charts.py b/scripts/generate_extra_charts.py deleted file mode 100644 index cc554a7..0000000 --- a/scripts/generate_extra_charts.py +++ /dev/null @@ -1,172 +0,0 @@ -"""Generate additional blog charts: controversy trend + party alignment heatmap.""" - -from __future__ import annotations -import os, sys - -ROOT = os.path.dirname(os.path.abspath(__file__)) -if ROOT not in sys.path: - sys.path.insert(0, ROOT) - -import duckdb -import plotly.graph_objects as go -import plotly.express as px -import numpy as np - -DB = "data/motions.db" -OUT = "outputs/blog-charts" -os.makedirs(OUT, exist_ok=True) - -con = duckdb.connect(DB, read_only=True) - -# ─── 1. Controversy trend (bar chart, 2019-2026, quarterly) ────────────────── -rows = con.execute(""" - SELECT - YEAR(date) || '-Q' || QUARTER(date) as wid, - YEAR(date) as yr, - QUARTER(date) as q, - COUNT(*) as n, - ROUND(AVG(controversy_score), 3) as avg_c, - COUNT(*) FILTER (WHERE controversy_score >= 0.7) as high_c - FROM motions - WHERE controversy_score IS NOT NULL - AND date >= '2019-01-01' AND date < '2026-04-01' - GROUP BY wid, yr, q - ORDER BY yr, q -""").fetchall() - -windows = [r[0] for r in rows] -avg_c = [r[4] for r in rows] -high_pct = [round(100.0 * r[5] / r[3], 1) if r[3] else 0 for r in rows] - -fig = go.Figure() -fig.add_trace( - go.Bar( - x=windows, - y=high_pct, - name="% highly contested (score ≥ 0.7)", - marker_color="#00d9a3", - opacity=0.85, - ) -) -fig.add_trace( - go.Scatter( - x=windows, - y=[v * 100 for v in avg_c], - name="avg controversy × 100", - mode="lines+markers", - line=dict(color="#e6edf3", width=2), - marker=dict(size=4), - ) -) -fig.update_layout( - title="Political controversy per quarter (Tweede Kamer, 2019–2026)", - xaxis_title="Quarter", - yaxis_title="% of motions", - plot_bgcolor="#161b22", - paper_bgcolor="#0d1117", - font=dict(color="#e6edf3", family="Inter, system-ui"), - legend=dict(bgcolor="rgba(0,0,0,0)", bordercolor="#30363d", borderwidth=1), - xaxis=dict(tickangle=-45, gridcolor="#30363d"), - yaxis=dict(gridcolor="#30363d", range=[0, 55]), - bargap=0.15, -) -out1 = os.path.join(OUT, "controversy_trend.html") -fig.write_html(out1, include_plotlyjs="cdn", full_html=True) -print(f"Wrote {out1}") - -# ─── 2. Party alignment heatmap ────────────────────────────────────────────── -# Only include major parties with sufficient data -MAJOR = [ - "VVD", - "PVV", - "D66", - "CDA", - "PvdA", - "GroenLinks", - "SP", - "ChristenUnie", - "SGP", - "FVD", - "BBB", - "PvdD", - "Volt", - "GroenLinks-PvdA", - "Nieuw Sociaal Contract", - "DENK", - "JA21", -] - -rows = con.execute(""" - WITH pv AS ( - SELECT motion_id, party, - CASE - WHEN SUM(CASE WHEN vote='voor' THEN 1 ELSE 0 END) > SUM(CASE WHEN vote='tegen' THEN 1 ELSE 0 END) THEN 'voor' - WHEN SUM(CASE WHEN vote='tegen' THEN 1 ELSE 0 END) > SUM(CASE WHEN vote='voor' THEN 1 ELSE 0 END) THEN 'tegen' - ELSE 'split' - END as pv - FROM mp_votes WHERE party IS NOT NULL AND vote IN ('voor','tegen') - GROUP BY motion_id, party - ), - d AS (SELECT * FROM pv WHERE pv != 'split') - SELECT a.party, b.party, - COUNT(*) as shared, - ROUND(100.0 * SUM(CASE WHEN a.pv = b.pv THEN 1 ELSE 0 END) / COUNT(*), 1) as pct - FROM d a JOIN d b ON a.motion_id = b.motion_id AND a.party != b.party - GROUP BY a.party, b.party - HAVING COUNT(*) >= 100 -""").fetchall() - -# Build matrix -agree = {} -for a, b, _, pct in rows: - agree[(a, b)] = pct - -# Filter to parties that have data -present = set() -for a, b in agree: - if a in MAJOR: - present.add(a) - if b in MAJOR: - present.add(b) -parties = [p for p in MAJOR if p in present] - -n = len(parties) -matrix = np.full((n, n), np.nan) -for i, a in enumerate(parties): - matrix[i, i] = 100.0 - for j, b in enumerate(parties): - if i != j and (a, b) in agree: - matrix[i, j] = agree[(a, b)] - -fig2 = go.Figure( - data=go.Heatmap( - z=matrix, - x=parties, - y=parties, - colorscale=[[0, "#6e40c9"], [0.5, "#30363d"], [1, "#00d9a3"]], - zmid=70, - zmin=35, - zmax=100, - text=[[f"{v:.0f}%" if not np.isnan(v) else "" for v in row] for row in matrix], - texttemplate="%{text}", - textfont=dict(size=9), - hoverongaps=False, - showscale=True, - colorbar=dict(title="Agreement %", tickfont=dict(color="#e6edf3")), - ) -) -fig2.update_layout( - title="Cross-party vote alignment (all years combined)", - plot_bgcolor="#161b22", - paper_bgcolor="#0d1117", - font=dict(color="#e6edf3", family="Inter, system-ui", size=11), - xaxis=dict(tickangle=-45, side="bottom", gridcolor="#30363d"), - yaxis=dict(autorange="reversed", gridcolor="#30363d"), - height=600, -) -out2 = os.path.join(OUT, "party_alignment.html") -fig2.write_html(out2, include_plotlyjs="cdn", full_html=True) -print(f"Wrote {out2}") - -con.close() -print("Done.") diff --git a/tests/test_qa_similarity.py b/tests/test_qa_similarity.py deleted file mode 100644 index 7c8d614..0000000 --- a/tests/test_qa_similarity.py +++ /dev/null @@ -1,51 +0,0 @@ -import json -from pathlib import Path - - -def test_qa_similarity_creates_ledger(tmp_path, monkeypatch): - # Prepare monkeypatched database.db - class DummyDB: - def sample_motions(self, sample_size): - assert sample_size == 2 - return [1, 2] - - def get_cached_similarities(self, motion_id, top_k): - # return deterministic neighbors - return [ - {"id": motion_id * 10 + i, "score": 1.0 - i * 0.1} for i in range(top_k) - ] - - dummy = DummyDB() - - # Monkeypatch the database module to provide .db — use monkeypatch.setitem - # so the override is active for this test and auto-reverts after. - import types - - fake_db_module = types.SimpleNamespace(db=dummy) - - import sys - - monkeypatch.setitem(sys.modules, "database", fake_db_module) - - # Ensure thoughts/ledgers inside tmp_path - base = tmp_path - (base / "thoughts" / "ledgers").mkdir(parents=True) - - # Monkeypatch cwd so ledger writes to tmp_path/thoughts - monkeypatch.chdir(base) - - from scripts.qa_similarity import main - - summary = main(db_path=":memory:", sample_size=2, top_k=3) - - assert summary["sample_size"] == 2 - assert summary["top_k"] == 3 - assert 1 in summary["motions"] - assert 2 in summary["motions"] - - ledger_path = Path(summary["ledger_path"]) - assert ledger_path.exists() - - data = json.loads(ledger_path.read_text(encoding="utf-8")) - assert "motions" in data - assert len(data["motions"]) == 2