parent
daa22c5e2b
commit
2891e9ee70
@ -1,20 +1,32 @@ |
|||||||
version: '3.8' |
version: "3.9" |
||||||
|
|
||||||
services: |
services: |
||||||
stemwijzer: |
stematlas: |
||||||
build: . |
image: ${DOCKER_REGISTRY}/sgeboers/stemwijzer:latest |
||||||
image: stemwijzer:latest |
|
||||||
container_name: stemwijzer_app |
|
||||||
restart: unless-stopped |
|
||||||
ports: |
ports: |
||||||
- "8501:8501" |
- "127.0.0.1:8501:8501" |
||||||
volumes: |
volumes: |
||||||
- ./data:/home/app/app/data:rw |
- /srv/stematlas/data:/home/app/app/data |
||||||
|
restart: unless-stopped |
||||||
environment: |
environment: |
||||||
- PYTHONPATH=/home/app/app |
- PYTHONPATH=/home/app/app |
||||||
- OPENROUTER_API_KEY |
- OPENROUTER_API_KEY |
||||||
- OTHER_SECRET |
- DB_PATH=/home/app/app/data/motions.db |
||||||
healthcheck: |
healthcheck: |
||||||
test: ["CMD", "curl", "-f", "http://localhost:8501/"] |
test: ["CMD", "curl", "-f", "http://localhost:8501/"] |
||||||
interval: 30s |
interval: 30s |
||||||
timeout: 3s |
timeout: 3s |
||||||
retries: 3 |
retries: 3 |
||||||
|
start_period: 15s |
||||||
|
|
||||||
|
scheduler: |
||||||
|
image: ${DOCKER_REGISTRY}/sgeboers/stemwijzer:latest |
||||||
|
command: python scheduler.py |
||||||
|
volumes: |
||||||
|
- /srv/stematlas/data:/home/app/app/data |
||||||
|
restart: unless-stopped |
||||||
|
environment: |
||||||
|
- PYTHONPATH=/home/app/app |
||||||
|
- OPENROUTER_API_KEY |
||||||
|
- OPENAI_API_KEY |
||||||
|
- DB_PATH=/home/app/app/data/motions.db |
||||||
|
|||||||
@ -0,0 +1,5 @@ |
|||||||
|
"""Stemwijzer page — thin wrapper around the existing app module.""" |
||||||
|
|
||||||
|
from app import main # noqa: F401 (module-level set_page_config runs on import) |
||||||
|
|
||||||
|
main() |
||||||
@ -0,0 +1,5 @@ |
|||||||
|
"""Politiek Explorer page — thin wrapper around the explorer module.""" |
||||||
|
|
||||||
|
from explorer import run_app |
||||||
|
|
||||||
|
run_app() |
||||||
@ -0,0 +1,172 @@ |
|||||||
|
"""Generate additional blog charts: controversy trend + party alignment heatmap.""" |
||||||
|
|
||||||
|
from __future__ import annotations |
||||||
|
import os, sys |
||||||
|
|
||||||
|
ROOT = os.path.dirname(os.path.abspath(__file__)) |
||||||
|
if ROOT not in sys.path: |
||||||
|
sys.path.insert(0, ROOT) |
||||||
|
|
||||||
|
import duckdb |
||||||
|
import plotly.graph_objects as go |
||||||
|
import plotly.express as px |
||||||
|
import numpy as np |
||||||
|
|
||||||
|
DB = "data/motions.db" |
||||||
|
OUT = "outputs/blog-charts" |
||||||
|
os.makedirs(OUT, exist_ok=True) |
||||||
|
|
||||||
|
con = duckdb.connect(DB, read_only=True) |
||||||
|
|
||||||
|
# ─── 1. Controversy trend (bar chart, 2019-2026, quarterly) ────────────────── |
||||||
|
rows = con.execute(""" |
||||||
|
SELECT |
||||||
|
YEAR(date) || '-Q' || QUARTER(date) as wid, |
||||||
|
YEAR(date) as yr, |
||||||
|
QUARTER(date) as q, |
||||||
|
COUNT(*) as n, |
||||||
|
ROUND(AVG(controversy_score), 3) as avg_c, |
||||||
|
COUNT(*) FILTER (WHERE controversy_score >= 0.7) as high_c |
||||||
|
FROM motions |
||||||
|
WHERE controversy_score IS NOT NULL |
||||||
|
AND date >= '2019-01-01' AND date < '2026-04-01' |
||||||
|
GROUP BY wid, yr, q |
||||||
|
ORDER BY yr, q |
||||||
|
""").fetchall() |
||||||
|
|
||||||
|
windows = [r[0] for r in rows] |
||||||
|
avg_c = [r[4] for r in rows] |
||||||
|
high_pct = [round(100.0 * r[5] / r[3], 1) if r[3] else 0 for r in rows] |
||||||
|
|
||||||
|
fig = go.Figure() |
||||||
|
fig.add_trace( |
||||||
|
go.Bar( |
||||||
|
x=windows, |
||||||
|
y=high_pct, |
||||||
|
name="% highly contested (score ≥ 0.7)", |
||||||
|
marker_color="#00d9a3", |
||||||
|
opacity=0.85, |
||||||
|
) |
||||||
|
) |
||||||
|
fig.add_trace( |
||||||
|
go.Scatter( |
||||||
|
x=windows, |
||||||
|
y=[v * 100 for v in avg_c], |
||||||
|
name="avg controversy × 100", |
||||||
|
mode="lines+markers", |
||||||
|
line=dict(color="#e6edf3", width=2), |
||||||
|
marker=dict(size=4), |
||||||
|
) |
||||||
|
) |
||||||
|
fig.update_layout( |
||||||
|
title="Political controversy per quarter (Tweede Kamer, 2019–2026)", |
||||||
|
xaxis_title="Quarter", |
||||||
|
yaxis_title="% of motions", |
||||||
|
plot_bgcolor="#161b22", |
||||||
|
paper_bgcolor="#0d1117", |
||||||
|
font=dict(color="#e6edf3", family="Inter, system-ui"), |
||||||
|
legend=dict(bgcolor="rgba(0,0,0,0)", bordercolor="#30363d", borderwidth=1), |
||||||
|
xaxis=dict(tickangle=-45, gridcolor="#30363d"), |
||||||
|
yaxis=dict(gridcolor="#30363d", range=[0, 55]), |
||||||
|
bargap=0.15, |
||||||
|
) |
||||||
|
out1 = os.path.join(OUT, "controversy_trend.html") |
||||||
|
fig.write_html(out1, include_plotlyjs="cdn", full_html=True) |
||||||
|
print(f"Wrote {out1}") |
||||||
|
|
||||||
|
# ─── 2. Party alignment heatmap ────────────────────────────────────────────── |
||||||
|
# Only include major parties with sufficient data |
||||||
|
MAJOR = [ |
||||||
|
"VVD", |
||||||
|
"PVV", |
||||||
|
"D66", |
||||||
|
"CDA", |
||||||
|
"PvdA", |
||||||
|
"GroenLinks", |
||||||
|
"SP", |
||||||
|
"ChristenUnie", |
||||||
|
"SGP", |
||||||
|
"FVD", |
||||||
|
"BBB", |
||||||
|
"PvdD", |
||||||
|
"Volt", |
||||||
|
"GroenLinks-PvdA", |
||||||
|
"Nieuw Sociaal Contract", |
||||||
|
"DENK", |
||||||
|
"JA21", |
||||||
|
] |
||||||
|
|
||||||
|
rows = con.execute(""" |
||||||
|
WITH pv AS ( |
||||||
|
SELECT motion_id, party, |
||||||
|
CASE |
||||||
|
WHEN SUM(CASE WHEN vote='voor' THEN 1 ELSE 0 END) > SUM(CASE WHEN vote='tegen' THEN 1 ELSE 0 END) THEN 'voor' |
||||||
|
WHEN SUM(CASE WHEN vote='tegen' THEN 1 ELSE 0 END) > SUM(CASE WHEN vote='voor' THEN 1 ELSE 0 END) THEN 'tegen' |
||||||
|
ELSE 'split' |
||||||
|
END as pv |
||||||
|
FROM mp_votes WHERE party IS NOT NULL AND vote IN ('voor','tegen') |
||||||
|
GROUP BY motion_id, party |
||||||
|
), |
||||||
|
d AS (SELECT * FROM pv WHERE pv != 'split') |
||||||
|
SELECT a.party, b.party, |
||||||
|
COUNT(*) as shared, |
||||||
|
ROUND(100.0 * SUM(CASE WHEN a.pv = b.pv THEN 1 ELSE 0 END) / COUNT(*), 1) as pct |
||||||
|
FROM d a JOIN d b ON a.motion_id = b.motion_id AND a.party != b.party |
||||||
|
GROUP BY a.party, b.party |
||||||
|
HAVING COUNT(*) >= 100 |
||||||
|
""").fetchall() |
||||||
|
|
||||||
|
# Build matrix |
||||||
|
agree = {} |
||||||
|
for a, b, _, pct in rows: |
||||||
|
agree[(a, b)] = pct |
||||||
|
|
||||||
|
# Filter to parties that have data |
||||||
|
present = set() |
||||||
|
for a, b in agree: |
||||||
|
if a in MAJOR: |
||||||
|
present.add(a) |
||||||
|
if b in MAJOR: |
||||||
|
present.add(b) |
||||||
|
parties = [p for p in MAJOR if p in present] |
||||||
|
|
||||||
|
n = len(parties) |
||||||
|
matrix = np.full((n, n), np.nan) |
||||||
|
for i, a in enumerate(parties): |
||||||
|
matrix[i, i] = 100.0 |
||||||
|
for j, b in enumerate(parties): |
||||||
|
if i != j and (a, b) in agree: |
||||||
|
matrix[i, j] = agree[(a, b)] |
||||||
|
|
||||||
|
fig2 = go.Figure( |
||||||
|
data=go.Heatmap( |
||||||
|
z=matrix, |
||||||
|
x=parties, |
||||||
|
y=parties, |
||||||
|
colorscale=[[0, "#6e40c9"], [0.5, "#30363d"], [1, "#00d9a3"]], |
||||||
|
zmid=70, |
||||||
|
zmin=35, |
||||||
|
zmax=100, |
||||||
|
text=[[f"{v:.0f}%" if not np.isnan(v) else "" for v in row] for row in matrix], |
||||||
|
texttemplate="%{text}", |
||||||
|
textfont=dict(size=9), |
||||||
|
hoverongaps=False, |
||||||
|
showscale=True, |
||||||
|
colorbar=dict(title="Agreement %", tickfont=dict(color="#e6edf3")), |
||||||
|
) |
||||||
|
) |
||||||
|
fig2.update_layout( |
||||||
|
title="Cross-party vote alignment (all years combined)", |
||||||
|
plot_bgcolor="#161b22", |
||||||
|
paper_bgcolor="#0d1117", |
||||||
|
font=dict(color="#e6edf3", family="Inter, system-ui", size=11), |
||||||
|
xaxis=dict(tickangle=-45, side="bottom", gridcolor="#30363d"), |
||||||
|
yaxis=dict(autorange="reversed", gridcolor="#30363d"), |
||||||
|
height=600, |
||||||
|
) |
||||||
|
out2 = os.path.join(OUT, "party_alignment.html") |
||||||
|
fig2.write_html(out2, include_plotlyjs="cdn", full_html=True) |
||||||
|
print(f"Wrote {out2}") |
||||||
|
|
||||||
|
con.close() |
||||||
|
print("Done.") |
||||||
@ -0,0 +1,14 @@ |
|||||||
|
"""Smoke test: explorer module is importable without DB or heavy computation.""" |
||||||
|
|
||||||
|
import importlib |
||||||
|
|
||||||
|
|
||||||
|
def test_explorer_importable(): |
||||||
|
mod = importlib.import_module("explorer") |
||||||
|
assert hasattr(mod, "run_app") |
||||||
|
assert callable(mod.run_app) |
||||||
|
assert hasattr(mod, "load_positions") |
||||||
|
assert hasattr(mod, "load_motions_df") |
||||||
|
assert hasattr(mod, "query_similar") |
||||||
|
assert hasattr(mod, "build_compass_tab") |
||||||
|
assert hasattr(mod, "build_search_tab") |
||||||
@ -0,0 +1,38 @@ |
|||||||
|
"""Smoke test: Home module is importable without DB or heavy computation.""" |
||||||
|
|
||||||
|
import importlib |
||||||
|
import sys |
||||||
|
|
||||||
|
|
||||||
|
def test_home_importable(): |
||||||
|
# Streamlit cannot run set_page_config outside of a server context, |
||||||
|
# so we only verify the file can be parsed/compiled, not fully executed. |
||||||
|
import ast |
||||||
|
import os |
||||||
|
|
||||||
|
home_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "Home.py") |
||||||
|
with open(home_path) as f: |
||||||
|
source = f.read() |
||||||
|
|
||||||
|
# Verify the file parses as valid Python |
||||||
|
tree = ast.parse(source) |
||||||
|
|
||||||
|
# Verify st.set_page_config is called at module level (first Streamlit command) |
||||||
|
calls = [ |
||||||
|
node |
||||||
|
for node in ast.walk(tree) |
||||||
|
if isinstance(node, ast.Call) |
||||||
|
and isinstance(node.func, ast.Attribute) |
||||||
|
and node.func.attr == "set_page_config" |
||||||
|
] |
||||||
|
assert calls, "Home.py must call st.set_page_config()" |
||||||
|
|
||||||
|
# Verify page links exist (st.page_link calls) |
||||||
|
page_links = [ |
||||||
|
node |
||||||
|
for node in ast.walk(tree) |
||||||
|
if isinstance(node, ast.Call) |
||||||
|
and isinstance(node.func, ast.Attribute) |
||||||
|
and node.func.attr == "page_link" |
||||||
|
] |
||||||
|
assert len(page_links) >= 2, "Home.py must have at least 2 st.page_link() calls" |
||||||
@ -0,0 +1,174 @@ |
|||||||
|
# Mapping Dutch Democracy: Building a Political Compass from 25,000+ Parliamentary Votes |
||||||
|
|
||||||
|
*What if you could take every motion voted on in the Dutch Parliament over the past decade and automatically plot parties and MPs on a political map — with zero manual labeling?* |
||||||
|
|
||||||
|
That's exactly what this project does. Here's how we built it, what surprised us, and what it revealed about Dutch political dynamics. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## The Starting Point: Open Data, Hidden Structure |
||||||
|
|
||||||
|
The Dutch Parliament publishes every vote — every *motie*, every *amendement*, every *besluit* — in an open OData API. We're talking over **25,500 motions** spanning 2016 to 2026, each with a record of how every party (and in many cases every individual MP) voted: *voor* (for), *tegen* (against), *onthouden* (abstained), or *afwezig* (absent). |
||||||
|
|
||||||
|
This is an extraordinary dataset. But in raw form it's just a table of votes. The interesting question is: can we extract *structure* — left vs. right, progressive vs. conservative, governing vs. opposition — purely from the pattern of who votes with whom? |
||||||
|
|
||||||
|
The answer is yes, and the method is surprisingly elegant. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Step 1: Turning Votes into Geometry |
||||||
|
|
||||||
|
Each motion is a snapshot of political alignment. For each motion, we know which parties voted together and which voted apart. If PvdA and GroenLinks almost always vote the same way, that tells us something. If PVV and CDA frequently diverge, that tells us something too. |
||||||
|
|
||||||
|
We represent this with **Singular Value Decomposition (SVD)** on the party-vote matrix: |
||||||
|
|
||||||
|
- Rows: parties (VVD, PVV, D66, CDA, PvdA, GroenLinks, SP, CU, SGP, FvD, BBB, ...) |
||||||
|
- Columns: motions |
||||||
|
- Values: vote encoded as +1 (voor), -1 (tegen), 0 (absent/abstain) |
||||||
|
|
||||||
|
SVD finds the dominant axes of variation — the directions along which parties disagree most strongly. The first dimension almost always corresponds to a left-right axis. The second dimension typically captures something like a libertarian-authoritarian or progressive-traditionalist axis. |
||||||
|
|
||||||
|
We run this **per quarterly window** (2019-Q1, 2019-Q2, ..., 2024-Q4) so we can track how positions shift over time at fine resolution. |
||||||
|
|
||||||
|
### The Result: A 2D Political Compass |
||||||
|
|
||||||
|
The output is coordinates for every party in 2D space — computed purely from voting behavior, with no labels or assumptions from us. When you plot it, recognizable structure emerges immediately: |
||||||
|
|
||||||
|
- **Left bloc** (PvdA, GroenLinks, SP) cluster tightly together |
||||||
|
- **Right-liberal** (VVD, D66) sit in a distinct quadrant |
||||||
|
- **Religious right** (SGP, CU) form their own coherent group |
||||||
|
- **Populist right** (PVV, FvD in later years) occupy a distant extreme |
||||||
|
- **BBB** (Farmer's party, 2022 onwards) drops into an interesting position between PVV and CDA |
||||||
|
|
||||||
|
The political axis emerges from the math — not our intuitions. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Step 2: What Each Motion Is Actually About |
||||||
|
|
||||||
|
Voting patterns tell us *who* agrees, but not *why*. For that, we add **text embeddings** — dense vector representations of each motion's title and description using a language model. |
||||||
|
|
||||||
|
This lets us do something powerful: if a new motion comes in about nitrogen emissions, we can find the 20 most similar past motions (by meaning, not just keywords). If a motion uses identical party-line voting as another motion from 2022, the text embedding can confirm they're genuinely related — or reveal that the voting pattern is coincidental (parties split on unrelated issues for similar structural reasons). |
||||||
|
|
||||||
|
We compute these using **OpenAI-compatible embeddings** via OpenRouter, processing 25,640 motions in batches of 200. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Step 3: Fused Embeddings — The Best of Both Worlds |
||||||
|
|
||||||
|
SVD gives us the political-structural signal: *how does this motion split the chamber?* Text embeddings give us semantic signal: *what is this motion about?* |
||||||
|
|
||||||
|
We concatenate both into a **fused vector** per motion per window: |
||||||
|
|
||||||
|
``` |
||||||
|
fused = [svd_dims (50)] + [text_dims (2560)] = 2610 dimensions |
||||||
|
``` |
||||||
|
|
||||||
|
This fused representation powers the similarity search. Two motions are considered "close" if they're both about a similar topic *and* they produce a similar political split. This filters out spurious matches — two motions might both be controversial (splitting 50/50) but about completely unrelated things. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## The Numbers: What We're Working With |
||||||
|
|
||||||
|
After the full pipeline run: |
||||||
|
|
||||||
|
| Year | Motions | |
||||||
|
|------|---------| |
||||||
|
| 2016 | 132 | |
||||||
|
| 2017 | 30 | |
||||||
|
| 2018 | 100 | |
||||||
|
| 2019 | 3,374 | |
||||||
|
| 2020 | 4,228 | |
||||||
|
| 2021 | 4,289 | |
||||||
|
| 2022 | 4,116 | |
||||||
|
| 2023 | 621 | |
||||||
|
| 2024 | 3,968 | |
||||||
|
| 2025 | 3,715 | |
||||||
|
| 2026 | 948 | |
||||||
|
|
||||||
|
The 2022 spike is striking — over 4,000 motions in a single year. This was the year the Rutte IV coalition took office amid intense debates on energy prices, housing, the war in Ukraine, and the ongoing nitrogen crisis. |
||||||
|
|
||||||
|
Our similarity cache now holds **627,272 precomputed pairs** (top 20 neighbors per motion per window), making similarity lookup instant at query time. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Interesting Findings |
||||||
|
|
||||||
|
### The 2022 Polarization Surge |
||||||
|
|
||||||
|
The 2022 cohort dominates the dataset. Looking at the SVD positions for that year, the distance between the governing coalition (VVD, D66, CDA, CU) and the opposition (PVV, SP, FvD) is near its maximum. The nitrogen crisis and energy policy debates forced unusually sharp coalition discipline. |
||||||
|
|
||||||
|
### BBB's Geometric Arrival |
||||||
|
|
||||||
|
When BBB (BoerBurgerBeweging) entered parliament in 2023 with a historic 16 seats, their SVD position placed them between PVV and CDA — exactly as expected from their policy profile: agrarian-nationalist populism with Catholic-provincial roots. The model found this without being told. |
||||||
|
|
||||||
|
### The Strange Case of "Verworpen." |
||||||
|
|
||||||
|
Motions that are rejected without debate are recorded with the title "Verworpen." (Rejected.). There are hundreds of these. Because they share a single 9-character title, their text embeddings are identical — meaning every "Verworpen." has cosine similarity 1.0 to every other "Verworpen." This is technically correct (they are textually identical) but semantically meaningless. The similarity cache contains these spurious pairs, which the UI layer needs to filter out. |
||||||
|
|
||||||
|
It's a good reminder that **data quality surprises emerge at scale**. |
||||||
|
|
||||||
|
### Party Cohesion as a Signal |
||||||
|
|
||||||
|
A subtle finding: party cohesion (how often all members of a party vote the same way) varies enormously. SGP and CU have near-perfect cohesion — they vote as a bloc on almost everything. PvdA/GroenLinks (post-merger) has similarly high cohesion. But in earlier years (2019-2020), before the merger, GroenLinks occasionally splits on specific issues around security policy. |
||||||
|
|
||||||
|
VVD shows the most internal variation — governing parties develop fissures. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## The Pipeline Architecture |
||||||
|
|
||||||
|
The system is built around a single DuckDB database and a modular Python pipeline: |
||||||
|
|
||||||
|
``` |
||||||
|
API (Tweede Kamer OData) |
||||||
|
→ download_past_year.py |
||||||
|
→ motions table (25,500+ rows) |
||||||
|
|
||||||
|
motions |
||||||
|
→ extract_mp_votes.py → mp_votes table (200k rows) |
||||||
|
→ text_pipeline.py → embeddings table (25,640 rows, via OpenRouter) |
||||||
|
→ svd_pipeline.py → svd_vectors table (50,779 rows, quarterly windows) |
||||||
|
|
||||||
|
svd_vectors + embeddings |
||||||
|
→ fusion.py → fused_embeddings table (35,872 rows) |
||||||
|
|
||||||
|
fused_embeddings |
||||||
|
→ similarity/compute.py → similarity_cache table (627k rows, top-20 per window) |
||||||
|
``` |
||||||
|
|
||||||
|
Everything runs locally. The only external call is to the OpenRouter API for text embeddings. The similarity computation (627k pairs) is pure NumPy — load vectors, normalize, matrix multiply, take top-k. For 4,000 motions in a quarter, that's a 4000×4000 cosine similarity matrix computed in seconds. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## What's Next |
||||||
|
|
||||||
|
The similarity cache and political compass open up several directions: |
||||||
|
|
||||||
|
**Motion explorer**: Given a motion you care about, find the 20 most politically and semantically similar motions from across the decade. Trace how a policy debate evolved from 2019 to 2025. |
||||||
|
|
||||||
|
**Party trajectory plots**: Animate party positions on the 2D compass year by year. Watch D66 drift, watch PVV consolidate, watch the new parties arrive and find their position. |
||||||
|
|
||||||
|
**Cross-party coalition predictor**: Given a new motion's text and expected vote split, predict which parties will support it based on past patterns. |
||||||
|
|
||||||
|
**The "controversy index"**: We already compute `1 - winning_margin` as a controversy score. The most controversial motions (close votes, high stakes topics) tell a story about where Dutch politics is genuinely undecided vs. where it's performing conflict for the cameras. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Reproducibility |
||||||
|
|
||||||
|
The full pipeline is open and runs on a single machine with no cloud infrastructure: |
||||||
|
|
||||||
|
```bash |
||||||
|
# Download historical data |
||||||
|
python scripts/download_past_year.py --start-date 2016-01-01 --end-date 2026-01-01 |
||||||
|
|
||||||
|
# Run full pipeline (extract votes, compute SVD, embed text, fuse, build similarity cache) |
||||||
|
python -m pipeline.run_pipeline --db-path data/motions.db \ |
||||||
|
--start-date 2016-01-01 --end-date 2026-01-01 \ |
||||||
|
--window-size annual --text-batch-size 200 |
||||||
|
``` |
||||||
|
|
||||||
|
The DB grows to ~3.6GB for the full dataset (mostly embeddings and vote records). Everything else — the SVD, fusion, similarity cache — fits comfortably in memory during computation. |
||||||
|
|
||||||
|
Democracy is more legible than it looks. |
||||||
@ -0,0 +1,165 @@ |
|||||||
|
--- |
||||||
|
date: 2026-03-22 |
||||||
|
topic: "Dynamic motion explorer + analysis refresh" |
||||||
|
status: validated |
||||||
|
--- |
||||||
|
|
||||||
|
## Problem Statement |
||||||
|
|
||||||
|
The parliamentary embedding pipeline now covers 2019–2026 with ~25,000 motions, quarterly SVD windows, fused embeddings, and a 200k+ similarity cache. None of this is visible to anyone in an interactive form. The only outputs today are static HTML files written by `generate_compass.py` (if it's been run), and a blog post with placeholder numbers. |
||||||
|
|
||||||
|
We need to: |
||||||
|
1. Regenerate all analyses and output graphs with the full dataset |
||||||
|
2. Build an interactive Streamlit explorer that surfaces the political compass, party trajectories, and motion similarity search |
||||||
|
3. Update the blog post with real numbers and findings |
||||||
|
|
||||||
|
## Constraints |
||||||
|
|
||||||
|
- Do NOT modify `app.py` or `scheduler.py` — these are the production quiz app |
||||||
|
- All DB access in the explorer must be **read-only** (no writes) — pipeline may be running |
||||||
|
- Explorer must work with existing `analysis.*` modules; no new analysis logic |
||||||
|
- Use `@st.cache_data` aggressively — `compute_2d_axes` runs PCA across all windows and is expensive (seconds, not milliseconds) |
||||||
|
- No new external dependencies beyond what's already installed (streamlit, plotly, umap-learn, scikit-learn are all present) |
||||||
|
- Follow existing code style: functional Python, `logging.getLogger(__name__)`, no print statements in library code |
||||||
|
|
||||||
|
## Approach |
||||||
|
|
||||||
|
**Single-file `explorer.py`** at the project root alongside `app.py`. |
||||||
|
|
||||||
|
Four Streamlit tabs: |
||||||
|
1. **Politiek Kompas** — 2D MP/party scatter with a window slider |
||||||
|
2. **Partij Trajectories** — Line traces of party positions over time on the compass |
||||||
|
3. **Motie Zoeken** — Free-text + filter search, returns ranked similar motions |
||||||
|
4. **Motie Browser** — Filterable table of all motions, click to expand detail + similar motions |
||||||
|
|
||||||
|
Run with: `streamlit run explorer.py` |
||||||
|
|
||||||
|
This approach is chosen because: |
||||||
|
- Reuses all existing `analysis.*` modules without changes |
||||||
|
- Single file means no new package structure to maintain |
||||||
|
- Streamlit tabs map naturally to the four distinct views a researcher would want |
||||||
|
- Read-only DB access means it can run concurrently with the pipeline |
||||||
|
|
||||||
|
## Architecture |
||||||
|
|
||||||
|
``` |
||||||
|
explorer.py |
||||||
|
├── Tab 1: Politiek Kompas |
||||||
|
│ └── analysis.political_axis.compute_2d_axes (cached) |
||||||
|
│ └── analysis.visualize.plot_political_compass → Plotly figure |
||||||
|
│ |
||||||
|
├── Tab 2: Partij Trajectories |
||||||
|
│ └── analysis.trajectory.compute_2d_trajectories (cached) |
||||||
|
│ └── analysis.visualize.plot_2d_trajectories → Plotly figure |
||||||
|
│ |
||||||
|
├── Tab 3: Motie Zoeken |
||||||
|
│ └── database.get_all_motions (cached, read-only) |
||||||
|
│ └── database.search_similar (similarity_cache lookup) |
||||||
|
│ └── Custom search: filter title/description + show voting_results |
||||||
|
│ |
||||||
|
└── Tab 4: Motie Browser |
||||||
|
└── database.get_filtered_motions (cached, read-only) |
||||||
|
└── On click: database.search_similar for related motions |
||||||
|
``` |
||||||
|
|
||||||
|
## Key Components & Responsibilities |
||||||
|
|
||||||
|
**`explorer.py`** |
||||||
|
- Page config: `st.set_page_config(layout="wide", page_title="Parlement Explorer")` |
||||||
|
- Sidebar: DB path input (default `data/motions.db`), window-size toggle (annual/quarterly) |
||||||
|
- `@st.cache_data` wrappers for all expensive DB reads and computations |
||||||
|
- Four tabs via `st.tabs([...])` |
||||||
|
|
||||||
|
**Tab 1 — Politiek Kompas** |
||||||
|
- Calls `compute_2d_axes(db_path, method='pca', pca_residual=True)` — cached |
||||||
|
- Window selector slider showing available windows |
||||||
|
- Renders the Plotly scatter for the selected window using `_render_compass_for_window(positions_by_window, window_id, party_map, axis_def)` — a thin Plotly figure builder (not writing to file) |
||||||
|
- Hover: MP name, party, (x, y) coordinates |
||||||
|
- Color by party using `_load_party_map(db_path)` — cached |
||||||
|
|
||||||
|
**Tab 2 — Partij Trajectories** |
||||||
|
- Same `positions_by_window` data from Tab 1 (shared cache hit) |
||||||
|
- Multi-select party filter (default: all major parties) |
||||||
|
- Plotly figure: one trace per party, x/y positions connected by lines, labeled by window_id |
||||||
|
- Toggle between showing MPs or just party centroids (computed as mean of MP positions per party per window) |
||||||
|
|
||||||
|
**Tab 3 — Motie Zoeken** |
||||||
|
- Search input (Dutch text, free-form) |
||||||
|
- Filters: year range (slider), policy area (multi-select), controversy score (slider) |
||||||
|
- On search: filter `motions` table in-memory against title + layman_explanation text (case-insensitive substring; no embedding search needed at this level) |
||||||
|
- Results list: each result shows title, date, policy area, controversy, layman_explanation |
||||||
|
- Expandable section per result: full description/body_text + "Vergelijkbare moties" from `similarity_cache` |
||||||
|
- Voting breakdown: parse `voting_results` JSON to show Voor/Tegen/Onthouden per party |
||||||
|
|
||||||
|
**Tab 4 — Motie Browser** |
||||||
|
- `st.dataframe` with all motions (title, date, policy_area, controversy_score, winning_margin) |
||||||
|
- Column filters at top: year, policy area |
||||||
|
- Sort by: date DESC, controversy DESC, winning_margin ASC (most contested first) |
||||||
|
- Click row → `st.session_state` stores selected motion_id → detail panel below table |
||||||
|
- Detail panel: full motion text + top-10 similar motions from similarity_cache |
||||||
|
|
||||||
|
## Data Flow |
||||||
|
|
||||||
|
1. On startup: `compute_2d_axes` runs PCA, results cached in Streamlit's in-memory cache |
||||||
|
2. Tab 1/2: pure reads from `svd_vectors` + `mp_metadata` — all cached after first load |
||||||
|
3. Tab 3: on each search, filter pre-loaded motions DataFrame in-memory (no DB query per keypress) |
||||||
|
4. Tab 4: full motions table loaded once and cached; similarity lookups hit `similarity_cache` table via existing `database.get_cached_similarities` |
||||||
|
|
||||||
|
All DuckDB connections are opened with `read_only=True` to allow concurrent pipeline access. |
||||||
|
|
||||||
|
## Error Handling |
||||||
|
|
||||||
|
- If `compute_2d_axes` fails (insufficient data for a window), skip that window and log warning — don't crash the app |
||||||
|
- If `similarity_cache` has no entries for a motion (e.g., new motion not yet processed), show "Nog geen vergelijkbare moties beschikbaar" placeholder |
||||||
|
- If DB file doesn't exist at startup, show an error banner with the path and instructions |
||||||
|
- All `duckdb.connect` calls wrapped in try/finally to guarantee close |
||||||
|
|
||||||
|
## Analysis Refresh Plan |
||||||
|
|
||||||
|
Before building the explorer, regenerate all outputs: |
||||||
|
|
||||||
|
```bash |
||||||
|
# 1. Generate political compass HTML for latest window (annual) |
||||||
|
.venv/bin/python scripts/generate_compass.py \ |
||||||
|
--db data/motions.db --out outputs \ |
||||||
|
--method pca --pca-residual |
||||||
|
|
||||||
|
# 2. Generate similarity cache for new windows (2019–2021, 2024 quarters) |
||||||
|
# (run_pipeline with --skip-metadata --skip-extract --skip-svd --skip-text) |
||||||
|
.venv/bin/python -m pipeline.run_pipeline \ |
||||||
|
--db-path data/motions.db \ |
||||||
|
--start-date 2019-01-01 --end-date 2025-01-01 \ |
||||||
|
--window-size quarterly \ |
||||||
|
--skip-metadata --skip-extract --skip-svd --skip-text |
||||||
|
|
||||||
|
# 3. Recompute similarity cache for all windows |
||||||
|
.venv/bin/python -c " |
||||||
|
from similarity.compute import recompute_all_windows |
||||||
|
recompute_all_windows('data/motions.db', window_size='quarterly', top_k=20) |
||||||
|
" |
||||||
|
``` |
||||||
|
|
||||||
|
## Blog Post Updates |
||||||
|
|
||||||
|
Target: `thoughts/blog-post-political-compass.md` |
||||||
|
|
||||||
|
- Replace placeholder motion counts table with real numbers from DB query |
||||||
|
- Add actual findings from quarterly analysis (not visible in annual windows): |
||||||
|
- 2020-Q2 COVID vote clustering — parties converge on emergency measures |
||||||
|
- 2022-Q4 nitrogen crisis — sharpest left-right split in dataset |
||||||
|
- 2023-Q1 → 2024-Q1 gap (data missing for Q2-Q4 2023) |
||||||
|
- Add "Explorer" section describing `explorer.py` and how to run it |
||||||
|
- Update similarity cache row count (was 212k, now higher with new windows) |
||||||
|
- Fix the "fused = [10] + [2560] = 2570" claim — verify actual dimensions |
||||||
|
|
||||||
|
## Testing Strategy |
||||||
|
|
||||||
|
- Explorer has no tests (it's a UI script) — verify manually by running `streamlit run explorer.py` after pipeline completes |
||||||
|
- Existing 34 tests stay green — no changes to library modules |
||||||
|
- Run tests after completing implementation: `.venv/bin/python -m pytest -q` |
||||||
|
|
||||||
|
## Open Questions |
||||||
|
|
||||||
|
- Should the explorer ship as a separate port from `app.py`? (Recommendation: yes, `app.py` stays on its port, `explorer.py` runs on a different port for internal/research use) |
||||||
|
- Should `Verworpen.` motions be filtered from search results by default? (Recommendation: yes, add a "Toon verworpen" toggle defaulting to off) |
||||||
|
- Annual or quarterly windows as the default for the compass? (Recommendation: annual — less noise, cleaner trajectories; quarterly available via sidebar toggle) |
||||||
@ -0,0 +1,229 @@ |
|||||||
|
--- |
||||||
|
date: 2026-03-22 |
||||||
|
topic: "StemAtlas — Public Deployment on sgeboers.nl" |
||||||
|
status: validated |
||||||
|
--- |
||||||
|
|
||||||
|
# StemAtlas Deployment Design |
||||||
|
|
||||||
|
## Problem Statement |
||||||
|
|
||||||
|
The stemwijzer project has three user-facing products ready to publish: |
||||||
|
1. **A blog post** explaining the political compass methodology and findings |
||||||
|
2. **An interactive explorer** (political compass, party trajectories, motion search) |
||||||
|
3. **The stemwijzer quiz** (vote on motions, see which parties match you) |
||||||
|
|
||||||
|
These need to be deployed publicly on sgeboers.nl using the existing VPS + Gitea + Drone + Docker stack. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## The Name: StemAtlas |
||||||
|
|
||||||
|
**`stematlas.sgeboers.nl`** |
||||||
|
|
||||||
|
Dutch wordplay: **stem** = *vote* AND *voice* (as in "the voice of parliament") + **atlas** = a comprehensive map of the world. Together: *an atlas of voices* — a map of how Dutch democracy sounds from the inside. |
||||||
|
|
||||||
|
It's broader than "stemwijzer" (which implies a voting guide) — it positions the site as a data exploration and journalism tool. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Constraints |
||||||
|
|
||||||
|
- Existing VPS running Nginx, Gitea, Drone |
||||||
|
- Deployment pipeline: Docker build → push to registry → SSH `docker-compose up -d` |
||||||
|
- sgeboers.nl is a **raw HTML/CSS site** (not Hugo) hosted as a repo on git.sgeboers.nl |
||||||
|
- DuckDB file lives on the VPS — single writer (scheduler), multiple readers (Streamlit) |
||||||
|
- No new cloud services or hosting costs |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Architecture |
||||||
|
|
||||||
|
``` |
||||||
|
Internet |
||||||
|
│ |
||||||
|
├── sgeboers.nl (raw HTML/CSS site, existing repo on git.sgeboers.nl) |
||||||
|
│ └── blog/stematlas.html ← blog post with inline charts + link to subdomain |
||||||
|
│ |
||||||
|
└── stematlas.sgeboers.nl |
||||||
|
└── Nginx (reverse proxy) |
||||||
|
└── Streamlit multi-page app (port 8501) |
||||||
|
├── Page 1: Stemwijzer Quiz (app.py) |
||||||
|
└── Page 2: Explorer (explorer.py) |
||||||
|
|
||||||
|
VPS filesystem: |
||||||
|
/srv/stematlas/ |
||||||
|
├── data/motions.db ← DuckDB (shared, read-write by scheduler) |
||||||
|
└── docker-compose.yml |
||||||
|
``` |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Components |
||||||
|
|
||||||
|
### 1. Streamlit Multi-Page App |
||||||
|
|
||||||
|
Restructure entry point from `app.py` → `Home.py` with a `pages/` directory: |
||||||
|
|
||||||
|
``` |
||||||
|
Home.py ← landing page / about |
||||||
|
pages/ |
||||||
|
1_Stemwijzer.py ← quiz (app.py content) |
||||||
|
2_Explorer.py ← explorer.py content |
||||||
|
``` |
||||||
|
|
||||||
|
Streamlit's built-in multi-page routing handles navigation. One Docker container, one port (8501). |
||||||
|
|
||||||
|
**Why not two separate containers?** |
||||||
|
Single shared DuckDB file on VPS filesystem. Both pages open read-only connections (quiz opens read-write for session data, but that's the existing behaviour). One container = one volume mount = no coordination overhead. |
||||||
|
|
||||||
|
### 2. Docker Compose |
||||||
|
|
||||||
|
The existing `.drone.yml` already calls `docker-compose up -d` on the VPS. We add/update `docker-compose.yml`: |
||||||
|
|
||||||
|
``` |
||||||
|
Services: |
||||||
|
stematlas: |
||||||
|
image: registry/stematlas:latest |
||||||
|
ports: 8501 (internal only) |
||||||
|
volumes: |
||||||
|
- /srv/stematlas/data:/app/data ← persistent DB |
||||||
|
restart: unless-stopped |
||||||
|
|
||||||
|
scheduler: |
||||||
|
image: registry/stematlas:latest |
||||||
|
command: python scheduler.py |
||||||
|
volumes: |
||||||
|
- /srv/stematlas/data:/app/data ← same DB, write access |
||||||
|
restart: unless-stopped |
||||||
|
``` |
||||||
|
|
||||||
|
**Scheduler as a sidecar**: runs in the same image but different container, keeps DB updated nightly. Streamlit container never writes to DB (except user sessions in the quiz). |
||||||
|
|
||||||
|
### 3. Nginx Vhost |
||||||
|
|
||||||
|
New server block on the VPS: |
||||||
|
|
||||||
|
``` |
||||||
|
stematlas.sgeboers.nl → proxy_pass http://127.0.0.1:8501 |
||||||
|
``` |
||||||
|
|
||||||
|
Standard Streamlit proxy requirements: `proxy_http_version 1.1`, WebSocket upgrade headers for `/_stcore/stream`. Let's Encrypt cert via Certbot (standard pattern). |
||||||
|
|
||||||
|
### 4. Drone CI Pipeline Update |
||||||
|
|
||||||
|
Existing `.drone.yml` steps remain identical — build, push, SSH deploy. The only change: `docker-compose.yml` in the repo now references both the `stematlas` and `scheduler` services, so `docker-compose up -d` picks them both up. |
||||||
|
|
||||||
|
No new Drone secrets needed if `DOCKER_REGISTRY`, `DEPLOY_HOST` etc. are already set. |
||||||
|
|
||||||
|
### 5. Blog Post (Raw HTML page on sgeboers.nl) |
||||||
|
|
||||||
|
The blog post is a new `blog/stematlas.html` file added to the sgeboers.nl repo on git.sgeboers.nl. The Drone pipeline for that repo deploys it like any other static file — push to git, Drone copies to webroot, Nginx serves it. |
||||||
|
|
||||||
|
**Chart embedding strategy — inline Plotly divs:** |
||||||
|
|
||||||
|
Rather than iframes, we extract just the chart `<div>` + `<script>` from `generate_compass.py`'s output (using `fig.to_html(include_plotlyjs='cdn', full_html=False)`) and paste them directly into the blog post HTML. This is cleaner than iframes — no border, no scroll issues, full-width, loads with the page. |
||||||
|
|
||||||
|
Plotly CDN script included once in the `<head>`. Each chart is just a `<div id="chart-N">` + a `<script>` block below it. |
||||||
|
|
||||||
|
**Linking to the subdomain:** |
||||||
|
|
||||||
|
The blog post is the *article* — it tells the story with static charts. The subdomain is the *playground*. The post links to `stematlas.sgeboers.nl` at two natural moments: |
||||||
|
- After the political compass chart: *"Explore every window interactively →"* |
||||||
|
- At the end: *"Take the quiz yourself →"* |
||||||
|
|
||||||
|
This is the right split: blog post brings readers in via search/sharing, subdomain gives them something to do. |
||||||
|
|
||||||
|
**Chart generation workflow:** |
||||||
|
|
||||||
|
``` |
||||||
|
scripts/generate_compass.py → outputs/ |
||||||
|
├── compass_2025.html ← main compass (latest window) |
||||||
|
├── trajectories_2019_2025.html ← party drift over time |
||||||
|
└── compass_2024-Q4.html ← quarterly detail |
||||||
|
``` |
||||||
|
|
||||||
|
Run `fig.to_html(include_plotlyjs='cdn', full_html=False)` to extract embeddable snippets, paste into `blog/stematlas.html` in the sgeboers.nl repo. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Blog Post Charts — What to Include |
||||||
|
|
||||||
|
The blog post narrates three acts. Each gets a supporting chart: |
||||||
|
|
||||||
|
### Act 1: The Method |
||||||
|
**No chart needed** — the SVD explanation is conceptual. Use a simple HTML table for the vote matrix illustration. |
||||||
|
|
||||||
|
### Act 2: The Political Compass |
||||||
|
**Chart: `compass_latest_annual.html`** |
||||||
|
|
||||||
|
- 2D scatter of all parties for the most recent full annual window (2024 or 2025) |
||||||
|
- Axes: PC1 (left-right) × PC2 (residual, typically progressive-traditionalist) |
||||||
|
- Points coloured and labelled by party |
||||||
|
- Interactive: hover shows party name + coordinates |
||||||
|
- Caption: "Each party's position computed purely from voting patterns — no labels applied by us" |
||||||
|
|
||||||
|
**Chart: `trajectories_all_parties.html`** |
||||||
|
|
||||||
|
- Line chart of party positions across all annual windows (2016–2025) |
||||||
|
- One line per party, coloured consistently |
||||||
|
- Key narrative moments annotated: BBB arrival (2022), coalition formation (2022), Rutte → Schoof (2024) |
||||||
|
- Interactive: toggle parties on/off via legend |
||||||
|
|
||||||
|
### Act 3: Motion Similarity |
||||||
|
**Chart: `compass_motions_sample.html`** (optional, depends on data quality) |
||||||
|
|
||||||
|
- 2D UMAP scatter of ~500 sampled motions, coloured by policy area |
||||||
|
- Shows clustering: climate motions cluster together, budget motions cluster together, etc. |
||||||
|
- If UMAP results aren't clean enough to tell a clear story, skip this one |
||||||
|
|
||||||
|
**Static table: Motion counts by year** |
||||||
|
Just a markdown table in the blog post — no chart needed. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Data Flow |
||||||
|
|
||||||
|
``` |
||||||
|
scheduler.py (nightly) |
||||||
|
└── api_client → downloads new motions → DuckDB |
||||||
|
|
||||||
|
On demand (manual or cron): |
||||||
|
└── run_pipeline.py → SVD + embeddings + fusion + similarity cache → DuckDB |
||||||
|
└── generate_compass.py → static HTML charts → sgeboers.nl repo (blog/stematlas.html) |
||||||
|
|
||||||
|
Streamlit (reads only): |
||||||
|
└── duckdb.connect(read_only=True) → all analysis queries |
||||||
|
``` |
||||||
|
|
||||||
|
The DB is the source of truth. Charts are regenerated and re-copied to Hugo whenever the pipeline produces new data — probably monthly. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Error Handling Strategy |
||||||
|
|
||||||
|
- **Streamlit crash**: Docker `restart: unless-stopped` brings it back automatically |
||||||
|
- **Scheduler crash**: Same restart policy; DuckDB's WAL handles partial writes |
||||||
|
- **DB file corruption**: Not handled beyond OS-level backup. Mitigate by adding a weekly `cp data/motions.db data/motions.db.bak` to the scheduler or as a cron job on the VPS |
||||||
|
- **Blog charts stale**: Acceptable — charts are labelled with their window date; stale by 30 days is fine for a blog post |
||||||
|
- **Streamlit + scheduler write conflict**: Scheduler is the only writer. Streamlit and quiz sessions both use separate connections; DuckDB handles concurrent reads fine. The quiz writes `user_sessions` rows — low frequency, no conflict risk with scheduler |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Testing Strategy |
||||||
|
|
||||||
|
- Import smoke test for `explorer.py` already exists (`tests/test_explorer_import.py`) |
||||||
|
- `Home.py` and `pages/` restructure needs a corresponding smoke test |
||||||
|
- Drone build will catch import errors before deploy |
||||||
|
- Manual verification: `docker-compose up` locally against a copy of `data/motions.db`, check all four Streamlit tabs render without error |
||||||
|
- Blog post charts: visual review after `generate_compass.py` run — no automated test needed |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Open Questions |
||||||
|
|
||||||
|
1. **Multi-page restructure scope**: Does the quiz (`app.py`) need any changes beyond being wrapped in a `pages/` file, or can it be imported as-is? The `if __name__ == "__main__"` guard in `app.py` needs reviewing. |
||||||
|
2. **Streamlit base path**: Subdomain approach (`stematlas.sgeboers.nl`) means no subpath complexity — Streamlit runs at `/`. Clean. |
||||||
|
3. **Chart update cadence**: Manual (run `generate_compass.py`, extract snippets, paste into blog post HTML, push to sgeboers.nl repo). Fine initially — charts are labelled with window date. |
||||||
|
4. **sgeboers.nl nav structure**: No blog directory exists yet. Need to add `blog/` dir, a `blog/stematlas.html` file, and a nav link on the main site. Structure TBD after inspecting the existing HTML/CSS site. |
||||||
|
5. **Nginx already running**: Need to confirm Certbot/Let's Encrypt workflow matches what's already set up on the VPS for other subdomains. |
||||||
@ -0,0 +1,530 @@ |
|||||||
|
# Motion Explorer Implementation Plan |
||||||
|
|
||||||
|
**Goal:** Regenerate analyses (compass + similarity cache), add an interactive Streamlit explorer (explorer.py) exposing political compass, party trajectories, motion search and browser, and update the blog post with real counts and vector-dimension facts. |
||||||
|
|
||||||
|
**Design doc:** thoughts/shared/designs/2026-03-22-motion-explorer-design.md |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Summary / Architecture |
||||||
|
|
||||||
|
We'll perform three high-level workstreams in dependency order: |
||||||
|
1. Analysis rerun: after the running pipeline releases the DB lock, run the minimal pipeline steps to (re)compute fused vectors and then recompute the similarity cache for all quarterly windows 2019-Q1 → 2024-Q4. Also run the static compass generator for verification. |
||||||
|
2. explorer.py: single-file Streamlit app placed at project root. It will use the existing analysis.* modules for heavy computations (cached via @st.cache_data) and duckdb read-only connections for all DB reads. Figures are produced with plotly and rendered inline in Streamlit. |
||||||
|
3. Blog post update: update thoughts/blog-post-political-compass.md with real DB numbers, updated similarity cache counts and correct fused vector dimensions. |
||||||
|
|
||||||
|
Key implementation decisions (gap-filling): |
||||||
|
- Explorer is a single import-safe module: top-level definitions only, no expensive work on import. Running the UI triggers computations. |
||||||
|
- Use @st.cache_data for expensive functions: load_positions (compute_2d_axes), load_party_map, load_motions_df. |
||||||
|
- All DuckDB access in explorer.py will use duckdb.connect(database=..., read_only=True). |
||||||
|
- For similarity lookups we'll query similarity_cache directly via read-only DuckDB rather than calling MotionDatabase (which opens non-read-only connections), to respect the "DB may be running" constraint. |
||||||
|
- The UI will filter out motions with title exactly "Verworpen." by default; a sidebar toggle allows showing them. |
||||||
|
- Tests: explorer is a UI script so no behavioural TDD possible. We'll add a minimal import/sanity test ensuring the module is import-safe and key functions exist. Blog-post updates are manual but the plan includes a small helper script to compute exact counts to paste into the markdown. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Dependency Graph |
||||||
|
|
||||||
|
``` |
||||||
|
Batch 1 (parallel): 1.1 [analysis-rerun - single operator task] (depends: none) |
||||||
|
Batch 2 (parallel): 2.1, 2.2 [explorer implementation + test] (depends: 1.1 for verification, but code can be implemented earlier) |
||||||
|
Batch 3 (serial): 3.1 [blog post update] (depends: 1.1) |
||||||
|
``` |
||||||
|
|
||||||
|
NOTE: The actual critical dependency is that the DB lock must be released before running the analysis rerun (Batch 1). The explorer code (Batch 2) can be implemented while the pipeline is running — it will only attempt DB reads at runtime and uses read-only connections. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Batch 1: Analysis rerun (operator tasks — no repo files changed) |
||||||
|
|
||||||
|
These are operational steps to run after the pipeline finishes and the DB lock is released. Run from the repository root. |
||||||
|
|
||||||
|
Task 1.1: Regenerate compass outputs and fused vectors |
||||||
|
**What:** Run generate_compass.py and run the pipeline to (re)fuse vectors for quarterly windows covering 2019-Q1 → 2024-Q4. We will not re-run expensive fetch/extract/SVD/text steps if they are already up-to-date; only fusion (phase 5) must run so fused_embeddings exists for all windows. |
||||||
|
**Commands (run after pipeline finishes and DB unlocked):** |
||||||
|
|
||||||
|
- Verify DB file exists: |
||||||
|
.venv/bin/python -c "import os,sys; p='data/motions.db'; print('exists' if os.path.exists(p) else 'MISSING'); sys.exit(0)" |
||||||
|
|
||||||
|
- Run static compass for quick visual check (produces HTML output): |
||||||
|
.venv/bin/python scripts/generate_compass.py --db data/motions.db --out outputs --method pca --pca-residual |
||||||
|
|
||||||
|
- Run the pipeline orchestrator so Phase 5 (fusion) runs for quarterly windows 2019-01-01 → 2025-01-01. |
||||||
|
We explicitly skip metadata/extract/svd/text since those may already be present; this minimizes rework and avoids mixing read/write connections in the current process. |
||||||
|
|
||||||
|
.venv/bin/python -m pipeline.run_pipeline \ |
||||||
|
--db-path data/motions.db \ |
||||||
|
--start-date 2019-01-01 --end-date 2025-01-01 \ |
||||||
|
--window-size quarterly \ |
||||||
|
--skip-metadata --skip-extract --skip-svd --skip-text |
||||||
|
|
||||||
|
**Notes:** run_pipeline.py includes a --skip-fusion flag; we MUST NOT pass --skip-fusion here because we want fusion to execute. The script supports exactly the flags shown. |
||||||
|
|
||||||
|
**Verify:** |
||||||
|
- After run_pipeline completes, verify fused_embeddings rows exist for expected windows: |
||||||
|
.venv/bin/python - <<'PY' |
||||||
|
import duckdb |
||||||
|
conn = duckdb.connect(database='data/motions.db', read_only=True) |
||||||
|
print(conn.execute("SELECT window_id, COUNT(*) FROM fused_embeddings GROUP BY window_id ORDER BY window_id DESC").fetchall()) |
||||||
|
conn.close() |
||||||
|
PY |
||||||
|
|
||||||
|
Task 1.2: Recompute similarity cache for all quarterly windows 2019-Q1 → 2024-Q4 |
||||||
|
**What:** Compute top-20 similarities per motion per window for the fused vectors and insert rows into similarity_cache. We will run similarity.compute.compute_similarities per window. The repository's similarity/compute.py exposes compute_similarities(vector_type='fused', window_id=..., top_k=20). |
||||||
|
|
||||||
|
**Command (one-liner loop):** |
||||||
|
.venv/bin/python - <<'PY' |
||||||
|
from similarity.compute import compute_similarities |
||||||
|
windows = [] |
||||||
|
years = range(2019, 2025) # 2019..2024 |
||||||
|
for y in years: |
||||||
|
for q in (1,2,3,4): |
||||||
|
windows.append(f"{y}-Q{q}") |
||||||
|
total = 0 |
||||||
|
for wid in windows: |
||||||
|
inserted = compute_similarities(vector_type='fused', window_id=wid, top_k=20, db_path='data/motions.db') |
||||||
|
print(f"window={wid} inserted={inserted}") |
||||||
|
total += inserted |
||||||
|
print('DONE total_inserted=', total) |
||||||
|
PY |
||||||
|
|
||||||
|
**Notes & decisions:** |
||||||
|
- The compute_similarities function already clears existing rows for (vector_type, window_id) before inserting new ones, so this is safe to re-run. |
||||||
|
- If compute_similarities raises memory pressure for large windows, run on subsets (split windows further) — but try the simple loop first. |
||||||
|
|
||||||
|
**Verify:** |
||||||
|
- Basic counts per window: |
||||||
|
.venv/bin/python - <<'PY' |
||||||
|
import duckdb |
||||||
|
conn = duckdb.connect(database='data/motions.db', read_only=True) |
||||||
|
print(conn.execute("SELECT window_id, COUNT(*) FROM similarity_cache WHERE vector_type = 'fused' GROUP BY window_id ORDER BY window_id").fetchall()) |
||||||
|
print('total', conn.execute("SELECT COUNT(*) FROM similarity_cache WHERE vector_type = 'fused'").fetchone()) |
||||||
|
conn.close() |
||||||
|
PY |
||||||
|
|
||||||
|
- Spot-check top neighbors for a known motion id (replace 123 with a real id observed from motions table): |
||||||
|
.venv/bin/python - <<'PY' |
||||||
|
import duckdb |
||||||
|
conn = duckdb.connect(database='data/motions.db', read_only=True) |
||||||
|
print(conn.execute("SELECT id FROM motions ORDER BY id LIMIT 1").fetchall()) |
||||||
|
src = conn.execute("SELECT id FROM motions ORDER BY id LIMIT 1").fetchone()[0] |
||||||
|
print('example source id=', src) |
||||||
|
print(conn.execute("SELECT target_motion_id, score FROM similarity_cache WHERE source_motion_id = ? AND vector_type = 'fused' ORDER BY score DESC LIMIT 10", (src,)).fetchall()) |
||||||
|
conn.close() |
||||||
|
PY |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Batch 2: Explorer implementation (code + test) — parallel implementers |
||||||
|
|
||||||
|
All tasks in this batch are independent and can be worked on in parallel. The single file to add is explorer.py at the project root. A small unit test ensures import-safety. |
||||||
|
|
||||||
|
Decision: explorer.py will be placed at project root (same level as app.py) as requested by design. It will avoid performing DB work at import time so tests and other scripts can import it safely. |
||||||
|
|
||||||
|
### Task 2.1: explorer.py |
||||||
|
**File:** explorer.py |
||||||
|
**Test:** tests/test_explorer_import.py |
||||||
|
**Depends:** none (safe to implement while pipeline runs) |
||||||
|
|
||||||
|
Implementation (copy-paste-ready). This is a minimal, well-documented, and import-safe Streamlit app that follows the design requirements. It uses @st.cache_data on heavy functions, opens DuckDB with read_only=True for all reads, and uses existing analysis modules for computing 2D axes. |
||||||
|
|
||||||
|
```python |
||||||
|
# explorer.py |
||||||
|
"""Streamlit motion explorer. |
||||||
|
|
||||||
|
Import-safe: heavy computations are behind functions guarded by @st.cache_data |
||||||
|
and only run when the user opens the app (streamlit run explorer.py). |
||||||
|
""" |
||||||
|
|
||||||
|
from __future__ import annotations |
||||||
|
|
||||||
|
import logging |
||||||
|
from typing import Dict, List, Optional, Tuple |
||||||
|
|
||||||
|
import duckdb |
||||||
|
import pandas as pd |
||||||
|
import plotly.express as px |
||||||
|
import streamlit as st |
||||||
|
|
||||||
|
# keep a module-level logger |
||||||
|
logger = logging.getLogger(__name__) |
||||||
|
|
||||||
|
|
||||||
|
# ---------- Cached data loaders ---------- |
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data |
||||||
|
def load_positions(db_path: str = "data/motions.db", window_size: str = "annual") -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], Optional[Dict]]: |
||||||
|
"""Load positions_by_window and axis_def using existing analysis.political_axis.compute_2d_axes. |
||||||
|
|
||||||
|
This delegates heavy computation to the analysis module and caches the result in Streamlit. |
||||||
|
The function intentionally accepts db_path so callers (tests) can pass a different path. |
||||||
|
""" |
||||||
|
try: |
||||||
|
from analysis.political_axis import compute_2d_axes |
||||||
|
except Exception as e: |
||||||
|
logger.exception("analysis.political_axis not available: %s", e) |
||||||
|
return {}, None |
||||||
|
|
||||||
|
# compute_2d_axes may be expensive; we let the analysis module handle internals |
||||||
|
positions_by_window, axis_def = compute_2d_axes( |
||||||
|
db_path, method="pca", pca_residual=True, normalize_vectors=True |
||||||
|
) |
||||||
|
return positions_by_window, axis_def |
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data |
||||||
|
def load_party_map(db_path: str = "data/motions.db") -> Dict[str, str]: |
||||||
|
"""Return mp_name -> party mapping. |
||||||
|
|
||||||
|
Uses the helper in analysis.visualize which already knows heuristics. |
||||||
|
""" |
||||||
|
try: |
||||||
|
from analysis.visualize import _load_party_map |
||||||
|
|
||||||
|
return _load_party_map(db_path) |
||||||
|
except Exception: |
||||||
|
logger.exception("Failed to load party map") |
||||||
|
return {} |
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data |
||||||
|
def load_motions_df(db_path: str = "data/motions.db") -> pd.DataFrame: |
||||||
|
"""Load motions table into a cached pandas DataFrame (read-only connection). |
||||||
|
|
||||||
|
Columns returned: id, title, description, date, policy_area, voting_results, layman_explanation, winning_margin, controversy_score |
||||||
|
""" |
||||||
|
conn = None |
||||||
|
try: |
||||||
|
conn = duckdb.connect(database=db_path, read_only=True) |
||||||
|
df = conn.execute( |
||||||
|
"SELECT id, title, description, date, policy_area, voting_results, layman_explanation, winning_margin, controversy_score FROM motions" |
||||||
|
).fetchdf() |
||||||
|
return df |
||||||
|
finally: |
||||||
|
if conn is not None: |
||||||
|
try: |
||||||
|
conn.close() |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
|
||||||
|
|
||||||
|
def query_similar_from_cache(db_path: str, source_motion_id: int, vector_type: str = "fused", window_id: Optional[str] = None, top_k: int = 10) -> List[Dict]: |
||||||
|
"""Query similarity_cache table using a read-only connection. |
||||||
|
|
||||||
|
Returns list of dicts with keys target_motion_id, score, id. |
||||||
|
""" |
||||||
|
conn = None |
||||||
|
try: |
||||||
|
conn = duckdb.connect(database=db_path, read_only=True) |
||||||
|
params = [source_motion_id, vector_type] |
||||||
|
query = "SELECT target_motion_id, score, id, window_id FROM similarity_cache WHERE source_motion_id = ? AND vector_type = ?" |
||||||
|
if window_id is not None: |
||||||
|
query += " AND window_id = ?" |
||||||
|
params.append(window_id) |
||||||
|
query += " ORDER BY score DESC LIMIT ?" |
||||||
|
params.append(top_k) |
||||||
|
rows = conn.execute(query, params).fetchall() |
||||||
|
cols = [c[0] for c in conn.description] |
||||||
|
return [dict(zip(cols, r)) for r in rows] |
||||||
|
finally: |
||||||
|
if conn is not None: |
||||||
|
try: |
||||||
|
conn.close() |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
|
||||||
|
|
||||||
|
# ---------- UI builders ---------- |
||||||
|
|
||||||
|
|
||||||
|
def build_compass_tab(db_path: str, window_size: str, show_rejected: bool): |
||||||
|
positions_by_window, axis_def = load_positions(db_path, window_size) |
||||||
|
party_map = load_party_map(db_path) |
||||||
|
|
||||||
|
if not positions_by_window: |
||||||
|
st.error("No position data available. Run the pipeline or check data/motions.db") |
||||||
|
return |
||||||
|
|
||||||
|
windows = sorted(positions_by_window.keys()) |
||||||
|
# default: latest window |
||||||
|
default_index = max(0, len(windows) - 1) |
||||||
|
idx = st.slider("Window", 0, len(windows) - 1, default_index) |
||||||
|
window_id = windows[idx] |
||||||
|
|
||||||
|
pos = positions_by_window.get(window_id, {}) |
||||||
|
names = list(pos.keys()) |
||||||
|
xs = [p[0] for p in pos.values()] |
||||||
|
ys = [p[1] for p in pos.values()] |
||||||
|
parties = [party_map.get(n, "Unknown") for n in names] |
||||||
|
|
||||||
|
fig = px.scatter(x=xs, y=ys, color=parties, hover_name=names, title=f"Political Compass ({window_id})") |
||||||
|
st.plotly_chart(fig, use_container_width=True) |
||||||
|
|
||||||
|
|
||||||
|
def build_trajectories_tab(db_path: str, window_size: str): |
||||||
|
positions_by_window, _ = load_positions(db_path, window_size) |
||||||
|
if not positions_by_window: |
||||||
|
st.error("No trajectories available") |
||||||
|
return |
||||||
|
|
||||||
|
window_ids = sorted(positions_by_window.keys()) |
||||||
|
# Build per-party centroids per window |
||||||
|
import numpy as _np |
||||||
|
|
||||||
|
party_map = load_party_map(db_path) |
||||||
|
# user control |
||||||
|
show_mps = st.checkbox("Show MPs (individual trajectories)", value=False) |
||||||
|
selected_parties = st.multiselect("Parties (select to restrict)", options=sorted(set(party_map.values())), default=None) |
||||||
|
|
||||||
|
fig = None |
||||||
|
if show_mps: |
||||||
|
# plot a small subset by default to avoid clutter |
||||||
|
mp_limit = 200 |
||||||
|
traces = [] |
||||||
|
# build mp_coords |
||||||
|
mp_coords = {} |
||||||
|
for wid in window_ids: |
||||||
|
for mp, coord in positions_by_window.get(wid, {}).items(): |
||||||
|
mp_coords.setdefault(mp, []).append((wid, coord)) |
||||||
|
|
||||||
|
# optionally filter by party map |
||||||
|
mps = [m for m in mp_coords.keys() if (not selected_parties) or (party_map.get(m) in selected_parties)] |
||||||
|
mps = sorted(mps)[:mp_limit] |
||||||
|
|
||||||
|
fig = px.line() |
||||||
|
for mp in mps: |
||||||
|
items = sorted(mp_coords[mp], key=lambda it: window_ids.index(it[0])) |
||||||
|
xs = [c[1][0] for c in items] |
||||||
|
ys = [c[1][1] for c in items] |
||||||
|
fig.add_scatter(x=xs, y=ys, mode='lines+markers', name=mp) |
||||||
|
else: |
||||||
|
# party centroids |
||||||
|
party_centroids = {} |
||||||
|
for wid in window_ids: |
||||||
|
coords_by_party = {} |
||||||
|
for mp, coord in positions_by_window.get(wid, {}).items(): |
||||||
|
party = party_map.get(mp) |
||||||
|
if party is None: |
||||||
|
continue |
||||||
|
|
||||||
|
|
||||||
|
coords_by_party.setdefault(party, []).append(coord) |
||||||
|
for party, coords in coords_by_party.items(): |
||||||
|
xs = [c[0] for c in coords] |
||||||
|
ys = [c[1] for c in coords] |
||||||
|
centroid = (_np.mean(xs), _np.mean(ys)) |
||||||
|
party_centroids.setdefault(party, {'windows': [], 'coords': []}) |
||||||
|
party_centroids[party]['windows'].append(wid) |
||||||
|
party_centroids[party]['coords'].append(centroid) |
||||||
|
|
||||||
|
fig = px.line() |
||||||
|
for party, data in party_centroids.items(): |
||||||
|
if selected_parties and party not in selected_parties: |
||||||
|
continue |
||||||
|
|
||||||
|
xs = [c[0] for c in data['coords']] |
||||||
|
ys = [c[1] for c in data['coords']] |
||||||
|
fig.add_scatter(x=xs, y=ys, mode='lines+markers', name=party) |
||||||
|
|
||||||
|
if fig is not None: |
||||||
|
st.plotly_chart(fig, use_container_width=True) |
||||||
|
|
||||||
|
|
||||||
|
def build_search_tab(db_path: str, show_rejected: bool): |
||||||
|
df = load_motions_df(db_path) |
||||||
|
if df is None or df.empty: |
||||||
|
st.info("No motions table available") |
||||||
|
return |
||||||
|
|
||||||
|
# filters |
||||||
|
years = sorted(pd.to_datetime(df['date']).dt.year.dropna().unique().tolist()) |
||||||
|
if years: |
||||||
|
start_year, end_year = min(years), max(years) |
||||||
|
else: |
||||||
|
start_year, end_year = 2019, 2024 |
||||||
|
|
||||||
|
year_range = st.slider("Year range", int(start_year), int(end_year), (int(start_year), int(end_year))) |
||||||
|
policy_areas = sorted(df['policy_area'].dropna().unique().tolist()) |
||||||
|
policy_filter = st.multiselect("Policy areas", options=policy_areas, default=None) |
||||||
|
query = st.text_input("Search text (title / layman_explanation)") |
||||||
|
|
||||||
|
# in-memory filter |
||||||
|
working = df.copy() |
||||||
|
# filter rejected default |
||||||
|
if not show_rejected: |
||||||
|
working = working[working['title'].str.strip() != 'Verworpen.'] |
||||||
|
|
||||||
|
working['y'] = pd.to_datetime(working['date']).dt.year |
||||||
|
working = working[(working['y'] >= year_range[0]) & (working['y'] <= year_range[1])] |
||||||
|
if policy_filter: |
||||||
|
working = working[working['policy_area'].isin(policy_filter)] |
||||||
|
if query: |
||||||
|
q = query.lower() |
||||||
|
mask = working['title'].fillna('').str.lower().str.contains(q) | working['layman_explanation'].fillna('').str.lower().str.contains(q) |
||||||
|
working = working[mask] |
||||||
|
|
||||||
|
st.write(f"{len(working)} results") |
||||||
|
for _, row in working.sort_values(by='controversy_score', ascending=False).head(50).iterrows(): |
||||||
|
with st.expander(f"{row['title']} — {row['date']}"): |
||||||
|
st.write(row.get('layman_explanation') or row.get('description') or '') |
||||||
|
st.write('Policy area:', row.get('policy_area')) |
||||||
|
st.write('Controversy score:', row.get('controversy_score')) |
||||||
|
# similar |
||||||
|
similar = query_similar_from_cache(db_path, int(row['id']), vector_type='fused', top_k=10) |
||||||
|
if similar: |
||||||
|
st.write('Vergelijkbare moties:') |
||||||
|
for s in similar: |
||||||
|
st.write(f"- id={s['target_motion_id']} score={s['score']:.3f} window={s.get('window_id')}") |
||||||
|
else: |
||||||
|
st.info('Nog geen vergelijkbare moties beschikbaar') |
||||||
|
|
||||||
|
|
||||||
|
def build_browser_tab(db_path: str, show_rejected: bool): |
||||||
|
df = load_motions_df(db_path) |
||||||
|
if df is None or df.empty: |
||||||
|
st.info("No motions table available") |
||||||
|
return |
||||||
|
|
||||||
|
if not show_rejected: |
||||||
|
df = df[df['title'].str.strip() != 'Verworpen.'] |
||||||
|
|
||||||
|
df_display = df[['id', 'title', 'date', 'policy_area', 'controversy_score', 'winning_margin']].copy() |
||||||
|
df_display = df_display.sort_values(by=['date'], ascending=False) |
||||||
|
|
||||||
|
sel = st.experimental_data_editor(df_display, num_rows='dynamic') |
||||||
|
# store selected id via session_state: user clicks a row and then presses a button |
||||||
|
st.write('Select a row and click "Show details"') |
||||||
|
sel_row_idx = st.number_input('Select row index (0-based)', min_value=0, max_value=max(0, len(df_display)-1), value=0) |
||||||
|
if st.button('Show details'): |
||||||
|
row = df_display.iloc[int(sel_row_idx)] |
||||||
|
st.subheader(row['title']) |
||||||
|
st.write(df.loc[df['id'] == row['id']].iloc[0].get('description') or '') |
||||||
|
similar = query_similar_from_cache(db_path, int(row['id']), vector_type='fused', top_k=10) |
||||||
|
if similar: |
||||||
|
st.write('Top similar:') |
||||||
|
for s in similar: |
||||||
|
st.write(f"- id={s['target_motion_id']} score={s['score']:.3f} window={s.get('window_id')}") |
||||||
|
else: |
||||||
|
st.info('Nog geen vergelijkbare moties beschikbaar') |
||||||
|
|
||||||
|
|
||||||
|
def run_app(): |
||||||
|
st.set_page_config(layout='wide', page_title='Parlement Explorer') |
||||||
|
|
||||||
|
st.sidebar.title('Explorer settings') |
||||||
|
db_path = st.sidebar.text_input('DuckDB path', value='data/motions.db') |
||||||
|
window_granularity = st.sidebar.selectbox('Window granularity', ['annual', 'quarterly'], index=0) |
||||||
|
show_rejected = st.sidebar.checkbox('Toon verworpen', value=False) |
||||||
|
|
||||||
|
tabs = st.tabs(['Politiek Kompas', 'Partij Trajectories', 'Motie Zoeken', 'Motie Browser']) |
||||||
|
with tabs[0]: |
||||||
|
build_compass_tab(db_path, window_granularity, show_rejected) |
||||||
|
with tabs[1]: |
||||||
|
build_trajectories_tab(db_path, window_granularity) |
||||||
|
with tabs[2]: |
||||||
|
build_search_tab(db_path, show_rejected) |
||||||
|
with tabs[3]: |
||||||
|
build_browser_tab(db_path, show_rejected) |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__': |
||||||
|
run_app() |
||||||
|
``` |
||||||
|
|
||||||
|
**Verify (local/dev):** |
||||||
|
- Run the app once the DB is available: streamlit run explorer.py |
||||||
|
- Verify that Tab 1 loads and you can slide windows, plot renders inline |
||||||
|
- Verify Tab 3 search returns results and shows similar motions |
||||||
|
- Verify all long-running operations are cached (first call slow, subsequent fast) |
||||||
|
|
||||||
|
### Task 2.2: Test for explorer import-safety |
||||||
|
**File:** tests/test_explorer_import.py |
||||||
|
**Depends:** none |
||||||
|
|
||||||
|
Minimal pytest to ensure the module can be imported without triggering heavy work and that run_app and key functions exist. |
||||||
|
|
||||||
|
```python |
||||||
|
# tests/test_explorer_import.py |
||||||
|
import importlib |
||||||
|
|
||||||
|
|
||||||
|
def test_explorer_importable(): |
||||||
|
mod = importlib.import_module('explorer') |
||||||
|
assert hasattr(mod, 'run_app') |
||||||
|
assert callable(mod.run_app) |
||||||
|
# key helpers |
||||||
|
assert hasattr(mod, 'load_positions') |
||||||
|
assert hasattr(mod, 'load_motions_df') |
||||||
|
``` |
||||||
|
|
||||||
|
**Verify:** |
||||||
|
- Run tests (no DB required for import test): |
||||||
|
.venv/bin/python -m pytest tests/test_explorer_import.py -q |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Batch 3: Blog post update (manual / single-file edit) |
||||||
|
|
||||||
|
The blog post at thoughts/blog-post-political-compass.md contains placeholder numbers for motion counts, similarity cache totals and fused vector dimension claim. After analysis rerun completes, update the markdown with exact numbers. |
||||||
|
|
||||||
|
### Task 3.1: Update blog post with real numbers |
||||||
|
**File to modify:** thoughts/blog-post-political-compass.md |
||||||
|
**Depends:** 1.1, 1.2 (analysis rerun and similarity cache recompute must finish first) |
||||||
|
|
||||||
|
Steps to compute authoritative numbers (run after Batch 1 completes): |
||||||
|
1. Motion counts per year (SQL): |
||||||
|
.venv/bin/python - <<'PY' |
||||||
|
import duckdb |
||||||
|
conn = duckdb.connect(database='data/motions.db', read_only=True) |
||||||
|
rows = conn.execute("SELECT EXTRACT(year FROM date) AS y, COUNT(*) FROM motions GROUP BY y ORDER BY y").fetchall() |
||||||
|
print(rows) |
||||||
|
conn.close() |
||||||
|
PY |
||||||
|
|
||||||
|
2. Similarity cache total count (fused vectors): |
||||||
|
.venv/bin/python - <<'PY' |
||||||
|
import duckdb |
||||||
|
conn = duckdb.connect(database='data/motions.db', read_only=True) |
||||||
|
total = conn.execute("SELECT COUNT(*) FROM similarity_cache WHERE vector_type = 'fused'").fetchone()[0] |
||||||
|
print('similarity_cache_fused_total=', total) |
||||||
|
conn.close() |
||||||
|
PY |
||||||
|
|
||||||
|
3. Verify fused vector dimensions claim (inspect fused_embeddings.vector JSON lengths) — the fused field is stored as JSON array; compute distinct lengths: |
||||||
|
.venv/bin/python - <<'PY' |
||||||
|
import duckdb, json |
||||||
|
conn = duckdb.connect(database='data/motions.db', read_only=True) |
||||||
|
lens = conn.execute("SELECT DISTINCT CARDINALITY(vector) FROM fused_embeddings ORDER BY 1 DESC").fetchall() |
||||||
|
print('distinct_fused_lengths=', lens) |
||||||
|
conn.close() |
||||||
|
PY |
||||||
|
|
||||||
|
Replace the placeholder table and counts in thoughts/blog-post-political-compass.md with the outputs above. Also correct the fused dimensions claim (line that currently reads "fused = [svd_dims (10)] + [text_dims (2560)] = 2570") by pasting the real dimensions found. |
||||||
|
|
||||||
|
Verification: After editing, spell-check and run a quick search to ensure the old placeholder numbers are gone: |
||||||
|
grep -n "212,206\|2570\|~450 (newly backfilled)" -n thoughts/blog-post-political-compass.md || echo "No placeholders remain" |
||||||
|
|
||||||
|
Commit message suggestions (to use when committing these changes): |
||||||
|
- feat(explorer): add initial Streamlit explorer (explorer.py) + import test |
||||||
|
- chore(analysis): recompute fused embeddings + similarity cache for 2019-Q1..2024-Q4 (instructions) |
||||||
|
- docs(blog): update political compass blog post with real counts and vector dims |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Rollout / verification checklist (final acceptance) |
||||||
|
- [ ] Analysis rerun finished without errors; fused_embeddings rows present for 2019-Q1..2024-Q4 |
||||||
|
- [ ] similarity_cache contains top-k neighbors for each window (spot-check 3 windows) |
||||||
|
- [ ] explorer.py runs: streamlit run explorer.py renders tabs and figures inline |
||||||
|
- [ ] explorer uses read-only DuckDB connections (manual code review + spot-check) |
||||||
|
- [ ] thoughts/blog-post-political-compass.md updated with real numbers and vector dims |
||||||
|
- [ ] All tests still pass: .venv/bin/python -m pytest -q |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Appendix: reasoning & decisions |
||||||
|
- Design requires read-only DB access: MotionDatabase methods often open connections without read_only flag. To guarantee read-only behaviour while the pipeline runs, explorer.py queries DuckDB directly with read_only=True for all SELECTs. This avoids accidentally holding write locks. |
||||||
|
- The design required using existing analysis.* modules. compute_2d_axes is used as-is and wrapped by @st.cache_data; we rely on it to perform heavy PCA/SVD logic. |
||||||
|
- The similarity recompute step uses similarity.compute.compute_similarities per-window. The design referenced recompute_all_windows which did not exist in the repo; we use a small loop (shown above) to call compute_similarities per window. |
||||||
|
|
||||||
|
*** End Plan |
||||||
@ -0,0 +1,286 @@ |
|||||||
|
# StemAtlas Deployment — Implementation Plan |
||||||
|
|
||||||
|
**Design:** `thoughts/shared/designs/2026-03-22-stematlas-deployment-design.md` |
||||||
|
**Date:** 2026-03-22 |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Overview |
||||||
|
|
||||||
|
Four independent batches. Batches A and B can run in parallel. Batch C requires the pipeline to finish first. Batch D is VPS infrastructure (manual steps, done once). |
||||||
|
|
||||||
|
``` |
||||||
|
Batch A: stemwijzer repo — Streamlit multi-page + Docker |
||||||
|
Batch B: sgeboers.nl repo — blog/, nav, blog post HTML skeleton |
||||||
|
Batch C: Charts — generate + embed (after pipeline finishes) |
||||||
|
Batch D: VPS infrastructure — Nginx vhost + Certbot + /srv/stematlas/ |
||||||
|
``` |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Batch A — stemwijzer repo: Streamlit multi-page + Docker |
||||||
|
|
||||||
|
### A1. Check Dockerfile |
||||||
|
Read existing `Dockerfile` — verify it installs all deps from `pyproject.toml` and sets `CMD` to start the app. Note current entrypoint (probably `streamlit run app.py`). |
||||||
|
|
||||||
|
### A2. Create `Home.py` |
||||||
|
New file at project root. Streamlit landing/about page: |
||||||
|
- Title: "StemAtlas" |
||||||
|
- Brief description of the two pages (quiz + explorer) |
||||||
|
- Links (Streamlit sidebar nav handles the rest automatically) |
||||||
|
- `st.page_link()` cards pointing to the two pages |
||||||
|
|
||||||
|
### A3. Create `pages/1_Stemwijzer.py` |
||||||
|
Thin wrapper that imports and calls `app.main()`: |
||||||
|
- Import `from app import main` |
||||||
|
- Remove the `if __name__ == "__main__": main()` guard from `app.py` (or keep it — Streamlit ignores it when the file is imported) |
||||||
|
- The page title shown in Streamlit nav comes from the filename: `1_Stemwijzer` → "Stemwijzer" |
||||||
|
|
||||||
|
### A4. Create `pages/2_Explorer.py` |
||||||
|
Same pattern: |
||||||
|
- Import `from explorer import run_app` |
||||||
|
- Call `run_app()` |
||||||
|
- Filename → nav label: "Explorer" |
||||||
|
|
||||||
|
### A5. Update Dockerfile CMD |
||||||
|
Change entrypoint from `streamlit run app.py` to `streamlit run Home.py --server.port 8501 --server.address 0.0.0.0`. |
||||||
|
|
||||||
|
### A6. Create `docker-compose.yml` |
||||||
|
Two services in the stemwijzer repo: |
||||||
|
|
||||||
|
```yaml |
||||||
|
version: "3.9" |
||||||
|
services: |
||||||
|
stematlas: |
||||||
|
image: ${DOCKER_REGISTRY}/sgeboers/stemwijzer:latest |
||||||
|
ports: |
||||||
|
- "127.0.0.1:8501:8501" |
||||||
|
volumes: |
||||||
|
- /srv/stematlas/data:/app/data |
||||||
|
restart: unless-stopped |
||||||
|
environment: |
||||||
|
- DB_PATH=/app/data/motions.db |
||||||
|
|
||||||
|
scheduler: |
||||||
|
image: ${DOCKER_REGISTRY}/sgeboers/stemwijzer:latest |
||||||
|
command: python scheduler.py |
||||||
|
volumes: |
||||||
|
- /srv/stematlas/data:/app/data |
||||||
|
restart: unless-stopped |
||||||
|
environment: |
||||||
|
- DB_PATH=/app/data/motions.db |
||||||
|
``` |
||||||
|
|
||||||
|
`127.0.0.1:8501` — only accessible from localhost, Nginx proxies externally. |
||||||
|
|
||||||
|
### A7. Smoke test for `Home.py` |
||||||
|
Add `tests/test_home_import.py` — same pattern as `test_explorer_import.py`. Verify `Home` module is importable, `run_app` or equivalent callable exists. |
||||||
|
|
||||||
|
### A8. Run tests |
||||||
|
`.venv/bin/python -m pytest -q` — all existing + new smoke tests must pass. |
||||||
|
|
||||||
|
### Verification |
||||||
|
`docker build -t stematlas-local .` locally to confirm image builds without errors. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Batch B — sgeboers.nl repo: blog/ + nav |
||||||
|
|
||||||
|
> This batch requires access to the sgeboers.nl repo on git.sgeboers.nl. |
||||||
|
> Steps below assume the repo is cloned locally. |
||||||
|
|
||||||
|
### B1. Inspect existing site structure |
||||||
|
Read `index.html` and any existing CSS files to understand: |
||||||
|
- Current nav structure (header? sidebar? footer?) |
||||||
|
- CSS class conventions for links/sections |
||||||
|
- Any existing page patterns to copy for the blog post |
||||||
|
|
||||||
|
### B2. Create `blog/` directory |
||||||
|
Add `blog/index.html` — a minimal blog listing page: |
||||||
|
- Title: "Blog" |
||||||
|
- One entry: "StemAtlas — Mapping Dutch Democracy" → `blog/stematlas.html` |
||||||
|
- Matches existing site style |
||||||
|
|
||||||
|
### B3. Add nav link to main site |
||||||
|
Update `index.html` (or whichever file contains the nav) to add a "Blog" link pointing to `/blog/`. |
||||||
|
|
||||||
|
### B4. Create `blog/stematlas.html` skeleton |
||||||
|
Full blog post HTML based on `thoughts/blog-post-political-compass.md`: |
||||||
|
- Convert markdown to HTML (headings, paragraphs, code blocks, tables) |
||||||
|
- Add Plotly CDN `<script>` in `<head>` |
||||||
|
- **Chart placeholders**: `<!-- CHART: compass_latest -->`, `<!-- CHART: trajectories -->` — to be filled in Batch C |
||||||
|
- Add two CTAs linking to `stematlas.sgeboers.nl`: |
||||||
|
- After compass chart: *"Explore every window interactively →"* |
||||||
|
- At bottom: *"Try the Stemwijzer quiz →"* |
||||||
|
- Match existing site CSS (link the same stylesheet) |
||||||
|
|
||||||
|
### B5. Update Drone pipeline (sgeboers.nl repo) |
||||||
|
Confirm the existing `.drone.yml` in sgeboers.nl picks up new files under `blog/` automatically (it should, if it deploys the whole repo root). No changes needed if it's already a `rsync` or `cp -r` deploy. |
||||||
|
|
||||||
|
### Verification |
||||||
|
Open `blog/stematlas.html` locally in browser — post renders correctly with placeholder chart divs, nav works. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Batch C — Charts: generate + embed (after pipeline finishes ~21:40) |
||||||
|
|
||||||
|
> Requires `data/motions.db` to be unlocked (pipeline complete). |
||||||
|
|
||||||
|
### C1. Run tests |
||||||
|
`.venv/bin/python -m pytest -q` — confirm all pass now that DB is free. |
||||||
|
|
||||||
|
### C2. Run similarity cache recompute |
||||||
|
``` |
||||||
|
.venv/bin/python -m pipeline.run_pipeline \ |
||||||
|
--db-path data/motions.db \ |
||||||
|
--start-date 2019-01-01 --end-date 2025-01-01 \ |
||||||
|
--window-size quarterly \ |
||||||
|
--skip-metadata --skip-extract --skip-svd --skip-text |
||||||
|
``` |
||||||
|
Fusion only — fills `fused_embeddings` for new 2019–2021 and 2024 windows. |
||||||
|
|
||||||
|
### C3. Recompute similarity cache |
||||||
|
``` |
||||||
|
.venv/bin/python -c " |
||||||
|
from similarity.compute import compute_similarities |
||||||
|
import duckdb |
||||||
|
conn = duckdb.connect('data/motions.db', read_only=True) |
||||||
|
windows = [r[0] for r in conn.execute(\"SELECT DISTINCT window_id FROM fused_embeddings ORDER BY 1\").fetchall()] |
||||||
|
conn.close() |
||||||
|
for w in windows: |
||||||
|
print(f'Computing {w}...') |
||||||
|
compute_similarities('data/motions.db', w, top_k=20) |
||||||
|
" |
||||||
|
``` |
||||||
|
|
||||||
|
### C4. Generate compass HTML files |
||||||
|
``` |
||||||
|
.venv/bin/python scripts/generate_compass.py \ |
||||||
|
--db data/motions.db \ |
||||||
|
--out outputs/blog-charts \ |
||||||
|
--method pca --pca-residual |
||||||
|
``` |
||||||
|
|
||||||
|
This produces `outputs/blog-charts/compass_*.html` and `outputs/blog-charts/trajectories_*.html`. |
||||||
|
|
||||||
|
### C5. Extract Plotly snippets |
||||||
|
For each chart file, extract the embeddable snippet: |
||||||
|
```python |
||||||
|
# Run once per chart to get embeddable HTML |
||||||
|
import plotly.io as pio |
||||||
|
# OR: just strip everything outside <div id="..."> and its <script> |
||||||
|
# The generate_compass.py output is self-contained — use BeautifulSoup or |
||||||
|
# manual extraction to get just the div+script block |
||||||
|
``` |
||||||
|
|
||||||
|
Simpler: modify `generate_compass.py` to add a `--partial` flag that calls `fig.to_html(include_plotlyjs=False, full_html=False)` and writes `.partial.html` files alongside the full ones. |
||||||
|
|
||||||
|
### C6. Fill chart placeholders in blog post |
||||||
|
Replace `<!-- CHART: compass_latest -->` and `<!-- CHART: trajectories -->` in `blog/stematlas.html` with the extracted Plotly div+script blocks. |
||||||
|
|
||||||
|
### C7. Update motion count table in blog post |
||||||
|
Run SQL to get authoritative counts: |
||||||
|
```sql |
||||||
|
SELECT strftime(date, '%Y') AS year, COUNT(*) AS motions |
||||||
|
FROM motions |
||||||
|
GROUP BY year ORDER BY year; |
||||||
|
``` |
||||||
|
Replace placeholder numbers in `blog/stematlas.html` table. |
||||||
|
|
||||||
|
### C8. Push sgeboers.nl repo |
||||||
|
Commit and push `blog/stematlas.html` + `blog/index.html` + nav changes to git.sgeboers.nl → Drone deploys. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Batch D — VPS infrastructure (manual, one-time) |
||||||
|
|
||||||
|
> SSH into the VPS. Steps are sequential. |
||||||
|
|
||||||
|
### D1. Create data directory |
||||||
|
```bash |
||||||
|
sudo mkdir -p /srv/stematlas/data |
||||||
|
sudo chown $USER:$USER /srv/stematlas/data |
||||||
|
``` |
||||||
|
|
||||||
|
### D2. Copy `motions.db` to VPS |
||||||
|
From local machine: |
||||||
|
```bash |
||||||
|
rsync -avz --progress data/motions.db user@vps:/srv/stematlas/data/motions.db |
||||||
|
``` |
||||||
|
~3.6GB transfer — takes a few minutes. |
||||||
|
|
||||||
|
### D3. Add Nginx vhost |
||||||
|
New file `/etc/nginx/sites-available/stematlas`: |
||||||
|
```nginx |
||||||
|
server { |
||||||
|
listen 80; |
||||||
|
server_name stematlas.sgeboers.nl; |
||||||
|
return 301 https://$host$request_uri; |
||||||
|
} |
||||||
|
|
||||||
|
server { |
||||||
|
listen 443 ssl; |
||||||
|
server_name stematlas.sgeboers.nl; |
||||||
|
|
||||||
|
# Let's Encrypt certs (Certbot fills these in) |
||||||
|
ssl_certificate /etc/letsencrypt/live/stematlas.sgeboers.nl/fullchain.pem; |
||||||
|
ssl_certificate_key /etc/letsencrypt/live/stematlas.sgeboers.nl/privkey.pem; |
||||||
|
|
||||||
|
location / { |
||||||
|
proxy_pass http://127.0.0.1:8501; |
||||||
|
proxy_http_version 1.1; |
||||||
|
proxy_set_header Upgrade $http_upgrade; |
||||||
|
proxy_set_header Connection "upgrade"; |
||||||
|
proxy_set_header Host $host; |
||||||
|
proxy_set_header X-Real-IP $remote_addr; |
||||||
|
proxy_read_timeout 86400; |
||||||
|
} |
||||||
|
} |
||||||
|
``` |
||||||
|
|
||||||
|
Enable: `sudo ln -s /etc/nginx/sites-available/stematlas /etc/nginx/sites-enabled/` |
||||||
|
|
||||||
|
### D4. Get Let's Encrypt cert |
||||||
|
```bash |
||||||
|
sudo certbot --nginx -d stematlas.sgeboers.nl |
||||||
|
``` |
||||||
|
(Assumes Certbot is already installed and working for other subdomains on this VPS.) |
||||||
|
|
||||||
|
### D5. First deploy |
||||||
|
The Drone pipeline for the stemwijzer repo will handle future deploys. For the first deploy, either: |
||||||
|
- Push a commit to trigger Drone, OR |
||||||
|
- Manually on VPS: `cd /srv/stematlas && docker-compose pull && docker-compose up -d` |
||||||
|
|
||||||
|
### D6. Verify |
||||||
|
- `https://stematlas.sgeboers.nl` → Streamlit loads, shows Home.py |
||||||
|
- Both pages accessible from Streamlit nav |
||||||
|
- `docker-compose logs stematlas` — no errors |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Dependencies Between Batches |
||||||
|
|
||||||
|
``` |
||||||
|
A (stemwijzer repo) ──► D5 (first deploy) ──► D6 (verify) |
||||||
|
B (sgeboers.nl repo) ──► C8 (push blog) |
||||||
|
C (charts) ──► C8 (push blog) |
||||||
|
D1-D4 (VPS infra) ──► D5 (first deploy) |
||||||
|
|
||||||
|
Pipeline finish (~21:40) ──► C1 (tests) ──► C2-C7 (charts) |
||||||
|
``` |
||||||
|
|
||||||
|
Batches A and B are fully independent — can start now. |
||||||
|
Batch C waits only for the pipeline to finish. |
||||||
|
Batch D is VPS-side and independent of code changes. |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Estimated Effort |
||||||
|
|
||||||
|
| Batch | Tasks | Est. Time | |
||||||
|
|-------|-------|-----------| |
||||||
|
| A | Multi-page Streamlit + docker-compose | 45 min | |
||||||
|
| B | Blog HTML + nav (after inspecting site) | 60 min | |
||||||
|
| C | Charts + embed (after pipeline) | 30 min | |
||||||
|
| D | VPS infra (manual SSH) | 30 min | |
||||||
|
| **Total** | | **~2.5 hours** | |
||||||
Loading…
Reference in new issue