diff --git a/.drone.yml b/.drone.yml
index 276f176..24e2fa4 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -28,7 +28,7 @@ steps:
password: ${DEPLOY_PASSWORD}
script: |
set -e
- cd /srv/stemwijzer
+ cd /srv/stematlas
docker pull ${DOCKER_REGISTRY}/${DRONE_REPO_OWNER}/${DRONE_REPO_NAME}:latest
docker-compose pull
docker-compose up -d
diff --git a/.gitignore b/.gitignore
index 8630cd8..ab1308d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,6 @@ data/*.json
# Generated output files
outputs/
outputs_*/
+
+# Stray temp files
+dummy
diff --git a/Dockerfile b/Dockerfile
index 4e7198a..10f6c1b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,13 +13,9 @@ WORKDIR /home/app/app
# Copy project files
COPY . /home/app/app
-# Upgrade pip and install either pinned requirements or runtime defaults
+# Upgrade pip and install all project dependencies from pyproject.toml
RUN python -m pip install --upgrade pip
-RUN if [ -f requirements.txt ]; then \
- pip install -r requirements.txt; \
- else \
- pip install uv streamlit duckdb; \
- fi
+RUN pip install .
# Fix permissions
RUN chown -R app:app /home/app
@@ -32,5 +28,5 @@ EXPOSE 8501
# Simple healthcheck that queries the Streamlit root
HEALTHCHECK --interval=30s --timeout=3s --start-period=10s CMD curl -f http://localhost:8501/ || exit 1
-# Run the Streamlit app via uv as preferred in this project
-CMD ["uv", "run", "streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
+# Run the multi-page Streamlit app
+CMD ["streamlit", "run", "Home.py", "--server.port=8501", "--server.address=0.0.0.0"]
diff --git a/Home.py b/Home.py
new file mode 100644
index 0000000..e036c4b
--- /dev/null
+++ b/Home.py
@@ -0,0 +1,53 @@
+"""StemAtlas β home page.
+
+Entry point for the Streamlit multi-page app. Shows a landing page with
+brief descriptions of and links to the two sub-pages.
+"""
+
+import streamlit as st
+
+st.set_page_config(
+ page_title="StemAtlas",
+ page_icon="πΊοΈ",
+ layout="centered",
+ initial_sidebar_state="expanded",
+)
+
+
+def main() -> None:
+ st.title("πΊοΈ StemAtlas")
+ st.markdown(
+ "**StemAtlas** brengt de Nederlandse Tweede Kamer in kaart op basis van "
+ "echte stemmingen over moties. Gebruik de Stemwijzer om te ontdekken welke "
+ "partij het beste bij jouw standpunten past, of verken de politieke ruimte "
+ "zelf in de Explorer."
+ )
+
+ st.divider()
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.subheader("π³οΈ Stemwijzer")
+ st.markdown(
+ "Stem op echte Tweede Kamer moties en zie welke partij het "
+ "dichtst bij jouw keuzes staat."
+ )
+ st.page_link("pages/1_Stemwijzer.py", label="Open Stemwijzer", icon="π³οΈ")
+
+ with col2:
+ st.subheader("π Politiek Explorer")
+ st.markdown(
+ "Verken het politieke kompas, partijtrajecten door de tijd, "
+ "en zoek vergelijkbare moties op in het archief."
+ )
+ st.page_link("pages/2_Explorer.py", label="Open Explorer", icon="π")
+
+ st.divider()
+ st.caption(
+ "Data: Tweede Kamer API Β· Embeddings: OpenAI Β· "
+ "Gemaakt door [Sebastiaan Geboers](https://sgeboers.nl)"
+ )
+
+
+main()
diff --git a/database.py b/database.py
index 5ff5ae4..28898d4 100644
--- a/database.py
+++ b/database.py
@@ -305,6 +305,104 @@ class MotionDatabase:
conn.close()
return False
+ def batch_insert_motions(self, motions_data: List[Dict]) -> Tuple[int, int]:
+ """Batch-insert motions and their mp_votes using a single DuckDB connection.
+
+ Returns (inserted_count, duplicate_count).
+ """
+ if not motions_data:
+ return 0, 0
+
+ try:
+ conn = duckdb.connect(self.db_path)
+
+ # 1. Find which URLs already exist β single query
+ urls = [m["url"] for m in motions_data]
+ placeholders = ", ".join("?" * len(urls))
+ existing_urls = set(
+ row[0]
+ for row in conn.execute(
+ f"SELECT url FROM motions WHERE url IN ({placeholders})", urls
+ ).fetchall()
+ )
+
+ new_motions = [m for m in motions_data if m["url"] not in existing_urls]
+ duplicates = len(motions_data) - len(new_motions)
+
+ if not new_motions:
+ conn.close()
+ return 0, duplicates
+
+ # 2. Bulk-insert motions
+ motion_rows = [
+ (
+ m["title"],
+ m["description"] or "",
+ m["date"],
+ m["policy_area"],
+ json.dumps(m["voting_results"]),
+ m["winning_margin"],
+ 1 - m["winning_margin"],
+ m["url"],
+ m.get("externe_identifier"),
+ m.get("body_text"),
+ )
+ for m in new_motions
+ ]
+ conn.executemany(
+ """
+ INSERT INTO motions
+ (title, description, date, policy_area, voting_results,
+ winning_margin, controversy_score, url, externe_identifier,
+ body_text, created_at)
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
+ """,
+ motion_rows,
+ )
+
+ # 3. Fetch the newly-assigned IDs in one query
+ new_urls = [m["url"] for m in new_motions]
+ np = ", ".join("?" * len(new_urls))
+ url_to_id = {
+ row[1]: row[0]
+ for row in conn.execute(
+ f"SELECT id, url FROM motions WHERE url IN ({np})", new_urls
+ ).fetchall()
+ }
+
+ # 4. Bulk-insert mp_votes
+ vote_rows = []
+ for m in new_motions:
+ motion_id = url_to_id.get(m["url"])
+ if motion_id is None:
+ continue
+ mp_vote_parties = m.get("mp_vote_parties", {})
+ voting_results_raw = m.get("voting_results", {})
+ motion_date = m.get("date", "")
+ for mp_name, party in mp_vote_parties.items():
+ vote = voting_results_raw.get(mp_name, "afwezig")
+ vote_rows.append((motion_id, mp_name, party, vote, motion_date))
+
+ if vote_rows:
+ conn.executemany(
+ """
+ INSERT INTO mp_votes (motion_id, mp_name, party, vote, date, created_at)
+ VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
+ """,
+ vote_rows,
+ )
+
+ conn.close()
+ return len(new_motions), duplicates
+
+ except Exception as e:
+ _logger.error(f"Error in batch_insert_motions: {e}")
+ try:
+ conn.close()
+ except Exception:
+ pass
+ raise
+
def get_filtered_motions(
self,
policy_area: str = "Alle",
@@ -675,6 +773,43 @@ class MotionDatabase:
pass
return -1
+ def batch_store_svd_vectors(
+ self,
+ window_id: str,
+ rows: List[Tuple], # each: (entity_type, entity_id, vector_list, model_or_None)
+ ) -> int:
+ """Batch-upsert SVD vectors for a window using a single connection.
+
+ Deletes all existing rows for the window first, then inserts the new batch.
+ Returns number of rows inserted.
+ """
+ if not rows:
+ return 0
+ try:
+ conn = duckdb.connect(self.db_path)
+ conn.execute("DELETE FROM svd_vectors WHERE window_id = ?", (window_id,))
+ insert_rows = [
+ (window_id, entity_type, entity_id, json.dumps(vector), model)
+ for entity_type, entity_id, vector, model in rows
+ ]
+ conn.executemany(
+ """
+ INSERT INTO svd_vectors
+ (window_id, entity_type, entity_id, vector, model, created_at)
+ VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
+ """,
+ insert_rows,
+ )
+ conn.close()
+ return len(insert_rows)
+ except Exception as e:
+ _logger.error(f"Error in batch_store_svd_vectors: {e}")
+ try:
+ conn.close()
+ except Exception:
+ pass
+ raise
+
def store_fused_embedding(
self,
motion_id: int,
diff --git a/docker-compose.yml b/docker-compose.yml
index 73ddb7d..9082d64 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,20 +1,32 @@
-version: '3.8'
+version: "3.9"
+
services:
- stemwijzer:
- build: .
- image: stemwijzer:latest
- container_name: stemwijzer_app
- restart: unless-stopped
+ stematlas:
+ image: ${DOCKER_REGISTRY}/sgeboers/stemwijzer:latest
ports:
- - "8501:8501"
+ - "127.0.0.1:8501:8501"
volumes:
- - ./data:/home/app/app/data:rw
+ - /srv/stematlas/data:/home/app/app/data
+ restart: unless-stopped
environment:
- PYTHONPATH=/home/app/app
- OPENROUTER_API_KEY
- - OTHER_SECRET
+ - DB_PATH=/home/app/app/data/motions.db
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8501/"]
interval: 30s
timeout: 3s
retries: 3
+ start_period: 15s
+
+ scheduler:
+ image: ${DOCKER_REGISTRY}/sgeboers/stemwijzer:latest
+ command: python scheduler.py
+ volumes:
+ - /srv/stematlas/data:/home/app/app/data
+ restart: unless-stopped
+ environment:
+ - PYTHONPATH=/home/app/app
+ - OPENROUTER_API_KEY
+ - OPENAI_API_KEY
+ - DB_PATH=/home/app/app/data/motions.db
diff --git a/explorer.py b/explorer.py
new file mode 100644
index 0000000..9e709a3
--- /dev/null
+++ b/explorer.py
@@ -0,0 +1,586 @@
+"""Parlement Explorer β Streamlit data analysis app.
+
+Four tabs:
+ 1. Politiek Kompas β 2D scatter of MPs/parties, window slider
+ 2. Partij Trajectories β party centroid lines over time
+ 3. Motie Zoeken β text search + similarity lookup
+ 4. Motie Browser β sortable table + detail panel
+
+Run with: streamlit run explorer.py
+
+Import-safe: heavy computation is behind @st.cache_data and only runs at UI time.
+All DuckDB connections are read_only=True so the app can run alongside the pipeline.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from typing import Dict, List, Optional, Tuple
+
+import duckdb
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+import streamlit as st
+
+logger = logging.getLogger(__name__)
+
+# Party colour palette (consistent across tabs)
+PARTY_COLOURS: Dict[str, str] = {
+ "VVD": "#1E73BE",
+ "PVV": "#002366",
+ "D66": "#00A36C",
+ "CDA": "#4CAF50",
+ "SP": "#E53935",
+ "PvdA": "#D32F2F",
+ "GroenLinks": "#388E3C",
+ "GroenLinks-PvdA": "#2E7D32",
+ "CU": "#0288D1",
+ "SGP": "#F4511E",
+ "PvdD": "#43A047",
+ "FVD": "#6A1B9A",
+ "JA21": "#7B1FA2",
+ "BBB": "#8D6E63",
+ "NSC": "#FF8F00",
+ "DENK": "#00897B",
+ "50PLUS": "#7E57C2",
+ "Unknown": "#9E9E9E",
+}
+
+
+# ---------------------------------------------------------------------------
+# Cached loaders
+# ---------------------------------------------------------------------------
+
+
+@st.cache_data(show_spinner="Beschikbare tijdsvensters ladenβ¦")
+def get_available_windows(db_path: str) -> List[str]:
+ """Return sorted list of distinct window_ids from svd_vectors."""
+ con = duckdb.connect(database=db_path, read_only=True)
+ try:
+ rows = con.execute(
+ "SELECT DISTINCT window_id FROM svd_vectors ORDER BY window_id"
+ ).fetchall()
+ return [r[0] for r in rows]
+ except Exception:
+ logger.exception("Failed to query available windows")
+ return []
+ finally:
+ con.close()
+
+
+@st.cache_data(show_spinner="2D posities berekenen (kan even duren)β¦")
+def load_positions(
+ db_path: str, window_size: str = "quarterly"
+) -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], Dict]:
+ """Compute 2D positions per window using PCA on aligned SVD vectors.
+
+ Returns:
+ positions_by_window: {window_id: {entity_name: (x, y)}}
+ axis_def: dict with x_axis, y_axis, method keys
+ """
+ from analysis.political_axis import compute_2d_axes
+
+ available = get_available_windows(db_path)
+ if window_size == "annual":
+ # Keep only Q4 windows (one representative window per year)
+ available = [w for w in available if w.endswith("-Q4")]
+
+ if not available:
+ return {}, {}
+
+ positions_by_window, axis_def = compute_2d_axes(
+ db_path,
+ window_ids=available,
+ method="pca",
+ pca_residual=True,
+ normalize_vectors=True,
+ )
+ return positions_by_window, axis_def
+
+
+@st.cache_data(show_spinner="Partijkaart ladenβ¦")
+def load_party_map(db_path: str) -> Dict[str, str]:
+ """Return {mp_name: party} mapping from mp_metadata (with vote-based fallback)."""
+ from analysis.visualize import _load_party_map
+
+ try:
+ return _load_party_map(db_path)
+ except Exception:
+ logger.exception("Failed to load party map")
+ return {}
+
+
+@st.cache_data(show_spinner="Moties ladenβ¦")
+def load_motions_df(db_path: str) -> pd.DataFrame:
+ """Load the full motions table as a pandas DataFrame (read-only)."""
+ con = duckdb.connect(database=db_path, read_only=True)
+ try:
+ df = con.execute(
+ """
+ SELECT id, title, description, date, policy_area,
+ voting_results, layman_explanation,
+ winning_margin, controversy_score
+ FROM motions
+ """
+ ).fetchdf()
+ df["date"] = pd.to_datetime(df["date"], errors="coerce")
+ df["year"] = df["date"].dt.year
+ return df
+ except Exception:
+ logger.exception("Failed to load motions")
+ return pd.DataFrame()
+ finally:
+ con.close()
+
+
+def query_similar(
+ db_path: str,
+ source_motion_id: int,
+ vector_type: str = "fused",
+ top_k: int = 10,
+) -> pd.DataFrame:
+ """Return top-k similar motions from similarity_cache (read-only)."""
+ con = duckdb.connect(database=db_path, read_only=True)
+ try:
+ rows = con.execute(
+ """
+ SELECT sc.target_motion_id, sc.score, sc.window_id,
+ m.title, m.date, m.policy_area
+ FROM similarity_cache sc
+ JOIN motions m ON m.id = sc.target_motion_id
+ WHERE sc.source_motion_id = ?
+ AND sc.vector_type = ?
+ ORDER BY sc.score DESC
+ LIMIT ?
+ """,
+ [source_motion_id, vector_type, top_k],
+ ).fetchdf()
+ return rows
+ except Exception:
+ logger.exception(
+ "Failed to query similarity cache for motion %s", source_motion_id
+ )
+ return pd.DataFrame()
+ finally:
+ con.close()
+
+
+# ---------------------------------------------------------------------------
+# Tab 1: Politiek Kompas
+# ---------------------------------------------------------------------------
+
+
+def build_compass_tab(db_path: str, window_size: str) -> None:
+ st.subheader("Politiek Kompas")
+ st.markdown(
+ "2D projectie van Kamerlid posities op basis van stemgedrag (PCA op SVD-vectoren)."
+ )
+
+ positions_by_window, axis_def = load_positions(db_path, window_size)
+ if not positions_by_window:
+ st.warning(
+ "Geen positiedata beschikbaar. Controleer of de pipeline is gedraaid."
+ )
+ return
+
+ party_map = load_party_map(db_path)
+ windows = sorted(positions_by_window.keys())
+
+ col1, col2 = st.columns([3, 1])
+ with col2:
+ window_idx = st.select_slider(
+ "Tijdsvenster", options=windows, value=windows[-1]
+ )
+ show_names = st.checkbox("Toon namen", value=False)
+ min_size = st.slider("Min. MPs per partij", 0, 20, 3)
+
+ pos = positions_by_window.get(window_idx, {})
+ if not pos:
+ st.info(f"Geen data voor venster {window_idx}")
+ return
+
+ rows = []
+ for name, (x, y) in pos.items():
+ party = party_map.get(name, "Unknown")
+ rows.append({"name": name, "x": x, "y": y, "party": party})
+
+ df_pos = pd.DataFrame(rows)
+
+ # Filter to parties with enough MPs
+ party_counts = df_pos["party"].value_counts()
+ valid_parties = party_counts[party_counts >= min_size].index
+ df_pos = df_pos[df_pos["party"].isin(valid_parties)]
+
+ colour_map = {p: PARTY_COLOURS.get(p, "#9E9E9E") for p in df_pos["party"].unique()}
+
+ fig = px.scatter(
+ df_pos,
+ x="x",
+ y="y",
+ color="party",
+ hover_name="name",
+ hover_data={"party": True, "x": ":.3f", "y": ":.3f"},
+ color_discrete_map=colour_map,
+ title=f"Politiek Kompas β {window_idx}",
+ labels={"x": "Links β β Rechts", "y": "Progressief β / Conservatief β"},
+ )
+ if show_names:
+ fig.update_traces(text=df_pos["name"], textposition="top center")
+ fig.update_layout(height=600, legend_title_text="Partij")
+
+ with col1:
+ st.plotly_chart(fig, use_container_width=True)
+
+ # Axis info
+ if axis_def:
+ evr = axis_def.get("explained_variance_ratio", [])
+ if evr:
+ st.caption(
+ f"PCA verklaarde variantie: as 1 = {evr[0] * 100:.1f}%, as 2 = {evr[1] * 100:.1f}%"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Tab 2: Partij Trajectories
+# ---------------------------------------------------------------------------
+
+
+def build_trajectories_tab(db_path: str, window_size: str) -> None:
+ st.subheader("Partij Trajectories")
+ st.markdown("Hoe bewegen partijen over de tijdsvensters heen?")
+
+ positions_by_window, _ = load_positions(db_path, window_size)
+ if not positions_by_window:
+ st.warning("Geen positiedata beschikbaar.")
+ return
+
+ party_map = load_party_map(db_path)
+ windows = sorted(positions_by_window.keys())
+
+ # Compute party centroids per window
+ centroids: Dict[str, Dict[str, Tuple[float, float]]] = {}
+ all_parties: set = set()
+ for wid in windows:
+ pos = positions_by_window.get(wid, {})
+ per_party: Dict[str, List[Tuple[float, float]]] = {}
+ for mp_name, (x, y) in pos.items():
+ party = party_map.get(mp_name, "Unknown")
+ if party == "Unknown":
+ continue
+ per_party.setdefault(party, []).append((x, y))
+ for party, coords in per_party.items():
+ all_parties.add(party)
+ xs = [c[0] for c in coords]
+ ys = [c[1] for c in coords]
+ centroids.setdefault(party, {})[wid] = (
+ float(np.mean(xs)),
+ float(np.mean(ys)),
+ )
+
+ all_parties_sorted = sorted(all_parties)
+ major_parties = [
+ p
+ for p in all_parties_sorted
+ if len(centroids.get(p, {})) >= max(2, len(windows) // 2)
+ ]
+
+ selected_parties = st.multiselect(
+ "Selecteer partijen",
+ options=all_parties_sorted,
+ default=major_parties[:12] if major_parties else all_parties_sorted[:8],
+ )
+
+ fig = go.Figure()
+ for party in selected_parties:
+ if party not in centroids:
+ continue
+ wids_sorted = sorted(centroids[party].keys())
+ xs = [centroids[party][w][0] for w in wids_sorted]
+ ys = [centroids[party][w][1] for w in wids_sorted]
+ colour = PARTY_COLOURS.get(party, "#9E9E9E")
+ fig.add_trace(
+ go.Scatter(
+ x=xs,
+ y=ys,
+ mode="lines+markers+text",
+ name=party,
+ text=[w.replace("-Q4", "") for w in wids_sorted],
+ textposition="top center",
+ line=dict(color=colour),
+ marker=dict(color=colour, size=8),
+ hovertemplate=(
+ f"{party}
"
+ "venster: %{text}
"
+ "x: %{x:.3f}
y: %{y:.3f}"
+ ),
+ )
+ )
+
+ fig.update_layout(
+ title="Partij trajectories",
+ xaxis_title="Links β β Rechts",
+ yaxis_title="Progressief β / Conservatief β",
+ height=600,
+ legend_title_text="Partij",
+ )
+ st.plotly_chart(fig, use_container_width=True)
+
+
+# ---------------------------------------------------------------------------
+# Tab 3: Motie Zoeken
+# ---------------------------------------------------------------------------
+
+
+def build_search_tab(db_path: str, show_rejected: bool) -> None:
+ st.subheader("Motie Zoeken")
+
+ df = load_motions_df(db_path)
+ if df.empty:
+ st.warning("Geen moties beschikbaar.")
+ return
+
+ if not show_rejected:
+ df = df[df["title"].fillna("").str.strip() != "Verworpen."]
+
+ # Sidebar-style controls in the main area
+ col1, col2, col3 = st.columns([2, 1, 1])
+ with col1:
+ query = st.text_input(
+ "Zoek op titel of uitleg", placeholder="bijv. stikstof, klimaat, wonen"
+ )
+ with col2:
+ years = sorted(df["year"].dropna().astype(int).unique().tolist())
+ if years:
+ year_range = st.select_slider(
+ "Jaar", options=years, value=(years[0], years[-1])
+ )
+ else:
+ year_range = (2019, 2024)
+ with col3:
+ policy_areas = ["(Alle)"] + sorted(df["policy_area"].dropna().unique().tolist())
+ policy_filter = st.selectbox("Beleidsterrein", options=policy_areas)
+
+ # Apply filters in-memory
+ working = df.copy()
+ working = working[
+ (working["year"] >= year_range[0]) & (working["year"] <= year_range[1])
+ ]
+ if policy_filter != "(Alle)":
+ working = working[working["policy_area"] == policy_filter]
+ if query:
+ q = query.lower()
+ mask = working["title"].fillna("").str.lower().str.contains(
+ q, regex=False
+ ) | working["layman_explanation"].fillna("").str.lower().str.contains(
+ q, regex=False
+ )
+ working = working[mask]
+
+ working = working.sort_values(by="controversy_score", ascending=False)
+ st.caption(f"{len(working)} resultaten (top 50 getoond)")
+
+ for _, row in working.head(50).iterrows():
+ title = row.get("title") or f"Motie #{row['id']}"
+ date_str = row["date"].strftime("%d %b %Y") if pd.notna(row["date"]) else "?"
+ with st.expander(f"**{title}** β {date_str} β {row.get('policy_area') or ''}"):
+ explanation = row.get("layman_explanation")
+ if explanation and str(explanation).strip():
+ st.markdown(explanation)
+ elif row.get("description") and str(row["description"]).strip():
+ st.markdown(str(row["description"])[:600] + "β¦")
+ else:
+ st.caption("_Geen samenvatting beschikbaar_")
+
+ cols = st.columns(3)
+ cols[0].metric("Controverse", f"{row.get('controversy_score', 0):.2f}")
+ cols[1].metric("Marge", f"{row.get('winning_margin', 0):.2f}")
+ cols[2].metric("Jaar", int(row["year"]) if pd.notna(row["year"]) else "?")
+
+ # Similar motions
+ sim = query_similar(db_path, int(row["id"]), top_k=5)
+ if not sim.empty:
+ st.markdown("**Vergelijkbare moties:**")
+ for _, s in sim.iterrows():
+ s_date = (
+ pd.to_datetime(s["date"]).strftime("%Y")
+ if pd.notna(s.get("date"))
+ else ""
+ )
+ st.markdown(
+ f"- {s.get('title', 'Onbekend')} *(score: {s['score']:.3f}, {s_date})*"
+ )
+ else:
+ st.caption("_Nog geen vergelijkbare moties beschikbaar_")
+
+
+# ---------------------------------------------------------------------------
+# Tab 4: Motie Browser
+# ---------------------------------------------------------------------------
+
+
+def build_browser_tab(db_path: str, show_rejected: bool) -> None:
+ st.subheader("Motie Browser")
+
+ df = load_motions_df(db_path)
+ if df.empty:
+ st.warning("Geen moties beschikbaar.")
+ return
+
+ if not show_rejected:
+ df = df[df["title"].fillna("").str.strip() != "Verworpen."]
+
+ # Controls
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ years = sorted(df["year"].dropna().astype(int).unique().tolist())
+ year_filter = st.selectbox("Jaar", ["(Alle)"] + [str(y) for y in years])
+ with col2:
+ policy_areas = ["(Alle)"] + sorted(df["policy_area"].dropna().unique().tolist())
+ pa_filter = st.selectbox(
+ "Beleidsterrein", options=policy_areas, key="browser_pa"
+ )
+ with col3:
+ sort_by = st.selectbox("Sorteren op", ["Datum (nieuw)", "Controverse", "Marge"])
+
+ # Filter
+ working = df.copy()
+ if year_filter != "(Alle)":
+ working = working[working["year"] == int(year_filter)]
+ if pa_filter != "(Alle)":
+ working = working[working["policy_area"] == pa_filter]
+
+ sort_map = {
+ "Datum (nieuw)": ("date", False),
+ "Controverse": ("controversy_score", False),
+ "Marge": ("winning_margin", True),
+ }
+ sort_col, sort_asc = sort_map[sort_by]
+ working = working.sort_values(by=sort_col, ascending=sort_asc)
+
+ # Display table
+ display_cols = [
+ "id",
+ "title",
+ "date",
+ "policy_area",
+ "controversy_score",
+ "winning_margin",
+ ]
+ available_display = [c for c in display_cols if c in working.columns]
+ st.dataframe(
+ working[available_display].reset_index(drop=True),
+ use_container_width=True,
+ height=350,
+ )
+
+ st.divider()
+
+ # Detail panel
+ st.markdown("**Detail weergave** β vul een motie-ID in:")
+ sel_id = st.number_input(
+ "Motie ID",
+ min_value=int(working["id"].min()) if not working.empty else 1,
+ max_value=int(working["id"].max()) if not working.empty else 99999,
+ value=int(working["id"].iloc[0]) if not working.empty else 1,
+ step=1,
+ )
+ motion_row = df[df["id"] == sel_id]
+ if not motion_row.empty:
+ row = motion_row.iloc[0]
+ st.markdown(f"### {row.get('title') or 'Onbekend'}")
+ st.caption(
+ f"π
{row['date'].strftime('%d %b %Y') if pd.notna(row['date']) else '?'} "
+ f"| π·οΈ {row.get('policy_area') or ''} "
+ f"| π₯ Controverse: {row.get('controversy_score', 0):.2f}"
+ )
+
+ if row.get("layman_explanation") and str(row["layman_explanation"]).strip():
+ st.markdown(row["layman_explanation"])
+ elif row.get("description") and str(row["description"]).strip():
+ st.markdown(str(row["description"]))
+
+ # Parse voting results
+ try:
+ vr = row.get("voting_results")
+ if vr and str(vr).strip() not in ("", "null", "None"):
+ vdata = json.loads(vr) if isinstance(vr, str) else vr
+ if isinstance(vdata, dict):
+ st.markdown("**Stemuitslag:**")
+ for category, actors in vdata.items():
+ if actors:
+ st.markdown(
+ f"- **{category}**: {', '.join(str(a) for a in actors)}"
+ )
+ except Exception:
+ pass
+
+ # Similar motions
+ sim = query_similar(db_path, int(sel_id), top_k=10)
+ if not sim.empty:
+ st.markdown("**Vergelijkbare moties:**")
+ st.dataframe(
+ sim[["title", "score", "date", "policy_area"]],
+ use_container_width=True,
+ )
+ else:
+ st.caption("_Nog geen vergelijkbare moties beschikbaar voor deze motie_")
+
+
+# ---------------------------------------------------------------------------
+# App entry
+# ---------------------------------------------------------------------------
+
+
+def run_app() -> None:
+ st.set_page_config(
+ layout="wide",
+ page_title="Parlement Explorer",
+ page_icon="ποΈ",
+ )
+ st.title("ποΈ Parlement Explorer")
+
+ # Sidebar
+ st.sidebar.title("Instellingen")
+ db_path = st.sidebar.text_input("DuckDB pad", value="data/motions.db")
+ window_size = st.sidebar.radio("Venstergrootte", ["quarterly", "annual"], index=0)
+ show_rejected = st.sidebar.checkbox("Toon verworpen moties", value=False)
+
+ # About section
+ with st.sidebar.expander("βΉοΈ Over", expanded=False):
+ try:
+ con = duckdb.connect(database=db_path, read_only=True)
+ n_motions = con.execute("SELECT COUNT(*) FROM motions").fetchone()[0]
+ n_fused = con.execute("SELECT COUNT(*) FROM fused_embeddings").fetchone()[0]
+ n_sim = con.execute("SELECT COUNT(*) FROM similarity_cache").fetchone()[0]
+ con.close()
+ st.markdown(
+ f"**Moties:** {n_motions:,} \n"
+ f"**Fused embeddings:** {n_fused:,} \n"
+ f"**Similarity cache:** {n_sim:,}"
+ )
+ except Exception as e:
+ st.warning(f"DB niet bereikbaar: {e}")
+
+ # Main tabs
+ tab1, tab2, tab3, tab4 = st.tabs(
+ ["π§ Politiek Kompas", "π Trajectories", "π Motie Zoeken", "π Motie Browser"]
+ )
+ with tab1:
+ build_compass_tab(db_path, window_size)
+ with tab2:
+ build_trajectories_tab(db_path, window_size)
+ with tab3:
+ build_search_tab(db_path, show_rejected)
+ with tab4:
+ build_browser_tab(db_path, show_rejected)
+
+
+if __name__ == "__main__":
+ logging.basicConfig(
+ level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s"
+ )
+ run_app()
diff --git a/pages/1_Stemwijzer.py b/pages/1_Stemwijzer.py
new file mode 100644
index 0000000..94ea0fd
--- /dev/null
+++ b/pages/1_Stemwijzer.py
@@ -0,0 +1,5 @@
+"""Stemwijzer page β thin wrapper around the existing app module."""
+
+from app import main # noqa: F401 (module-level set_page_config runs on import)
+
+main()
diff --git a/pages/2_Explorer.py b/pages/2_Explorer.py
new file mode 100644
index 0000000..e0101f2
--- /dev/null
+++ b/pages/2_Explorer.py
@@ -0,0 +1,5 @@
+"""Politiek Explorer page β thin wrapper around the explorer module."""
+
+from explorer import run_app
+
+run_app()
diff --git a/pipeline/run_pipeline.py b/pipeline/run_pipeline.py
index c16c0ce..d443669 100644
--- a/pipeline/run_pipeline.py
+++ b/pipeline/run_pipeline.py
@@ -29,6 +29,7 @@ import argparse
import calendar
import logging
import sys
+from concurrent.futures import ThreadPoolExecutor
from datetime import date, timedelta
from typing import List, Tuple
@@ -143,27 +144,55 @@ def run(args: argparse.Namespace) -> int:
# ββ Phase 3: SVD per window ββββββββββββββββββββββββββββββββββββββββββββββ
if not args.skip_svd:
windows = _generate_windows(start_date, end_date, args.window_size)
- _logger.info("Phase 3: SVD for %d windows (k=%d)", len(windows), args.svd_k)
- from pipeline.svd_pipeline import run_svd_for_window
+ _logger.info(
+ "Phase 3: SVD for %d windows (k=%d, parallel)", len(windows), args.svd_k
+ )
+ from pipeline.svd_pipeline import compute_svd_for_window
- for window_id, w_start, w_end in windows:
- _logger.info(" window %s: %s β %s", window_id, w_start, w_end)
- if not dry_run:
- result = run_svd_for_window(
- db=db,
- window_id=window_id,
- start_date=w_start,
- end_date=w_end,
- k=args.svd_k,
- )
+ if dry_run:
+ for window_id, w_start, w_end in windows:
+ _logger.info(" [dry-run] would run SVD for window %s", window_id)
+ else:
+ # Compute all windows in parallel (numpy/scipy SVD releases the GIL).
+ # IMPORTANT: collect ALL results before writing β DuckDB rejects mixing
+ # read-only and read-write connections in the same process.
+ # The `with` block waits for all threads to finish before we exit it,
+ # ensuring all read-only connections are closed before writes begin.
+ futures = {}
+ max_workers = min(len(windows), (args.svd_workers or 4))
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
+ for window_id, w_start, w_end in windows:
+ fut = pool.submit(
+ compute_svd_for_window,
+ db.db_path,
+ window_id,
+ w_start,
+ w_end,
+ args.svd_k,
+ )
+ futures[fut] = window_id
+ # All threads are done here β all read-only connections are closed.
+ # Now write results sequentially.
+ for fut, window_id in futures.items():
+ try:
+ result = fut.result()
+ except Exception as exc:
+ _logger.error(" window %s raised: %s", window_id, exc)
+ continue
+
+ if result["k_used"] == 0:
+ _logger.info(" window %s: no data, skipped", window_id)
+ continue
+
+ rows = result["mp_rows"] + result["motion_rows"]
+ db.batch_store_svd_vectors(window_id, rows)
_logger.info(
- " k_used=%d stored_mp=%d stored_motion=%d",
+ " window %s: k_used=%d stored_mp=%d stored_motion=%d",
+ window_id,
result["k_used"],
- result["stored_mp"],
- result["stored_motion"],
+ len(result["mp_rows"]),
+ len(result["motion_rows"]),
)
- else:
- _logger.info(" [dry-run] would run SVD for window %s", window_id)
else:
_logger.info("Phase 3: skipped (--skip-svd)")
@@ -235,6 +264,12 @@ def build_parser() -> argparse.ArgumentParser:
help="Time window granularity",
)
parser.add_argument("--svd-k", type=int, default=50, help="SVD dimensions")
+ parser.add_argument(
+ "--svd-workers",
+ type=int,
+ default=None,
+ help="Parallel workers for SVD (default: min(windows, 4))",
+ )
parser.add_argument(
"--text-model",
default=None,
diff --git a/pipeline/svd_pipeline.py b/pipeline/svd_pipeline.py
index 4635fb7..13fcbb6 100644
--- a/pipeline/svd_pipeline.py
+++ b/pipeline/svd_pipeline.py
@@ -150,57 +150,115 @@ def _procrustes_align(
return current_anchor
-def run_svd_for_window(
- db: MotionDatabase,
+def compute_svd_for_window(
+ db_path: str,
window_id: str,
start_date: str,
end_date: str,
k: int = 50,
) -> Dict:
- """Run SVD on votes in given date window and store vectors in DB.
+ """Pure-compute SVD for a window. Safe to run in a subprocess.
- Returns metadata dict with keys: k_used, stored_mp, stored_motion
+ Opens the DB in read-only mode (allows concurrent parallel workers).
+ Does NOT write to the DB β caller is responsible for persisting results.
+
+ Returns dict with keys:
+ window_id, k_used, mp_rows, motion_rows
+ where *_rows are List[Tuple[entity_type, entity_id, vector, model]]
"""
- mat, mp_names, motion_ids = _build_vote_matrix(db, start_date, end_date)
+ empty = {"window_id": window_id, "k_used": 0, "mp_rows": [], "motion_rows": []}
+
+ # Read vote matrix using a read-only connection β safe to run in parallel.
+ conn = duckdb.connect(db_path, read_only=True)
+ try:
+ rows = conn.execute(
+ "SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ?",
+ (start_date, end_date),
+ ).fetchall()
+ finally:
+ conn.close()
+
+ if not rows:
+ return empty
+
+ motion_ids = sorted({int(r[0]) for r in rows})
+ mp_names = sorted({r[1] for r in rows})
+
+ m_count = len(mp_names)
+ n_count = len(motion_ids)
+ mat = np.zeros((m_count, n_count), dtype=float)
+
+ mp_index = {name: i for i, name in enumerate(mp_names)}
+ motion_index = {mid: j for j, mid in enumerate(motion_ids)}
+
+ for motion_id, mp_name, vote in rows:
+ i = mp_index[mp_name]
+ j = motion_index[int(motion_id)]
+ val = VOTE_MAP.get(
+ vote, VOTE_MAP.get(vote.strip() if isinstance(vote, str) else vote, 0.0)
+ )
+ try:
+ mat[i, j] = float(val)
+ except Exception:
+ mat[i, j] = 0.0
if mat.size == 0 or mat.shape[0] == 0 or mat.shape[1] == 0:
- return {"k_used": 0, "stored_mp": 0, "stored_motion": 0}
+ return empty
k_used = _safe_k(mat, k)
-
if k_used <= 0:
- return {"k_used": 0, "stored_mp": 0, "stored_motion": 0}
+ return empty
- # use sparse svds for efficiency
try:
A = csr_matrix(mat)
U, s, Vt = svds(A, k=k_used)
- # svds does not guarantee ordering of singular values; sort descending
idx = np.argsort(s)[::-1]
s = s[idx]
U = U[:, idx]
Vt = Vt[idx, :]
- # weight by singular values
- mp_vecs = (U * s.reshape(1, -1)).tolist() # m x k
- motion_vecs = (Vt.T * s.reshape(1, -1)).tolist() # n x k
-
- stored_mp = 0
- stored_motion = 0
- for i, mp_name in enumerate(mp_names):
- db.store_svd_vector(window_id, "mp", mp_name, mp_vecs[i])
- stored_mp += 1
+ mp_vecs = (U * s.reshape(1, -1)).tolist()
+ motion_vecs = (Vt.T * s.reshape(1, -1)).tolist()
- for j, motion_id in enumerate(motion_ids):
- db.store_svd_vector(window_id, "motion", str(motion_id), motion_vecs[j])
- stored_motion += 1
+ mp_rows = [
+ ("mp", mp_name, mp_vecs[i], None) for i, mp_name in enumerate(mp_names)
+ ]
+ motion_rows = [
+ ("motion", str(mid), motion_vecs[j], None)
+ for j, mid in enumerate(motion_ids)
+ ]
return {
+ "window_id": window_id,
"k_used": k_used,
- "stored_mp": stored_mp,
- "stored_motion": stored_motion,
+ "mp_rows": mp_rows,
+ "motion_rows": motion_rows,
}
except Exception:
- _logger.exception("SVD failed for window")
+ _logger.exception("SVD failed for window %s", window_id)
+ return empty
+
+
+def run_svd_for_window(
+ db: MotionDatabase,
+ window_id: str,
+ start_date: str,
+ end_date: str,
+ k: int = 50,
+) -> Dict:
+ """Run SVD on votes in given date window and store vectors in DB.
+
+ Returns metadata dict with keys: k_used, stored_mp, stored_motion
+ """
+ result = compute_svd_for_window(db.db_path, window_id, start_date, end_date, k)
+ if result["k_used"] == 0:
return {"k_used": 0, "stored_mp": 0, "stored_motion": 0}
+
+ rows = result["mp_rows"] + result["motion_rows"]
+ stored = db.batch_store_svd_vectors(window_id, rows)
+ return {
+ "k_used": result["k_used"],
+ "stored_mp": len(result["mp_rows"]),
+ "stored_motion": len(result["motion_rows"]),
+ }
diff --git a/scripts/download_past_year.py b/scripts/download_past_year.py
index 5206f6c..8ccc6f7 100644
--- a/scripts/download_past_year.py
+++ b/scripts/download_past_year.py
@@ -1,7 +1,7 @@
"""download_past_year.py β One-shot data download: parliamentary motions for a date range.
Fetches Stemming records from the OData API in chunks (default 90-day windows),
-stores motions into data/motions.db using MotionDatabase.insert_motion().
+stores motions into data/motions.db using MotionDatabase.batch_insert_motions().
Skips AI summarisation β this is a raw data fetch for the embedding pipeline.
@@ -105,11 +105,7 @@ def main():
inserted = 0
duplicates = 0
- for m in motions:
- if db.insert_motion(m):
- inserted += 1
- else:
- duplicates += 1
+ inserted, duplicates = db.batch_insert_motions(motions)
total_inserted += inserted
total_duplicates += duplicates
diff --git a/scripts/generate_extra_charts.py b/scripts/generate_extra_charts.py
new file mode 100644
index 0000000..cc554a7
--- /dev/null
+++ b/scripts/generate_extra_charts.py
@@ -0,0 +1,172 @@
+"""Generate additional blog charts: controversy trend + party alignment heatmap."""
+
+from __future__ import annotations
+import os, sys
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+if ROOT not in sys.path:
+ sys.path.insert(0, ROOT)
+
+import duckdb
+import plotly.graph_objects as go
+import plotly.express as px
+import numpy as np
+
+DB = "data/motions.db"
+OUT = "outputs/blog-charts"
+os.makedirs(OUT, exist_ok=True)
+
+con = duckdb.connect(DB, read_only=True)
+
+# βββ 1. Controversy trend (bar chart, 2019-2026, quarterly) ββββββββββββββββββ
+rows = con.execute("""
+ SELECT
+ YEAR(date) || '-Q' || QUARTER(date) as wid,
+ YEAR(date) as yr,
+ QUARTER(date) as q,
+ COUNT(*) as n,
+ ROUND(AVG(controversy_score), 3) as avg_c,
+ COUNT(*) FILTER (WHERE controversy_score >= 0.7) as high_c
+ FROM motions
+ WHERE controversy_score IS NOT NULL
+ AND date >= '2019-01-01' AND date < '2026-04-01'
+ GROUP BY wid, yr, q
+ ORDER BY yr, q
+""").fetchall()
+
+windows = [r[0] for r in rows]
+avg_c = [r[4] for r in rows]
+high_pct = [round(100.0 * r[5] / r[3], 1) if r[3] else 0 for r in rows]
+
+fig = go.Figure()
+fig.add_trace(
+ go.Bar(
+ x=windows,
+ y=high_pct,
+ name="% highly contested (score β₯ 0.7)",
+ marker_color="#00d9a3",
+ opacity=0.85,
+ )
+)
+fig.add_trace(
+ go.Scatter(
+ x=windows,
+ y=[v * 100 for v in avg_c],
+ name="avg controversy Γ 100",
+ mode="lines+markers",
+ line=dict(color="#e6edf3", width=2),
+ marker=dict(size=4),
+ )
+)
+fig.update_layout(
+ title="Political controversy per quarter (Tweede Kamer, 2019β2026)",
+ xaxis_title="Quarter",
+ yaxis_title="% of motions",
+ plot_bgcolor="#161b22",
+ paper_bgcolor="#0d1117",
+ font=dict(color="#e6edf3", family="Inter, system-ui"),
+ legend=dict(bgcolor="rgba(0,0,0,0)", bordercolor="#30363d", borderwidth=1),
+ xaxis=dict(tickangle=-45, gridcolor="#30363d"),
+ yaxis=dict(gridcolor="#30363d", range=[0, 55]),
+ bargap=0.15,
+)
+out1 = os.path.join(OUT, "controversy_trend.html")
+fig.write_html(out1, include_plotlyjs="cdn", full_html=True)
+print(f"Wrote {out1}")
+
+# βββ 2. Party alignment heatmap ββββββββββββββββββββββββββββββββββββββββββββββ
+# Only include major parties with sufficient data
+MAJOR = [
+ "VVD",
+ "PVV",
+ "D66",
+ "CDA",
+ "PvdA",
+ "GroenLinks",
+ "SP",
+ "ChristenUnie",
+ "SGP",
+ "FVD",
+ "BBB",
+ "PvdD",
+ "Volt",
+ "GroenLinks-PvdA",
+ "Nieuw Sociaal Contract",
+ "DENK",
+ "JA21",
+]
+
+rows = con.execute("""
+ WITH pv AS (
+ SELECT motion_id, party,
+ CASE
+ WHEN SUM(CASE WHEN vote='voor' THEN 1 ELSE 0 END) > SUM(CASE WHEN vote='tegen' THEN 1 ELSE 0 END) THEN 'voor'
+ WHEN SUM(CASE WHEN vote='tegen' THEN 1 ELSE 0 END) > SUM(CASE WHEN vote='voor' THEN 1 ELSE 0 END) THEN 'tegen'
+ ELSE 'split'
+ END as pv
+ FROM mp_votes WHERE party IS NOT NULL AND vote IN ('voor','tegen')
+ GROUP BY motion_id, party
+ ),
+ d AS (SELECT * FROM pv WHERE pv != 'split')
+ SELECT a.party, b.party,
+ COUNT(*) as shared,
+ ROUND(100.0 * SUM(CASE WHEN a.pv = b.pv THEN 1 ELSE 0 END) / COUNT(*), 1) as pct
+ FROM d a JOIN d b ON a.motion_id = b.motion_id AND a.party != b.party
+ GROUP BY a.party, b.party
+ HAVING COUNT(*) >= 100
+""").fetchall()
+
+# Build matrix
+agree = {}
+for a, b, _, pct in rows:
+ agree[(a, b)] = pct
+
+# Filter to parties that have data
+present = set()
+for a, b in agree:
+ if a in MAJOR:
+ present.add(a)
+ if b in MAJOR:
+ present.add(b)
+parties = [p for p in MAJOR if p in present]
+
+n = len(parties)
+matrix = np.full((n, n), np.nan)
+for i, a in enumerate(parties):
+ matrix[i, i] = 100.0
+ for j, b in enumerate(parties):
+ if i != j and (a, b) in agree:
+ matrix[i, j] = agree[(a, b)]
+
+fig2 = go.Figure(
+ data=go.Heatmap(
+ z=matrix,
+ x=parties,
+ y=parties,
+ colorscale=[[0, "#6e40c9"], [0.5, "#30363d"], [1, "#00d9a3"]],
+ zmid=70,
+ zmin=35,
+ zmax=100,
+ text=[[f"{v:.0f}%" if not np.isnan(v) else "" for v in row] for row in matrix],
+ texttemplate="%{text}",
+ textfont=dict(size=9),
+ hoverongaps=False,
+ showscale=True,
+ colorbar=dict(title="Agreement %", tickfont=dict(color="#e6edf3")),
+ )
+)
+fig2.update_layout(
+ title="Cross-party vote alignment (all years combined)",
+ plot_bgcolor="#161b22",
+ paper_bgcolor="#0d1117",
+ font=dict(color="#e6edf3", family="Inter, system-ui", size=11),
+ xaxis=dict(tickangle=-45, side="bottom", gridcolor="#30363d"),
+ yaxis=dict(autorange="reversed", gridcolor="#30363d"),
+ height=600,
+)
+out2 = os.path.join(OUT, "party_alignment.html")
+fig2.write_html(out2, include_plotlyjs="cdn", full_html=True)
+print(f"Wrote {out2}")
+
+con.close()
+print("Done.")
diff --git a/tests/test_explorer_import.py b/tests/test_explorer_import.py
new file mode 100644
index 0000000..4c3b8be
--- /dev/null
+++ b/tests/test_explorer_import.py
@@ -0,0 +1,14 @@
+"""Smoke test: explorer module is importable without DB or heavy computation."""
+
+import importlib
+
+
+def test_explorer_importable():
+ mod = importlib.import_module("explorer")
+ assert hasattr(mod, "run_app")
+ assert callable(mod.run_app)
+ assert hasattr(mod, "load_positions")
+ assert hasattr(mod, "load_motions_df")
+ assert hasattr(mod, "query_similar")
+ assert hasattr(mod, "build_compass_tab")
+ assert hasattr(mod, "build_search_tab")
diff --git a/tests/test_home_import.py b/tests/test_home_import.py
new file mode 100644
index 0000000..086dfd4
--- /dev/null
+++ b/tests/test_home_import.py
@@ -0,0 +1,38 @@
+"""Smoke test: Home module is importable without DB or heavy computation."""
+
+import importlib
+import sys
+
+
+def test_home_importable():
+ # Streamlit cannot run set_page_config outside of a server context,
+ # so we only verify the file can be parsed/compiled, not fully executed.
+ import ast
+ import os
+
+ home_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "Home.py")
+ with open(home_path) as f:
+ source = f.read()
+
+ # Verify the file parses as valid Python
+ tree = ast.parse(source)
+
+ # Verify st.set_page_config is called at module level (first Streamlit command)
+ calls = [
+ node
+ for node in ast.walk(tree)
+ if isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Attribute)
+ and node.func.attr == "set_page_config"
+ ]
+ assert calls, "Home.py must call st.set_page_config()"
+
+ # Verify page links exist (st.page_link calls)
+ page_links = [
+ node
+ for node in ast.walk(tree)
+ if isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Attribute)
+ and node.func.attr == "page_link"
+ ]
+ assert len(page_links) >= 2, "Home.py must have at least 2 st.page_link() calls"
diff --git a/thoughts/blog-post-political-compass.md b/thoughts/blog-post-political-compass.md
new file mode 100644
index 0000000..72a1584
--- /dev/null
+++ b/thoughts/blog-post-political-compass.md
@@ -0,0 +1,174 @@
+# Mapping Dutch Democracy: Building a Political Compass from 25,000+ Parliamentary Votes
+
+*What if you could take every motion voted on in the Dutch Parliament over the past decade and automatically plot parties and MPs on a political map β with zero manual labeling?*
+
+That's exactly what this project does. Here's how we built it, what surprised us, and what it revealed about Dutch political dynamics.
+
+---
+
+## The Starting Point: Open Data, Hidden Structure
+
+The Dutch Parliament publishes every vote β every *motie*, every *amendement*, every *besluit* β in an open OData API. We're talking over **25,500 motions** spanning 2016 to 2026, each with a record of how every party (and in many cases every individual MP) voted: *voor* (for), *tegen* (against), *onthouden* (abstained), or *afwezig* (absent).
+
+This is an extraordinary dataset. But in raw form it's just a table of votes. The interesting question is: can we extract *structure* β left vs. right, progressive vs. conservative, governing vs. opposition β purely from the pattern of who votes with whom?
+
+The answer is yes, and the method is surprisingly elegant.
+
+---
+
+## Step 1: Turning Votes into Geometry
+
+Each motion is a snapshot of political alignment. For each motion, we know which parties voted together and which voted apart. If PvdA and GroenLinks almost always vote the same way, that tells us something. If PVV and CDA frequently diverge, that tells us something too.
+
+We represent this with **Singular Value Decomposition (SVD)** on the party-vote matrix:
+
+- Rows: parties (VVD, PVV, D66, CDA, PvdA, GroenLinks, SP, CU, SGP, FvD, BBB, ...)
+- Columns: motions
+- Values: vote encoded as +1 (voor), -1 (tegen), 0 (absent/abstain)
+
+SVD finds the dominant axes of variation β the directions along which parties disagree most strongly. The first dimension almost always corresponds to a left-right axis. The second dimension typically captures something like a libertarian-authoritarian or progressive-traditionalist axis.
+
+We run this **per quarterly window** (2019-Q1, 2019-Q2, ..., 2024-Q4) so we can track how positions shift over time at fine resolution.
+
+### The Result: A 2D Political Compass
+
+The output is coordinates for every party in 2D space β computed purely from voting behavior, with no labels or assumptions from us. When you plot it, recognizable structure emerges immediately:
+
+- **Left bloc** (PvdA, GroenLinks, SP) cluster tightly together
+- **Right-liberal** (VVD, D66) sit in a distinct quadrant
+- **Religious right** (SGP, CU) form their own coherent group
+- **Populist right** (PVV, FvD in later years) occupy a distant extreme
+- **BBB** (Farmer's party, 2022 onwards) drops into an interesting position between PVV and CDA
+
+The political axis emerges from the math β not our intuitions.
+
+---
+
+## Step 2: What Each Motion Is Actually About
+
+Voting patterns tell us *who* agrees, but not *why*. For that, we add **text embeddings** β dense vector representations of each motion's title and description using a language model.
+
+This lets us do something powerful: if a new motion comes in about nitrogen emissions, we can find the 20 most similar past motions (by meaning, not just keywords). If a motion uses identical party-line voting as another motion from 2022, the text embedding can confirm they're genuinely related β or reveal that the voting pattern is coincidental (parties split on unrelated issues for similar structural reasons).
+
+We compute these using **OpenAI-compatible embeddings** via OpenRouter, processing 25,640 motions in batches of 200.
+
+---
+
+## Step 3: Fused Embeddings β The Best of Both Worlds
+
+SVD gives us the political-structural signal: *how does this motion split the chamber?* Text embeddings give us semantic signal: *what is this motion about?*
+
+We concatenate both into a **fused vector** per motion per window:
+
+```
+fused = [svd_dims (50)] + [text_dims (2560)] = 2610 dimensions
+```
+
+This fused representation powers the similarity search. Two motions are considered "close" if they're both about a similar topic *and* they produce a similar political split. This filters out spurious matches β two motions might both be controversial (splitting 50/50) but about completely unrelated things.
+
+---
+
+## The Numbers: What We're Working With
+
+After the full pipeline run:
+
+| Year | Motions |
+|------|---------|
+| 2016 | 132 |
+| 2017 | 30 |
+| 2018 | 100 |
+| 2019 | 3,374 |
+| 2020 | 4,228 |
+| 2021 | 4,289 |
+| 2022 | 4,116 |
+| 2023 | 621 |
+| 2024 | 3,968 |
+| 2025 | 3,715 |
+| 2026 | 948 |
+
+The 2022 spike is striking β over 4,000 motions in a single year. This was the year the Rutte IV coalition took office amid intense debates on energy prices, housing, the war in Ukraine, and the ongoing nitrogen crisis.
+
+Our similarity cache now holds **627,272 precomputed pairs** (top 20 neighbors per motion per window), making similarity lookup instant at query time.
+
+---
+
+## Interesting Findings
+
+### The 2022 Polarization Surge
+
+The 2022 cohort dominates the dataset. Looking at the SVD positions for that year, the distance between the governing coalition (VVD, D66, CDA, CU) and the opposition (PVV, SP, FvD) is near its maximum. The nitrogen crisis and energy policy debates forced unusually sharp coalition discipline.
+
+### BBB's Geometric Arrival
+
+When BBB (BoerBurgerBeweging) entered parliament in 2023 with a historic 16 seats, their SVD position placed them between PVV and CDA β exactly as expected from their policy profile: agrarian-nationalist populism with Catholic-provincial roots. The model found this without being told.
+
+### The Strange Case of "Verworpen."
+
+Motions that are rejected without debate are recorded with the title "Verworpen." (Rejected.). There are hundreds of these. Because they share a single 9-character title, their text embeddings are identical β meaning every "Verworpen." has cosine similarity 1.0 to every other "Verworpen." This is technically correct (they are textually identical) but semantically meaningless. The similarity cache contains these spurious pairs, which the UI layer needs to filter out.
+
+It's a good reminder that **data quality surprises emerge at scale**.
+
+### Party Cohesion as a Signal
+
+A subtle finding: party cohesion (how often all members of a party vote the same way) varies enormously. SGP and CU have near-perfect cohesion β they vote as a bloc on almost everything. PvdA/GroenLinks (post-merger) has similarly high cohesion. But in earlier years (2019-2020), before the merger, GroenLinks occasionally splits on specific issues around security policy.
+
+VVD shows the most internal variation β governing parties develop fissures.
+
+---
+
+## The Pipeline Architecture
+
+The system is built around a single DuckDB database and a modular Python pipeline:
+
+```
+API (Tweede Kamer OData)
+ β download_past_year.py
+ β motions table (25,500+ rows)
+
+motions
+ β extract_mp_votes.py β mp_votes table (200k rows)
+ β text_pipeline.py β embeddings table (25,640 rows, via OpenRouter)
+ β svd_pipeline.py β svd_vectors table (50,779 rows, quarterly windows)
+
+svd_vectors + embeddings
+ β fusion.py β fused_embeddings table (35,872 rows)
+
+fused_embeddings
+ β similarity/compute.py β similarity_cache table (627k rows, top-20 per window)
+```
+
+Everything runs locally. The only external call is to the OpenRouter API for text embeddings. The similarity computation (627k pairs) is pure NumPy β load vectors, normalize, matrix multiply, take top-k. For 4,000 motions in a quarter, that's a 4000Γ4000 cosine similarity matrix computed in seconds.
+
+---
+
+## What's Next
+
+The similarity cache and political compass open up several directions:
+
+**Motion explorer**: Given a motion you care about, find the 20 most politically and semantically similar motions from across the decade. Trace how a policy debate evolved from 2019 to 2025.
+
+**Party trajectory plots**: Animate party positions on the 2D compass year by year. Watch D66 drift, watch PVV consolidate, watch the new parties arrive and find their position.
+
+**Cross-party coalition predictor**: Given a new motion's text and expected vote split, predict which parties will support it based on past patterns.
+
+**The "controversy index"**: We already compute `1 - winning_margin` as a controversy score. The most controversial motions (close votes, high stakes topics) tell a story about where Dutch politics is genuinely undecided vs. where it's performing conflict for the cameras.
+
+---
+
+## Reproducibility
+
+The full pipeline is open and runs on a single machine with no cloud infrastructure:
+
+```bash
+# Download historical data
+python scripts/download_past_year.py --start-date 2016-01-01 --end-date 2026-01-01
+
+# Run full pipeline (extract votes, compute SVD, embed text, fuse, build similarity cache)
+python -m pipeline.run_pipeline --db-path data/motions.db \
+ --start-date 2016-01-01 --end-date 2026-01-01 \
+ --window-size annual --text-batch-size 200
+```
+
+The DB grows to ~3.6GB for the full dataset (mostly embeddings and vote records). Everything else β the SVD, fusion, similarity cache β fits comfortably in memory during computation.
+
+Democracy is more legible than it looks.
diff --git a/thoughts/shared/designs/2026-03-22-motion-explorer-design.md b/thoughts/shared/designs/2026-03-22-motion-explorer-design.md
new file mode 100644
index 0000000..8b3bedf
--- /dev/null
+++ b/thoughts/shared/designs/2026-03-22-motion-explorer-design.md
@@ -0,0 +1,165 @@
+---
+date: 2026-03-22
+topic: "Dynamic motion explorer + analysis refresh"
+status: validated
+---
+
+## Problem Statement
+
+The parliamentary embedding pipeline now covers 2019β2026 with ~25,000 motions, quarterly SVD windows, fused embeddings, and a 200k+ similarity cache. None of this is visible to anyone in an interactive form. The only outputs today are static HTML files written by `generate_compass.py` (if it's been run), and a blog post with placeholder numbers.
+
+We need to:
+1. Regenerate all analyses and output graphs with the full dataset
+2. Build an interactive Streamlit explorer that surfaces the political compass, party trajectories, and motion similarity search
+3. Update the blog post with real numbers and findings
+
+## Constraints
+
+- Do NOT modify `app.py` or `scheduler.py` β these are the production quiz app
+- All DB access in the explorer must be **read-only** (no writes) β pipeline may be running
+- Explorer must work with existing `analysis.*` modules; no new analysis logic
+- Use `@st.cache_data` aggressively β `compute_2d_axes` runs PCA across all windows and is expensive (seconds, not milliseconds)
+- No new external dependencies beyond what's already installed (streamlit, plotly, umap-learn, scikit-learn are all present)
+- Follow existing code style: functional Python, `logging.getLogger(__name__)`, no print statements in library code
+
+## Approach
+
+**Single-file `explorer.py`** at the project root alongside `app.py`.
+
+Four Streamlit tabs:
+1. **Politiek Kompas** β 2D MP/party scatter with a window slider
+2. **Partij Trajectories** β Line traces of party positions over time on the compass
+3. **Motie Zoeken** β Free-text + filter search, returns ranked similar motions
+4. **Motie Browser** β Filterable table of all motions, click to expand detail + similar motions
+
+Run with: `streamlit run explorer.py`
+
+This approach is chosen because:
+- Reuses all existing `analysis.*` modules without changes
+- Single file means no new package structure to maintain
+- Streamlit tabs map naturally to the four distinct views a researcher would want
+- Read-only DB access means it can run concurrently with the pipeline
+
+## Architecture
+
+```
+explorer.py
+ βββ Tab 1: Politiek Kompas
+ β βββ analysis.political_axis.compute_2d_axes (cached)
+ β βββ analysis.visualize.plot_political_compass β Plotly figure
+ β
+ βββ Tab 2: Partij Trajectories
+ β βββ analysis.trajectory.compute_2d_trajectories (cached)
+ β βββ analysis.visualize.plot_2d_trajectories β Plotly figure
+ β
+ βββ Tab 3: Motie Zoeken
+ β βββ database.get_all_motions (cached, read-only)
+ β βββ database.search_similar (similarity_cache lookup)
+ β βββ Custom search: filter title/description + show voting_results
+ β
+ βββ Tab 4: Motie Browser
+ βββ database.get_filtered_motions (cached, read-only)
+ βββ On click: database.search_similar for related motions
+```
+
+## Key Components & Responsibilities
+
+**`explorer.py`**
+- Page config: `st.set_page_config(layout="wide", page_title="Parlement Explorer")`
+- Sidebar: DB path input (default `data/motions.db`), window-size toggle (annual/quarterly)
+- `@st.cache_data` wrappers for all expensive DB reads and computations
+- Four tabs via `st.tabs([...])`
+
+**Tab 1 β Politiek Kompas**
+- Calls `compute_2d_axes(db_path, method='pca', pca_residual=True)` β cached
+- Window selector slider showing available windows
+- Renders the Plotly scatter for the selected window using `_render_compass_for_window(positions_by_window, window_id, party_map, axis_def)` β a thin Plotly figure builder (not writing to file)
+- Hover: MP name, party, (x, y) coordinates
+- Color by party using `_load_party_map(db_path)` β cached
+
+**Tab 2 β Partij Trajectories**
+- Same `positions_by_window` data from Tab 1 (shared cache hit)
+- Multi-select party filter (default: all major parties)
+- Plotly figure: one trace per party, x/y positions connected by lines, labeled by window_id
+- Toggle between showing MPs or just party centroids (computed as mean of MP positions per party per window)
+
+**Tab 3 β Motie Zoeken**
+- Search input (Dutch text, free-form)
+- Filters: year range (slider), policy area (multi-select), controversy score (slider)
+- On search: filter `motions` table in-memory against title + layman_explanation text (case-insensitive substring; no embedding search needed at this level)
+- Results list: each result shows title, date, policy area, controversy, layman_explanation
+- Expandable section per result: full description/body_text + "Vergelijkbare moties" from `similarity_cache`
+- Voting breakdown: parse `voting_results` JSON to show Voor/Tegen/Onthouden per party
+
+**Tab 4 β Motie Browser**
+- `st.dataframe` with all motions (title, date, policy_area, controversy_score, winning_margin)
+- Column filters at top: year, policy area
+- Sort by: date DESC, controversy DESC, winning_margin ASC (most contested first)
+- Click row β `st.session_state` stores selected motion_id β detail panel below table
+- Detail panel: full motion text + top-10 similar motions from similarity_cache
+
+## Data Flow
+
+1. On startup: `compute_2d_axes` runs PCA, results cached in Streamlit's in-memory cache
+2. Tab 1/2: pure reads from `svd_vectors` + `mp_metadata` β all cached after first load
+3. Tab 3: on each search, filter pre-loaded motions DataFrame in-memory (no DB query per keypress)
+4. Tab 4: full motions table loaded once and cached; similarity lookups hit `similarity_cache` table via existing `database.get_cached_similarities`
+
+All DuckDB connections are opened with `read_only=True` to allow concurrent pipeline access.
+
+## Error Handling
+
+- If `compute_2d_axes` fails (insufficient data for a window), skip that window and log warning β don't crash the app
+- If `similarity_cache` has no entries for a motion (e.g., new motion not yet processed), show "Nog geen vergelijkbare moties beschikbaar" placeholder
+- If DB file doesn't exist at startup, show an error banner with the path and instructions
+- All `duckdb.connect` calls wrapped in try/finally to guarantee close
+
+## Analysis Refresh Plan
+
+Before building the explorer, regenerate all outputs:
+
+```bash
+# 1. Generate political compass HTML for latest window (annual)
+.venv/bin/python scripts/generate_compass.py \
+ --db data/motions.db --out outputs \
+ --method pca --pca-residual
+
+# 2. Generate similarity cache for new windows (2019β2021, 2024 quarters)
+# (run_pipeline with --skip-metadata --skip-extract --skip-svd --skip-text)
+.venv/bin/python -m pipeline.run_pipeline \
+ --db-path data/motions.db \
+ --start-date 2019-01-01 --end-date 2025-01-01 \
+ --window-size quarterly \
+ --skip-metadata --skip-extract --skip-svd --skip-text
+
+# 3. Recompute similarity cache for all windows
+.venv/bin/python -c "
+from similarity.compute import recompute_all_windows
+recompute_all_windows('data/motions.db', window_size='quarterly', top_k=20)
+"
+```
+
+## Blog Post Updates
+
+Target: `thoughts/blog-post-political-compass.md`
+
+- Replace placeholder motion counts table with real numbers from DB query
+- Add actual findings from quarterly analysis (not visible in annual windows):
+ - 2020-Q2 COVID vote clustering β parties converge on emergency measures
+ - 2022-Q4 nitrogen crisis β sharpest left-right split in dataset
+ - 2023-Q1 β 2024-Q1 gap (data missing for Q2-Q4 2023)
+- Add "Explorer" section describing `explorer.py` and how to run it
+- Update similarity cache row count (was 212k, now higher with new windows)
+- Fix the "fused = [10] + [2560] = 2570" claim β verify actual dimensions
+
+## Testing Strategy
+
+- Explorer has no tests (it's a UI script) β verify manually by running `streamlit run explorer.py` after pipeline completes
+- Existing 34 tests stay green β no changes to library modules
+- Run tests after completing implementation: `.venv/bin/python -m pytest -q`
+
+## Open Questions
+
+- Should the explorer ship as a separate port from `app.py`? (Recommendation: yes, `app.py` stays on its port, `explorer.py` runs on a different port for internal/research use)
+- Should `Verworpen.` motions be filtered from search results by default? (Recommendation: yes, add a "Toon verworpen" toggle defaulting to off)
+- Annual or quarterly windows as the default for the compass? (Recommendation: annual β less noise, cleaner trajectories; quarterly available via sidebar toggle)
diff --git a/thoughts/shared/designs/2026-03-22-stematlas-deployment-design.md b/thoughts/shared/designs/2026-03-22-stematlas-deployment-design.md
new file mode 100644
index 0000000..d1182eb
--- /dev/null
+++ b/thoughts/shared/designs/2026-03-22-stematlas-deployment-design.md
@@ -0,0 +1,229 @@
+---
+date: 2026-03-22
+topic: "StemAtlas β Public Deployment on sgeboers.nl"
+status: validated
+---
+
+# StemAtlas Deployment Design
+
+## Problem Statement
+
+The stemwijzer project has three user-facing products ready to publish:
+1. **A blog post** explaining the political compass methodology and findings
+2. **An interactive explorer** (political compass, party trajectories, motion search)
+3. **The stemwijzer quiz** (vote on motions, see which parties match you)
+
+These need to be deployed publicly on sgeboers.nl using the existing VPS + Gitea + Drone + Docker stack.
+
+---
+
+## The Name: StemAtlas
+
+**`stematlas.sgeboers.nl`**
+
+Dutch wordplay: **stem** = *vote* AND *voice* (as in "the voice of parliament") + **atlas** = a comprehensive map of the world. Together: *an atlas of voices* β a map of how Dutch democracy sounds from the inside.
+
+It's broader than "stemwijzer" (which implies a voting guide) β it positions the site as a data exploration and journalism tool.
+
+---
+
+## Constraints
+
+- Existing VPS running Nginx, Gitea, Drone
+- Deployment pipeline: Docker build β push to registry β SSH `docker-compose up -d`
+- sgeboers.nl is a **raw HTML/CSS site** (not Hugo) hosted as a repo on git.sgeboers.nl
+- DuckDB file lives on the VPS β single writer (scheduler), multiple readers (Streamlit)
+- No new cloud services or hosting costs
+
+---
+
+## Architecture
+
+```
+Internet
+ β
+ βββ sgeboers.nl (raw HTML/CSS site, existing repo on git.sgeboers.nl)
+ β βββ blog/stematlas.html β blog post with inline charts + link to subdomain
+ β
+ βββ stematlas.sgeboers.nl
+ βββ Nginx (reverse proxy)
+ βββ Streamlit multi-page app (port 8501)
+ βββ Page 1: Stemwijzer Quiz (app.py)
+ βββ Page 2: Explorer (explorer.py)
+
+VPS filesystem:
+ /srv/stematlas/
+ βββ data/motions.db β DuckDB (shared, read-write by scheduler)
+ βββ docker-compose.yml
+```
+
+---
+
+## Components
+
+### 1. Streamlit Multi-Page App
+
+Restructure entry point from `app.py` β `Home.py` with a `pages/` directory:
+
+```
+Home.py β landing page / about
+pages/
+ 1_Stemwijzer.py β quiz (app.py content)
+ 2_Explorer.py β explorer.py content
+```
+
+Streamlit's built-in multi-page routing handles navigation. One Docker container, one port (8501).
+
+**Why not two separate containers?**
+Single shared DuckDB file on VPS filesystem. Both pages open read-only connections (quiz opens read-write for session data, but that's the existing behaviour). One container = one volume mount = no coordination overhead.
+
+### 2. Docker Compose
+
+The existing `.drone.yml` already calls `docker-compose up -d` on the VPS. We add/update `docker-compose.yml`:
+
+```
+Services:
+ stematlas:
+ image: registry/stematlas:latest
+ ports: 8501 (internal only)
+ volumes:
+ - /srv/stematlas/data:/app/data β persistent DB
+ restart: unless-stopped
+
+ scheduler:
+ image: registry/stematlas:latest
+ command: python scheduler.py
+ volumes:
+ - /srv/stematlas/data:/app/data β same DB, write access
+ restart: unless-stopped
+```
+
+**Scheduler as a sidecar**: runs in the same image but different container, keeps DB updated nightly. Streamlit container never writes to DB (except user sessions in the quiz).
+
+### 3. Nginx Vhost
+
+New server block on the VPS:
+
+```
+stematlas.sgeboers.nl β proxy_pass http://127.0.0.1:8501
+```
+
+Standard Streamlit proxy requirements: `proxy_http_version 1.1`, WebSocket upgrade headers for `/_stcore/stream`. Let's Encrypt cert via Certbot (standard pattern).
+
+### 4. Drone CI Pipeline Update
+
+Existing `.drone.yml` steps remain identical β build, push, SSH deploy. The only change: `docker-compose.yml` in the repo now references both the `stematlas` and `scheduler` services, so `docker-compose up -d` picks them both up.
+
+No new Drone secrets needed if `DOCKER_REGISTRY`, `DEPLOY_HOST` etc. are already set.
+
+### 5. Blog Post (Raw HTML page on sgeboers.nl)
+
+The blog post is a new `blog/stematlas.html` file added to the sgeboers.nl repo on git.sgeboers.nl. The Drone pipeline for that repo deploys it like any other static file β push to git, Drone copies to webroot, Nginx serves it.
+
+**Chart embedding strategy β inline Plotly divs:**
+
+Rather than iframes, we extract just the chart `
` + `