feat: add StemAtlas Streamlit app, explorer, Docker deployment, blog charts

3 months ago · 2891e9ee70
parent daa22c5e2b
commit 2891e9ee70
20 changed files with 2558 additions and 66 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -28,7 +28,7 @@ steps:
      password: ${DEPLOY_PASSWORD}
      script: |
        set -e
-        cd /srv/stemwijzer
+        cd /srv/stematlas
        docker pull ${DOCKER_REGISTRY}/${DRONE_REPO_OWNER}/${DRONE_REPO_NAME}:latest
        docker-compose pull
        docker-compose up -d
--- a/.gitignore
+++ b/.gitignore
@ -17,3 +17,6 @@ data/*.json
 # Generated output files
 outputs/
 outputs_*/
 # Stray temp files
 dummy
--- a/12
+++ b/12
@ -13,13 +13,9 @@ WORKDIR /home/app/app
 # Copy project files
 COPY . /home/app/app
-# Upgrade pip and install either pinned requirements or runtime defaults
+# Upgrade pip and install all project dependencies from pyproject.toml
 RUN python -m pip install --upgrade pip
-RUN if [ -f requirements.txt ]; then \
+RUN pip install .
      pip install -r requirements.txt; \
    else \
      pip install uv streamlit duckdb; \
    fi
 # Fix permissions
 RUN chown -R app:app /home/app
@ -32,5 +28,5 @@ EXPOSE 8501
 # Simple healthcheck that queries the Streamlit root
 HEALTHCHECK --interval=30s --timeout=3s --start-period=10s CMD curl -f http://localhost:8501/ || exit 1
-# Run the Streamlit app via uv as preferred in this project
+# Run the multi-page Streamlit app
-CMD ["uv", "run", "streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
+CMD ["streamlit", "run", "Home.py", "--server.port=8501", "--server.address=0.0.0.0"]
--- a/Home.py
+++ b/Home.py
@ -0,0 +1,53 @@
 """StemAtlas — home page.
 Entry point for the Streamlit multi-page app. Shows a landing page with
 brief descriptions of and links to the two sub-pages.
 """
 import streamlit as st
 st.set_page_config(
    page_title="StemAtlas",
    page_icon="🗺️",
    layout="centered",
    initial_sidebar_state="expanded",
 )
 def main() -> None:
    st.title("🗺️ StemAtlas")
    st.markdown(
        "**StemAtlas** brengt de Nederlandse Tweede Kamer in kaart op basis van "
        "echte stemmingen over moties. Gebruik de Stemwijzer om te ontdekken welke "
        "partij het beste bij jouw standpunten past, of verken de politieke ruimte "
        "zelf in de Explorer."
    )
    st.divider()
    col1, col2 = st.columns(2)
    with col1:
        st.subheader("🗳️ Stemwijzer")
        st.markdown(
            "Stem op echte Tweede Kamer moties en zie welke partij het "
            "dichtst bij jouw keuzes staat."
        )
        st.page_link("pages/1_Stemwijzer.py", label="Open Stemwijzer", icon="🗳️")
    with col2:
        st.subheader("🔭 Politiek Explorer")
        st.markdown(
            "Verken het politieke kompas, partijtrajecten door de tijd, "
            "en zoek vergelijkbare moties op in het archief."
        )
        st.page_link("pages/2_Explorer.py", label="Open Explorer", icon="🔭")
    st.divider()
    st.caption(
        "Data: Tweede Kamer API · Embeddings: OpenAI · "
        "Gemaakt door [Sebastiaan Geboers](https://sgeboers.nl)"
    )
 main()
--- a/database.py
+++ b/database.py
@ -305,6 +305,104 @@ class MotionDatabase:
                conn.close()
            return False
    def batch_insert_motions(self, motions_data: List[Dict]) -> Tuple[int, int]:
        """Batch-insert motions and their mp_votes using a single DuckDB connection.
        Returns (inserted_count, duplicate_count).
        """
        if not motions_data:
            return 0, 0
        try:
            conn = duckdb.connect(self.db_path)
            # 1. Find which URLs already exist — single query
            urls = [m["url"] for m in motions_data]
            placeholders = ", ".join("?" * len(urls))
            existing_urls = set(
                row[0]
                for row in conn.execute(
                    f"SELECT url FROM motions WHERE url IN ({placeholders})", urls
                ).fetchall()
            )
            new_motions = [m for m in motions_data if m["url"] not in existing_urls]
            duplicates = len(motions_data) - len(new_motions)
            if not new_motions:
                conn.close()
                return 0, duplicates
            # 2. Bulk-insert motions
            motion_rows = [
                (
                    m["title"],
                    m["description"] or "",
                    m["date"],
                    m["policy_area"],
                    json.dumps(m["voting_results"]),
                    m["winning_margin"],
                    1 - m["winning_margin"],
                    m["url"],
                    m.get("externe_identifier"),
                    m.get("body_text"),
                )
                for m in new_motions
            ]
            conn.executemany(
                """
                INSERT INTO motions
                (title, description, date, policy_area, voting_results,
                 winning_margin, controversy_score, url, externe_identifier,
                 body_text, created_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
                """,
                motion_rows,
            )
            # 3. Fetch the newly-assigned IDs in one query
            new_urls = [m["url"] for m in new_motions]
            np = ", ".join("?" * len(new_urls))
            url_to_id = {
                row[1]: row[0]
                for row in conn.execute(
                    f"SELECT id, url FROM motions WHERE url IN ({np})", new_urls
                ).fetchall()
            }
            # 4. Bulk-insert mp_votes
            vote_rows = []
            for m in new_motions:
                motion_id = url_to_id.get(m["url"])
                if motion_id is None:
                    continue
                mp_vote_parties = m.get("mp_vote_parties", {})
                voting_results_raw = m.get("voting_results", {})
                motion_date = m.get("date", "")
                for mp_name, party in mp_vote_parties.items():
                    vote = voting_results_raw.get(mp_name, "afwezig")
                    vote_rows.append((motion_id, mp_name, party, vote, motion_date))
            if vote_rows:
                conn.executemany(
                    """
                    INSERT INTO mp_votes (motion_id, mp_name, party, vote, date, created_at)
                    VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
                    """,
                    vote_rows,
                )
            conn.close()
            return len(new_motions), duplicates
        except Exception as e:
            _logger.error(f"Error in batch_insert_motions: {e}")
            try:
                conn.close()
            except Exception:
                pass
            raise
    def get_filtered_motions(
        self,
        policy_area: str = "Alle",
@ -675,6 +773,43 @@ class MotionDatabase:
                pass
            return -1
    def batch_store_svd_vectors(
        self,
        window_id: str,
        rows: List[Tuple],  # each: (entity_type, entity_id, vector_list, model_or_None)
    ) -> int:
        """Batch-upsert SVD vectors for a window using a single connection.
        Deletes all existing rows for the window first, then inserts the new batch.
        Returns number of rows inserted.
        """
        if not rows:
            return 0
        try:
            conn = duckdb.connect(self.db_path)
            conn.execute("DELETE FROM svd_vectors WHERE window_id = ?", (window_id,))
            insert_rows = [
                (window_id, entity_type, entity_id, json.dumps(vector), model)
                for entity_type, entity_id, vector, model in rows
            ]
            conn.executemany(
                """
                INSERT INTO svd_vectors
                (window_id, entity_type, entity_id, vector, model, created_at)
                VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
                """,
                insert_rows,
            )
            conn.close()
            return len(insert_rows)
        except Exception as e:
            _logger.error(f"Error in batch_store_svd_vectors: {e}")
            try:
                conn.close()
            except Exception:
                pass
            raise
    def store_fused_embedding(
        self,
        motion_id: int,
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,20 +1,32 @@
-version: '3.8'
+version: "3.9"
 services:
-  stemwijzer:
+  stematlas:
-    build: .
+    image: ${DOCKER_REGISTRY}/sgeboers/stemwijzer:latest
    image: stemwijzer:latest
    container_name: stemwijzer_app
    restart: unless-stopped
    ports:
-      - "8501:8501"
+      - "127.0.0.1:8501:8501"
    volumes:
-      - ./data:/home/app/app/data:rw
+      - /srv/stematlas/data:/home/app/app/data
    restart: unless-stopped
    environment:
      - PYTHONPATH=/home/app/app
      - OPENROUTER_API_KEY
-      - OTHER_SECRET
+      - DB_PATH=/home/app/app/data/motions.db
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8501/"]
      interval: 30s
      timeout: 3s
      retries: 3
      start_period: 15s
  scheduler:
    image: ${DOCKER_REGISTRY}/sgeboers/stemwijzer:latest
    command: python scheduler.py
    volumes:
      - /srv/stematlas/data:/home/app/app/data
    restart: unless-stopped
    environment:
      - PYTHONPATH=/home/app/app
      - OPENROUTER_API_KEY
      - OPENAI_API_KEY
      - DB_PATH=/home/app/app/data/motions.db
--- a/explorer.py
+++ b/explorer.py
@ -0,0 +1,586 @@
 """Parlement Explorer — Streamlit data analysis app.
 Four tabs:
  1. Politiek Kompas  — 2D scatter of MPs/parties, window slider
  2. Partij Trajectories — party centroid lines over time
  3. Motie Zoeken      — text search + similarity lookup
  4. Motie Browser     — sortable table + detail panel
 Run with: streamlit run explorer.py
 Import-safe: heavy computation is behind @st.cache_data and only runs at UI time.
 All DuckDB connections are read_only=True so the app can run alongside the pipeline.
 """
 from __future__ import annotations
 import json
 import logging
 import os
 from typing import Dict, List, Optional, Tuple
 import duckdb
 import numpy as np
 import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 import streamlit as st
 logger = logging.getLogger(__name__)
 # Party colour palette (consistent across tabs)
 PARTY_COLOURS: Dict[str, str] = {
    "VVD": "#1E73BE",
    "PVV": "#002366",
    "D66": "#00A36C",
    "CDA": "#4CAF50",
    "SP": "#E53935",
    "PvdA": "#D32F2F",
    "GroenLinks": "#388E3C",
    "GroenLinks-PvdA": "#2E7D32",
    "CU": "#0288D1",
    "SGP": "#F4511E",
    "PvdD": "#43A047",
    "FVD": "#6A1B9A",
    "JA21": "#7B1FA2",
    "BBB": "#8D6E63",
    "NSC": "#FF8F00",
    "DENK": "#00897B",
    "50PLUS": "#7E57C2",
    "Unknown": "#9E9E9E",
 }
 # ---------------------------------------------------------------------------
 # Cached loaders
 # ---------------------------------------------------------------------------
@st.cache_data(show_spinner="Beschikbare tijdsvensters laden…")
 def get_available_windows(db_path: str) -> List[str]:
    """Return sorted list of distinct window_ids from svd_vectors."""
    con = duckdb.connect(database=db_path, read_only=True)
    try:
        rows = con.execute(
            "SELECT DISTINCT window_id FROM svd_vectors ORDER BY window_id"
        ).fetchall()
        return [r[0] for r in rows]
    except Exception:
        logger.exception("Failed to query available windows")
        return []
    finally:
        con.close()
@st.cache_data(show_spinner="2D posities berekenen (kan even duren)…")
 def load_positions(
    db_path: str, window_size: str = "quarterly"
 ) -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], Dict]:
    """Compute 2D positions per window using PCA on aligned SVD vectors.
    Returns:
        positions_by_window: {window_id: {entity_name: (x, y)}}
        axis_def: dict with x_axis, y_axis, method keys
    """
    from analysis.political_axis import compute_2d_axes
    available = get_available_windows(db_path)
    if window_size == "annual":
        # Keep only Q4 windows (one representative window per year)
        available = [w for w in available if w.endswith("-Q4")]
    if not available:
        return {}, {}
    positions_by_window, axis_def = compute_2d_axes(
        db_path,
        window_ids=available,
        method="pca",
        pca_residual=True,
        normalize_vectors=True,
    )
    return positions_by_window, axis_def
@st.cache_data(show_spinner="Partijkaart laden…")
 def load_party_map(db_path: str) -> Dict[str, str]:
    """Return {mp_name: party} mapping from mp_metadata (with vote-based fallback)."""
    from analysis.visualize import _load_party_map
    try:
        return _load_party_map(db_path)
    except Exception:
        logger.exception("Failed to load party map")
        return {}
@st.cache_data(show_spinner="Moties laden…")
 def load_motions_df(db_path: str) -> pd.DataFrame:
    """Load the full motions table as a pandas DataFrame (read-only)."""
    con = duckdb.connect(database=db_path, read_only=True)
    try:
        df = con.execute(
            """
            SELECT id, title, description, date, policy_area,
                   voting_results, layman_explanation,
                   winning_margin, controversy_score
            FROM motions
            """
        ).fetchdf()
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        df["year"] = df["date"].dt.year
        return df
    except Exception:
        logger.exception("Failed to load motions")
        return pd.DataFrame()
    finally:
        con.close()
 def query_similar(
    db_path: str,
    source_motion_id: int,
    vector_type: str = "fused",
    top_k: int = 10,
 ) -> pd.DataFrame:
    """Return top-k similar motions from similarity_cache (read-only)."""
    con = duckdb.connect(database=db_path, read_only=True)
    try:
        rows = con.execute(
            """
            SELECT sc.target_motion_id, sc.score, sc.window_id,
                   m.title, m.date, m.policy_area
            FROM similarity_cache sc
            JOIN motions m ON m.id = sc.target_motion_id
            WHERE sc.source_motion_id = ?
              AND sc.vector_type = ?
            ORDER BY sc.score DESC
            LIMIT ?
            """,
            [source_motion_id, vector_type, top_k],
        ).fetchdf()
        return rows
    except Exception:
        logger.exception(
            "Failed to query similarity cache for motion %s", source_motion_id
        )
        return pd.DataFrame()
    finally:
        con.close()
 # ---------------------------------------------------------------------------
 # Tab 1: Politiek Kompas
 # ---------------------------------------------------------------------------
 def build_compass_tab(db_path: str, window_size: str) -> None:
    st.subheader("Politiek Kompas")
    st.markdown(
        "2D projectie van Kamerlid posities op basis van stemgedrag (PCA op SVD-vectoren)."
    )
    positions_by_window, axis_def = load_positions(db_path, window_size)
    if not positions_by_window:
        st.warning(
            "Geen positiedata beschikbaar. Controleer of de pipeline is gedraaid."
        )
        return
    party_map = load_party_map(db_path)
    windows = sorted(positions_by_window.keys())
    col1, col2 = st.columns([3, 1])
    with col2:
        window_idx = st.select_slider(
            "Tijdsvenster", options=windows, value=windows[-1]
        )
        show_names = st.checkbox("Toon namen", value=False)
        min_size = st.slider("Min. MPs per partij", 0, 20, 3)
    pos = positions_by_window.get(window_idx, {})
    if not pos:
        st.info(f"Geen data voor venster {window_idx}")
        return
    rows = []
    for name, (x, y) in pos.items():
        party = party_map.get(name, "Unknown")
        rows.append({"name": name, "x": x, "y": y, "party": party})
    df_pos = pd.DataFrame(rows)
    # Filter to parties with enough MPs
    party_counts = df_pos["party"].value_counts()
    valid_parties = party_counts[party_counts >= min_size].index
    df_pos = df_pos[df_pos["party"].isin(valid_parties)]
    colour_map = {p: PARTY_COLOURS.get(p, "#9E9E9E") for p in df_pos["party"].unique()}
    fig = px.scatter(
        df_pos,
        x="x",
        y="y",
        color="party",
        hover_name="name",
        hover_data={"party": True, "x": ":.3f", "y": ":.3f"},
        color_discrete_map=colour_map,
        title=f"Politiek Kompas — {window_idx}",
        labels={"x": "Links ← → Rechts", "y": "Progressief ↑ / Conservatief ↓"},
    )
    if show_names:
        fig.update_traces(text=df_pos["name"], textposition="top center")
    fig.update_layout(height=600, legend_title_text="Partij")
    with col1:
        st.plotly_chart(fig, use_container_width=True)
    # Axis info
    if axis_def:
        evr = axis_def.get("explained_variance_ratio", [])
        if evr:
            st.caption(
                f"PCA verklaarde variantie: as 1 = {evr[0] * 100:.1f}%, as 2 = {evr[1] * 100:.1f}%"
            )
 # ---------------------------------------------------------------------------
 # Tab 2: Partij Trajectories
 # ---------------------------------------------------------------------------
 def build_trajectories_tab(db_path: str, window_size: str) -> None:
    st.subheader("Partij Trajectories")
    st.markdown("Hoe bewegen partijen over de tijdsvensters heen?")
    positions_by_window, _ = load_positions(db_path, window_size)
    if not positions_by_window:
        st.warning("Geen positiedata beschikbaar.")
        return
    party_map = load_party_map(db_path)
    windows = sorted(positions_by_window.keys())
    # Compute party centroids per window
    centroids: Dict[str, Dict[str, Tuple[float, float]]] = {}
    all_parties: set = set()
    for wid in windows:
        pos = positions_by_window.get(wid, {})
        per_party: Dict[str, List[Tuple[float, float]]] = {}
        for mp_name, (x, y) in pos.items():
            party = party_map.get(mp_name, "Unknown")
            if party == "Unknown":
                continue
            per_party.setdefault(party, []).append((x, y))
        for party, coords in per_party.items():
            all_parties.add(party)
            xs = [c[0] for c in coords]
            ys = [c[1] for c in coords]
            centroids.setdefault(party, {})[wid] = (
                float(np.mean(xs)),
                float(np.mean(ys)),
            )
    all_parties_sorted = sorted(all_parties)
    major_parties = [
        p
        for p in all_parties_sorted
        if len(centroids.get(p, {})) >= max(2, len(windows) // 2)
    ]
    selected_parties = st.multiselect(
        "Selecteer partijen",
        options=all_parties_sorted,
        default=major_parties[:12] if major_parties else all_parties_sorted[:8],
    )
    fig = go.Figure()
    for party in selected_parties:
        if party not in centroids:
            continue
        wids_sorted = sorted(centroids[party].keys())
        xs = [centroids[party][w][0] for w in wids_sorted]
        ys = [centroids[party][w][1] for w in wids_sorted]
        colour = PARTY_COLOURS.get(party, "#9E9E9E")
        fig.add_trace(
            go.Scatter(
                x=xs,
                y=ys,
                mode="lines+markers+text",
                name=party,
                text=[w.replace("-Q4", "") for w in wids_sorted],
                textposition="top center",
                line=dict(color=colour),
                marker=dict(color=colour, size=8),
                hovertemplate=(
                    f"<b>{party}</b><br>"
                    "venster: %{text}<br>"
                    "x: %{x:.3f}<br>y: %{y:.3f}<extra></extra>"
                ),
            )
        )
    fig.update_layout(
        title="Partij trajectories",
        xaxis_title="Links ← → Rechts",
        yaxis_title="Progressief ↑ / Conservatief ↓",
        height=600,
        legend_title_text="Partij",
    )
    st.plotly_chart(fig, use_container_width=True)
 # ---------------------------------------------------------------------------
 # Tab 3: Motie Zoeken
 # ---------------------------------------------------------------------------
 def build_search_tab(db_path: str, show_rejected: bool) -> None:
    st.subheader("Motie Zoeken")
    df = load_motions_df(db_path)
    if df.empty:
        st.warning("Geen moties beschikbaar.")
        return
    if not show_rejected:
        df = df[df["title"].fillna("").str.strip() != "Verworpen."]
    # Sidebar-style controls in the main area
    col1, col2, col3 = st.columns([2, 1, 1])
    with col1:
        query = st.text_input(
            "Zoek op titel of uitleg", placeholder="bijv. stikstof, klimaat, wonen"
        )
    with col2:
        years = sorted(df["year"].dropna().astype(int).unique().tolist())
        if years:
            year_range = st.select_slider(
                "Jaar", options=years, value=(years[0], years[-1])
            )
        else:
            year_range = (2019, 2024)
    with col3:
        policy_areas = ["(Alle)"] + sorted(df["policy_area"].dropna().unique().tolist())
        policy_filter = st.selectbox("Beleidsterrein", options=policy_areas)
    # Apply filters in-memory
    working = df.copy()
    working = working[
        (working["year"] >= year_range[0]) & (working["year"] <= year_range[1])
    ]
    if policy_filter != "(Alle)":
        working = working[working["policy_area"] == policy_filter]
    if query:
        q = query.lower()
        mask = working["title"].fillna("").str.lower().str.contains(
            q, regex=False
        ) | working["layman_explanation"].fillna("").str.lower().str.contains(
            q, regex=False
        )
        working = working[mask]
    working = working.sort_values(by="controversy_score", ascending=False)
    st.caption(f"{len(working)} resultaten (top 50 getoond)")
    for _, row in working.head(50).iterrows():
        title = row.get("title") or f"Motie #{row['id']}"
        date_str = row["date"].strftime("%d %b %Y") if pd.notna(row["date"]) else "?"
        with st.expander(f"**{title}** — {date_str} — {row.get('policy_area') or ''}"):
            explanation = row.get("layman_explanation")
            if explanation and str(explanation).strip():
                st.markdown(explanation)
            elif row.get("description") and str(row["description"]).strip():
                st.markdown(str(row["description"])[:600] + "…")
            else:
                st.caption("_Geen samenvatting beschikbaar_")
            cols = st.columns(3)
            cols[0].metric("Controverse", f"{row.get('controversy_score', 0):.2f}")
            cols[1].metric("Marge", f"{row.get('winning_margin', 0):.2f}")
            cols[2].metric("Jaar", int(row["year"]) if pd.notna(row["year"]) else "?")
            # Similar motions
            sim = query_similar(db_path, int(row["id"]), top_k=5)
            if not sim.empty:
                st.markdown("**Vergelijkbare moties:**")
                for _, s in sim.iterrows():
                    s_date = (
                        pd.to_datetime(s["date"]).strftime("%Y")
                        if pd.notna(s.get("date"))
                        else ""
                    )
                    st.markdown(
                        f"- {s.get('title', 'Onbekend')} *(score: {s['score']:.3f}, {s_date})*"
                    )
            else:
                st.caption("_Nog geen vergelijkbare moties beschikbaar_")
 # ---------------------------------------------------------------------------
 # Tab 4: Motie Browser
 # ---------------------------------------------------------------------------
 def build_browser_tab(db_path: str, show_rejected: bool) -> None:
    st.subheader("Motie Browser")
    df = load_motions_df(db_path)
    if df.empty:
        st.warning("Geen moties beschikbaar.")
        return
    if not show_rejected:
        df = df[df["title"].fillna("").str.strip() != "Verworpen."]
    # Controls
    col1, col2, col3 = st.columns(3)
    with col1:
        years = sorted(df["year"].dropna().astype(int).unique().tolist())
        year_filter = st.selectbox("Jaar", ["(Alle)"] + [str(y) for y in years])
    with col2:
        policy_areas = ["(Alle)"] + sorted(df["policy_area"].dropna().unique().tolist())
        pa_filter = st.selectbox(
            "Beleidsterrein", options=policy_areas, key="browser_pa"
        )
    with col3:
        sort_by = st.selectbox("Sorteren op", ["Datum (nieuw)", "Controverse", "Marge"])
    # Filter
    working = df.copy()
    if year_filter != "(Alle)":
        working = working[working["year"] == int(year_filter)]
    if pa_filter != "(Alle)":
        working = working[working["policy_area"] == pa_filter]
    sort_map = {
        "Datum (nieuw)": ("date", False),
        "Controverse": ("controversy_score", False),
        "Marge": ("winning_margin", True),
    }
    sort_col, sort_asc = sort_map[sort_by]
    working = working.sort_values(by=sort_col, ascending=sort_asc)
    # Display table
    display_cols = [
        "id",
        "title",
        "date",
        "policy_area",
        "controversy_score",
        "winning_margin",
    ]
    available_display = [c for c in display_cols if c in working.columns]
    st.dataframe(
        working[available_display].reset_index(drop=True),
        use_container_width=True,
        height=350,
    )
    st.divider()
    # Detail panel
    st.markdown("**Detail weergave** — vul een motie-ID in:")
    sel_id = st.number_input(
        "Motie ID",
        min_value=int(working["id"].min()) if not working.empty else 1,
        max_value=int(working["id"].max()) if not working.empty else 99999,
        value=int(working["id"].iloc[0]) if not working.empty else 1,
        step=1,
    )
    motion_row = df[df["id"] == sel_id]
    if not motion_row.empty:
        row = motion_row.iloc[0]
        st.markdown(f"### {row.get('title') or 'Onbekend'}")
        st.caption(
            f"📅 {row['date'].strftime('%d %b %Y') if pd.notna(row['date']) else '?'}  "
            f"| 🏷️ {row.get('policy_area') or ''}  "
            f"| 🔥 Controverse: {row.get('controversy_score', 0):.2f}"
        )
        if row.get("layman_explanation") and str(row["layman_explanation"]).strip():
            st.markdown(row["layman_explanation"])
        elif row.get("description") and str(row["description"]).strip():
            st.markdown(str(row["description"]))
        # Parse voting results
        try:
            vr = row.get("voting_results")
            if vr and str(vr).strip() not in ("", "null", "None"):
                vdata = json.loads(vr) if isinstance(vr, str) else vr
                if isinstance(vdata, dict):
                    st.markdown("**Stemuitslag:**")
                    for category, actors in vdata.items():
                        if actors:
                            st.markdown(
                                f"- **{category}**: {', '.join(str(a) for a in actors)}"
                            )
        except Exception:
            pass
        # Similar motions
        sim = query_similar(db_path, int(sel_id), top_k=10)
        if not sim.empty:
            st.markdown("**Vergelijkbare moties:**")
            st.dataframe(
                sim[["title", "score", "date", "policy_area"]],
                use_container_width=True,
            )
        else:
            st.caption("_Nog geen vergelijkbare moties beschikbaar voor deze motie_")
 # ---------------------------------------------------------------------------
 # App entry
 # ---------------------------------------------------------------------------
 def run_app() -> None:
    st.set_page_config(
        layout="wide",
        page_title="Parlement Explorer",
        page_icon="🏛️",
    )
    st.title("🏛️ Parlement Explorer")
    # Sidebar
    st.sidebar.title("Instellingen")
    db_path = st.sidebar.text_input("DuckDB pad", value="data/motions.db")
    window_size = st.sidebar.radio("Venstergrootte", ["quarterly", "annual"], index=0)
    show_rejected = st.sidebar.checkbox("Toon verworpen moties", value=False)
    # About section
    with st.sidebar.expander("ℹ️ Over", expanded=False):
        try:
            con = duckdb.connect(database=db_path, read_only=True)
            n_motions = con.execute("SELECT COUNT(*) FROM motions").fetchone()[0]
            n_fused = con.execute("SELECT COUNT(*) FROM fused_embeddings").fetchone()[0]
            n_sim = con.execute("SELECT COUNT(*) FROM similarity_cache").fetchone()[0]
            con.close()
            st.markdown(
                f"**Moties:** {n_motions:,}  \n"
                f"**Fused embeddings:** {n_fused:,}  \n"
                f"**Similarity cache:** {n_sim:,}"
            )
        except Exception as e:
            st.warning(f"DB niet bereikbaar: {e}")
    # Main tabs
    tab1, tab2, tab3, tab4 = st.tabs(
        ["🧭 Politiek Kompas", "📈 Trajectories", "🔍 Motie Zoeken", "📋 Motie Browser"]
    )
    with tab1:
        build_compass_tab(db_path, window_size)
    with tab2:
        build_trajectories_tab(db_path, window_size)
    with tab3:
        build_search_tab(db_path, show_rejected)
    with tab4:
        build_browser_tab(db_path, show_rejected)
 if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s"
    )
    run_app()
--- a/pages/1_Stemwijzer.py
+++ b/pages/1_Stemwijzer.py
@ -0,0 +1,5 @@
 """Stemwijzer page — thin wrapper around the existing app module."""
 from app import main  # noqa: F401  (module-level set_page_config runs on import)
 main()
--- a/pages/2_Explorer.py
+++ b/pages/2_Explorer.py
@ -0,0 +1,5 @@
 """Politiek Explorer page — thin wrapper around the explorer module."""
 from explorer import run_app
 run_app()
--- a/pipeline/run_pipeline.py
+++ b/pipeline/run_pipeline.py
@ -29,6 +29,7 @@ import argparse
 import calendar
 import logging
 import sys
 from concurrent.futures import ThreadPoolExecutor
 from datetime import date, timedelta
 from typing import List, Tuple
@ -143,27 +144,55 @@ def run(args: argparse.Namespace) -> int:
    # ── Phase 3: SVD per window ──────────────────────────────────────────────
    if not args.skip_svd:
        windows = _generate_windows(start_date, end_date, args.window_size)
-        _logger.info("Phase 3: SVD for %d windows (k=%d)", len(windows), args.svd_k)
+        _logger.info(
-        from pipeline.svd_pipeline import run_svd_for_window
+            "Phase 3: SVD for %d windows (k=%d, parallel)", len(windows), args.svd_k
        )
        from pipeline.svd_pipeline import compute_svd_for_window
-        for window_id, w_start, w_end in windows:
+        if dry_run:
-            _logger.info("  window %s: %s → %s", window_id, w_start, w_end)
+            for window_id, w_start, w_end in windows:
-            if not dry_run:
+                _logger.info("  [dry-run] would run SVD for window %s", window_id)
-                result = run_svd_for_window(
+        else:
-                    db=db,
+            # Compute all windows in parallel (numpy/scipy SVD releases the GIL).
-                    window_id=window_id,
+            # IMPORTANT: collect ALL results before writing — DuckDB rejects mixing
-                    start_date=w_start,
+            # read-only and read-write connections in the same process.
-                    end_date=w_end,
+            # The `with` block waits for all threads to finish before we exit it,
-                    k=args.svd_k,
+            # ensuring all read-only connections are closed before writes begin.
-                )
+            futures = {}
            max_workers = min(len(windows), (args.svd_workers or 4))
            with ThreadPoolExecutor(max_workers=max_workers) as pool:
                for window_id, w_start, w_end in windows:
                    fut = pool.submit(
                        compute_svd_for_window,
                        db.db_path,
                        window_id,
                        w_start,
                        w_end,
                        args.svd_k,
                    )
                    futures[fut] = window_id
            # All threads are done here — all read-only connections are closed.
            # Now write results sequentially.
            for fut, window_id in futures.items():
                try:
                    result = fut.result()
                except Exception as exc:
                    _logger.error("  window %s raised: %s", window_id, exc)
                    continue
                if result["k_used"] == 0:
                    _logger.info("  window %s: no data, skipped", window_id)
                    continue
                rows = result["mp_rows"] + result["motion_rows"]
                db.batch_store_svd_vectors(window_id, rows)
                _logger.info(
-                    "    k_used=%d stored_mp=%d stored_motion=%d",
+                    "  window %s: k_used=%d stored_mp=%d stored_motion=%d",
                    window_id,
                    result["k_used"],
-                    result["stored_mp"],
+                    len(result["mp_rows"]),
-                    result["stored_motion"],
+                    len(result["motion_rows"]),
                )
            else:
                _logger.info("  [dry-run] would run SVD for window %s", window_id)
    else:
        _logger.info("Phase 3: skipped (--skip-svd)")
@ -235,6 +264,12 @@ def build_parser() -> argparse.ArgumentParser:
        help="Time window granularity",
    )
    parser.add_argument("--svd-k", type=int, default=50, help="SVD dimensions")
    parser.add_argument(
        "--svd-workers",
        type=int,
        default=None,
        help="Parallel workers for SVD (default: min(windows, 4))",
    )
    parser.add_argument(
        "--text-model",
        default=None,
--- a/pipeline/svd_pipeline.py
+++ b/pipeline/svd_pipeline.py
@ -150,57 +150,115 @@ def _procrustes_align(
        return current_anchor
-def run_svd_for_window(
+def compute_svd_for_window(
-    db: MotionDatabase,
+    db_path: str,
    window_id: str,
    start_date: str,
    end_date: str,
    k: int = 50,
 ) -> Dict:
-    """Run SVD on votes in given date window and store vectors in DB.
+    """Pure-compute SVD for a window. Safe to run in a subprocess.
-    Returns metadata dict with keys: k_used, stored_mp, stored_motion
+    Opens the DB in read-only mode (allows concurrent parallel workers).
    Does NOT write to the DB — caller is responsible for persisting results.
    Returns dict with keys:
        window_id, k_used, mp_rows, motion_rows
        where *_rows are List[Tuple[entity_type, entity_id, vector, model]]
    """
-    mat, mp_names, motion_ids = _build_vote_matrix(db, start_date, end_date)
+    empty = {"window_id": window_id, "k_used": 0, "mp_rows": [], "motion_rows": []}
    # Read vote matrix using a read-only connection — safe to run in parallel.
    conn = duckdb.connect(db_path, read_only=True)
    try:
        rows = conn.execute(
            "SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ?",
            (start_date, end_date),
        ).fetchall()
    finally:
        conn.close()
    if not rows:
        return empty
    motion_ids = sorted({int(r[0]) for r in rows})
    mp_names = sorted({r[1] for r in rows})
    m_count = len(mp_names)
    n_count = len(motion_ids)
    mat = np.zeros((m_count, n_count), dtype=float)
    mp_index = {name: i for i, name in enumerate(mp_names)}
    motion_index = {mid: j for j, mid in enumerate(motion_ids)}
    for motion_id, mp_name, vote in rows:
        i = mp_index[mp_name]
        j = motion_index[int(motion_id)]
        val = VOTE_MAP.get(
            vote, VOTE_MAP.get(vote.strip() if isinstance(vote, str) else vote, 0.0)
        )
        try:
            mat[i, j] = float(val)
        except Exception:
            mat[i, j] = 0.0
    if mat.size == 0 or mat.shape[0] == 0 or mat.shape[1] == 0:
-        return {"k_used": 0, "stored_mp": 0, "stored_motion": 0}
+        return empty
    k_used = _safe_k(mat, k)
    if k_used <= 0:
-        return {"k_used": 0, "stored_mp": 0, "stored_motion": 0}
+        return empty
    # use sparse svds for efficiency
    try:
        A = csr_matrix(mat)
        U, s, Vt = svds(A, k=k_used)
        # svds does not guarantee ordering of singular values; sort descending
        idx = np.argsort(s)[::-1]
        s = s[idx]
        U = U[:, idx]
        Vt = Vt[idx, :]
-        # weight by singular values
+        mp_vecs = (U * s.reshape(1, -1)).tolist()
-        mp_vecs = (U * s.reshape(1, -1)).tolist()  # m x k
+        motion_vecs = (Vt.T * s.reshape(1, -1)).tolist()
        motion_vecs = (Vt.T * s.reshape(1, -1)).tolist()  # n x k
        stored_mp = 0
        stored_motion = 0
        for i, mp_name in enumerate(mp_names):
            db.store_svd_vector(window_id, "mp", mp_name, mp_vecs[i])
            stored_mp += 1
-        for j, motion_id in enumerate(motion_ids):
+        mp_rows = [
-            db.store_svd_vector(window_id, "motion", str(motion_id), motion_vecs[j])
+            ("mp", mp_name, mp_vecs[i], None) for i, mp_name in enumerate(mp_names)
-            stored_motion += 1
+        ]
        motion_rows = [
            ("motion", str(mid), motion_vecs[j], None)
            for j, mid in enumerate(motion_ids)
        ]
        return {
            "window_id": window_id,
            "k_used": k_used,
-            "stored_mp": stored_mp,
+            "mp_rows": mp_rows,
-            "stored_motion": stored_motion,
+            "motion_rows": motion_rows,
        }
    except Exception:
-        _logger.exception("SVD failed for window")
+        _logger.exception("SVD failed for window %s", window_id)
        return empty
 def run_svd_for_window(
    db: MotionDatabase,
    window_id: str,
    start_date: str,
    end_date: str,
    k: int = 50,
 ) -> Dict:
    """Run SVD on votes in given date window and store vectors in DB.
    Returns metadata dict with keys: k_used, stored_mp, stored_motion
    """
    result = compute_svd_for_window(db.db_path, window_id, start_date, end_date, k)
    if result["k_used"] == 0:
        return {"k_used": 0, "stored_mp": 0, "stored_motion": 0}
    rows = result["mp_rows"] + result["motion_rows"]
    stored = db.batch_store_svd_vectors(window_id, rows)
    return {
        "k_used": result["k_used"],
        "stored_mp": len(result["mp_rows"]),
        "stored_motion": len(result["motion_rows"]),
    }
--- a/scripts/download_past_year.py
+++ b/scripts/download_past_year.py
@ -1,7 +1,7 @@
 """download_past_year.py — One-shot data download: parliamentary motions for a date range.
 Fetches Stemming records from the OData API in chunks (default 90-day windows),
-stores motions into data/motions.db using MotionDatabase.insert_motion().
+stores motions into data/motions.db using MotionDatabase.batch_insert_motions().
 Skips AI summarisation — this is a raw data fetch for the embedding pipeline.
@ -105,11 +105,7 @@ def main():
            inserted = 0
            duplicates = 0
-            for m in motions:
+            inserted, duplicates = db.batch_insert_motions(motions)
                if db.insert_motion(m):
                    inserted += 1
                else:
                    duplicates += 1
            total_inserted += inserted
            total_duplicates += duplicates
--- a/scripts/generate_extra_charts.py
+++ b/scripts/generate_extra_charts.py
@ -0,0 +1,172 @@
 """Generate additional blog charts: controversy trend + party alignment heatmap."""
 from __future__ import annotations
 import os, sys
 ROOT = os.path.dirname(os.path.abspath(__file__))
 if ROOT not in sys.path:
    sys.path.insert(0, ROOT)
 import duckdb
 import plotly.graph_objects as go
 import plotly.express as px
 import numpy as np
 DB = "data/motions.db"
 OUT = "outputs/blog-charts"
 os.makedirs(OUT, exist_ok=True)
 con = duckdb.connect(DB, read_only=True)
 # ─── 1. Controversy trend (bar chart, 2019-2026, quarterly) ──────────────────
 rows = con.execute("""
    SELECT
        YEAR(date) || '-Q' || QUARTER(date) as wid,
        YEAR(date) as yr,
        QUARTER(date) as q,
        COUNT(*) as n,
        ROUND(AVG(controversy_score), 3) as avg_c,
        COUNT(*) FILTER (WHERE controversy_score >= 0.7) as high_c
    FROM motions
    WHERE controversy_score IS NOT NULL
      AND date >= '2019-01-01' AND date < '2026-04-01'
    GROUP BY wid, yr, q
    ORDER BY yr, q
 """).fetchall()
 windows = [r[0] for r in rows]
 avg_c = [r[4] for r in rows]
 high_pct = [round(100.0 * r[5] / r[3], 1) if r[3] else 0 for r in rows]
 fig = go.Figure()
 fig.add_trace(
    go.Bar(
        x=windows,
        y=high_pct,
        name="% highly contested (score ≥ 0.7)",
        marker_color="#00d9a3",
        opacity=0.85,
    )
 )
 fig.add_trace(
    go.Scatter(
        x=windows,
        y=[v * 100 for v in avg_c],
        name="avg controversy × 100",
        mode="lines+markers",
        line=dict(color="#e6edf3", width=2),
        marker=dict(size=4),
    )
 )
 fig.update_layout(
    title="Political controversy per quarter (Tweede Kamer, 2019–2026)",
    xaxis_title="Quarter",
    yaxis_title="% of motions",
    plot_bgcolor="#161b22",
    paper_bgcolor="#0d1117",
    font=dict(color="#e6edf3", family="Inter, system-ui"),
    legend=dict(bgcolor="rgba(0,0,0,0)", bordercolor="#30363d", borderwidth=1),
    xaxis=dict(tickangle=-45, gridcolor="#30363d"),
    yaxis=dict(gridcolor="#30363d", range=[0, 55]),
    bargap=0.15,
 )
 out1 = os.path.join(OUT, "controversy_trend.html")
 fig.write_html(out1, include_plotlyjs="cdn", full_html=True)
 print(f"Wrote {out1}")
 # ─── 2. Party alignment heatmap ──────────────────────────────────────────────
 # Only include major parties with sufficient data
 MAJOR = [
    "VVD",
    "PVV",
    "D66",
    "CDA",
    "PvdA",
    "GroenLinks",
    "SP",
    "ChristenUnie",
    "SGP",
    "FVD",
    "BBB",
    "PvdD",
    "Volt",
    "GroenLinks-PvdA",
    "Nieuw Sociaal Contract",
    "DENK",
    "JA21",
 ]
 rows = con.execute("""
    WITH pv AS (
        SELECT motion_id, party,
               CASE
                 WHEN SUM(CASE WHEN vote='voor' THEN 1 ELSE 0 END) > SUM(CASE WHEN vote='tegen' THEN 1 ELSE 0 END) THEN 'voor'
                 WHEN SUM(CASE WHEN vote='tegen' THEN 1 ELSE 0 END) > SUM(CASE WHEN vote='voor' THEN 1 ELSE 0 END) THEN 'tegen'
                 ELSE 'split'
               END as pv
        FROM mp_votes WHERE party IS NOT NULL AND vote IN ('voor','tegen')
        GROUP BY motion_id, party
    ),
    d AS (SELECT * FROM pv WHERE pv != 'split')
    SELECT a.party, b.party,
           COUNT(*) as shared,
           ROUND(100.0 * SUM(CASE WHEN a.pv = b.pv THEN 1 ELSE 0 END) / COUNT(*), 1) as pct
    FROM d a JOIN d b ON a.motion_id = b.motion_id AND a.party != b.party
    GROUP BY a.party, b.party
    HAVING COUNT(*) >= 100
 """).fetchall()
 # Build matrix
 agree = {}
 for a, b, _, pct in rows:
    agree[(a, b)] = pct
 # Filter to parties that have data
 present = set()
 for a, b in agree:
    if a in MAJOR:
        present.add(a)
    if b in MAJOR:
        present.add(b)
 parties = [p for p in MAJOR if p in present]
 n = len(parties)
 matrix = np.full((n, n), np.nan)
 for i, a in enumerate(parties):
    matrix[i, i] = 100.0
    for j, b in enumerate(parties):
        if i != j and (a, b) in agree:
            matrix[i, j] = agree[(a, b)]
 fig2 = go.Figure(
    data=go.Heatmap(
        z=matrix,
        x=parties,
        y=parties,
        colorscale=[[0, "#6e40c9"], [0.5, "#30363d"], [1, "#00d9a3"]],
        zmid=70,
        zmin=35,
        zmax=100,
        text=[[f"{v:.0f}%" if not np.isnan(v) else "" for v in row] for row in matrix],
        texttemplate="%{text}",
        textfont=dict(size=9),
        hoverongaps=False,
        showscale=True,
        colorbar=dict(title="Agreement %", tickfont=dict(color="#e6edf3")),
    )
 )
 fig2.update_layout(
    title="Cross-party vote alignment (all years combined)",
    plot_bgcolor="#161b22",
    paper_bgcolor="#0d1117",
    font=dict(color="#e6edf3", family="Inter, system-ui", size=11),
    xaxis=dict(tickangle=-45, side="bottom", gridcolor="#30363d"),
    yaxis=dict(autorange="reversed", gridcolor="#30363d"),
    height=600,
 )
 out2 = os.path.join(OUT, "party_alignment.html")
 fig2.write_html(out2, include_plotlyjs="cdn", full_html=True)
 print(f"Wrote {out2}")
 con.close()
 print("Done.")
--- a/tests/test_explorer_import.py
+++ b/tests/test_explorer_import.py
@ -0,0 +1,14 @@
 """Smoke test: explorer module is importable without DB or heavy computation."""
 import importlib
 def test_explorer_importable():
    mod = importlib.import_module("explorer")
    assert hasattr(mod, "run_app")
    assert callable(mod.run_app)
    assert hasattr(mod, "load_positions")
    assert hasattr(mod, "load_motions_df")
    assert hasattr(mod, "query_similar")
    assert hasattr(mod, "build_compass_tab")
    assert hasattr(mod, "build_search_tab")
--- a/tests/test_home_import.py
+++ b/tests/test_home_import.py
@ -0,0 +1,38 @@
 """Smoke test: Home module is importable without DB or heavy computation."""
 import importlib
 import sys
 def test_home_importable():
    # Streamlit cannot run set_page_config outside of a server context,
    # so we only verify the file can be parsed/compiled, not fully executed.
    import ast
    import os
    home_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "Home.py")
    with open(home_path) as f:
        source = f.read()
    # Verify the file parses as valid Python
    tree = ast.parse(source)
    # Verify st.set_page_config is called at module level (first Streamlit command)
    calls = [
        node
        for node in ast.walk(tree)
        if isinstance(node, ast.Call)
        and isinstance(node.func, ast.Attribute)
        and node.func.attr == "set_page_config"
    ]
    assert calls, "Home.py must call st.set_page_config()"
    # Verify page links exist (st.page_link calls)
    page_links = [
        node
        for node in ast.walk(tree)
        if isinstance(node, ast.Call)
        and isinstance(node.func, ast.Attribute)
        and node.func.attr == "page_link"
    ]
    assert len(page_links) >= 2, "Home.py must have at least 2 st.page_link() calls"
--- a/thoughts/blog-post-political-compass.md
+++ b/thoughts/blog-post-political-compass.md
@ -0,0 +1,174 @@
 # Mapping Dutch Democracy: Building a Political Compass from 25,000+ Parliamentary Votes
 *What if you could take every motion voted on in the Dutch Parliament over the past decade and automatically plot parties and MPs on a political map — with zero manual labeling?*
 That's exactly what this project does. Here's how we built it, what surprised us, and what it revealed about Dutch political dynamics.
 ---
 ## The Starting Point: Open Data, Hidden Structure
 The Dutch Parliament publishes every vote — every *motie*, every *amendement*, every *besluit* — in an open OData API. We're talking over **25,500 motions** spanning 2016 to 2026, each with a record of how every party (and in many cases every individual MP) voted: *voor* (for), *tegen* (against), *onthouden* (abstained), or *afwezig* (absent).
 This is an extraordinary dataset. But in raw form it's just a table of votes. The interesting question is: can we extract *structure* — left vs. right, progressive vs. conservative, governing vs. opposition — purely from the pattern of who votes with whom?
 The answer is yes, and the method is surprisingly elegant.
 ---
 ## Step 1: Turning Votes into Geometry
 Each motion is a snapshot of political alignment. For each motion, we know which parties voted together and which voted apart. If PvdA and GroenLinks almost always vote the same way, that tells us something. If PVV and CDA frequently diverge, that tells us something too.
 We represent this with **Singular Value Decomposition (SVD)** on the party-vote matrix:
 - Rows: parties (VVD, PVV, D66, CDA, PvdA, GroenLinks, SP, CU, SGP, FvD, BBB, ...)
 - Columns: motions
 - Values: vote encoded as +1 (voor), -1 (tegen), 0 (absent/abstain)
 SVD finds the dominant axes of variation — the directions along which parties disagree most strongly. The first dimension almost always corresponds to a left-right axis. The second dimension typically captures something like a libertarian-authoritarian or progressive-traditionalist axis.
 We run this **per quarterly window** (2019-Q1, 2019-Q2, ..., 2024-Q4) so we can track how positions shift over time at fine resolution.
 ### The Result: A 2D Political Compass
 The output is coordinates for every party in 2D space — computed purely from voting behavior, with no labels or assumptions from us. When you plot it, recognizable structure emerges immediately:
 - **Left bloc** (PvdA, GroenLinks, SP) cluster tightly together
 - **Right-liberal** (VVD, D66) sit in a distinct quadrant  
 - **Religious right** (SGP, CU) form their own coherent group
 - **Populist right** (PVV, FvD in later years) occupy a distant extreme
 - **BBB** (Farmer's party, 2022 onwards) drops into an interesting position between PVV and CDA
 The political axis emerges from the math — not our intuitions.
 ---
 ## Step 2: What Each Motion Is Actually About
 Voting patterns tell us *who* agrees, but not *why*. For that, we add **text embeddings** — dense vector representations of each motion's title and description using a language model.
 This lets us do something powerful: if a new motion comes in about nitrogen emissions, we can find the 20 most similar past motions (by meaning, not just keywords). If a motion uses identical party-line voting as another motion from 2022, the text embedding can confirm they're genuinely related — or reveal that the voting pattern is coincidental (parties split on unrelated issues for similar structural reasons).
 We compute these using **OpenAI-compatible embeddings** via OpenRouter, processing 25,640 motions in batches of 200.
 ---
 ## Step 3: Fused Embeddings — The Best of Both Worlds
 SVD gives us the political-structural signal: *how does this motion split the chamber?* Text embeddings give us semantic signal: *what is this motion about?*
 We concatenate both into a **fused vector** per motion per window:
 ```
 fused = [svd_dims (50)] + [text_dims (2560)] = 2610 dimensions
 ```
 This fused representation powers the similarity search. Two motions are considered "close" if they're both about a similar topic *and* they produce a similar political split. This filters out spurious matches — two motions might both be controversial (splitting 50/50) but about completely unrelated things.
 ---
 ## The Numbers: What We're Working With
 After the full pipeline run:
 | Year | Motions |
 |------|---------|
 | 2016 | 132 |
 | 2017 | 30 |
 | 2018 | 100 |
 | 2019 | 3,374 |
 | 2020 | 4,228 |
 | 2021 | 4,289 |
 | 2022 | 4,116 |
 | 2023 | 621 |
 | 2024 | 3,968 |
 | 2025 | 3,715 |
 | 2026 | 948 |
 The 2022 spike is striking — over 4,000 motions in a single year. This was the year the Rutte IV coalition took office amid intense debates on energy prices, housing, the war in Ukraine, and the ongoing nitrogen crisis.
 Our similarity cache now holds **627,272 precomputed pairs** (top 20 neighbors per motion per window), making similarity lookup instant at query time.
 ---
 ## Interesting Findings
 ### The 2022 Polarization Surge
 The 2022 cohort dominates the dataset. Looking at the SVD positions for that year, the distance between the governing coalition (VVD, D66, CDA, CU) and the opposition (PVV, SP, FvD) is near its maximum. The nitrogen crisis and energy policy debates forced unusually sharp coalition discipline.
 ### BBB's Geometric Arrival
 When BBB (BoerBurgerBeweging) entered parliament in 2023 with a historic 16 seats, their SVD position placed them between PVV and CDA — exactly as expected from their policy profile: agrarian-nationalist populism with Catholic-provincial roots. The model found this without being told.
 ### The Strange Case of "Verworpen."
 Motions that are rejected without debate are recorded with the title "Verworpen." (Rejected.). There are hundreds of these. Because they share a single 9-character title, their text embeddings are identical — meaning every "Verworpen." has cosine similarity 1.0 to every other "Verworpen." This is technically correct (they are textually identical) but semantically meaningless. The similarity cache contains these spurious pairs, which the UI layer needs to filter out.
 It's a good reminder that **data quality surprises emerge at scale**.
 ### Party Cohesion as a Signal
 A subtle finding: party cohesion (how often all members of a party vote the same way) varies enormously. SGP and CU have near-perfect cohesion — they vote as a bloc on almost everything. PvdA/GroenLinks (post-merger) has similarly high cohesion. But in earlier years (2019-2020), before the merger, GroenLinks occasionally splits on specific issues around security policy.
 VVD shows the most internal variation — governing parties develop fissures.
 ---
 ## The Pipeline Architecture
 The system is built around a single DuckDB database and a modular Python pipeline:
 ```
 API (Tweede Kamer OData) 
  → download_past_year.py 
  → motions table (25,500+ rows)
 motions
  → extract_mp_votes.py → mp_votes table (200k rows)
  → text_pipeline.py   → embeddings table (25,640 rows, via OpenRouter)
  → svd_pipeline.py    → svd_vectors table (50,779 rows, quarterly windows)
 svd_vectors + embeddings
  → fusion.py          → fused_embeddings table (35,872 rows)
 fused_embeddings
  → similarity/compute.py → similarity_cache table (627k rows, top-20 per window)
 ```
 Everything runs locally. The only external call is to the OpenRouter API for text embeddings. The similarity computation (627k pairs) is pure NumPy — load vectors, normalize, matrix multiply, take top-k. For 4,000 motions in a quarter, that's a 4000×4000 cosine similarity matrix computed in seconds.
 ---
 ## What's Next
 The similarity cache and political compass open up several directions:
 **Motion explorer**: Given a motion you care about, find the 20 most politically and semantically similar motions from across the decade. Trace how a policy debate evolved from 2019 to 2025.
 **Party trajectory plots**: Animate party positions on the 2D compass year by year. Watch D66 drift, watch PVV consolidate, watch the new parties arrive and find their position.
 **Cross-party coalition predictor**: Given a new motion's text and expected vote split, predict which parties will support it based on past patterns.
 **The "controversy index"**: We already compute `1 - winning_margin` as a controversy score. The most controversial motions (close votes, high stakes topics) tell a story about where Dutch politics is genuinely undecided vs. where it's performing conflict for the cameras.
 ---
 ## Reproducibility
 The full pipeline is open and runs on a single machine with no cloud infrastructure:
 ```bash
 # Download historical data
 python scripts/download_past_year.py --start-date 2016-01-01 --end-date 2026-01-01
 # Run full pipeline (extract votes, compute SVD, embed text, fuse, build similarity cache)
 python -m pipeline.run_pipeline --db-path data/motions.db \
    --start-date 2016-01-01 --end-date 2026-01-01 \
    --window-size annual --text-batch-size 200
 ```
 The DB grows to ~3.6GB for the full dataset (mostly embeddings and vote records). Everything else — the SVD, fusion, similarity cache — fits comfortably in memory during computation.
 Democracy is more legible than it looks.
--- a/thoughts/shared/designs/2026-03-22-motion-explorer-design.md
+++ b/thoughts/shared/designs/2026-03-22-motion-explorer-design.md
@ -0,0 +1,165 @@
 ---
 date: 2026-03-22
 topic: "Dynamic motion explorer + analysis refresh"
 status: validated
 ---
 ## Problem Statement
 The parliamentary embedding pipeline now covers 2019–2026 with ~25,000 motions, quarterly SVD windows, fused embeddings, and a 200k+ similarity cache. None of this is visible to anyone in an interactive form. The only outputs today are static HTML files written by `generate_compass.py` (if it's been run), and a blog post with placeholder numbers.
 We need to:
 1. Regenerate all analyses and output graphs with the full dataset
 2. Build an interactive Streamlit explorer that surfaces the political compass, party trajectories, and motion similarity search
 3. Update the blog post with real numbers and findings
 ## Constraints
 - Do NOT modify `app.py` or `scheduler.py` — these are the production quiz app
 - All DB access in the explorer must be **read-only** (no writes) — pipeline may be running
 - Explorer must work with existing `analysis.*` modules; no new analysis logic
 - Use `@st.cache_data` aggressively — `compute_2d_axes` runs PCA across all windows and is expensive (seconds, not milliseconds)
 - No new external dependencies beyond what's already installed (streamlit, plotly, umap-learn, scikit-learn are all present)
 - Follow existing code style: functional Python, `logging.getLogger(__name__)`, no print statements in library code
 ## Approach
 **Single-file `explorer.py`** at the project root alongside `app.py`.
 Four Streamlit tabs:
 1. **Politiek Kompas** — 2D MP/party scatter with a window slider
 2. **Partij Trajectories** — Line traces of party positions over time on the compass
 3. **Motie Zoeken** — Free-text + filter search, returns ranked similar motions
 4. **Motie Browser** — Filterable table of all motions, click to expand detail + similar motions
 Run with: `streamlit run explorer.py`
 This approach is chosen because:
 - Reuses all existing `analysis.*` modules without changes
 - Single file means no new package structure to maintain
 - Streamlit tabs map naturally to the four distinct views a researcher would want
 - Read-only DB access means it can run concurrently with the pipeline
 ## Architecture
 ```
 explorer.py
  ├── Tab 1: Politiek Kompas
  │     └── analysis.political_axis.compute_2d_axes (cached)
  │     └── analysis.visualize.plot_political_compass → Plotly figure
  │
  ├── Tab 2: Partij Trajectories  
  │     └── analysis.trajectory.compute_2d_trajectories (cached)
  │     └── analysis.visualize.plot_2d_trajectories → Plotly figure
  │
  ├── Tab 3: Motie Zoeken
  │     └── database.get_all_motions (cached, read-only)
  │     └── database.search_similar (similarity_cache lookup)
  │     └── Custom search: filter title/description + show voting_results
  │
  └── Tab 4: Motie Browser
        └── database.get_filtered_motions (cached, read-only)
        └── On click: database.search_similar for related motions
 ```
 ## Key Components & Responsibilities
 **`explorer.py`**
 - Page config: `st.set_page_config(layout="wide", page_title="Parlement Explorer")`
 - Sidebar: DB path input (default `data/motions.db`), window-size toggle (annual/quarterly)
 - `@st.cache_data` wrappers for all expensive DB reads and computations
 - Four tabs via `st.tabs([...])`
 **Tab 1 — Politiek Kompas**
 - Calls `compute_2d_axes(db_path, method='pca', pca_residual=True)` — cached
 - Window selector slider showing available windows
 - Renders the Plotly scatter for the selected window using `_render_compass_for_window(positions_by_window, window_id, party_map, axis_def)` — a thin Plotly figure builder (not writing to file)
 - Hover: MP name, party, (x, y) coordinates
 - Color by party using `_load_party_map(db_path)` — cached
 **Tab 2 — Partij Trajectories**
 - Same `positions_by_window` data from Tab 1 (shared cache hit)
 - Multi-select party filter (default: all major parties)
 - Plotly figure: one trace per party, x/y positions connected by lines, labeled by window_id
 - Toggle between showing MPs or just party centroids (computed as mean of MP positions per party per window)
 **Tab 3 — Motie Zoeken**
 - Search input (Dutch text, free-form)
 - Filters: year range (slider), policy area (multi-select), controversy score (slider)
 - On search: filter `motions` table in-memory against title + layman_explanation text (case-insensitive substring; no embedding search needed at this level)
 - Results list: each result shows title, date, policy area, controversy, layman_explanation
 - Expandable section per result: full description/body_text + "Vergelijkbare moties" from `similarity_cache`
 - Voting breakdown: parse `voting_results` JSON to show Voor/Tegen/Onthouden per party
 **Tab 4 — Motie Browser**
 - `st.dataframe` with all motions (title, date, policy_area, controversy_score, winning_margin)
 - Column filters at top: year, policy area
 - Sort by: date DESC, controversy DESC, winning_margin ASC (most contested first)
 - Click row → `st.session_state` stores selected motion_id → detail panel below table
 - Detail panel: full motion text + top-10 similar motions from similarity_cache
 ## Data Flow
 1. On startup: `compute_2d_axes` runs PCA, results cached in Streamlit's in-memory cache
 2. Tab 1/2: pure reads from `svd_vectors` + `mp_metadata` — all cached after first load
 3. Tab 3: on each search, filter pre-loaded motions DataFrame in-memory (no DB query per keypress)
 4. Tab 4: full motions table loaded once and cached; similarity lookups hit `similarity_cache` table via existing `database.get_cached_similarities`
 All DuckDB connections are opened with `read_only=True` to allow concurrent pipeline access.
 ## Error Handling
 - If `compute_2d_axes` fails (insufficient data for a window), skip that window and log warning — don't crash the app
 - If `similarity_cache` has no entries for a motion (e.g., new motion not yet processed), show "Nog geen vergelijkbare moties beschikbaar" placeholder
 - If DB file doesn't exist at startup, show an error banner with the path and instructions
 - All `duckdb.connect` calls wrapped in try/finally to guarantee close
 ## Analysis Refresh Plan
 Before building the explorer, regenerate all outputs:
 ```bash
 # 1. Generate political compass HTML for latest window (annual)
 .venv/bin/python scripts/generate_compass.py \
    --db data/motions.db --out outputs \
    --method pca --pca-residual
 # 2. Generate similarity cache for new windows (2019–2021, 2024 quarters)
 #    (run_pipeline with --skip-metadata --skip-extract --skip-svd --skip-text)
 .venv/bin/python -m pipeline.run_pipeline \
    --db-path data/motions.db \
    --start-date 2019-01-01 --end-date 2025-01-01 \
    --window-size quarterly \
    --skip-metadata --skip-extract --skip-svd --skip-text
 # 3. Recompute similarity cache for all windows
 .venv/bin/python -c "
 from similarity.compute import recompute_all_windows
 recompute_all_windows('data/motions.db', window_size='quarterly', top_k=20)
 "
 ```
 ## Blog Post Updates
 Target: `thoughts/blog-post-political-compass.md`
 - Replace placeholder motion counts table with real numbers from DB query
 - Add actual findings from quarterly analysis (not visible in annual windows):
  - 2020-Q2 COVID vote clustering — parties converge on emergency measures
  - 2022-Q4 nitrogen crisis — sharpest left-right split in dataset
  - 2023-Q1 → 2024-Q1 gap (data missing for Q2-Q4 2023)
 - Add "Explorer" section describing `explorer.py` and how to run it
 - Update similarity cache row count (was 212k, now higher with new windows)
 - Fix the "fused = [10] + [2560] = 2570" claim — verify actual dimensions
 ## Testing Strategy
 - Explorer has no tests (it's a UI script) — verify manually by running `streamlit run explorer.py` after pipeline completes
 - Existing 34 tests stay green — no changes to library modules
 - Run tests after completing implementation: `.venv/bin/python -m pytest -q`
 ## Open Questions
 - Should the explorer ship as a separate port from `app.py`? (Recommendation: yes, `app.py` stays on its port, `explorer.py` runs on a different port for internal/research use)
 - Should `Verworpen.` motions be filtered from search results by default? (Recommendation: yes, add a "Toon verworpen" toggle defaulting to off)
 - Annual or quarterly windows as the default for the compass? (Recommendation: annual — less noise, cleaner trajectories; quarterly available via sidebar toggle)
--- a/thoughts/shared/designs/2026-03-22-stematlas-deployment-design.md
+++ b/thoughts/shared/designs/2026-03-22-stematlas-deployment-design.md
@ -0,0 +1,229 @@
 ---
 date: 2026-03-22
 topic: "StemAtlas — Public Deployment on sgeboers.nl"
 status: validated
 ---
 # StemAtlas Deployment Design
 ## Problem Statement
 The stemwijzer project has three user-facing products ready to publish:
 1. **A blog post** explaining the political compass methodology and findings
 2. **An interactive explorer** (political compass, party trajectories, motion search)
 3. **The stemwijzer quiz** (vote on motions, see which parties match you)
 These need to be deployed publicly on sgeboers.nl using the existing VPS + Gitea + Drone + Docker stack.
 ---
 ## The Name: StemAtlas
 **`stematlas.sgeboers.nl`**
 Dutch wordplay: **stem** = *vote* AND *voice* (as in "the voice of parliament") + **atlas** = a comprehensive map of the world. Together: *an atlas of voices* — a map of how Dutch democracy sounds from the inside.
 It's broader than "stemwijzer" (which implies a voting guide) — it positions the site as a data exploration and journalism tool.
 ---
 ## Constraints
 - Existing VPS running Nginx, Gitea, Drone
 - Deployment pipeline: Docker build → push to registry → SSH `docker-compose up -d`
 - sgeboers.nl is a **raw HTML/CSS site** (not Hugo) hosted as a repo on git.sgeboers.nl
 - DuckDB file lives on the VPS — single writer (scheduler), multiple readers (Streamlit)
 - No new cloud services or hosting costs
 ---
 ## Architecture
 ```
 Internet
  │
  ├── sgeboers.nl (raw HTML/CSS site, existing repo on git.sgeboers.nl)
  │     └── blog/stematlas.html  ← blog post with inline charts + link to subdomain
  │
  └── stematlas.sgeboers.nl
        └── Nginx (reverse proxy)
              └── Streamlit multi-page app (port 8501)
                    ├── Page 1: Stemwijzer Quiz (app.py)
                    └── Page 2: Explorer (explorer.py)
 VPS filesystem:
  /srv/stematlas/
    ├── data/motions.db        ← DuckDB (shared, read-write by scheduler)
    └── docker-compose.yml
 ```
 ---
 ## Components
 ### 1. Streamlit Multi-Page App
 Restructure entry point from `app.py` → `Home.py` with a `pages/` directory:
 ```
 Home.py                  ← landing page / about
 pages/
  1_Stemwijzer.py        ← quiz (app.py content)
  2_Explorer.py          ← explorer.py content
 ```
 Streamlit's built-in multi-page routing handles navigation. One Docker container, one port (8501).
 **Why not two separate containers?**  
 Single shared DuckDB file on VPS filesystem. Both pages open read-only connections (quiz opens read-write for session data, but that's the existing behaviour). One container = one volume mount = no coordination overhead.
 ### 2. Docker Compose
 The existing `.drone.yml` already calls `docker-compose up -d` on the VPS. We add/update `docker-compose.yml`:
 ```
 Services:
  stematlas:
    image: registry/stematlas:latest
    ports: 8501 (internal only)
    volumes:
      - /srv/stematlas/data:/app/data   ← persistent DB
    restart: unless-stopped
  scheduler:
    image: registry/stematlas:latest
    command: python scheduler.py
    volumes:
      - /srv/stematlas/data:/app/data   ← same DB, write access
    restart: unless-stopped
 ```
 **Scheduler as a sidecar**: runs in the same image but different container, keeps DB updated nightly. Streamlit container never writes to DB (except user sessions in the quiz).
 ### 3. Nginx Vhost
 New server block on the VPS:
 ```
 stematlas.sgeboers.nl → proxy_pass http://127.0.0.1:8501
 ```
 Standard Streamlit proxy requirements: `proxy_http_version 1.1`, WebSocket upgrade headers for `/_stcore/stream`. Let's Encrypt cert via Certbot (standard pattern).
 ### 4. Drone CI Pipeline Update
 Existing `.drone.yml` steps remain identical — build, push, SSH deploy. The only change: `docker-compose.yml` in the repo now references both the `stematlas` and `scheduler` services, so `docker-compose up -d` picks them both up.
 No new Drone secrets needed if `DOCKER_REGISTRY`, `DEPLOY_HOST` etc. are already set.
 ### 5. Blog Post (Raw HTML page on sgeboers.nl)
 The blog post is a new `blog/stematlas.html` file added to the sgeboers.nl repo on git.sgeboers.nl. The Drone pipeline for that repo deploys it like any other static file — push to git, Drone copies to webroot, Nginx serves it.
 **Chart embedding strategy — inline Plotly divs:**
 Rather than iframes, we extract just the chart `<div>` + `<script>` from `generate_compass.py`'s output (using `fig.to_html(include_plotlyjs='cdn', full_html=False)`) and paste them directly into the blog post HTML. This is cleaner than iframes — no border, no scroll issues, full-width, loads with the page.
 Plotly CDN script included once in the `<head>`. Each chart is just a `<div id="chart-N">` + a `<script>` block below it.
 **Linking to the subdomain:**
 The blog post is the *article* — it tells the story with static charts. The subdomain is the *playground*. The post links to `stematlas.sgeboers.nl` at two natural moments:
 - After the political compass chart: *"Explore every window interactively →"*
 - At the end: *"Take the quiz yourself →"*
 This is the right split: blog post brings readers in via search/sharing, subdomain gives them something to do.
 **Chart generation workflow:**
 ```
 scripts/generate_compass.py → outputs/
  ├── compass_2025.html         ← main compass (latest window)
  ├── trajectories_2019_2025.html  ← party drift over time
  └── compass_2024-Q4.html      ← quarterly detail
 ```
 Run `fig.to_html(include_plotlyjs='cdn', full_html=False)` to extract embeddable snippets, paste into `blog/stematlas.html` in the sgeboers.nl repo.
 ---
 ## Blog Post Charts — What to Include
 The blog post narrates three acts. Each gets a supporting chart:
 ### Act 1: The Method
 **No chart needed** — the SVD explanation is conceptual. Use a simple HTML table for the vote matrix illustration.
 ### Act 2: The Political Compass
 **Chart: `compass_latest_annual.html`**
 - 2D scatter of all parties for the most recent full annual window (2024 or 2025)
 - Axes: PC1 (left-right) × PC2 (residual, typically progressive-traditionalist)
 - Points coloured and labelled by party
 - Interactive: hover shows party name + coordinates
 - Caption: "Each party's position computed purely from voting patterns — no labels applied by us"
 **Chart: `trajectories_all_parties.html`**
 - Line chart of party positions across all annual windows (2016–2025)
 - One line per party, coloured consistently
 - Key narrative moments annotated: BBB arrival (2022), coalition formation (2022), Rutte → Schoof (2024)
 - Interactive: toggle parties on/off via legend
 ### Act 3: Motion Similarity
 **Chart: `compass_motions_sample.html`** (optional, depends on data quality)
 - 2D UMAP scatter of ~500 sampled motions, coloured by policy area
 - Shows clustering: climate motions cluster together, budget motions cluster together, etc.
 - If UMAP results aren't clean enough to tell a clear story, skip this one
 **Static table: Motion counts by year**
 Just a markdown table in the blog post — no chart needed.
 ---
 ## Data Flow
 ```
 scheduler.py (nightly)
  └── api_client → downloads new motions → DuckDB
 On demand (manual or cron):
  └── run_pipeline.py → SVD + embeddings + fusion + similarity cache → DuckDB
  └── generate_compass.py → static HTML charts → sgeboers.nl repo (blog/stematlas.html)
 Streamlit (reads only):
  └── duckdb.connect(read_only=True) → all analysis queries
 ```
 The DB is the source of truth. Charts are regenerated and re-copied to Hugo whenever the pipeline produces new data — probably monthly.
 ---
 ## Error Handling Strategy
 - **Streamlit crash**: Docker `restart: unless-stopped` brings it back automatically
 - **Scheduler crash**: Same restart policy; DuckDB's WAL handles partial writes
 - **DB file corruption**: Not handled beyond OS-level backup. Mitigate by adding a weekly `cp data/motions.db data/motions.db.bak` to the scheduler or as a cron job on the VPS
 - **Blog charts stale**: Acceptable — charts are labelled with their window date; stale by 30 days is fine for a blog post
 - **Streamlit + scheduler write conflict**: Scheduler is the only writer. Streamlit and quiz sessions both use separate connections; DuckDB handles concurrent reads fine. The quiz writes `user_sessions` rows — low frequency, no conflict risk with scheduler
 ---
 ## Testing Strategy
 - Import smoke test for `explorer.py` already exists (`tests/test_explorer_import.py`)
 - `Home.py` and `pages/` restructure needs a corresponding smoke test
 - Drone build will catch import errors before deploy
 - Manual verification: `docker-compose up` locally against a copy of `data/motions.db`, check all four Streamlit tabs render without error
 - Blog post charts: visual review after `generate_compass.py` run — no automated test needed
 ---
 ## Open Questions
 1. **Multi-page restructure scope**: Does the quiz (`app.py`) need any changes beyond being wrapped in a `pages/` file, or can it be imported as-is? The `if __name__ == "__main__"` guard in `app.py` needs reviewing.
 2. **Streamlit base path**: Subdomain approach (`stematlas.sgeboers.nl`) means no subpath complexity — Streamlit runs at `/`. Clean.
 3. **Chart update cadence**: Manual (run `generate_compass.py`, extract snippets, paste into blog post HTML, push to sgeboers.nl repo). Fine initially — charts are labelled with window date.
 4. **sgeboers.nl nav structure**: No blog directory exists yet. Need to add `blog/` dir, a `blog/stematlas.html` file, and a nav link on the main site. Structure TBD after inspecting the existing HTML/CSS site.
 5. **Nginx already running**: Need to confirm Certbot/Let's Encrypt workflow matches what's already set up on the VPS for other subdomains.
--- a/thoughts/shared/plans/2026-03-22-motion-explorer-plan.md
+++ b/thoughts/shared/plans/2026-03-22-motion-explorer-plan.md
@ -0,0 +1,530 @@
 # Motion Explorer Implementation Plan
 **Goal:** Regenerate analyses (compass + similarity cache), add an interactive Streamlit explorer (explorer.py) exposing political compass, party trajectories, motion search and browser, and update the blog post with real counts and vector-dimension facts.
 **Design doc:** thoughts/shared/designs/2026-03-22-motion-explorer-design.md
 ---
 ## Summary / Architecture
 We'll perform three high-level workstreams in dependency order:
 1. Analysis rerun: after the running pipeline releases the DB lock, run the minimal pipeline steps to (re)compute fused vectors and then recompute the similarity cache for all quarterly windows 2019-Q1 → 2024-Q4. Also run the static compass generator for verification.
 2. explorer.py: single-file Streamlit app placed at project root. It will use the existing analysis.* modules for heavy computations (cached via @st.cache_data) and duckdb read-only connections for all DB reads. Figures are produced with plotly and rendered inline in Streamlit.
 3. Blog post update: update thoughts/blog-post-political-compass.md with real DB numbers, updated similarity cache counts and correct fused vector dimensions.
 Key implementation decisions (gap-filling):
 - Explorer is a single import-safe module: top-level definitions only, no expensive work on import. Running the UI triggers computations.
 - Use @st.cache_data for expensive functions: load_positions (compute_2d_axes), load_party_map, load_motions_df.
 - All DuckDB access in explorer.py will use duckdb.connect(database=..., read_only=True).
 - For similarity lookups we'll query similarity_cache directly via read-only DuckDB rather than calling MotionDatabase (which opens non-read-only connections), to respect the "DB may be running" constraint.
 - The UI will filter out motions with title exactly "Verworpen." by default; a sidebar toggle allows showing them.
 - Tests: explorer is a UI script so no behavioural TDD possible. We'll add a minimal import/sanity test ensuring the module is import-safe and key functions exist. Blog-post updates are manual but the plan includes a small helper script to compute exact counts to paste into the markdown.
 ---
 ## Dependency Graph
 ```
 Batch 1 (parallel): 1.1 [analysis-rerun - single operator task] (depends: none)
 Batch 2 (parallel): 2.1, 2.2 [explorer implementation + test] (depends: 1.1 for verification, but code can be implemented earlier)
 Batch 3 (serial): 3.1 [blog post update] (depends: 1.1)
 ```
 NOTE: The actual critical dependency is that the DB lock must be released before running the analysis rerun (Batch 1). The explorer code (Batch 2) can be implemented while the pipeline is running — it will only attempt DB reads at runtime and uses read-only connections.
 ---
 ## Batch 1: Analysis rerun (operator tasks — no repo files changed)
 These are operational steps to run after the pipeline finishes and the DB lock is released. Run from the repository root.
 Task 1.1: Regenerate compass outputs and fused vectors
 **What:** Run generate_compass.py and run the pipeline to (re)fuse vectors for quarterly windows covering 2019-Q1 → 2024-Q4. We will not re-run expensive fetch/extract/SVD/text steps if they are already up-to-date; only fusion (phase 5) must run so fused_embeddings exists for all windows.
 **Commands (run after pipeline finishes and DB unlocked):**
 - Verify DB file exists:
   .venv/bin/python -c "import os,sys; p='data/motions.db'; print('exists' if os.path.exists(p) else 'MISSING'); sys.exit(0)"
 - Run static compass for quick visual check (produces HTML output):
   .venv/bin/python scripts/generate_compass.py --db data/motions.db --out outputs --method pca --pca-residual
 - Run the pipeline orchestrator so Phase 5 (fusion) runs for quarterly windows 2019-01-01 → 2025-01-01.
   We explicitly skip metadata/extract/svd/text since those may already be present; this minimizes rework and avoids mixing read/write connections in the current process.
   .venv/bin/python -m pipeline.run_pipeline \
       --db-path data/motions.db \
       --start-date 2019-01-01 --end-date 2025-01-01 \
       --window-size quarterly \
       --skip-metadata --skip-extract --skip-svd --skip-text
 **Notes:** run_pipeline.py includes a --skip-fusion flag; we MUST NOT pass --skip-fusion here because we want fusion to execute. The script supports exactly the flags shown.
 **Verify:**
 - After run_pipeline completes, verify fused_embeddings rows exist for expected windows:
   .venv/bin/python - <<'PY'
   import duckdb
   conn = duckdb.connect(database='data/motions.db', read_only=True)
   print(conn.execute("SELECT window_id, COUNT(*) FROM fused_embeddings GROUP BY window_id ORDER BY window_id DESC").fetchall())
   conn.close()
   PY
 Task 1.2: Recompute similarity cache for all quarterly windows 2019-Q1 → 2024-Q4
 **What:** Compute top-20 similarities per motion per window for the fused vectors and insert rows into similarity_cache. We will run similarity.compute.compute_similarities per window. The repository's similarity/compute.py exposes compute_similarities(vector_type='fused', window_id=..., top_k=20).
 **Command (one-liner loop):**
 .venv/bin/python - <<'PY'
 from similarity.compute import compute_similarities
 windows = []
 years = range(2019, 2025)  # 2019..2024
 for y in years:
     for q in (1,2,3,4):
         windows.append(f"{y}-Q{q}")
 total = 0
 for wid in windows:
     inserted = compute_similarities(vector_type='fused', window_id=wid, top_k=20, db_path='data/motions.db')
     print(f"window={wid} inserted={inserted}")
     total += inserted
 print('DONE total_inserted=', total)
 PY
 **Notes & decisions:**
 - The compute_similarities function already clears existing rows for (vector_type, window_id) before inserting new ones, so this is safe to re-run.
 - If compute_similarities raises memory pressure for large windows, run on subsets (split windows further) — but try the simple loop first.
 **Verify:**
 - Basic counts per window:
   .venv/bin/python - <<'PY'
   import duckdb
   conn = duckdb.connect(database='data/motions.db', read_only=True)
   print(conn.execute("SELECT window_id, COUNT(*) FROM similarity_cache WHERE vector_type = 'fused' GROUP BY window_id ORDER BY window_id").fetchall())
   print('total', conn.execute("SELECT COUNT(*) FROM similarity_cache WHERE vector_type = 'fused'").fetchone())
   conn.close()
   PY
 - Spot-check top neighbors for a known motion id (replace 123 with a real id observed from motions table):
   .venv/bin/python - <<'PY'
   import duckdb
   conn = duckdb.connect(database='data/motions.db', read_only=True)
   print(conn.execute("SELECT id FROM motions ORDER BY id LIMIT 1").fetchall())
   src = conn.execute("SELECT id FROM motions ORDER BY id LIMIT 1").fetchone()[0]
   print('example source id=', src)
   print(conn.execute("SELECT target_motion_id, score FROM similarity_cache WHERE source_motion_id = ? AND vector_type = 'fused' ORDER BY score DESC LIMIT 10", (src,)).fetchall())
   conn.close()
   PY
 ---
 ## Batch 2: Explorer implementation (code + test) — parallel implementers
 All tasks in this batch are independent and can be worked on in parallel. The single file to add is explorer.py at the project root. A small unit test ensures import-safety.
 Decision: explorer.py will be placed at project root (same level as app.py) as requested by design. It will avoid performing DB work at import time so tests and other scripts can import it safely.
 ### Task 2.1: explorer.py
 **File:** explorer.py
 **Test:** tests/test_explorer_import.py
 **Depends:** none (safe to implement while pipeline runs)
 Implementation (copy-paste-ready). This is a minimal, well-documented, and import-safe Streamlit app that follows the design requirements. It uses @st.cache_data on heavy functions, opens DuckDB with read_only=True for all reads, and uses existing analysis modules for computing 2D axes.
 ```python
 # explorer.py
 """Streamlit motion explorer.
 Import-safe: heavy computations are behind functions guarded by @st.cache_data
 and only run when the user opens the app (streamlit run explorer.py).
 """
 from __future__ import annotations
 import logging
 from typing import Dict, List, Optional, Tuple
 import duckdb
 import pandas as pd
 import plotly.express as px
 import streamlit as st
 # keep a module-level logger
 logger = logging.getLogger(__name__)
 # ---------- Cached data loaders ----------
 @st.cache_data
 def load_positions(db_path: str = "data/motions.db", window_size: str = "annual") -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], Optional[Dict]]:
     """Load positions_by_window and axis_def using existing analysis.political_axis.compute_2d_axes.
     This delegates heavy computation to the analysis module and caches the result in Streamlit.
     The function intentionally accepts db_path so callers (tests) can pass a different path.
     """
     try:
         from analysis.political_axis import compute_2d_axes
     except Exception as e:
         logger.exception("analysis.political_axis not available: %s", e)
         return {}, None
     # compute_2d_axes may be expensive; we let the analysis module handle internals
     positions_by_window, axis_def = compute_2d_axes(
         db_path, method="pca", pca_residual=True, normalize_vectors=True
     )
     return positions_by_window, axis_def
 @st.cache_data
 def load_party_map(db_path: str = "data/motions.db") -> Dict[str, str]:
     """Return mp_name -> party mapping.
     Uses the helper in analysis.visualize which already knows heuristics.
     """
     try:
         from analysis.visualize import _load_party_map
         return _load_party_map(db_path)
     except Exception:
         logger.exception("Failed to load party map")
         return {}
 @st.cache_data
 def load_motions_df(db_path: str = "data/motions.db") -> pd.DataFrame:
     """Load motions table into a cached pandas DataFrame (read-only connection).
     Columns returned: id, title, description, date, policy_area, voting_results, layman_explanation, winning_margin, controversy_score
     """
     conn = None
     try:
         conn = duckdb.connect(database=db_path, read_only=True)
         df = conn.execute(
             "SELECT id, title, description, date, policy_area, voting_results, layman_explanation, winning_margin, controversy_score FROM motions"
         ).fetchdf()
         return df
     finally:
         if conn is not None:
             try:
                 conn.close()
             except Exception:
                 pass
 def query_similar_from_cache(db_path: str, source_motion_id: int, vector_type: str = "fused", window_id: Optional[str] = None, top_k: int = 10) -> List[Dict]:
     """Query similarity_cache table using a read-only connection.
     Returns list of dicts with keys target_motion_id, score, id.
     """
     conn = None
     try:
         conn = duckdb.connect(database=db_path, read_only=True)
         params = [source_motion_id, vector_type]
         query = "SELECT target_motion_id, score, id, window_id FROM similarity_cache WHERE source_motion_id = ? AND vector_type = ?"
         if window_id is not None:
             query += " AND window_id = ?"
             params.append(window_id)
         query += " ORDER BY score DESC LIMIT ?"
         params.append(top_k)
         rows = conn.execute(query, params).fetchall()
         cols = [c[0] for c in conn.description]
         return [dict(zip(cols, r)) for r in rows]
     finally:
         if conn is not None:
             try:
                 conn.close()
             except Exception:
                 pass
 # ---------- UI builders ----------
 def build_compass_tab(db_path: str, window_size: str, show_rejected: bool):
     positions_by_window, axis_def = load_positions(db_path, window_size)
     party_map = load_party_map(db_path)
     if not positions_by_window:
         st.error("No position data available. Run the pipeline or check data/motions.db")
         return
     windows = sorted(positions_by_window.keys())
     # default: latest window
     default_index = max(0, len(windows) - 1)
     idx = st.slider("Window", 0, len(windows) - 1, default_index)
     window_id = windows[idx]
     pos = positions_by_window.get(window_id, {})
     names = list(pos.keys())
     xs = [p[0] for p in pos.values()]
     ys = [p[1] for p in pos.values()]
     parties = [party_map.get(n, "Unknown") for n in names]
     fig = px.scatter(x=xs, y=ys, color=parties, hover_name=names, title=f"Political Compass ({window_id})")
     st.plotly_chart(fig, use_container_width=True)
 def build_trajectories_tab(db_path: str, window_size: str):
     positions_by_window, _ = load_positions(db_path, window_size)
     if not positions_by_window:
         st.error("No trajectories available")
         return
     window_ids = sorted(positions_by_window.keys())
     # Build per-party centroids per window
     import numpy as _np
     party_map = load_party_map(db_path)
     # user control
     show_mps = st.checkbox("Show MPs (individual trajectories)", value=False)
     selected_parties = st.multiselect("Parties (select to restrict)", options=sorted(set(party_map.values())), default=None)
     fig = None
     if show_mps:
         # plot a small subset by default to avoid clutter
         mp_limit = 200
         traces = []
         # build mp_coords
         mp_coords = {}
         for wid in window_ids:
             for mp, coord in positions_by_window.get(wid, {}).items():
                 mp_coords.setdefault(mp, []).append((wid, coord))
         # optionally filter by party map
         mps = [m for m in mp_coords.keys() if (not selected_parties) or (party_map.get(m) in selected_parties)]
         mps = sorted(mps)[:mp_limit]
         fig = px.line()
         for mp in mps:
             items = sorted(mp_coords[mp], key=lambda it: window_ids.index(it[0]))
             xs = [c[1][0] for c in items]
             ys = [c[1][1] for c in items]
             fig.add_scatter(x=xs, y=ys, mode='lines+markers', name=mp)
     else:
         # party centroids
         party_centroids = {}
         for wid in window_ids:
             coords_by_party = {}
             for mp, coord in positions_by_window.get(wid, {}).items():
                 party = party_map.get(mp)
                 if party is None:
                     continue
                 coords_by_party.setdefault(party, []).append(coord)
             for party, coords in coords_by_party.items():
                 xs = [c[0] for c in coords]
                 ys = [c[1] for c in coords]
                 centroid = (_np.mean(xs), _np.mean(ys))
                 party_centroids.setdefault(party, {'windows': [], 'coords': []})
                 party_centroids[party]['windows'].append(wid)
                 party_centroids[party]['coords'].append(centroid)
         fig = px.line()
         for party, data in party_centroids.items():
             if selected_parties and party not in selected_parties:
                 continue
             xs = [c[0] for c in data['coords']]
             ys = [c[1] for c in data['coords']]
             fig.add_scatter(x=xs, y=ys, mode='lines+markers', name=party)
     if fig is not None:
         st.plotly_chart(fig, use_container_width=True)
 def build_search_tab(db_path: str, show_rejected: bool):
     df = load_motions_df(db_path)
     if df is None or df.empty:
         st.info("No motions table available")
         return
     # filters
     years = sorted(pd.to_datetime(df['date']).dt.year.dropna().unique().tolist())
     if years:
         start_year, end_year = min(years), max(years)
     else:
         start_year, end_year = 2019, 2024
     year_range = st.slider("Year range", int(start_year), int(end_year), (int(start_year), int(end_year)))
     policy_areas = sorted(df['policy_area'].dropna().unique().tolist())
     policy_filter = st.multiselect("Policy areas", options=policy_areas, default=None)
     query = st.text_input("Search text (title / layman_explanation)")
     # in-memory filter
     working = df.copy()
     # filter rejected default
     if not show_rejected:
         working = working[working['title'].str.strip() != 'Verworpen.']
     working['y'] = pd.to_datetime(working['date']).dt.year
     working = working[(working['y'] >= year_range[0]) & (working['y'] <= year_range[1])]
     if policy_filter:
         working = working[working['policy_area'].isin(policy_filter)]
     if query:
         q = query.lower()
         mask = working['title'].fillna('').str.lower().str.contains(q) | working['layman_explanation'].fillna('').str.lower().str.contains(q)
         working = working[mask]
     st.write(f"{len(working)} results")
     for _, row in working.sort_values(by='controversy_score', ascending=False).head(50).iterrows():
         with st.expander(f"{row['title']} — {row['date']}"):
             st.write(row.get('layman_explanation') or row.get('description') or '')
             st.write('Policy area:', row.get('policy_area'))
             st.write('Controversy score:', row.get('controversy_score'))
             # similar
             similar = query_similar_from_cache(db_path, int(row['id']), vector_type='fused', top_k=10)
             if similar:
                 st.write('Vergelijkbare moties:')
                 for s in similar:
                     st.write(f"- id={s['target_motion_id']} score={s['score']:.3f} window={s.get('window_id')}")
             else:
                 st.info('Nog geen vergelijkbare moties beschikbaar')
 def build_browser_tab(db_path: str, show_rejected: bool):
     df = load_motions_df(db_path)
     if df is None or df.empty:
         st.info("No motions table available")
         return
     if not show_rejected:
         df = df[df['title'].str.strip() != 'Verworpen.']
     df_display = df[['id', 'title', 'date', 'policy_area', 'controversy_score', 'winning_margin']].copy()
     df_display = df_display.sort_values(by=['date'], ascending=False)
     sel = st.experimental_data_editor(df_display, num_rows='dynamic')
     # store selected id via session_state: user clicks a row and then presses a button
     st.write('Select a row and click "Show details"')
     sel_row_idx = st.number_input('Select row index (0-based)', min_value=0, max_value=max(0, len(df_display)-1), value=0)
     if st.button('Show details'):
         row = df_display.iloc[int(sel_row_idx)]
         st.subheader(row['title'])
         st.write(df.loc[df['id'] == row['id']].iloc[0].get('description') or '')
         similar = query_similar_from_cache(db_path, int(row['id']), vector_type='fused', top_k=10)
         if similar:
             st.write('Top similar:')
             for s in similar:
                 st.write(f"- id={s['target_motion_id']} score={s['score']:.3f} window={s.get('window_id')}")
         else:
             st.info('Nog geen vergelijkbare moties beschikbaar')
 def run_app():
     st.set_page_config(layout='wide', page_title='Parlement Explorer')
     st.sidebar.title('Explorer settings')
     db_path = st.sidebar.text_input('DuckDB path', value='data/motions.db')
     window_granularity = st.sidebar.selectbox('Window granularity', ['annual', 'quarterly'], index=0)
     show_rejected = st.sidebar.checkbox('Toon verworpen', value=False)
     tabs = st.tabs(['Politiek Kompas', 'Partij Trajectories', 'Motie Zoeken', 'Motie Browser'])
     with tabs[0]:
         build_compass_tab(db_path, window_granularity, show_rejected)
     with tabs[1]:
         build_trajectories_tab(db_path, window_granularity)
     with tabs[2]:
         build_search_tab(db_path, show_rejected)
     with tabs[3]:
         build_browser_tab(db_path, show_rejected)
 if __name__ == '__main__':
     run_app()
 ```
 **Verify (local/dev):**
 - Run the app once the DB is available: streamlit run explorer.py
 - Verify that Tab 1 loads and you can slide windows, plot renders inline
 - Verify Tab 3 search returns results and shows similar motions
 - Verify all long-running operations are cached (first call slow, subsequent fast)
 ### Task 2.2: Test for explorer import-safety
 **File:** tests/test_explorer_import.py
 **Depends:** none
 Minimal pytest to ensure the module can be imported without triggering heavy work and that run_app and key functions exist.
 ```python
 # tests/test_explorer_import.py
 import importlib
 def test_explorer_importable():
     mod = importlib.import_module('explorer')
     assert hasattr(mod, 'run_app')
     assert callable(mod.run_app)
     # key helpers
     assert hasattr(mod, 'load_positions')
     assert hasattr(mod, 'load_motions_df')
 ```
 **Verify:**
 - Run tests (no DB required for import test):
   .venv/bin/python -m pytest tests/test_explorer_import.py -q
 ---
 ## Batch 3: Blog post update (manual / single-file edit)
 The blog post at thoughts/blog-post-political-compass.md contains placeholder numbers for motion counts, similarity cache totals and fused vector dimension claim. After analysis rerun completes, update the markdown with exact numbers.
 ### Task 3.1: Update blog post with real numbers
 **File to modify:** thoughts/blog-post-political-compass.md
 **Depends:** 1.1, 1.2 (analysis rerun and similarity cache recompute must finish first)
 Steps to compute authoritative numbers (run after Batch 1 completes):
 1. Motion counts per year (SQL):
    .venv/bin/python - <<'PY'
    import duckdb
    conn = duckdb.connect(database='data/motions.db', read_only=True)
    rows = conn.execute("SELECT EXTRACT(year FROM date) AS y, COUNT(*) FROM motions GROUP BY y ORDER BY y").fetchall()
    print(rows)
    conn.close()
    PY
 2. Similarity cache total count (fused vectors):
    .venv/bin/python - <<'PY'
    import duckdb
    conn = duckdb.connect(database='data/motions.db', read_only=True)
    total = conn.execute("SELECT COUNT(*) FROM similarity_cache WHERE vector_type = 'fused'").fetchone()[0]
    print('similarity_cache_fused_total=', total)
    conn.close()
    PY
 3. Verify fused vector dimensions claim (inspect fused_embeddings.vector JSON lengths) — the fused field is stored as JSON array; compute distinct lengths:
    .venv/bin/python - <<'PY'
    import duckdb, json
    conn = duckdb.connect(database='data/motions.db', read_only=True)
    lens = conn.execute("SELECT DISTINCT CARDINALITY(vector) FROM fused_embeddings ORDER BY 1 DESC").fetchall()
    print('distinct_fused_lengths=', lens)
    conn.close()
    PY
 Replace the placeholder table and counts in thoughts/blog-post-political-compass.md with the outputs above. Also correct the fused dimensions claim (line that currently reads "fused = [svd_dims (10)] + [text_dims (2560)] = 2570") by pasting the real dimensions found.
 Verification: After editing, spell-check and run a quick search to ensure the old placeholder numbers are gone:
 grep -n "212,206\|2570\|~450 (newly backfilled)" -n thoughts/blog-post-political-compass.md || echo "No placeholders remain"
 Commit message suggestions (to use when committing these changes):
 - feat(explorer): add initial Streamlit explorer (explorer.py) + import test
 - chore(analysis): recompute fused embeddings + similarity cache for 2019-Q1..2024-Q4 (instructions)
 - docs(blog): update political compass blog post with real counts and vector dims
 ---
 ## Rollout / verification checklist (final acceptance)
 - [ ] Analysis rerun finished without errors; fused_embeddings rows present for 2019-Q1..2024-Q4
 - [ ] similarity_cache contains top-k neighbors for each window (spot-check 3 windows)
 - [ ] explorer.py runs: streamlit run explorer.py renders tabs and figures inline
 - [ ] explorer uses read-only DuckDB connections (manual code review + spot-check)
 - [ ] thoughts/blog-post-political-compass.md updated with real numbers and vector dims
 - [ ] All tests still pass: .venv/bin/python -m pytest -q
 ---
 ## Appendix: reasoning & decisions
 - Design requires read-only DB access: MotionDatabase methods often open connections without read_only flag. To guarantee read-only behaviour while the pipeline runs, explorer.py queries DuckDB directly with read_only=True for all SELECTs. This avoids accidentally holding write locks.
 - The design required using existing analysis.* modules. compute_2d_axes is used as-is and wrapped by @st.cache_data; we rely on it to perform heavy PCA/SVD logic.
 - The similarity recompute step uses similarity.compute.compute_similarities per-window. The design referenced recompute_all_windows which did not exist in the repo; we use a small loop (shown above) to call compute_similarities per window.
 *** End Plan
--- a/thoughts/shared/plans/2026-03-22-stematlas-deployment-plan.md
+++ b/thoughts/shared/plans/2026-03-22-stematlas-deployment-plan.md
@ -0,0 +1,286 @@
 # StemAtlas Deployment — Implementation Plan
 **Design:** `thoughts/shared/designs/2026-03-22-stematlas-deployment-design.md`
 **Date:** 2026-03-22
 ---
 ## Overview
 Four independent batches. Batches A and B can run in parallel. Batch C requires the pipeline to finish first. Batch D is VPS infrastructure (manual steps, done once).
 ```
 Batch A: stemwijzer repo     — Streamlit multi-page + Docker
 Batch B: sgeboers.nl repo    — blog/, nav, blog post HTML skeleton
 Batch C: Charts              — generate + embed (after pipeline finishes)
 Batch D: VPS infrastructure  — Nginx vhost + Certbot + /srv/stematlas/
 ```
 ---
 ## Batch A — stemwijzer repo: Streamlit multi-page + Docker
 ### A1. Check Dockerfile
 Read existing `Dockerfile` — verify it installs all deps from `pyproject.toml` and sets `CMD` to start the app. Note current entrypoint (probably `streamlit run app.py`).
 ### A2. Create `Home.py`
 New file at project root. Streamlit landing/about page:
 - Title: "StemAtlas"
 - Brief description of the two pages (quiz + explorer)
 - Links (Streamlit sidebar nav handles the rest automatically)
 - `st.page_link()` cards pointing to the two pages
 ### A3. Create `pages/1_Stemwijzer.py`
 Thin wrapper that imports and calls `app.main()`:
 - Import `from app import main`
 - Remove the `if __name__ == "__main__": main()` guard from `app.py` (or keep it — Streamlit ignores it when the file is imported)
 - The page title shown in Streamlit nav comes from the filename: `1_Stemwijzer` → "Stemwijzer"
 ### A4. Create `pages/2_Explorer.py`
 Same pattern:
 - Import `from explorer import run_app`
 - Call `run_app()`
 - Filename → nav label: "Explorer"
 ### A5. Update Dockerfile CMD
 Change entrypoint from `streamlit run app.py` to `streamlit run Home.py --server.port 8501 --server.address 0.0.0.0`.
 ### A6. Create `docker-compose.yml`
 Two services in the stemwijzer repo:
 ```yaml
 version: "3.9"
 services:
  stematlas:
    image: ${DOCKER_REGISTRY}/sgeboers/stemwijzer:latest
    ports:
      - "127.0.0.1:8501:8501"
    volumes:
      - /srv/stematlas/data:/app/data
    restart: unless-stopped
    environment:
      - DB_PATH=/app/data/motions.db
  scheduler:
    image: ${DOCKER_REGISTRY}/sgeboers/stemwijzer:latest
    command: python scheduler.py
    volumes:
      - /srv/stematlas/data:/app/data
    restart: unless-stopped
    environment:
      - DB_PATH=/app/data/motions.db
 ```
 `127.0.0.1:8501` — only accessible from localhost, Nginx proxies externally.
 ### A7. Smoke test for `Home.py`
 Add `tests/test_home_import.py` — same pattern as `test_explorer_import.py`. Verify `Home` module is importable, `run_app` or equivalent callable exists.
 ### A8. Run tests
 `.venv/bin/python -m pytest -q` — all existing + new smoke tests must pass.
 ### Verification
 `docker build -t stematlas-local .` locally to confirm image builds without errors.
 ---
 ## Batch B — sgeboers.nl repo: blog/ + nav
 > This batch requires access to the sgeboers.nl repo on git.sgeboers.nl.
 > Steps below assume the repo is cloned locally.
 ### B1. Inspect existing site structure
 Read `index.html` and any existing CSS files to understand:
 - Current nav structure (header? sidebar? footer?)
 - CSS class conventions for links/sections
 - Any existing page patterns to copy for the blog post
 ### B2. Create `blog/` directory
 Add `blog/index.html` — a minimal blog listing page:
 - Title: "Blog"
 - One entry: "StemAtlas — Mapping Dutch Democracy" → `blog/stematlas.html`
 - Matches existing site style
 ### B3. Add nav link to main site
 Update `index.html` (or whichever file contains the nav) to add a "Blog" link pointing to `/blog/`.
 ### B4. Create `blog/stematlas.html` skeleton
 Full blog post HTML based on `thoughts/blog-post-political-compass.md`:
 - Convert markdown to HTML (headings, paragraphs, code blocks, tables)
 - Add Plotly CDN `<script>` in `<head>`
 - **Chart placeholders**: `<!-- CHART: compass_latest -->`, `<!-- CHART: trajectories -->` — to be filled in Batch C
 - Add two CTAs linking to `stematlas.sgeboers.nl`:
  - After compass chart: *"Explore every window interactively →"*
  - At bottom: *"Try the Stemwijzer quiz →"*
 - Match existing site CSS (link the same stylesheet)
 ### B5. Update Drone pipeline (sgeboers.nl repo)
 Confirm the existing `.drone.yml` in sgeboers.nl picks up new files under `blog/` automatically (it should, if it deploys the whole repo root). No changes needed if it's already a `rsync` or `cp -r` deploy.
 ### Verification
 Open `blog/stematlas.html` locally in browser — post renders correctly with placeholder chart divs, nav works.
 ---
 ## Batch C — Charts: generate + embed (after pipeline finishes ~21:40)
 > Requires `data/motions.db` to be unlocked (pipeline complete).
 ### C1. Run tests
 `.venv/bin/python -m pytest -q` — confirm all pass now that DB is free.
 ### C2. Run similarity cache recompute
 ```
 .venv/bin/python -m pipeline.run_pipeline \
  --db-path data/motions.db \
  --start-date 2019-01-01 --end-date 2025-01-01 \
  --window-size quarterly \
  --skip-metadata --skip-extract --skip-svd --skip-text
 ```
 Fusion only — fills `fused_embeddings` for new 2019–2021 and 2024 windows.
 ### C3. Recompute similarity cache
 ```
 .venv/bin/python -c "
 from similarity.compute import compute_similarities
 import duckdb
 conn = duckdb.connect('data/motions.db', read_only=True)
 windows = [r[0] for r in conn.execute(\"SELECT DISTINCT window_id FROM fused_embeddings ORDER BY 1\").fetchall()]
 conn.close()
 for w in windows:
    print(f'Computing {w}...')
    compute_similarities('data/motions.db', w, top_k=20)
 "
 ```
 ### C4. Generate compass HTML files
 ```
 .venv/bin/python scripts/generate_compass.py \
  --db data/motions.db \
  --out outputs/blog-charts \
  --method pca --pca-residual
 ```
 This produces `outputs/blog-charts/compass_*.html` and `outputs/blog-charts/trajectories_*.html`.
 ### C5. Extract Plotly snippets
 For each chart file, extract the embeddable snippet:
 ```python
 # Run once per chart to get embeddable HTML
 import plotly.io as pio
 # OR: just strip everything outside <div id="..."> and its <script>
 # The generate_compass.py output is self-contained — use BeautifulSoup or
 # manual extraction to get just the div+script block
 ```
 Simpler: modify `generate_compass.py` to add a `--partial` flag that calls `fig.to_html(include_plotlyjs=False, full_html=False)` and writes `.partial.html` files alongside the full ones.
 ### C6. Fill chart placeholders in blog post
 Replace `<!-- CHART: compass_latest -->` and `<!-- CHART: trajectories -->` in `blog/stematlas.html` with the extracted Plotly div+script blocks.
 ### C7. Update motion count table in blog post
 Run SQL to get authoritative counts:
 ```sql
 SELECT strftime(date, '%Y') AS year, COUNT(*) AS motions
 FROM motions
 GROUP BY year ORDER BY year;
 ```
 Replace placeholder numbers in `blog/stematlas.html` table.
 ### C8. Push sgeboers.nl repo
 Commit and push `blog/stematlas.html` + `blog/index.html` + nav changes to git.sgeboers.nl → Drone deploys.
 ---
 ## Batch D — VPS infrastructure (manual, one-time)
 > SSH into the VPS. Steps are sequential.
 ### D1. Create data directory
 ```bash
 sudo mkdir -p /srv/stematlas/data
 sudo chown $USER:$USER /srv/stematlas/data
 ```
 ### D2. Copy `motions.db` to VPS
 From local machine:
 ```bash
 rsync -avz --progress data/motions.db user@vps:/srv/stematlas/data/motions.db
 ```
 ~3.6GB transfer — takes a few minutes.
 ### D3. Add Nginx vhost
 New file `/etc/nginx/sites-available/stematlas`:
 ```nginx
 server {
    listen 80;
    server_name stematlas.sgeboers.nl;
    return 301 https://$host$request_uri;
 }
 server {
    listen 443 ssl;
    server_name stematlas.sgeboers.nl;
    # Let's Encrypt certs (Certbot fills these in)
    ssl_certificate /etc/letsencrypt/live/stematlas.sgeboers.nl/fullchain.pem;
    ssl_certificate_key /etc/letsencrypt/live/stematlas.sgeboers.nl/privkey.pem;
    location / {
        proxy_pass http://127.0.0.1:8501;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection "upgrade";
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_read_timeout 86400;
    }
 }
 ```
 Enable: `sudo ln -s /etc/nginx/sites-available/stematlas /etc/nginx/sites-enabled/`
 ### D4. Get Let's Encrypt cert
 ```bash
 sudo certbot --nginx -d stematlas.sgeboers.nl
 ```
 (Assumes Certbot is already installed and working for other subdomains on this VPS.)
 ### D5. First deploy
 The Drone pipeline for the stemwijzer repo will handle future deploys. For the first deploy, either:
 - Push a commit to trigger Drone, OR
 - Manually on VPS: `cd /srv/stematlas && docker-compose pull && docker-compose up -d`
 ### D6. Verify
 - `https://stematlas.sgeboers.nl` → Streamlit loads, shows Home.py
 - Both pages accessible from Streamlit nav
 - `docker-compose logs stematlas` — no errors
 ---
 ## Dependencies Between Batches
 ```
 A (stemwijzer repo)  ──► D5 (first deploy) ──► D6 (verify)
 B (sgeboers.nl repo) ──► C8 (push blog)
 C (charts)           ──► C8 (push blog)
 D1-D4 (VPS infra)    ──► D5 (first deploy)
 Pipeline finish (~21:40) ──► C1 (tests) ──► C2-C7 (charts)
 ```
 Batches A and B are fully independent — can start now.
 Batch C waits only for the pipeline to finish.
 Batch D is VPS-side and independent of code changes.
 ---
 ## Estimated Effort
 | Batch | Tasks | Est. Time |
 |-------|-------|-----------|
 | A | Multi-page Streamlit + docker-compose | 45 min |
 | B | Blog HTML + nav (after inspecting site) | 60 min |
 | C | Charts + embed (after pipeline) | 30 min |
 | D | VPS infra (manual SSH) | 30 min |
 | **Total** | | **~2.5 hours** |