"""Parlement Explorer — Streamlit data analysis app. Four tabs: 1. Politiek Kompas — 2D scatter of MPs/parties, window slider 2. Partij Trajectories — party centroid lines over time 3. Motie Zoeken — text search + similarity lookup 4. Motie Browser — sortable table + detail panel Run with: streamlit run explorer.py Import-safe: heavy computation is behind @st.cache_data and only runs at UI time. All DuckDB connections are read_only=True so the app can run alongside the pipeline. """ from __future__ import annotations import json import logging import os from typing import Dict, List, Optional, Tuple import duckdb import numpy as np import pandas as pd import plotly.express as px import plotly.graph_objects as go import streamlit as st logger = logging.getLogger(__name__) # Party colour palette (consistent across tabs) PARTY_COLOURS: Dict[str, str] = { "VVD": "#1E73BE", "PVV": "#002366", "D66": "#00A36C", "CDA": "#4CAF50", "SP": "#E53935", "PvdA": "#D32F2F", "GroenLinks": "#388E3C", "GroenLinks-PvdA": "#2E7D32", "CU": "#0288D1", "SGP": "#F4511E", "PvdD": "#43A047", "FVD": "#6A1B9A", "JA21": "#7B1FA2", "BBB": "#8D6E63", "NSC": "#FF8F00", "Nieuw Sociaal Contract": "#FF8F00", # alias used in mp_metadata "DENK": "#00897B", "50PLUS": "#7E57C2", "Volt": "#572AB7", "Unknown": "#9E9E9E", } # Ordered list of well-known parties for trajectory default selection. # Keeps the chart readable without overwhelming users with all parties. KNOWN_MAJOR_PARTIES = [ "VVD", "PVV", "D66", "GroenLinks-PvdA", "GroenLinks", "PvdA", "CDA", "SP", "NSC", "Nieuw Sociaal Contract", "CU", "BBB", ] # --------------------------------------------------------------------------- # Cached loaders # --------------------------------------------------------------------------- @st.cache_data(show_spinner="Beschikbare tijdsvensters laden…") def get_available_windows(db_path: str) -> List[str]: """Return sorted list of distinct window_ids from svd_vectors.""" con = duckdb.connect(database=db_path, read_only=True) try: rows = con.execute( "SELECT DISTINCT window_id FROM svd_vectors ORDER BY window_id" ).fetchall() return [r[0] for r in rows] except Exception: logger.exception("Failed to query available windows") return [] finally: con.close() @st.cache_data(show_spinner=False) def get_uniform_dim_windows(db_path: str) -> List[str]: """Return only windows whose vector dimension equals the most common dimension. np.vstack requires all vectors to have the same shape. Early or small windows have lower SVD rank (dim < 50). This helper filters to only windows at the dominant (max-count) dimension so compute_2d_axes never sees mixed shapes. """ con = duckdb.connect(database=db_path, read_only=True) try: rows = con.execute( """ WITH window_dims AS ( SELECT DISTINCT ON (window_id) window_id, json_array_length(vector) AS dim FROM svd_vectors WHERE entity_type = 'mp' ORDER BY window_id ), dim_counts AS ( SELECT dim, COUNT(*) AS cnt FROM window_dims GROUP BY dim ), dominant AS ( SELECT dim FROM dim_counts ORDER BY cnt DESC, dim DESC LIMIT 1 ) SELECT wd.window_id FROM window_dims wd JOIN dominant d ON wd.dim = d.dim ORDER BY wd.window_id """ ).fetchall() return [r[0] for r in rows] except Exception: logger.exception("Failed to query uniform-dim windows") return [] finally: con.close() @st.cache_data(show_spinner="2D posities berekenen (kan even duren)…") def load_positions( db_path: str, window_size: str = "quarterly" ) -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], Dict]: """Compute 2D positions per window using PCA on aligned SVD vectors. Returns: positions_by_window: {window_id: {entity_name: (x, y)}} axis_def: dict with x_axis, y_axis, method keys """ from analysis.political_axis import compute_2d_axes # Only use windows where all vectors share the same dimension (dim=50). # Mixed-dim windows cause np.vstack to fail in compute_2d_axes. available = get_uniform_dim_windows(db_path) if window_size == "annual": # Keep only Q4 windows (one representative window per year) available = [w for w in available if w.endswith("-Q4")] if not available: return {}, {} positions_by_window, axis_def = compute_2d_axes( db_path, window_ids=available, method="pca", pca_residual=True, normalize_vectors=True, ) return positions_by_window, axis_def @st.cache_data(show_spinner="Partijkaart laden…") def load_party_map(db_path: str) -> Dict[str, str]: """Return {mp_name: party} mapping from mp_metadata (with vote-based fallback).""" from analysis.visualize import _load_party_map try: return _load_party_map(db_path) except Exception: logger.exception("Failed to load party map") return {} @st.cache_data(show_spinner="Moties laden…") def load_motions_df(db_path: str) -> pd.DataFrame: """Load the full motions table as a pandas DataFrame (read-only).""" con = duckdb.connect(database=db_path, read_only=True) try: df = con.execute( """ SELECT id, title, description, date, policy_area, voting_results, layman_explanation, winning_margin, controversy_score, url FROM motions """ ).fetchdf() df["date"] = pd.to_datetime(df["date"], errors="coerce") df["year"] = df["date"].dt.year return df except Exception: logger.exception("Failed to load motions") return pd.DataFrame() finally: con.close() def query_similar( db_path: str, source_motion_id: int, vector_type: str = "fused", top_k: int = 10, ) -> pd.DataFrame: """Return top-k similar motions from similarity_cache (read-only).""" con = duckdb.connect(database=db_path, read_only=True) try: rows = con.execute( """ SELECT sc.target_motion_id, sc.score, sc.window_id, m.title, m.date, m.policy_area FROM similarity_cache sc JOIN motions m ON m.id = sc.target_motion_id WHERE sc.source_motion_id = ? AND sc.vector_type = ? ORDER BY sc.score DESC LIMIT ? """, [source_motion_id, vector_type, top_k], ).fetchdf() return rows except Exception: logger.exception( "Failed to query similarity cache for motion %s", source_motion_id ) return pd.DataFrame() finally: con.close() # --------------------------------------------------------------------------- # Shared rendering helpers # --------------------------------------------------------------------------- def _render_voting_results(voting_results_json) -> None: """Render a voting_results JSON blob as a grouped voor/tegen/onthouden table. The JSON is stored as {party_or_mp: vote} where vote is one of 'voor', 'tegen', 'onthouden', 'afwezig'. We group by vote for readability. """ if not voting_results_json: return try: vdata = ( json.loads(voting_results_json) if isinstance(voting_results_json, str) else voting_results_json ) if not isinstance(vdata, dict) or not vdata: return # Group {vote: [actor, ...]} by_vote: Dict[str, List[str]] = {} for actor, vote in vdata.items(): vote_str = str(vote).lower().strip() by_vote.setdefault(vote_str, []).append(str(actor)) # Render in fixed order vote_order = ["voor", "tegen", "onthouden", "afwezig"] vote_emoji = {"voor": "✅", "tegen": "❌", "onthouden": "🟡", "afwezig": "⬜"} rows_shown = False for v in vote_order + [k for k in by_vote if k not in vote_order]: actors = by_vote.get(v) if not actors: continue emoji = vote_emoji.get(v, "▪️") st.markdown( f"**{emoji} {v.capitalize()}** ({len(actors)}): {', '.join(sorted(actors))}" ) rows_shown = True if not rows_shown: st.caption("_Geen stemuitslag beschikbaar_") except Exception: pass # --------------------------------------------------------------------------- # Tab 1: Politiek Kompas # --------------------------------------------------------------------------- def build_compass_tab(db_path: str, window_size: str) -> None: st.subheader("Politiek Kompas") st.markdown( "2D projectie van Kamerlid posities op basis van stemgedrag (PCA op SVD-vectoren)." ) positions_by_window, axis_def = load_positions(db_path, window_size) if not positions_by_window: st.warning( "Geen positiedata beschikbaar. Controleer of de pipeline is gedraaid." ) return party_map = load_party_map(db_path) windows = sorted(positions_by_window.keys()) col1, col2 = st.columns([3, 1]) with col2: window_idx = st.select_slider( "Tijdsvenster", options=windows, value=windows[-1] ) show_names = st.checkbox("Toon namen", value=False) min_size = st.slider("Min. MPs per partij", 0, 20, 3) pos = positions_by_window.get(window_idx, {}) if not pos: st.info(f"Geen data voor venster {window_idx}") return rows = [] for name, (x, y) in pos.items(): party = party_map.get(name, "Unknown") rows.append({"name": name, "x": x, "y": y, "party": party}) df_pos = pd.DataFrame(rows) # Filter to parties with enough MPs party_counts = df_pos["party"].value_counts() valid_parties = party_counts[party_counts >= min_size].index df_pos = df_pos[df_pos["party"].isin(valid_parties)] colour_map = {p: PARTY_COLOURS.get(p, "#9E9E9E") for p in df_pos["party"].unique()} fig = px.scatter( df_pos, x="x", y="y", color="party", hover_name="name", hover_data={"party": True, "x": ":.3f", "y": ":.3f"}, color_discrete_map=colour_map, title=f"Politiek Kompas — {window_idx}", labels={"x": "Links ← → Rechts", "y": "Progressief ↑ / Conservatief ↓"}, ) if show_names: fig.update_traces(text=df_pos["name"], textposition="top center") fig.update_layout(height=600, legend_title_text="Partij") with col1: st.plotly_chart(fig, use_container_width=True) # Axis info if axis_def: evr = axis_def.get("explained_variance_ratio", []) if evr: st.caption( f"PCA verklaarde variantie: as 1 = {evr[0] * 100:.1f}%, as 2 = {evr[1] * 100:.1f}%" ) # --------------------------------------------------------------------------- # Tab 2: Partij Trajectories # --------------------------------------------------------------------------- def build_trajectories_tab(db_path: str, window_size: str) -> None: st.subheader("Partij Trajectories") st.markdown("Hoe bewegen partijen over de tijdsvensters heen?") positions_by_window, _ = load_positions(db_path, window_size) if not positions_by_window: st.warning("Geen positiedata beschikbaar.") return party_map = load_party_map(db_path) windows = sorted(positions_by_window.keys()) # Compute party centroids per window centroids: Dict[str, Dict[str, Tuple[float, float]]] = {} all_parties: set = set() for wid in windows: pos = positions_by_window.get(wid, {}) per_party: Dict[str, List[Tuple[float, float]]] = {} for mp_name, (x, y) in pos.items(): party = party_map.get(mp_name, "Unknown") if party == "Unknown": continue per_party.setdefault(party, []).append((x, y)) for party, coords in per_party.items(): all_parties.add(party) xs = [c[0] for c in coords] ys = [c[1] for c in coords] centroids.setdefault(party, {})[wid] = ( float(np.mean(xs)), float(np.mean(ys)), ) all_parties_sorted = sorted(all_parties) # Default: prefer known major parties over the automatic "appeared in most windows" # heuristic, which would exclude newer parties like NSC that only have 4 windows. default_parties = [p for p in KNOWN_MAJOR_PARTIES if p in all_parties] if not default_parties: default_parties = all_parties_sorted[:6] selected_parties = st.multiselect( "Selecteer partijen", options=all_parties_sorted, default=default_parties, ) # Note about partial data years if "2023-Q1" in windows and not any( w.startswith("2023-Q") and w != "2023-Q1" for w in windows ): st.caption( "ℹ️ 2023 heeft alleen data voor Q1 — pipeline draaide niet door in dat jaar." ) fig = go.Figure() for party in selected_parties: if party not in centroids: continue wids_sorted = sorted(centroids[party].keys()) xs = [centroids[party][w][0] for w in wids_sorted] ys = [centroids[party][w][1] for w in wids_sorted] colour = PARTY_COLOURS.get(party, "#9E9E9E") fig.add_trace( go.Scatter( x=xs, y=ys, mode="lines+markers+text", name=party, text=[w.replace("-Q4", "") for w in wids_sorted], textposition="top center", line=dict(color=colour), marker=dict(color=colour, size=8), hovertemplate=( f"{party}
" "venster: %{text}
" "x: %{x:.3f}
y: %{y:.3f}" ), ) ) fig.update_layout( title="Partij trajectories", xaxis_title="Links ← → Rechts", yaxis_title="Progressief ↑ / Conservatief ↓", height=600, legend_title_text="Partij", ) st.plotly_chart(fig, use_container_width=True) # --------------------------------------------------------------------------- # Tab 3: Motie Zoeken # --------------------------------------------------------------------------- def build_search_tab(db_path: str, show_rejected: bool) -> None: st.subheader("Motie Zoeken") df = load_motions_df(db_path) if df.empty: st.warning("Geen moties beschikbaar.") return if not show_rejected: df = df[df["title"].fillna("").str.strip() != "Verworpen."] # Controls col1, col2, col3 = st.columns([2, 1, 1]) with col1: query = st.text_input( "Zoek op titel", placeholder="bijv. stikstof, klimaat, wonen" ) with col2: years = sorted(df["year"].dropna().astype(int).unique().tolist()) if years: year_range = st.select_slider( "Jaar", options=years, value=(years[0], years[-1]) ) else: year_range = (2019, 2024) with col3: min_controversy = st.slider( "Min. controverse", min_value=0.0, max_value=1.0, value=0.0, step=0.05 ) # Apply filters in-memory working = df.copy() working = working[ (working["year"] >= year_range[0]) & (working["year"] <= year_range[1]) ] if min_controversy > 0: working = working[working["controversy_score"] >= min_controversy] if query: q = query.lower() mask = working["title"].fillna("").str.lower().str.contains(q, regex=False) working = working[mask] working = working.sort_values(by="controversy_score", ascending=False) st.caption(f"{len(working)} resultaten (top 50 getoond)") for _, row in working.head(50).iterrows(): title = row.get("title") or f"Motie #{row['id']}" date_str = row["date"].strftime("%d %b %Y") if pd.notna(row["date"]) else "?" controversy = row.get("controversy_score") or 0 with st.expander(f"**{title}** — {date_str} — 🔥 {controversy:.2f}"): cols = st.columns(3) cols[0].metric("Controverse", f"{controversy:.2f}") cols[1].metric("Marge", f"{row.get('winning_margin', 0):.2f}") cols[2].metric("Jaar", int(row["year"]) if pd.notna(row["year"]) else "?") # Voting breakdown _render_voting_results(row.get("voting_results")) # Link to original motion url = row.get("url") if url and str(url).startswith("http"): st.markdown(f"[🔗 Bekijk op Tweede Kamer]({url})") # Similar motions sim = query_similar(db_path, int(row["id"]), top_k=5) if not sim.empty: st.markdown("**Vergelijkbare moties:**") for _, s in sim.iterrows(): s_date = ( pd.to_datetime(s["date"]).strftime("%Y") if pd.notna(s.get("date")) else "" ) st.markdown( f"- {s.get('title', 'Onbekend')} *(score: {s['score']:.3f}, {s_date})*" ) else: st.caption("_Nog geen vergelijkbare moties beschikbaar_") # --------------------------------------------------------------------------- # Tab 4: Motie Browser # --------------------------------------------------------------------------- def build_browser_tab(db_path: str, show_rejected: bool) -> None: st.subheader("Motie Browser") df = load_motions_df(db_path) if df.empty: st.warning("Geen moties beschikbaar.") return if not show_rejected: df = df[df["title"].fillna("").str.strip() != "Verworpen."] # Controls col1, col2, col3 = st.columns(3) with col1: years = sorted(df["year"].dropna().astype(int).unique().tolist()) year_filter = st.selectbox("Jaar", ["(Alle)"] + [str(y) for y in years]) with col2: min_controversy_b = st.slider( "Min. controverse", min_value=0.0, max_value=1.0, value=0.0, step=0.05, key="browser_controversy", ) with col3: sort_by = st.selectbox("Sorteren op", ["Datum (nieuw)", "Controverse", "Marge"]) # Filter working = df.copy() if year_filter != "(Alle)": working = working[working["year"] == int(year_filter)] if min_controversy_b > 0: working = working[working["controversy_score"] >= min_controversy_b] sort_map = { "Datum (nieuw)": ("date", False), "Controverse": ("controversy_score", False), "Marge": ("winning_margin", True), } sort_col, sort_asc = sort_map[sort_by] working = working.sort_values(by=sort_col, ascending=sort_asc) # Display table display_cols = ["id", "title", "date", "controversy_score", "winning_margin"] available_display = [c for c in display_cols if c in working.columns] st.dataframe( working[available_display].reset_index(drop=True), use_container_width=True, height=350, ) st.divider() # Detail panel st.markdown("**Detail weergave** — vul een motie-ID in:") sel_id = st.number_input( "Motie ID", min_value=int(working["id"].min()) if not working.empty else 1, max_value=int(working["id"].max()) if not working.empty else 99999, value=int(working["id"].iloc[0]) if not working.empty else 1, step=1, ) motion_row = df[df["id"] == sel_id] if not motion_row.empty: row = motion_row.iloc[0] st.markdown(f"### {row.get('title') or 'Onbekend'}") date_str = row["date"].strftime("%d %b %Y") if pd.notna(row["date"]) else "?" st.caption( f"📅 {date_str} | 🔥 Controverse: {row.get('controversy_score', 0):.2f}" ) # Link to original source url = row.get("url") if url and str(url).startswith("http"): st.markdown(f"[🔗 Bekijk op Tweede Kamer]({url})") # Voting breakdown st.markdown("**Stemuitslag:**") _render_voting_results(row.get("voting_results")) # Similar motions sim = query_similar(db_path, int(sel_id), top_k=10) if not sim.empty: st.markdown("**Vergelijkbare moties:**") st.dataframe( sim[["title", "score", "date", "policy_area"]], use_container_width=True, ) else: st.caption("_Nog geen vergelijkbare moties beschikbaar voor deze motie_") # --------------------------------------------------------------------------- # App entry # --------------------------------------------------------------------------- def run_app() -> None: st.set_page_config( layout="wide", page_title="Parlement Explorer", page_icon="🏛️", ) st.title("🏛️ Parlement Explorer") # Sidebar st.sidebar.title("Instellingen") db_path = st.sidebar.text_input("DuckDB pad", value="data/motions.db") window_size = st.sidebar.radio("Venstergrootte", ["quarterly", "annual"], index=0) show_rejected = st.sidebar.checkbox("Toon verworpen moties", value=False) # About section with st.sidebar.expander("ℹ️ Over", expanded=False): try: con = duckdb.connect(database=db_path, read_only=True) n_motions = con.execute("SELECT COUNT(*) FROM motions").fetchone()[0] n_fused = con.execute("SELECT COUNT(*) FROM fused_embeddings").fetchone()[0] n_sim = con.execute("SELECT COUNT(*) FROM similarity_cache").fetchone()[0] con.close() st.markdown( f"**Moties:** {n_motions:,} \n" f"**Fused embeddings:** {n_fused:,} \n" f"**Similarity cache:** {n_sim:,}" ) except Exception as e: st.warning(f"DB niet bereikbaar: {e}") # Main tabs tab1, tab2, tab3, tab4 = st.tabs( ["🧭 Politiek Kompas", "📈 Trajectories", "🔍 Motie Zoeken", "📋 Motie Browser"] ) with tab1: build_compass_tab(db_path, window_size) with tab2: build_trajectories_tab(db_path, window_size) with tab3: build_search_tab(db_path, show_rejected) with tab4: build_browser_tab(db_path, show_rejected) if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s" ) run_app()