You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
motief/explorer.py

586 lines
20 KiB

"""Parlement Explorer — Streamlit data analysis app.
Four tabs:
1. Politiek Kompas — 2D scatter of MPs/parties, window slider
2. Partij Trajectories — party centroid lines over time
3. Motie Zoeken — text search + similarity lookup
4. Motie Browser — sortable table + detail panel
Run with: streamlit run explorer.py
Import-safe: heavy computation is behind @st.cache_data and only runs at UI time.
All DuckDB connections are read_only=True so the app can run alongside the pipeline.
"""
from __future__ import annotations
import json
import logging
import os
from typing import Dict, List, Optional, Tuple
import duckdb
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
logger = logging.getLogger(__name__)
# Party colour palette (consistent across tabs)
PARTY_COLOURS: Dict[str, str] = {
"VVD": "#1E73BE",
"PVV": "#002366",
"D66": "#00A36C",
"CDA": "#4CAF50",
"SP": "#E53935",
"PvdA": "#D32F2F",
"GroenLinks": "#388E3C",
"GroenLinks-PvdA": "#2E7D32",
"CU": "#0288D1",
"SGP": "#F4511E",
"PvdD": "#43A047",
"FVD": "#6A1B9A",
"JA21": "#7B1FA2",
"BBB": "#8D6E63",
"NSC": "#FF8F00",
"DENK": "#00897B",
"50PLUS": "#7E57C2",
"Unknown": "#9E9E9E",
}
# ---------------------------------------------------------------------------
# Cached loaders
# ---------------------------------------------------------------------------
@st.cache_data(show_spinner="Beschikbare tijdsvensters laden…")
def get_available_windows(db_path: str) -> List[str]:
"""Return sorted list of distinct window_ids from svd_vectors."""
con = duckdb.connect(database=db_path, read_only=True)
try:
rows = con.execute(
"SELECT DISTINCT window_id FROM svd_vectors ORDER BY window_id"
).fetchall()
return [r[0] for r in rows]
except Exception:
logger.exception("Failed to query available windows")
return []
finally:
con.close()
@st.cache_data(show_spinner="2D posities berekenen (kan even duren)…")
def load_positions(
db_path: str, window_size: str = "quarterly"
) -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], Dict]:
"""Compute 2D positions per window using PCA on aligned SVD vectors.
Returns:
positions_by_window: {window_id: {entity_name: (x, y)}}
axis_def: dict with x_axis, y_axis, method keys
"""
from analysis.political_axis import compute_2d_axes
available = get_available_windows(db_path)
if window_size == "annual":
# Keep only Q4 windows (one representative window per year)
available = [w for w in available if w.endswith("-Q4")]
if not available:
return {}, {}
positions_by_window, axis_def = compute_2d_axes(
db_path,
window_ids=available,
method="pca",
pca_residual=True,
normalize_vectors=True,
)
return positions_by_window, axis_def
@st.cache_data(show_spinner="Partijkaart laden…")
def load_party_map(db_path: str) -> Dict[str, str]:
"""Return {mp_name: party} mapping from mp_metadata (with vote-based fallback)."""
from analysis.visualize import _load_party_map
try:
return _load_party_map(db_path)
except Exception:
logger.exception("Failed to load party map")
return {}
@st.cache_data(show_spinner="Moties laden…")
def load_motions_df(db_path: str) -> pd.DataFrame:
"""Load the full motions table as a pandas DataFrame (read-only)."""
con = duckdb.connect(database=db_path, read_only=True)
try:
df = con.execute(
"""
SELECT id, title, description, date, policy_area,
voting_results, layman_explanation,
winning_margin, controversy_score
FROM motions
"""
).fetchdf()
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["year"] = df["date"].dt.year
return df
except Exception:
logger.exception("Failed to load motions")
return pd.DataFrame()
finally:
con.close()
def query_similar(
db_path: str,
source_motion_id: int,
vector_type: str = "fused",
top_k: int = 10,
) -> pd.DataFrame:
"""Return top-k similar motions from similarity_cache (read-only)."""
con = duckdb.connect(database=db_path, read_only=True)
try:
rows = con.execute(
"""
SELECT sc.target_motion_id, sc.score, sc.window_id,
m.title, m.date, m.policy_area
FROM similarity_cache sc
JOIN motions m ON m.id = sc.target_motion_id
WHERE sc.source_motion_id = ?
AND sc.vector_type = ?
ORDER BY sc.score DESC
LIMIT ?
""",
[source_motion_id, vector_type, top_k],
).fetchdf()
return rows
except Exception:
logger.exception(
"Failed to query similarity cache for motion %s", source_motion_id
)
return pd.DataFrame()
finally:
con.close()
# ---------------------------------------------------------------------------
# Tab 1: Politiek Kompas
# ---------------------------------------------------------------------------
def build_compass_tab(db_path: str, window_size: str) -> None:
st.subheader("Politiek Kompas")
st.markdown(
"2D projectie van Kamerlid posities op basis van stemgedrag (PCA op SVD-vectoren)."
)
positions_by_window, axis_def = load_positions(db_path, window_size)
if not positions_by_window:
st.warning(
"Geen positiedata beschikbaar. Controleer of de pipeline is gedraaid."
)
return
party_map = load_party_map(db_path)
windows = sorted(positions_by_window.keys())
col1, col2 = st.columns([3, 1])
with col2:
window_idx = st.select_slider(
"Tijdsvenster", options=windows, value=windows[-1]
)
show_names = st.checkbox("Toon namen", value=False)
min_size = st.slider("Min. MPs per partij", 0, 20, 3)
pos = positions_by_window.get(window_idx, {})
if not pos:
st.info(f"Geen data voor venster {window_idx}")
return
rows = []
for name, (x, y) in pos.items():
party = party_map.get(name, "Unknown")
rows.append({"name": name, "x": x, "y": y, "party": party})
df_pos = pd.DataFrame(rows)
# Filter to parties with enough MPs
party_counts = df_pos["party"].value_counts()
valid_parties = party_counts[party_counts >= min_size].index
df_pos = df_pos[df_pos["party"].isin(valid_parties)]
colour_map = {p: PARTY_COLOURS.get(p, "#9E9E9E") for p in df_pos["party"].unique()}
fig = px.scatter(
df_pos,
x="x",
y="y",
color="party",
hover_name="name",
hover_data={"party": True, "x": ":.3f", "y": ":.3f"},
color_discrete_map=colour_map,
title=f"Politiek Kompas — {window_idx}",
labels={"x": "Links ← → Rechts", "y": "Progressief ↑ / Conservatief ↓"},
)
if show_names:
fig.update_traces(text=df_pos["name"], textposition="top center")
fig.update_layout(height=600, legend_title_text="Partij")
with col1:
st.plotly_chart(fig, use_container_width=True)
# Axis info
if axis_def:
evr = axis_def.get("explained_variance_ratio", [])
if evr:
st.caption(
f"PCA verklaarde variantie: as 1 = {evr[0] * 100:.1f}%, as 2 = {evr[1] * 100:.1f}%"
)
# ---------------------------------------------------------------------------
# Tab 2: Partij Trajectories
# ---------------------------------------------------------------------------
def build_trajectories_tab(db_path: str, window_size: str) -> None:
st.subheader("Partij Trajectories")
st.markdown("Hoe bewegen partijen over de tijdsvensters heen?")
positions_by_window, _ = load_positions(db_path, window_size)
if not positions_by_window:
st.warning("Geen positiedata beschikbaar.")
return
party_map = load_party_map(db_path)
windows = sorted(positions_by_window.keys())
# Compute party centroids per window
centroids: Dict[str, Dict[str, Tuple[float, float]]] = {}
all_parties: set = set()
for wid in windows:
pos = positions_by_window.get(wid, {})
per_party: Dict[str, List[Tuple[float, float]]] = {}
for mp_name, (x, y) in pos.items():
party = party_map.get(mp_name, "Unknown")
if party == "Unknown":
continue
per_party.setdefault(party, []).append((x, y))
for party, coords in per_party.items():
all_parties.add(party)
xs = [c[0] for c in coords]
ys = [c[1] for c in coords]
centroids.setdefault(party, {})[wid] = (
float(np.mean(xs)),
float(np.mean(ys)),
)
all_parties_sorted = sorted(all_parties)
major_parties = [
p
for p in all_parties_sorted
if len(centroids.get(p, {})) >= max(2, len(windows) // 2)
]
selected_parties = st.multiselect(
"Selecteer partijen",
options=all_parties_sorted,
default=major_parties[:12] if major_parties else all_parties_sorted[:8],
)
fig = go.Figure()
for party in selected_parties:
if party not in centroids:
continue
wids_sorted = sorted(centroids[party].keys())
xs = [centroids[party][w][0] for w in wids_sorted]
ys = [centroids[party][w][1] for w in wids_sorted]
colour = PARTY_COLOURS.get(party, "#9E9E9E")
fig.add_trace(
go.Scatter(
x=xs,
y=ys,
mode="lines+markers+text",
name=party,
text=[w.replace("-Q4", "") for w in wids_sorted],
textposition="top center",
line=dict(color=colour),
marker=dict(color=colour, size=8),
hovertemplate=(
f"<b>{party}</b><br>"
"venster: %{text}<br>"
"x: %{x:.3f}<br>y: %{y:.3f}<extra></extra>"
),
)
)
fig.update_layout(
title="Partij trajectories",
xaxis_title="Links ← → Rechts",
yaxis_title="Progressief ↑ / Conservatief ↓",
height=600,
legend_title_text="Partij",
)
st.plotly_chart(fig, use_container_width=True)
# ---------------------------------------------------------------------------
# Tab 3: Motie Zoeken
# ---------------------------------------------------------------------------
def build_search_tab(db_path: str, show_rejected: bool) -> None:
st.subheader("Motie Zoeken")
df = load_motions_df(db_path)
if df.empty:
st.warning("Geen moties beschikbaar.")
return
if not show_rejected:
df = df[df["title"].fillna("").str.strip() != "Verworpen."]
# Sidebar-style controls in the main area
col1, col2, col3 = st.columns([2, 1, 1])
with col1:
query = st.text_input(
"Zoek op titel of uitleg", placeholder="bijv. stikstof, klimaat, wonen"
)
with col2:
years = sorted(df["year"].dropna().astype(int).unique().tolist())
if years:
year_range = st.select_slider(
"Jaar", options=years, value=(years[0], years[-1])
)
else:
year_range = (2019, 2024)
with col3:
policy_areas = ["(Alle)"] + sorted(df["policy_area"].dropna().unique().tolist())
policy_filter = st.selectbox("Beleidsterrein", options=policy_areas)
# Apply filters in-memory
working = df.copy()
working = working[
(working["year"] >= year_range[0]) & (working["year"] <= year_range[1])
]
if policy_filter != "(Alle)":
working = working[working["policy_area"] == policy_filter]
if query:
q = query.lower()
mask = working["title"].fillna("").str.lower().str.contains(
q, regex=False
) | working["layman_explanation"].fillna("").str.lower().str.contains(
q, regex=False
)
working = working[mask]
working = working.sort_values(by="controversy_score", ascending=False)
st.caption(f"{len(working)} resultaten (top 50 getoond)")
for _, row in working.head(50).iterrows():
title = row.get("title") or f"Motie #{row['id']}"
date_str = row["date"].strftime("%d %b %Y") if pd.notna(row["date"]) else "?"
with st.expander(f"**{title}** — {date_str}{row.get('policy_area') or ''}"):
explanation = row.get("layman_explanation")
if explanation and str(explanation).strip():
st.markdown(explanation)
elif row.get("description") and str(row["description"]).strip():
st.markdown(str(row["description"])[:600] + "")
else:
st.caption("_Geen samenvatting beschikbaar_")
cols = st.columns(3)
cols[0].metric("Controverse", f"{row.get('controversy_score', 0):.2f}")
cols[1].metric("Marge", f"{row.get('winning_margin', 0):.2f}")
cols[2].metric("Jaar", int(row["year"]) if pd.notna(row["year"]) else "?")
# Similar motions
sim = query_similar(db_path, int(row["id"]), top_k=5)
if not sim.empty:
st.markdown("**Vergelijkbare moties:**")
for _, s in sim.iterrows():
s_date = (
pd.to_datetime(s["date"]).strftime("%Y")
if pd.notna(s.get("date"))
else ""
)
st.markdown(
f"- {s.get('title', 'Onbekend')} *(score: {s['score']:.3f}, {s_date})*"
)
else:
st.caption("_Nog geen vergelijkbare moties beschikbaar_")
# ---------------------------------------------------------------------------
# Tab 4: Motie Browser
# ---------------------------------------------------------------------------
def build_browser_tab(db_path: str, show_rejected: bool) -> None:
st.subheader("Motie Browser")
df = load_motions_df(db_path)
if df.empty:
st.warning("Geen moties beschikbaar.")
return
if not show_rejected:
df = df[df["title"].fillna("").str.strip() != "Verworpen."]
# Controls
col1, col2, col3 = st.columns(3)
with col1:
years = sorted(df["year"].dropna().astype(int).unique().tolist())
year_filter = st.selectbox("Jaar", ["(Alle)"] + [str(y) for y in years])
with col2:
policy_areas = ["(Alle)"] + sorted(df["policy_area"].dropna().unique().tolist())
pa_filter = st.selectbox(
"Beleidsterrein", options=policy_areas, key="browser_pa"
)
with col3:
sort_by = st.selectbox("Sorteren op", ["Datum (nieuw)", "Controverse", "Marge"])
# Filter
working = df.copy()
if year_filter != "(Alle)":
working = working[working["year"] == int(year_filter)]
if pa_filter != "(Alle)":
working = working[working["policy_area"] == pa_filter]
sort_map = {
"Datum (nieuw)": ("date", False),
"Controverse": ("controversy_score", False),
"Marge": ("winning_margin", True),
}
sort_col, sort_asc = sort_map[sort_by]
working = working.sort_values(by=sort_col, ascending=sort_asc)
# Display table
display_cols = [
"id",
"title",
"date",
"policy_area",
"controversy_score",
"winning_margin",
]
available_display = [c for c in display_cols if c in working.columns]
st.dataframe(
working[available_display].reset_index(drop=True),
use_container_width=True,
height=350,
)
st.divider()
# Detail panel
st.markdown("**Detail weergave** — vul een motie-ID in:")
sel_id = st.number_input(
"Motie ID",
min_value=int(working["id"].min()) if not working.empty else 1,
max_value=int(working["id"].max()) if not working.empty else 99999,
value=int(working["id"].iloc[0]) if not working.empty else 1,
step=1,
)
motion_row = df[df["id"] == sel_id]
if not motion_row.empty:
row = motion_row.iloc[0]
st.markdown(f"### {row.get('title') or 'Onbekend'}")
st.caption(
f"📅 {row['date'].strftime('%d %b %Y') if pd.notna(row['date']) else '?'} "
f"| 🏷 {row.get('policy_area') or ''} "
f"| 🔥 Controverse: {row.get('controversy_score', 0):.2f}"
)
if row.get("layman_explanation") and str(row["layman_explanation"]).strip():
st.markdown(row["layman_explanation"])
elif row.get("description") and str(row["description"]).strip():
st.markdown(str(row["description"]))
# Parse voting results
try:
vr = row.get("voting_results")
if vr and str(vr).strip() not in ("", "null", "None"):
vdata = json.loads(vr) if isinstance(vr, str) else vr
if isinstance(vdata, dict):
st.markdown("**Stemuitslag:**")
for category, actors in vdata.items():
if actors:
st.markdown(
f"- **{category}**: {', '.join(str(a) for a in actors)}"
)
except Exception:
pass
# Similar motions
sim = query_similar(db_path, int(sel_id), top_k=10)
if not sim.empty:
st.markdown("**Vergelijkbare moties:**")
st.dataframe(
sim[["title", "score", "date", "policy_area"]],
use_container_width=True,
)
else:
st.caption("_Nog geen vergelijkbare moties beschikbaar voor deze motie_")
# ---------------------------------------------------------------------------
# App entry
# ---------------------------------------------------------------------------
def run_app() -> None:
st.set_page_config(
layout="wide",
page_title="Parlement Explorer",
page_icon="🏛",
)
st.title("🏛 Parlement Explorer")
# Sidebar
st.sidebar.title("Instellingen")
db_path = st.sidebar.text_input("DuckDB pad", value="data/motions.db")
window_size = st.sidebar.radio("Venstergrootte", ["quarterly", "annual"], index=0)
show_rejected = st.sidebar.checkbox("Toon verworpen moties", value=False)
# About section
with st.sidebar.expander(" Over", expanded=False):
try:
con = duckdb.connect(database=db_path, read_only=True)
n_motions = con.execute("SELECT COUNT(*) FROM motions").fetchone()[0]
n_fused = con.execute("SELECT COUNT(*) FROM fused_embeddings").fetchone()[0]
n_sim = con.execute("SELECT COUNT(*) FROM similarity_cache").fetchone()[0]
con.close()
st.markdown(
f"**Moties:** {n_motions:,} \n"
f"**Fused embeddings:** {n_fused:,} \n"
f"**Similarity cache:** {n_sim:,}"
)
except Exception as e:
st.warning(f"DB niet bereikbaar: {e}")
# Main tabs
tab1, tab2, tab3, tab4 = st.tabs(
["🧭 Politiek Kompas", "📈 Trajectories", "🔍 Motie Zoeken", "📋 Motie Browser"]
)
with tab1:
build_compass_tab(db_path, window_size)
with tab2:
build_trajectories_tab(db_path, window_size)
with tab3:
build_search_tab(db_path, show_rejected)
with tab4:
build_browser_tab(db_path, show_rejected)
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s"
)
run_app()