diff --git a/explorer.py b/explorer.py index d0cbdcd..1bb973b 100644 --- a/explorer.py +++ b/explorer.py @@ -49,6 +49,7 @@ PARTY_COLOURS: Dict[str, str] = { "DENK": "#00897B", "50PLUS": "#7E57C2", "Volt": "#572AB7", + "ChristenUnie": "#0288D1", "Unknown": "#9E9E9E", } @@ -69,23 +70,27 @@ KNOWN_MAJOR_PARTIES = [ ] -# Current parliament parties (used for party-level SVD lookups) -# Keep both common abbreviations and full names that may appear in the DB -CURRENT_PARLIAMENT_PARTIES = frozenset( - [ - "VVD", +# Parties currently seated in the Tweede Kamer (2023 election cycle). +# Deze zijn de entity_ids zoals opgeslagen in svd_vectors voor window='2025'. +CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset( + { "PVV", + "VVD", + "NSC", + "BBB", "D66", "GroenLinks-PvdA", - "GroenLinks", - "PvdA", "CDA", "SP", - "NSC", - "CU", "ChristenUnie", - "BBB", - ] + "CU", # alias for ChristenUnie + "SGP", + "Volt", + "DENK", + "PvdD", + "JA21", + "FVD", + } ) @@ -207,16 +212,12 @@ def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]: AND entity_id is a known current-parliament party. Returns: - {party_name: [float * k]} — k = 50 for the canonical 2025 window + {party_name: [float * k]} — k = 50 for the canonical 2025 window. + Duplicate rows for the same party are de-duplicated (last row wins). """ - con = None try: - # Use a deterministic, ordered list for parameter binding - party_list = sorted(CURRENT_PARLIAMENT_PARTIES) - if not party_list: - return {} - con = duckdb.connect(database=db_path, read_only=True) + party_list = sorted(CURRENT_PARLIAMENT_PARTIES) placeholders = ", ".join("?" for _ in party_list) rows = con.execute( f"SELECT entity_id, vector FROM svd_vectors " @@ -224,70 +225,29 @@ def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]: f"AND entity_id IN ({placeholders})", party_list, ).fetchall() - - out: Dict[str, List[float]] = {} - for row in rows: - party = row[0] - vec_field = row[1] - try: - if vec_field is None: - # skip missing vectors + result: Dict[str, List[float]] = {} + for entity_id, raw_vec in rows: + if isinstance(raw_vec, str): + vec = json.loads(raw_vec) + elif isinstance(raw_vec, (bytes, bytearray)): + vec = json.loads(raw_vec.decode()) + elif isinstance(raw_vec, list): + vec = raw_vec + else: + try: + vec = list(raw_vec) + except Exception: continue - # string-encoded JSON vector - if isinstance(vec_field, str): - vec = json.loads(vec_field) - # bytes (some DB drivers may return bytes) - elif isinstance(vec_field, (bytes, bytearray)): - try: - vec = json.loads(vec_field.decode("utf-8")) - except Exception: - # fallback: attempt to eval as list-like - vec = list(vec_field) - # already a list/tuple/np.ndarray-like - elif isinstance(vec_field, (list, tuple, np.ndarray)): - vec = list(vec_field) - else: - # unknown type: attempt best-effort conversion - vec = list(vec_field) - - # ensure all entries are floats - vec_floats = [float(x) for x in vec] - out[party] = vec_floats - except Exception: - # skip malformed rows but keep processing others - logger.debug("Skipping malformed vector for party %s", party) - continue - - return out + result[entity_id] = [float(v) if v is not None else 0.0 for v in vec] + return result except Exception: logger.exception("Failed to load party axis scores") return {} finally: - if con is not None: + try: con.close() - - -@st.cache_data(show_spinner="Moties laden…") -def load_motions_df(db_path: str) -> pd.DataFrame: - """Load the full motions table as a pandas DataFrame (read-only).""" - con = duckdb.connect(database=db_path, read_only=True) - try: - df = con.execute( - """ - SELECT id, title, description, date, policy_area, - voting_results, layman_explanation, - winning_margin, controversy_score, url - FROM motions - """ - ).fetchdf() - df["date"] = pd.to_datetime(df["date"], errors="coerce") - df["year"] = df["date"].dt.year - return df - except Exception: - logger.exception("Failed to load motions") - return pd.DataFrame() - finally: - con.close() + except Exception: + pass def _render_party_axis_chart( @@ -295,114 +255,91 @@ def _render_party_axis_chart( ) -> None: """Render a 1D horizontal Plotly scatter of party positions on SVD axis `comp_sel`. - party_scores: mapping party -> list-like vector (50-dim) - comp_sel: 1-based component index + Each party is plotted at its score on a single horizontal axis (y=0). """ - # Validate component selection - if not isinstance(comp_sel, int) or comp_sel < 1: - st.caption("Ongeldige SVD-as geselecteerd.") - return - if not party_scores: - st.caption("Partijdata zijn niet beschikbaar.") + st.caption("_Partijdata niet beschikbaar voor deze as._") return - axis_idx = comp_sel - 1 - - # Determine maximum available vector dimension to validate selection - max_dim = 0 - for v in party_scores.values(): - try: - if isinstance(v, (list, tuple, np.ndarray)): - max_dim = max(max_dim, len(v)) - except Exception: - continue - - if axis_idx >= max_dim: - st.caption( - f"Geselecteerde component ({comp_sel}) valt buiten het bereik van de beschikbare vectoren ({max_dim} dimensies)." - ) - return - - parties: List[str] = [] - xs: List[float] = [] - + axis_idx = comp_sel - 1 # 0-based index into the 50-dim vector + data: list[dict] = [] for party, vec in party_scores.items(): - # Ensure vec is indexable/sequence-like - if not isinstance(vec, (list, tuple, np.ndarray)): - continue - # safe indexing - if axis_idx >= len(vec): - continue - try: - raw = vec[axis_idx] - val = float(raw) - # filter non-finite values - if not np.isfinite(val): - continue - except Exception: - continue - parties.append(party) - xs.append(val) + if axis_idx < len(vec): + data.append({"party": party, "score": vec[axis_idx]}) - if not xs: - st.caption("Geen bruikbare partijposities gevonden voor de gekozen SVD-as.") + if not data: + st.caption("_Geen partijscores voor deze as._") return - try: - x_min = float(min(xs)) - x_max = float(max(xs)) - except Exception: - st.caption("Onvoldoende gegevens om het asbereik te berekenen.") - return - - # Symmetric padding around the midpoint for balanced visualisation - if x_min == x_max: - padding = 0.5 if x_min == 0 else abs(x_min) * 0.1 - if padding <= 0: - padding = 0.5 - center = x_min - half = padding - else: - center = (x_min + x_max) / 2.0 - half = max(abs(x_max - center), abs(center - x_min)) - # add slight visual padding - half = half * 1.15 - - x_min = center - half - x_max = center + half - - # Build horizontal scatter: y is constant (0) but offset for label placement - ys = [0 for _ in xs] + scores = [d["score"] for d in data] + parties = [d["party"] for d in data] + colours = [PARTY_COLOURS.get(p, "#9E9E9E") for p in parties] + hover = [f"{p}: {s:.3f}" for p, s in zip(parties, scores)] fig = go.Figure() + # Baseline + x_min, x_max = min(scores) * 1.15, max(scores) * 1.15 + fig.add_trace( + go.Scatter( + x=[x_min, x_max], + y=[0, 0], + mode="lines", + line={"color": "#cccccc", "width": 1}, + hoverinfo="skip", + showlegend=False, + ) + ) + # Party markers fig.add_trace( go.Scatter( - x=xs, - y=ys, + x=scores, + y=[0] * len(scores), mode="markers+text", text=parties, textposition="top center", - marker=dict( - size=10, color=[PARTY_COLOURS.get(p, "#9E9E9E") for p in parties] - ), - hovertemplate="%{text}
x: %{x:.3f}", + marker={"size": 12, "color": colours}, + hovertext=hover, + hoverinfo="text", + showlegend=False, ) ) - fig.update_layout( - title=f"Partijposities op SVD-as {comp_sel}", - xaxis_title="Negatief ← — → Positief", - yaxis=dict(visible=False), - xaxis=dict(range=[x_min, x_max]), - height=300, - margin=dict(t=40, b=40, l=40, r=40), - showlegend=False, + height=160, + margin={"l": 10, "r": 10, "t": 10, "b": 30}, + xaxis={ + "title": "← Negatieve pool | Positieve pool →", + "zeroline": True, + "zerolinecolor": "#aaaaaa", + }, + yaxis={"visible": False, "range": [-1, 2]}, + plot_bgcolor="white", ) - st.plotly_chart(fig, use_container_width=True) +@st.cache_data(show_spinner="Moties laden…") +def load_motions_df(db_path: str) -> pd.DataFrame: + """Load the full motions table as a pandas DataFrame (read-only).""" + con = duckdb.connect(database=db_path, read_only=True) + try: + df = con.execute( + """ + SELECT id, title, description, date, policy_area, + voting_results, layman_explanation, + winning_margin, controversy_score, url + FROM motions + """ + ).fetchdf() + df["date"] = pd.to_datetime(df["date"], errors="coerce") + df["year"] = df["date"].dt.year + return df + except Exception: + logger.exception("Failed to load motions") + return pd.DataFrame() + finally: + con.close() + + def query_similar( db_path: str, source_motion_id: int, @@ -1039,62 +976,103 @@ def build_svd_components_tab(db_path: str) -> None: ) comp_sel = comp_options[comp_sel_idx] - # Show theme explanation + poles + # Show theme explanation theme = SVD_THEMES.get(comp_sel, {}) if theme: st.info(f"**{theme['label']}** — {theme['explanation']}") - pos = theme.get("positive_pole", "") - neg = theme.get("negative_pole", "") - if pos or neg: - pcol, ncol = st.columns(2) - with pcol: - st.success(f"▲ **Positieve pool:** {pos}") - with ncol: - st.error(f"▼ **Negatieve pool:** {neg}") motions = comp_map.get(comp_sel, []) - col1, col2 = st.columns([1, 2]) - with col1: - st.markdown("**Top-moties (titels)**") - for m in motions: - mid = m.get("motion_id") - score = m.get("score", 0.0) - title = m.get("title") or f"Motie #{mid}" - sign = "▲" if score >= 0 else "▼" - if st.button(f"{sign} {mid}: {title[:72]}", key=f"btn_{comp_sel}_{mid}"): - st.session_state["svd_selected_mid"] = mid - - with col2: - sel_mid = st.session_state.get("svd_selected_mid") - if not sel_mid and motions: - sel_mid = motions[0].get("motion_id") - if sel_mid: - # fetch motion metadata from DB for completeness - try: - con = duckdb.connect(database=db_path, read_only=True) - row = con.execute( - "SELECT id, title, date, policy_area, url, body_text FROM motions WHERE id=?", - [int(sel_mid)], - ).fetchone() - con.close() - except Exception: - row = None + # Party axis chart + party_scores = load_party_axis_scores(db_path) + _render_party_axis_chart(party_scores, comp_sel) + + # Batch-fetch motion details (title, date, policy_area, url, body_text, voting_results) + motion_ids = [m.get("motion_id") for m in motions if m.get("motion_id") is not None] + motion_details: Dict[int, tuple] = {} + if motion_ids: + # Defensively convert motion_ids to integers, skipping invalid values + ids_int: List[int] = [] + for mid in motion_ids: + try: + ids_int.append(int(mid)) + except Exception: + logger.warning("Skipping invalid motion id in SVD batch fetch: %r", mid) - if row: - st.markdown(f"### {row[1] or f'Motie #{row[0]}'}") + # If no valid ids remain, skip the DB query + if ids_int: + con = None try: - date_str = str(row[2])[:10] + placeholders = ", ".join("?" for _ in ids_int) + con = duckdb.connect(database=db_path, read_only=True) + db_rows = con.execute( + f"SELECT id, title, date, policy_area, url, body_text, voting_results " + f"FROM motions WHERE id IN ({placeholders})", + ids_int, + ).fetchall() + motion_details = {r[0]: r for r in db_rows} except Exception: - date_str = "?" - st.caption(f"📅 {date_str} | {row[3]}") - if row[4] and str(row[4]).startswith("http"): - st.markdown(f"[🔗 Bekijk op Tweede Kamer]({row[4]})") - if row[5]: - with st.expander("Toon volledige tekst"): - st.write(row[5]) - else: - st.info(f"Metadata not found in DB for motion {sel_mid}") + logger.exception("Failed to batch-fetch motion details") + finally: + if con: + con.close() + + # Split motions by pole sign + pos_motions = [m for m in motions if float(m.get("score", 0.0)) >= 0] + neg_motions = [m for m in motions if float(m.get("score", 0.0)) < 0] + + pos_pole = ( + theme.get("positive_pole", "Positieve pool") if theme else "Positieve pool" + ) + neg_pole = ( + theme.get("negative_pole", "Negatieve pool") if theme else "Negatieve pool" + ) + + pcol, ncol = st.columns(2) + + with pcol: + st.success(f"▲ **Positieve pool:** {pos_pole}") + for m in pos_motions: + mid = m.get("motion_id") + raw_title = m.get("title") or f"Motie #{mid}" + with st.expander(f"▲ {raw_title[:80]}"): + row = motion_details.get(int(mid)) if mid is not None else None + if row: + try: + date_str = str(row[2])[:10] + except Exception: + date_str = "?" + st.caption(f"📅 {date_str} | {row[3] or '—'}") + if row[4] and str(row[4]).startswith("http"): + st.markdown(f"[🔗 Bekijk op Tweede Kamer]({row[4]})") + if row[5]: + with st.expander("Toon volledige tekst"): + st.write(row[5]) + _render_voting_results(row[6]) + else: + st.caption("_Geen metadata beschikbaar_") + + with ncol: + st.error(f"▼ **Negatieve pool:** {neg_pole}") + for m in neg_motions: + mid = m.get("motion_id") + raw_title = m.get("title") or f"Motie #{mid}" + with st.expander(f"▼ {raw_title[:80]}"): + row = motion_details.get(int(mid)) if mid is not None else None + if row: + try: + date_str = str(row[2])[:10] + except Exception: + date_str = "?" + st.caption(f"📅 {date_str} | {row[3] or '—'}") + if row[4] and str(row[4]).startswith("http"): + st.markdown(f"[🔗 Bekijk op Tweede Kamer]({row[4]})") + if row[5]: + with st.expander("Toon volledige tekst"): + st.write(row[5]) + _render_voting_results(row[6]) + else: + st.caption("_Geen metadata beschikbaar_") def build_mp_quiz_tab(db_path: str) -> None: