- Extract 6 tab functions from explorer.py (3097 → 543 lines) - Create analysis/tabs/_rendering.py with shared plotly helpers - Move data logic to analysis/explorer_data.py - Add lazy-import wrappers in explorer.py for backward compat - Add scheduler.py with PipelineScheduler for daily pipeline runs - Add test_explorer_decomposition.py (5 tests, all pass) - Add test_scheduler.py (13 tests, all pass) - Full test suite: 222 passed, 2 skippedmain
parent
203ae178ca
commit
3bdb43f162
@ -1,18 +1,95 @@ |
||||
"""Browser tab for the parliamentary explorer. |
||||
|
||||
This module will contain the browser tab implementation. |
||||
Currently: Tab logic remains in explorer.py pending Streamlit decoupling. |
||||
""" |
||||
"""Browser tab for the parliamentary explorer.""" |
||||
|
||||
from __future__ import annotations |
||||
|
||||
import pandas as pd |
||||
|
||||
import analysis.explorer_data as explorer_data |
||||
from analysis.tabs._rendering import _render_voting_results, st |
||||
|
||||
|
||||
def build_browser_tab(db_path: str, show_rejected: bool) -> None: |
||||
"""Build the Motie Browser tab. |
||||
"""Build the Motie Browser tab.""" |
||||
st.subheader("Motie Browser") |
||||
|
||||
df = explorer_data.load_motions_df(db_path) |
||||
if df.empty: |
||||
st.warning("Geen moties beschikbaar.") |
||||
return |
||||
|
||||
if not show_rejected: |
||||
df = df[df["title"].fillna("").str.strip() != "Verworpen."] |
||||
|
||||
col1, col2, col3 = st.columns(3) |
||||
with col1: |
||||
years = sorted(df["year"].dropna().astype(int).unique().tolist()) |
||||
year_filter = st.selectbox("Jaar", ["(Alle)"] + [str(y) for y in years]) |
||||
with col2: |
||||
min_controversy_b = st.slider( |
||||
"Min. controverse", |
||||
min_value=0.0, |
||||
max_value=1.0, |
||||
value=0.0, |
||||
step=0.05, |
||||
key="browser_controversy", |
||||
) |
||||
with col3: |
||||
sort_by = st.selectbox("Sorteren op", ["Datum (nieuw)", "Controverse", "Marge"]) |
||||
|
||||
working = df.copy() |
||||
if year_filter != "(Alle)": |
||||
working = working[working["year"] == int(year_filter)] |
||||
if min_controversy_b > 0: |
||||
working = working[working["controversy_score"] >= min_controversy_b] |
||||
|
||||
sort_map = { |
||||
"Datum (nieuw)": ("date", False), |
||||
"Controverse": ("controversy_score", False), |
||||
"Marge": ("winning_margin", True), |
||||
} |
||||
sort_col, sort_asc = sort_map[sort_by] |
||||
working = working.sort_values(by=sort_col, ascending=sort_asc) |
||||
|
||||
display_cols = ["id", "title", "date", "controversy_score", "winning_margin"] |
||||
available_display = [c for c in display_cols if c in working.columns] |
||||
st.dataframe( |
||||
working[available_display].reset_index(drop=True), |
||||
use_container_width=True, |
||||
height=350, |
||||
) |
||||
|
||||
st.divider() |
||||
|
||||
st.markdown("**Detail weergave** — vul een motie-ID in:") |
||||
sel_id = st.number_input( |
||||
"Motie ID", |
||||
min_value=int(working["id"].min()) if not working.empty else 1, |
||||
max_value=int(working["id"].max()) if not working.empty else 99999, |
||||
value=int(working["id"].iloc[0]) if not working.empty else 1, |
||||
step=1, |
||||
) |
||||
motion_row = df[df["id"] == sel_id] |
||||
if not motion_row.empty: |
||||
row = motion_row.iloc[0] |
||||
st.markdown(f"### {row.get('title') or 'Onbekend'}") |
||||
date_str = row["date"].strftime("%d %b %Y") if pd.notna(row["date"]) else "?" |
||||
st.caption( |
||||
f"📅 {date_str} | 🔥 Controverse: {row.get('controversy_score', 0):.2f}" |
||||
) |
||||
|
||||
url = row.get("url") |
||||
if url and str(url).startswith("http"): |
||||
st.markdown(f"[🔗 Bekijk op Tweede Kamer]({url})") |
||||
|
||||
Currently delegates to explorer.py implementation. |
||||
Will be extracted when rendering logic is decoupled from Streamlit. |
||||
""" |
||||
import explorer |
||||
st.markdown("**Stemuitslag:**") |
||||
_render_voting_results(row.get("voting_results")) |
||||
|
||||
explorer.build_browser_tab(db_path, show_rejected) |
||||
sim = explorer_data.query_similar(db_path, int(sel_id), top_k=10) |
||||
if not sim.empty: |
||||
st.markdown("**Vergelijkbare moties:**") |
||||
st.dataframe( |
||||
sim[["title", "score", "date", "policy_area"]], |
||||
use_container_width=True, |
||||
) |
||||
else: |
||||
st.caption("_Nog geen vergelijkbare moties beschikbaar voor deze motie_") |
||||
|
||||
@ -1,18 +1,374 @@ |
||||
"""SVD Components tab for the parliamentary explorer. |
||||
|
||||
This module will contain the SVD components tab implementation. |
||||
Currently: Tab logic remains in explorer.py pending Streamlit decoupling. |
||||
""" |
||||
"""SVD Components tab for the parliamentary explorer.""" |
||||
|
||||
from __future__ import annotations |
||||
|
||||
import datetime as _dt |
||||
import logging |
||||
import os |
||||
from typing import Dict, List, Tuple |
||||
|
||||
import numpy as np |
||||
|
||||
from analysis import config |
||||
import analysis.explorer_data as explorer_data |
||||
from analysis.tabs._rendering import ( |
||||
_render_party_axis_chart_1d, |
||||
_render_scree_plot, |
||||
_render_svd_time_trajectory, |
||||
_render_voting_results, |
||||
st, |
||||
) |
||||
|
||||
try: |
||||
import duckdb |
||||
except Exception: |
||||
duckdb = None # type: ignore |
||||
|
||||
SVD_THEMES = config.SVD_THEMES |
||||
KNOWN_MAJOR_PARTIES = config.KNOWN_MAJOR_PARTIES |
||||
|
||||
logger = logging.getLogger(__name__) |
||||
|
||||
|
||||
def build_svd_components_tab(db_path: str) -> None: |
||||
"""Build the SVD Components tab. |
||||
"""New tab: show top motions contributing to top SVD components. |
||||
|
||||
Reads thoughts/explorer/top_svd_top_motions.json and displays a selector |
||||
for components 1..10 with theme labels/explanations and a detail pane per motion. |
||||
|
||||
Currently delegates to explorer.py implementation. |
||||
Will be extracted when rendering logic is decoupled from Streamlit. |
||||
Components 1-2 use aligned PCA positions (consistent with compass). |
||||
Components 3-10 use raw SVD scores. |
||||
""" |
||||
import explorer |
||||
st.subheader("🔬 SVD Assen — politieke polarisatiethema's") |
||||
st.markdown( |
||||
"Elke SVD-as representeert een latente politieke dimensie afgeleid uit stempatronen " |
||||
"van alle Kamerleden. De top-10 moties per as zijn uniek (geen overlap) en illustreren " |
||||
"het spanningsveld dat de as beschrijft." |
||||
) |
||||
|
||||
scree_importances = explorer_data.load_scree_data(db_path) |
||||
if scree_importances: |
||||
st.markdown( |
||||
"**Scree-plot** — het relatieve gewicht van elke SVD-as. " |
||||
"De eerste assen verklaren het meeste van de stemverschillen in de Kamer; " |
||||
"latere assen (7+) zijn fragiel en mogelijk niet boven ruisniveau." |
||||
) |
||||
_render_scree_plot(scree_importances) |
||||
|
||||
json_path = os.path.join("thoughts", "explorer", "top_svd_top_motions.json") |
||||
if not os.path.exists(json_path): |
||||
st.warning( |
||||
f"Top-SVD data not found at {json_path}. Run the importance job to generate it." |
||||
) |
||||
return |
||||
|
||||
try: |
||||
import json |
||||
|
||||
with open(json_path, "r", encoding="utf-8") as fh: |
||||
j = json.load(fh) |
||||
except Exception as e: |
||||
st.error(f"Failed to load SVD importance JSON: {e}") |
||||
return |
||||
|
||||
window = j.get("window") |
||||
rows = j.get("rows", []) |
||||
if not rows: |
||||
st.info("Geen top-moties in dataset") |
||||
return |
||||
|
||||
st.caption(f"Top SVD-bijdragers berekend voor venster: **{window}**") |
||||
|
||||
comp_map: dict[int, list] = {} |
||||
for r in rows: |
||||
comp = int(r.get("component", 0)) |
||||
bucket = comp_map.setdefault(comp, []) |
||||
existing_ids = {m.get("motion_id") for m in bucket} |
||||
if r.get("motion_id") not in existing_ids: |
||||
bucket.append(r) |
||||
|
||||
comp_options = sorted(comp_map.keys()) |
||||
|
||||
def _comp_label(c: int) -> str: |
||||
theme = SVD_THEMES.get(c, {}) |
||||
lbl = theme.get("label", "") |
||||
return f"As {c} — {lbl}" if lbl else f"As {c}" |
||||
|
||||
comp_display = [_comp_label(c) for c in comp_options] |
||||
|
||||
party_scores_default = explorer_data.load_party_axis_scores(db_path) |
||||
party_mp_vectors = explorer_data.load_party_mp_vectors(db_path) |
||||
bootstrap_data = None |
||||
if party_mp_vectors: |
||||
try: |
||||
from analysis.political_axis import compute_party_bootstrap_cis |
||||
|
||||
bootstrap_data = compute_party_bootstrap_cis(party_mp_vectors) |
||||
except Exception: |
||||
pass |
||||
|
||||
col1, col2 = st.columns([2, 1]) |
||||
|
||||
view_mode = "Enkel venster" |
||||
selected_parties_for_trajectory: list = [] |
||||
|
||||
with col2: |
||||
comp_sel_idx = st.selectbox( |
||||
"Selecteer SVD-as", |
||||
options=list(range(len(comp_options))), |
||||
format_func=lambda i: comp_display[i], |
||||
index=0, |
||||
) |
||||
comp_sel = comp_options[comp_sel_idx] |
||||
|
||||
min_mps = st.number_input( |
||||
"Min. Kamerleden per partij", |
||||
min_value=1, |
||||
max_value=20, |
||||
value=1, |
||||
step=1, |
||||
help="Partijen met minder dan dit aantal Kamerleden worden niet weergegeven.", |
||||
) |
||||
|
||||
view_mode = st.radio( |
||||
"Weergave", |
||||
options=["Enkel venster", "Tijdtraject"], |
||||
index=0, |
||||
help="Enkel venster: toont posities voor één tijdsvenster. Tijdtraject: toont hoe partijen over tijd bewegen op deze as.", |
||||
) |
||||
|
||||
selected_parties_for_trajectory = [] |
||||
if view_mode == "Tijdtraject": |
||||
all_parties = ( |
||||
sorted(party_scores_default.keys()) if party_scores_default else [] |
||||
) |
||||
default_parties = [p for p in KNOWN_MAJOR_PARTIES if p in all_parties][:8] |
||||
selected_parties_for_trajectory = st.multiselect( |
||||
"Partijen om te tonen", |
||||
options=all_parties, |
||||
default=default_parties, |
||||
help="Selecteer de partijen die je wilt zien in het tijdtraject.", |
||||
) |
||||
|
||||
theme = SVD_THEMES.get(comp_sel, {}) |
||||
if theme: |
||||
st.info(f"**{theme['label']}** — {theme['explanation']}") |
||||
|
||||
motions = comp_map.get(comp_sel, []) |
||||
|
||||
_current_year = str(_dt.date.today().year) |
||||
available_windows = explorer_data.get_uniform_dim_windows(db_path) |
||||
year_windows = sorted( |
||||
w for w in available_windows if w != "current_parliament" and w != _current_year |
||||
) |
||||
has_current = "current_parliament" in available_windows |
||||
svd_windows = year_windows + (["current_parliament"] if has_current else []) |
||||
|
||||
def _svd_window_label(w: str) -> str: |
||||
if w == "current_parliament": |
||||
return "Huidig parliament" |
||||
return w |
||||
|
||||
with col1: |
||||
svd_window = st.selectbox( |
||||
"Jaar", |
||||
options=svd_windows, |
||||
index=len(svd_windows) - 1, |
||||
format_func=_svd_window_label, |
||||
key=f"svd_window_{comp_sel}", |
||||
) |
||||
|
||||
if svd_window == "current_parliament": |
||||
party_scores = party_scores_default |
||||
else: |
||||
party_scores = explorer_data.load_party_axis_scores_for_window(db_path, svd_window) |
||||
|
||||
party_mp_counts = ( |
||||
{p: len(v) for p, v in party_mp_vectors.items()} if party_mp_vectors else {} |
||||
) |
||||
|
||||
def _get_aligned_party_coords(window: str) -> Dict[str, Tuple[float, float]]: |
||||
"""Get party (x, y) coordinates from aligned PCA positions for a window.""" |
||||
positions_by_window, _ = explorer_data.load_positions(db_path, "annual") |
||||
window_pos = positions_by_window.get(window, {}) |
||||
if not window_pos: |
||||
return {} |
||||
|
||||
_party_map = explorer_data.load_party_map(db_path) |
||||
|
||||
party_coords: Dict[str, List[Tuple[float, float]]] = {} |
||||
for mp_name, (x, y) in window_pos.items(): |
||||
party = _party_map.get( |
||||
mp_name, _party_map.get(mp_name.split("(")[0].strip(), None) |
||||
) |
||||
if party: |
||||
party_coords.setdefault(party, []).append((x, y)) |
||||
|
||||
return { |
||||
party: ( |
||||
float(np.mean([c[0] for c in coords])), |
||||
float(np.mean([c[1] for c in coords])), |
||||
) |
||||
for party, coords in party_coords.items() |
||||
if coords |
||||
} |
||||
|
||||
active_mps = ( |
||||
explorer_data.load_active_mps(db_path) |
||||
if svd_window == "current_parliament" |
||||
else None |
||||
) |
||||
aligned_all_scores = explorer_data.get_aligned_party_scores( |
||||
db_path, svd_window, active_mps |
||||
) |
||||
|
||||
party_1d_coords: dict = {} |
||||
for party, all_scores in aligned_all_scores.items(): |
||||
idx = comp_sel - 1 |
||||
if idx < len(all_scores): |
||||
party_1d_coords[party] = (float(all_scores[idx]),) |
||||
|
||||
computed_flips: Dict[int, bool] = {} |
||||
try: |
||||
from analysis.config import CANONICAL_LEFT, CANONICAL_RIGHT |
||||
|
||||
for comp_idx in range(10): |
||||
right_scores = [] |
||||
left_scores = [] |
||||
for party, scores in aligned_all_scores.items(): |
||||
if party in CANONICAL_RIGHT: |
||||
right_scores.append(scores[comp_idx]) |
||||
elif party in CANONICAL_LEFT: |
||||
left_scores.append(scores[comp_idx]) |
||||
|
||||
if right_scores and left_scores: |
||||
right_avg = np.mean(right_scores) |
||||
left_avg = np.mean(left_scores) |
||||
computed_flips[comp_idx + 1] = right_avg < left_avg |
||||
else: |
||||
computed_flips[comp_idx + 1] = False |
||||
except Exception: |
||||
pass |
||||
|
||||
theme_with_flip = { |
||||
**theme, |
||||
"flip": computed_flips.get(comp_sel, theme.get("flip", False)), |
||||
} |
||||
|
||||
if min_mps > 1 and party_mp_counts: |
||||
valid_parties = {p for p, count in party_mp_counts.items() if count >= min_mps} |
||||
party_1d_coords = { |
||||
p: coords for p, coords in party_1d_coords.items() if p in valid_parties |
||||
} |
||||
|
||||
if view_mode == "Tijdtraject" and selected_parties_for_trajectory: |
||||
available_windows = explorer_data.get_uniform_dim_windows(db_path) |
||||
year_windows = sorted( |
||||
w |
||||
for w in available_windows |
||||
if w != "current_parliament" and w != _current_year |
||||
) |
||||
has_current = "current_parliament" in available_windows |
||||
all_windows = year_windows + (["current_parliament"] if has_current else []) |
||||
|
||||
party_scores_by_window = explorer_data._get_aligned_trajectory_scores( |
||||
db_path, all_windows |
||||
) |
||||
|
||||
_render_svd_time_trajectory( |
||||
party_scores_by_window, |
||||
comp_sel, |
||||
theme_with_flip, |
||||
selected_parties_for_trajectory, |
||||
) |
||||
else: |
||||
_render_party_axis_chart_1d(party_1d_coords, comp_sel, theme_with_flip) |
||||
|
||||
motion_ids = [m.get("motion_id") for m in motions if m.get("motion_id") is not None] |
||||
motion_details: Dict[int, tuple] = {} |
||||
if motion_ids: |
||||
ids_int: List[int] = [] |
||||
for mid in motion_ids: |
||||
try: |
||||
ids_int.append(int(mid)) |
||||
except Exception: |
||||
logger.warning("Skipping invalid motion id in SVD batch fetch: %r", mid) |
||||
|
||||
if ids_int and duckdb is not None: |
||||
con = None |
||||
try: |
||||
placeholders = ", ".join("?" for _ in ids_int) |
||||
con = duckdb.connect(database=db_path, read_only=True) |
||||
db_rows = con.execute( |
||||
f"SELECT id, title, date, policy_area, url, body_text, voting_results " |
||||
f"FROM motions WHERE id IN ({placeholders})", |
||||
ids_int, |
||||
).fetchall() |
||||
motion_details = {r[0]: r for r in db_rows} |
||||
except Exception: |
||||
logger.exception("Failed to batch-fetch motion details") |
||||
finally: |
||||
if con: |
||||
con.close() |
||||
|
||||
pos_motions = [m for m in motions if float(m.get("score", 0.0)) >= 0] |
||||
neg_motions = [m for m in motions if float(m.get("score", 0.0)) < 0] |
||||
|
||||
flip = theme_with_flip.get("flip", False) if theme_with_flip else False |
||||
pos_pole = theme_with_flip.get("positive_pole", "") if theme_with_flip else "" |
||||
neg_pole = theme_with_flip.get("negative_pole", "") if theme_with_flip else "" |
||||
|
||||
if flip: |
||||
left_pole, right_pole = pos_pole, neg_pole |
||||
left_motions, right_motions = pos_motions, neg_motions |
||||
left_arrow, right_arrow = "▲", "▼" |
||||
else: |
||||
left_pole, right_pole = neg_pole, pos_pole |
||||
left_motions, right_motions = neg_motions, pos_motions |
||||
left_arrow, right_arrow = "▼", "▲" |
||||
|
||||
lcol, rcol = st.columns(2) |
||||
|
||||
with lcol: |
||||
st.markdown(f"**← {left_pole}**") |
||||
for m in left_motions: |
||||
mid = m.get("motion_id") |
||||
raw_title = m.get("title") or f"Motie #{mid}" |
||||
with st.expander(f"{left_arrow} {raw_title}"): |
||||
row = motion_details.get(int(mid)) if mid is not None else None |
||||
if row: |
||||
try: |
||||
date_str = str(row[2])[:10] |
||||
except Exception: |
||||
date_str = "?" |
||||
st.caption(f"📅 {date_str} | {row[3] or '—'}") |
||||
if row[4] and str(row[4]).startswith("http"): |
||||
st.markdown(f"[🔗 Bekijk op Tweede Kamer]({row[4]})") |
||||
if row[5]: |
||||
with st.expander("Toon volledige tekst"): |
||||
st.write(row[5]) |
||||
_render_voting_results(row[6]) |
||||
else: |
||||
st.caption("_Geen metadata beschikbaar_") |
||||
|
||||
explorer.build_svd_components_tab(db_path) |
||||
with rcol: |
||||
st.markdown(f"**{right_pole} →**") |
||||
for m in right_motions: |
||||
mid = m.get("motion_id") |
||||
raw_title = m.get("title") or f"Motie #{mid}" |
||||
with st.expander(f"{right_arrow} {raw_title}"): |
||||
row = motion_details.get(int(mid)) if mid is not None else None |
||||
if row: |
||||
try: |
||||
date_str = str(row[2])[:10] |
||||
except Exception: |
||||
date_str = "?" |
||||
st.caption(f"📅 {date_str} | {row[3] or '—'}") |
||||
if row[4] and str(row[4]).startswith("http"): |
||||
st.markdown(f"[🔗 Bekijk op Tweede Kamer]({row[4]})") |
||||
if row[5]: |
||||
with st.expander("Toon volledige tekst"): |
||||
st.write(row[5]) |
||||
_render_voting_results(row[6]) |
||||
else: |
||||
st.caption("_Geen metadata beschikbaar_") |
||||
|
||||
@ -1,18 +1,84 @@ |
||||
"""Search tab for the parliamentary explorer. |
||||
|
||||
This module will contain the search tab implementation. |
||||
Currently: Tab logic remains in explorer.py pending Streamlit decoupling. |
||||
""" |
||||
"""Search tab for the parliamentary explorer.""" |
||||
|
||||
from __future__ import annotations |
||||
|
||||
import pandas as pd |
||||
|
||||
import analysis.explorer_data as explorer_data |
||||
from analysis.tabs._rendering import _render_voting_results, st |
||||
|
||||
|
||||
def build_search_tab(db_path: str, show_rejected: bool) -> None: |
||||
"""Build the Motie Zoeken tab. |
||||
"""Build the Motie Zoeken tab.""" |
||||
st.subheader("Motie Zoeken") |
||||
|
||||
df = explorer_data.load_motions_df(db_path) |
||||
if df.empty: |
||||
st.warning("Geen moties beschikbaar.") |
||||
return |
||||
|
||||
if not show_rejected: |
||||
df = df[df["title"].fillna("").str.strip() != "Verworpen."] |
||||
|
||||
col1, col2, col3 = st.columns([2, 1, 1]) |
||||
with col1: |
||||
query = st.text_input( |
||||
"Zoek op titel", placeholder="bijv. stikstof, klimaat, wonen" |
||||
) |
||||
with col2: |
||||
years = sorted(df["year"].dropna().astype(int).unique().tolist()) |
||||
if years: |
||||
year_range = st.select_slider( |
||||
"Jaar", options=years, value=(years[0], years[-1]) |
||||
) |
||||
else: |
||||
year_range = (2019, 2024) |
||||
with col3: |
||||
min_controversy = st.slider( |
||||
"Min. controverse", min_value=0.0, max_value=1.0, value=0.0, step=0.05 |
||||
) |
||||
|
||||
working = df.copy() |
||||
working = working[ |
||||
(working["year"] >= year_range[0]) & (working["year"] <= year_range[1]) |
||||
] |
||||
if min_controversy > 0: |
||||
working = working[working["controversy_score"] >= min_controversy] |
||||
if query: |
||||
q = query.lower() |
||||
mask = working["title"].fillna("").str.lower().str.contains(q, regex=False) |
||||
working = working[mask] |
||||
|
||||
working = working.sort_values(by="controversy_score", ascending=False) |
||||
st.caption(f"{len(working)} resultaten (top 50 getoond)") |
||||
|
||||
for _, row in working.head(50).iterrows(): |
||||
title = row.get("title") or f"Motie #{row['id']}" |
||||
date_str = row["date"].strftime("%d %b %Y") if pd.notna(row["date"]) else "?" |
||||
controversy = row.get("controversy_score") or 0 |
||||
with st.expander(f"**{title}** — {date_str} — 🔥 {controversy:.2f}"): |
||||
cols = st.columns(3) |
||||
cols[0].metric("Controverse", f"{controversy:.2f}") |
||||
cols[1].metric("Marge", f"{row.get('winning_margin', 0):.2f}") |
||||
cols[2].metric("Jaar", int(row["year"]) if pd.notna(row["year"]) else "?") |
||||
|
||||
_render_voting_results(row.get("voting_results")) |
||||
|
||||
Currently delegates to explorer.py implementation. |
||||
Will be extracted when rendering logic is decoupled from Streamlit. |
||||
""" |
||||
import explorer |
||||
url = row.get("url") |
||||
if url and str(url).startswith("http"): |
||||
st.markdown(f"[🔗 Bekijk op Tweede Kamer]({url})") |
||||
|
||||
explorer.build_search_tab(db_path, show_rejected) |
||||
sim = explorer_data.query_similar(db_path, int(row["id"]), top_k=5) |
||||
if not sim.empty: |
||||
st.markdown("**Vergelijkbare moties:**") |
||||
for _, s in sim.iterrows(): |
||||
s_date = ( |
||||
pd.to_datetime(s["date"]).strftime("%Y") |
||||
if pd.notna(s.get("date")) |
||||
else "" |
||||
) |
||||
st.markdown( |
||||
f"- {s.get('title', 'Onbekend')} *(score: {s['score']:.3f}, {s_date})*" |
||||
) |
||||
else: |
||||
st.caption("_Nog geen vergelijkbare moties beschikbaar_") |
||||
|
||||
@ -1,20 +1,774 @@ |
||||
"""Trajectories tab for the parliamentary explorer. |
||||
"""Trajectories tab for the parliamentary explorer.""" |
||||
|
||||
This module will contain the trajectories tab implementation. |
||||
Currently: Tab logic remains in explorer.py pending Streamlit decoupling. |
||||
from __future__ import annotations |
||||
|
||||
import json |
||||
import logging |
||||
import os |
||||
import re |
||||
import traceback |
||||
from datetime import datetime |
||||
from typing import Dict, List, Optional, Tuple |
||||
|
||||
import numpy as np |
||||
|
||||
from analysis import config |
||||
import analysis.explorer_data as explorer_data |
||||
from analysis import trajectory |
||||
from analysis.tabs._rendering import ( |
||||
PARTY_COLOURS, |
||||
_add_y_direction_annotations, |
||||
go, |
||||
st, |
||||
) |
||||
from explorer_helpers import compute_party_centroids, inspect_positions_for_issues |
||||
|
||||
KNOWN_MAJOR_PARTIES = config.KNOWN_MAJOR_PARTIES |
||||
|
||||
logger = logging.getLogger(__name__) |
||||
|
||||
_last_trajectories_diagnostics: dict = {} |
||||
_last_diagnostics = _last_trajectories_diagnostics |
||||
|
||||
|
||||
def get_debug_trajectories_enabled() -> bool: |
||||
"""Return True when EXPLORER_DEBUG_TRAJECTORIES env var indicates debug mode.""" |
||||
v = os.getenv("EXPLORER_DEBUG_TRAJECTORIES") |
||||
return str(v) in ("1", "true", "True") |
||||
|
||||
|
||||
def select_trajectory_plot_data( |
||||
positions_by_window: Dict[str, Dict[str, Tuple[float, float]]], |
||||
party_map: Dict[str, str], |
||||
windows: List[str], |
||||
selected_parties: List[str], |
||||
smooth_alpha: float = 0.35, |
||||
mp_fallback_count: Optional[int] = None, |
||||
) -> Tuple[go.Figure, int, Optional[str]]: |
||||
"""Return (fig, trace_count, banner_text). |
||||
|
||||
Helper used by build_trajectories_tab. Does not call Streamlit. |
||||
""" |
||||
if mp_fallback_count is None: |
||||
try: |
||||
mp_fallback_count = int(os.getenv("EXPLORER_MP_FALLBACK_COUNT", "20")) |
||||
except Exception: |
||||
mp_fallback_count = 20 |
||||
|
||||
from __future__ import annotations |
||||
party_centroids, meta = compute_party_centroids( |
||||
positions_by_window, party_map, windows |
||||
) |
||||
|
||||
try: |
||||
inspector_summary = inspect_positions_for_issues(positions_by_window, party_map) |
||||
except Exception: |
||||
tb = traceback.format_exc() |
||||
inspector_summary = {} |
||||
try: |
||||
select_trajectory_plot_data._last_diagnostics = { |
||||
"stage": "inspector_exception", |
||||
"exception": tb, |
||||
} |
||||
except Exception: |
||||
pass |
||||
try: |
||||
_last_trajectories_diagnostics.update( |
||||
{"stage": "inspector_exception", "exception": tb} |
||||
) |
||||
except Exception: |
||||
pass |
||||
logger.debug("select_trajectory_plot_data inspector summary: %s", inspector_summary) |
||||
|
||||
plottable_parties = [] |
||||
for p, vals in party_centroids.items(): |
||||
has_valid = any(not (np.isnan(x) and np.isnan(y)) for x, y in vals) |
||||
if has_valid: |
||||
plottable_parties.append(p) |
||||
|
||||
logging.getLogger(__name__).debug( |
||||
"[TRAJ DEBUG] plottable_parties: %d parties, sample=%s", |
||||
len(plottable_parties), |
||||
(plottable_parties[:5] if plottable_parties else "empty"), |
||||
) |
||||
logging.getLogger(__name__).debug( |
||||
"[TRAJ DEBUG] party_centroids keys: %s", |
||||
list(party_centroids.keys())[:10], |
||||
) |
||||
if party_centroids: |
||||
sample_party = list(party_centroids.keys())[0] |
||||
sample_vals = party_centroids[sample_party] |
||||
logging.getLogger(__name__).debug( |
||||
"[TRAJ DEBUG] Sample party '%s' centroids: %s...", |
||||
sample_party, |
||||
sample_vals[:3], |
||||
) |
||||
|
||||
fig = go.Figure() |
||||
trace_count = 0 |
||||
banner_text: Optional[str] = None |
||||
|
||||
from typing import List |
||||
def _ema_smooth(values: List[float], alpha: float) -> List[float]: |
||||
if not values or alpha >= 1.0: |
||||
return values |
||||
smoothed: List[float] = [] |
||||
prev = None |
||||
for v in values: |
||||
if v is None or (isinstance(v, float) and np.isnan(v)): |
||||
smoothed.append(float(np.nan)) |
||||
continue |
||||
v = float(v) |
||||
if prev is None: |
||||
prev = v |
||||
else: |
||||
prev = alpha * v + (1 - alpha) * prev |
||||
smoothed.append(float(prev)) |
||||
return smoothed |
||||
|
||||
if not plottable_parties: |
||||
mp_positions: Dict[str, Dict[str, Tuple[float, float]]] = {} |
||||
for wid in windows: |
||||
pos = positions_by_window.get(wid, {}) |
||||
for mp_name, xy in pos.items(): |
||||
try: |
||||
x, y = float(xy[0]), float(xy[1]) |
||||
except Exception: |
||||
continue |
||||
mp_positions.setdefault(mp_name, {})[wid] = (x, y) |
||||
|
||||
mp_activity = sorted( |
||||
[(mp, len(wdict)) for mp, wdict in mp_positions.items()], |
||||
key=lambda t: t[1], |
||||
reverse=True, |
||||
) |
||||
top_mps = [mp for mp, _ in mp_activity[:mp_fallback_count]] |
||||
|
||||
for mp in top_mps: |
||||
wids_sorted = sorted(mp_positions.get(mp, {}).keys()) |
||||
if not wids_sorted: |
||||
continue |
||||
xs_raw = [mp_positions[mp][w][0] for w in wids_sorted] |
||||
ys_raw = [mp_positions[mp][w][1] for w in wids_sorted] |
||||
xs = _ema_smooth(xs_raw, smooth_alpha) |
||||
ys = _ema_smooth(ys_raw, smooth_alpha) |
||||
custom_raw = [ |
||||
( |
||||
float(rx) if rx is not None else float(np.nan), |
||||
float(ry) if ry is not None else float(np.nan), |
||||
) |
||||
for rx, ry in zip(xs_raw, ys_raw) |
||||
] |
||||
fig.add_trace( |
||||
go.Scatter( |
||||
x=xs, |
||||
y=ys, |
||||
mode="lines+markers", |
||||
name=mp, |
||||
text=wids_sorted, |
||||
customdata=custom_raw, |
||||
line=dict(color="#888888", shape="spline", smoothing=1.3), |
||||
marker=dict(color="#888888", size=6), |
||||
) |
||||
) |
||||
trace_count += 1 |
||||
|
||||
banner_text = "Partijcentroiden niet beschikbaar — tonen individuele MP-trajecten als fallback." |
||||
logging.getLogger(__name__).debug( |
||||
"[TRAJ DEBUG] Fallback to MP trajectories: trace_count=%d, top_mps=%d", |
||||
trace_count, |
||||
len(top_mps), |
||||
) |
||||
return fig, trace_count, banner_text |
||||
|
||||
to_plot = [p for p in selected_parties if p in plottable_parties] |
||||
if not to_plot: |
||||
to_plot = plottable_parties |
||||
|
||||
for party in to_plot: |
||||
vals = party_centroids.get(party, []) |
||||
if not vals: |
||||
continue |
||||
xs_raw = [v[0] for v in vals] |
||||
ys_raw = [v[1] for v in vals] |
||||
xs = _ema_smooth(xs_raw, smooth_alpha) |
||||
ys = _ema_smooth(ys_raw, smooth_alpha) |
||||
custom_raw = [ |
||||
( |
||||
float(x) if (x is not None and not np.isnan(x)) else float(np.nan), |
||||
float(y) if (y is not None and not np.isnan(y)) else float(np.nan), |
||||
) |
||||
for x, y in zip(xs_raw, ys_raw) |
||||
] |
||||
colour = PARTY_COLOURS.get(party, "#9E9E9E") |
||||
fig.add_trace( |
||||
go.Scatter( |
||||
x=xs, |
||||
y=ys, |
||||
mode="lines+markers", |
||||
name=party, |
||||
text=windows, |
||||
customdata=custom_raw, |
||||
line=dict(color=colour, shape="spline", smoothing=1.3), |
||||
marker=dict(color=colour, size=8), |
||||
) |
||||
) |
||||
trace_count += 1 |
||||
|
||||
logging.getLogger(__name__).debug( |
||||
"[TRAJ DEBUG] Final trace_count=%d, plottable_parties=%d, to_plot=%s", |
||||
trace_count, |
||||
len(plottable_parties), |
||||
(len(to_plot) if "to_plot" in dir() else "N/A"), |
||||
) |
||||
return fig, trace_count, None |
||||
|
||||
|
||||
def build_trajectories_tab(db_path: str, window_size: str) -> None: |
||||
"""Build the Partij Trajectories tab. |
||||
"""Build the Partij Trajectories tab.""" |
||||
logging.getLogger(__name__).debug( |
||||
"[TRAJ DEBUG] build_trajectories_tab called — db_path=%s, window_size=%s", |
||||
db_path, |
||||
window_size, |
||||
) |
||||
st.subheader("Partij Trajectories") |
||||
st.markdown("Hoe bewegen partijen over de tijdsvensters heen?") |
||||
|
||||
Currently delegates to explorer.py implementation. |
||||
Will be extracted when rendering logic is decoupled from Streamlit. |
||||
""" |
||||
import explorer |
||||
positions_by_window, axis_def = explorer_data.load_positions(db_path, window_size) |
||||
logging.getLogger(__name__).debug( |
||||
"[TRAJ DEBUG] load_positions → %d windows, total MPs=%d", |
||||
len(positions_by_window), |
||||
sum(len(v) for v in positions_by_window.values()), |
||||
) |
||||
if axis_def is None: |
||||
axis_def = {} |
||||
if not positions_by_window: |
||||
try: |
||||
_last_trajectories_diagnostics.update( |
||||
{ |
||||
"stage": "load_positions_empty", |
||||
"positions_by_window_len": len(positions_by_window), |
||||
} |
||||
) |
||||
except Exception: |
||||
pass |
||||
try: |
||||
st.warning("Geen positiedata beschikbaar.") |
||||
except Exception: |
||||
pass |
||||
try: |
||||
if get_debug_trajectories_enabled(): |
||||
try: |
||||
st.text_area( |
||||
"Trajectories diagnostics", |
||||
json.dumps(_last_trajectories_diagnostics, default=str), |
||||
height=160, |
||||
) |
||||
except Exception: |
||||
pass |
||||
except Exception: |
||||
pass |
||||
return |
||||
|
||||
party_map = explorer_data.load_party_map(db_path) |
||||
logging.getLogger(__name__).debug( |
||||
"[TRAJ DEBUG] load_party_map → %d entries, sample=%s", |
||||
len(party_map), |
||||
list(party_map.items())[:3], |
||||
) |
||||
|
||||
def normalize_mp_name(name): |
||||
"""Normalize MP name for better matching between data sources.""" |
||||
if not name: |
||||
return "" |
||||
name = name.strip() |
||||
if "," in name and ", " not in name: |
||||
name = name.replace(",", ", ") |
||||
return name |
||||
|
||||
party_map = {normalize_mp_name(k): v for k, v in party_map.items()} |
||||
|
||||
normalized_positions = {} |
||||
for window, positions in positions_by_window.items(): |
||||
normalized_positions[window] = { |
||||
normalize_mp_name(k): v for k, v in positions.items() |
||||
} |
||||
positions_by_window = normalized_positions |
||||
|
||||
all_mp_names = set() |
||||
for positions in positions_by_window.values(): |
||||
all_mp_names.update(positions.keys()) |
||||
|
||||
matched_names = sum(1 for mp in all_mp_names if mp in party_map) |
||||
if all_mp_names: |
||||
logger.info( |
||||
f"MP name matching: {matched_names}/{len(all_mp_names)} matched ({100 * matched_names / len(all_mp_names):.1f}%)" |
||||
) |
||||
else: |
||||
logger.info("MP name matching: no MPs found in positions data") |
||||
|
||||
if matched_names == 0 and len(all_mp_names) > 0: |
||||
logger.warning("No MP names matched between positions and party_map!") |
||||
logger.warning(f"Sample positions names: {list(all_mp_names)[:5]}") |
||||
logger.warning(f"Sample party_map names: {list(party_map.keys())[:5]}") |
||||
|
||||
windows = sorted(positions_by_window.keys()) |
||||
|
||||
centroids: Dict[str, Dict[str, Tuple[float, float]]] = {} |
||||
all_parties: set = set() |
||||
|
||||
def _strip_paren(name: str) -> str: |
||||
return re.sub(r"\s*\([^)]*\)", "", name).strip() |
||||
|
||||
for wid in windows: |
||||
pos = positions_by_window.get(wid, {}) |
||||
per_party: Dict[str, List[Tuple[float, float]]] = {} |
||||
for mp_name, (x, y) in pos.items(): |
||||
party = party_map.get(mp_name) or party_map.get( |
||||
_strip_paren(mp_name), "Unknown" |
||||
) |
||||
if party == "Unknown": |
||||
continue |
||||
per_party.setdefault(party, []).append((x, y)) |
||||
for party, coords in per_party.items(): |
||||
all_parties.add(party) |
||||
xs = [c[0] for c in coords] |
||||
ys = [c[1] for c in coords] |
||||
centroids.setdefault(party, {})[wid] = ( |
||||
float(np.mean(xs)), |
||||
float(np.mean(ys)), |
||||
) |
||||
|
||||
all_parties = sorted( |
||||
set(party_map.get(mp) for MPs in positions_by_window.values() for mp in MPs) |
||||
- {None, "Unknown"} |
||||
) |
||||
logging.getLogger(__name__).debug( |
||||
"[TRAJ DEBUG] all_parties (raw from party_map) → %d parties: %s", |
||||
len(all_parties), |
||||
all_parties[:10], |
||||
) |
||||
all_parties_sorted = sorted(all_parties) |
||||
|
||||
if not all_parties_sorted: |
||||
st.info( |
||||
"Geen partijen beschikbaar om trajecten te tekenen. Controleer of de party mapping is geladen (mp_metadata) en of de minimum Kamerleden-instelling te hoog staat." |
||||
) |
||||
try: |
||||
st.caption(f"Bekende partijen in party_map: {len(party_map)}") |
||||
except Exception: |
||||
pass |
||||
|
||||
default_parties = [p for p in ["CDA", "D66", "VVD"] if p in all_parties] |
||||
if not default_parties: |
||||
default_parties = [p for p in KNOWN_MAJOR_PARTIES if p in all_parties] |
||||
if not default_parties: |
||||
default_parties = all_parties_sorted[:6] |
||||
|
||||
selected_parties = st.multiselect( |
||||
"Selecteer partijen", |
||||
options=all_parties_sorted, |
||||
default=default_parties, |
||||
) |
||||
|
||||
def _ema_smooth(values: List[float], alpha: float) -> List[float]: |
||||
if not values or alpha >= 1.0: |
||||
return values |
||||
smoothed = [values[0]] |
||||
for v in values[1:]: |
||||
smoothed.append(alpha * v + (1 - alpha) * smoothed[-1]) |
||||
return smoothed |
||||
|
||||
smooth_alpha = 0.35 |
||||
|
||||
if not centroids: |
||||
st.info( |
||||
"Partijcentroiden niet beschikbaar — tonen individuele MP-trajecten als fallback." |
||||
) |
||||
|
||||
mp_positions: Dict[str, Dict[str, Tuple[float, float]]] = {} |
||||
for wid in windows: |
||||
pos = positions_by_window.get(wid, {}) |
||||
for mp_name, xy in pos.items(): |
||||
try: |
||||
x, y = float(xy[0]), float(xy[1]) |
||||
except Exception: |
||||
continue |
||||
mp_positions.setdefault(mp_name, {})[wid] = (x, y) |
||||
|
||||
mp_positions = { |
||||
mp: pos |
||||
for mp, pos in mp_positions.items() |
||||
if len(pos) >= 2 |
||||
and not all(np.isnan(x) and np.isnan(y) for x, y in pos.values()) |
||||
} |
||||
|
||||
if not mp_positions: |
||||
st.warning("Geen positiedata beschikbaar voor trajectplotten.") |
||||
_last_trajectories_diagnostics.update( |
||||
{ |
||||
"stage": "no_mp_positions", |
||||
"mp_positions_count": 0, |
||||
} |
||||
) |
||||
try: |
||||
if get_debug_trajectories_enabled(): |
||||
try: |
||||
st.text_area( |
||||
"Trajectories diagnostics", |
||||
json.dumps(_last_trajectories_diagnostics, default=str), |
||||
height=160, |
||||
) |
||||
except Exception: |
||||
pass |
||||
except Exception: |
||||
pass |
||||
return |
||||
|
||||
st.session_state["_trajectory_mp_positions"] = mp_positions |
||||
|
||||
mp_list = sorted(mp_positions.keys()) |
||||
default_mps = mp_list[:6] |
||||
selected_mps = st.multiselect( |
||||
"Selecteer Kamerleden (fallback)", options=mp_list, default=default_mps |
||||
) |
||||
|
||||
fig = go.Figure() |
||||
trace_count = 0 |
||||
for mp in selected_mps: |
||||
wids_sorted = sorted(mp_positions[mp].keys()) |
||||
xs_raw = [mp_positions[mp][w][0] for w in wids_sorted] |
||||
ys_raw = [mp_positions[mp][w][1] for w in wids_sorted] |
||||
xs = _ema_smooth(xs_raw, smooth_alpha) |
||||
ys = _ema_smooth(ys_raw, smooth_alpha) |
||||
custom_raw = [(float(rx), float(ry)) for rx, ry in zip(xs_raw, ys_raw)] |
||||
fig.add_trace( |
||||
go.Scatter( |
||||
x=xs, |
||||
y=ys, |
||||
mode="lines+markers", |
||||
name=mp, |
||||
text=wids_sorted, |
||||
customdata=custom_raw, |
||||
line=dict(color="#888888", shape="spline", smoothing=1.3), |
||||
marker=dict(color="#888888", size=6), |
||||
hovertemplate=( |
||||
f"<b>{mp}</b><br>" |
||||
"venster: %{text}<br>" |
||||
"x (smoothed): %{x:.3f}<br>" |
||||
"x (raw): %{customdata[0]:.3f}<br>" |
||||
"y (smoothed): %{y:.3f}<br>" |
||||
"y (raw): %{customdata[1]:.3f}<extra></extra>" |
||||
), |
||||
) |
||||
) |
||||
trace_count += 1 |
||||
|
||||
_add_y_direction_annotations(fig) |
||||
if trace_count == 0: |
||||
st.info( |
||||
"Geen trajecten getekend: geen geselecteerde Kamerleden met voldoende data." |
||||
) |
||||
else: |
||||
st.plotly_chart(fig, use_container_width=True) |
||||
return |
||||
|
||||
if os.getenv("EXPLORER_FORCE_SHOW_TRAJECTORIES") in ("1", "true", "True"): |
||||
mp_positions: Dict[str, Dict[str, Tuple[float, float]]] = {} |
||||
for wid in windows: |
||||
pos = positions_by_window.get(wid, {}) |
||||
for mp_name, (x, y) in pos.items(): |
||||
mp_positions.setdefault(mp_name, {})[wid] = (float(x), float(y)) |
||||
|
||||
mp_list = sorted(mp_positions.keys()) |
||||
if not mp_list: |
||||
st.info("Geen MP-positiegegevens beschikbaar om te tonen.") |
||||
return |
||||
|
||||
sample_mps = mp_list[:6] |
||||
fig = go.Figure() |
||||
for mp in sample_mps: |
||||
wids_sorted = sorted(mp_positions[mp].keys()) |
||||
xs_raw = [mp_positions[mp][w][0] for w in wids_sorted] |
||||
ys_raw = [mp_positions[mp][w][1] for w in wids_sorted] |
||||
xs = _ema_smooth(xs_raw, 0.35) |
||||
ys = _ema_smooth(ys_raw, 0.35) |
||||
custom_raw = [(float(rx), float(ry)) for rx, ry in zip(xs_raw, ys_raw)] |
||||
fig.add_trace( |
||||
go.Scatter( |
||||
x=xs, |
||||
y=ys, |
||||
mode="lines+markers", |
||||
name=mp, |
||||
text=wids_sorted, |
||||
customdata=custom_raw, |
||||
line=dict(color="#444444", shape="spline", smoothing=1.3), |
||||
marker=dict(color="#444444", size=6), |
||||
hovertemplate=( |
||||
f"<b>{mp}</b><br>" |
||||
"venster: %{text}<br>" |
||||
"x (smoothed): %{x:.3f}<br>" |
||||
"x (raw): %{customdata[0]:.3f}<br>" |
||||
"y (smoothed): %{y:.3f}<br>" |
||||
"y (raw): %{customdata[1]:.3f}<extra></extra>" |
||||
), |
||||
) |
||||
) |
||||
_add_y_direction_annotations(fig) |
||||
st.plotly_chart(fig, use_container_width=True) |
||||
return |
||||
|
||||
try: |
||||
debug_checkbox = False |
||||
try: |
||||
debug_checkbox = st.checkbox( |
||||
"Enable trajectories diagnostics (show extra info)", |
||||
value=get_debug_trajectories_enabled(), |
||||
) |
||||
except Exception: |
||||
debug_checkbox = get_debug_trajectories_enabled() |
||||
if debug_checkbox: |
||||
try: |
||||
with st.expander( |
||||
"DEBUG: Trajectories data (showing diagnostics)", expanded=False |
||||
): |
||||
st.write("windows (count):", len(windows)) |
||||
st.write("windows sample:", windows[:10]) |
||||
st.write("party_map entries:", len(party_map)) |
||||
st.write("parties with centroids:", len(all_parties_sorted)) |
||||
st.write("default_parties:", default_parties) |
||||
st.write("selected_parties:", selected_parties) |
||||
st.write("min_mps setting:", 3) |
||||
sample = { |
||||
p: len(centroids.get(p, {})) |
||||
for p in list(all_parties_sorted)[:8] |
||||
} |
||||
st.write("sample centroid window counts per party:", sample) |
||||
except Exception: |
||||
pass |
||||
except Exception: |
||||
pass |
||||
|
||||
smoothing_method = st.selectbox( |
||||
"Smoothing methode", |
||||
options=["EMA", "Spline", "None"], |
||||
index=0, |
||||
help="EMA = exponential moving average; Spline = low-degree polynomial spline fit; None = raw centroids", |
||||
) |
||||
|
||||
smooth_alpha = 1.0 |
||||
if smoothing_method == "EMA": |
||||
smooth_alpha = st.slider( |
||||
"Glad maken (EMA-\u03b1)", |
||||
min_value=0.1, |
||||
max_value=1.0, |
||||
value=0.35, |
||||
step=0.05, |
||||
help=( |
||||
"\u03b1=1.0 toont de ruwe data; lagere waarden maken de lijn gladder. " |
||||
"Standaard 0.35 voor een goed evenwicht tussen detail en ruis." |
||||
), |
||||
) |
||||
|
||||
def _spline_smooth(values: List[float]) -> List[float]: |
||||
n = len(values) |
||||
if n <= 2: |
||||
return values |
||||
deg = min(3, n - 1) |
||||
try: |
||||
idx = np.arange(n, dtype=float) |
||||
coeffs = np.polyfit(idx, np.array(values, dtype=float), deg=deg) |
||||
smooth = np.polyval(coeffs, idx) |
||||
return [float(v) for v in smooth] |
||||
except Exception: |
||||
return values |
||||
|
||||
fig = go.Figure() |
||||
trace_count = 0 |
||||
helper_succeeded = False |
||||
try: |
||||
fig2, trace_count2, banner_text = select_trajectory_plot_data( |
||||
positions_by_window, party_map, windows, selected_parties, smooth_alpha |
||||
) |
||||
if fig2 is not None: |
||||
fig = fig2 |
||||
trace_count = trace_count2 |
||||
helper_succeeded = True |
||||
if banner_text: |
||||
try: |
||||
st.caption(banner_text) |
||||
except Exception: |
||||
pass |
||||
try: |
||||
_last_trajectories_diagnostics.update({"banner_text": banner_text}) |
||||
except Exception: |
||||
pass |
||||
except Exception as e: |
||||
tb = traceback.format_exc() |
||||
try: |
||||
select_trajectory_plot_data._last_diagnostics = {"exception": tb} |
||||
except Exception: |
||||
pass |
||||
try: |
||||
_last_trajectories_diagnostics.update( |
||||
{"stage": "select_helper_exception", "exception": tb} |
||||
) |
||||
except Exception: |
||||
pass |
||||
logger.exception("select_trajectory_plot_data failed") |
||||
debug_enabled = get_debug_trajectories_enabled() |
||||
if debug_enabled: |
||||
try: |
||||
st.text_area("select_trajectory_plot_data traceback", tb, height=240) |
||||
except Exception: |
||||
pass |
||||
logging.getLogger(__name__).debug( |
||||
"[TRAJ DEBUG] helper_succeeded=%s", helper_succeeded |
||||
) |
||||
if not helper_succeeded: |
||||
for party in selected_parties: |
||||
if party not in centroids: |
||||
continue |
||||
wids_sorted = sorted(centroids[party].keys()) |
||||
xs_raw = [centroids[party][w][0] for w in wids_sorted] |
||||
ys_raw = [centroids[party][w][1] for w in wids_sorted] |
||||
xs = _ema_smooth(xs_raw, smooth_alpha) |
||||
ys = _ema_smooth(ys_raw, smooth_alpha) |
||||
custom_raw = [(float(rx), float(ry)) for rx, ry in zip(xs_raw, ys_raw)] |
||||
colour = PARTY_COLOURS.get(party, "#9E9E9E") |
||||
fig.add_trace( |
||||
go.Scatter( |
||||
x=xs, |
||||
y=ys, |
||||
mode="lines+markers", |
||||
name=party, |
||||
text=wids_sorted, |
||||
customdata=custom_raw, |
||||
line=dict(color=colour, shape="spline", smoothing=1.3), |
||||
marker=dict(color=colour, size=8), |
||||
hovertemplate=( |
||||
f"<b>{party}</b><br>" |
||||
"venster: %{text}<br>" |
||||
"x (smoothed): %{x:.3f}<br>" |
||||
"x (raw): %{customdata[0]:.3f}<br>" |
||||
"y (smoothed): %{y:.3f}<br>" |
||||
"y (raw): %{customdata[1]:.3f}<extra></extra>" |
||||
), |
||||
) |
||||
) |
||||
trace_count += 1 |
||||
|
||||
_THRESHOLD = 0.65 |
||||
x_conf_map = axis_def.get("x_label_confidence", {}) or {} |
||||
y_conf_map = axis_def.get("y_label_confidence", {}) or {} |
||||
|
||||
def _mean_conf(m: dict) -> Optional[float]: |
||||
vals = [v for v in m.values() if v is not None] |
||||
if not vals: |
||||
return None |
||||
return float(sum(vals) / len(vals)) |
||||
|
||||
x_mean = _mean_conf(x_conf_map) |
||||
y_mean = _mean_conf(y_conf_map) |
||||
|
||||
x_title = trajectory.choose_trajectory_title(axis_def, "x", threshold=_THRESHOLD) |
||||
y_title = trajectory.choose_trajectory_title(axis_def, "y", threshold=_THRESHOLD) |
||||
|
||||
fig.update_layout( |
||||
title="Partij trajectories", |
||||
xaxis_title=x_title, |
||||
yaxis_title=y_title, |
||||
height=600, |
||||
legend_title_text="Partij", |
||||
) |
||||
_add_y_direction_annotations(fig) |
||||
try: |
||||
_last_trajectories_diagnostics.update({"trace_count": trace_count}) |
||||
except Exception: |
||||
pass |
||||
debug_enabled = get_debug_trajectories_enabled() |
||||
if trace_count == 0: |
||||
_last_trajectories_diagnostics.update( |
||||
{ |
||||
"stage": "zero_traces", |
||||
"positions_count": sum(len(pos) for pos in positions_by_window.values()) |
||||
if positions_by_window |
||||
else 0, |
||||
"party_map_count": len(party_map) if party_map else 0, |
||||
"centroids_count": len(centroids) if centroids else 0, |
||||
"selected_parties_count": len(selected_parties) |
||||
if selected_parties |
||||
else 0, |
||||
"timestamp": datetime.now().isoformat(), |
||||
} |
||||
) |
||||
if positions_by_window and party_map and not centroids: |
||||
sample_mps = [] |
||||
for window, positions in list(positions_by_window.items())[:1]: |
||||
sample_mps = list(positions.keys())[:5] |
||||
break |
||||
matched = sum(1 for mp in sample_mps if mp in party_map) |
||||
_last_trajectories_diagnostics["name_match_check"] = { |
||||
"sample_mps": sample_mps, |
||||
"matched_in_party_map": matched, |
||||
"sample_size": len(sample_mps), |
||||
} |
||||
if trace_count == 0: |
||||
st.info("📊 **Geen trajecten getekend**") |
||||
|
||||
with st.expander("🔍 Diagnostische informatie"): |
||||
st.write("**Data status:**") |
||||
st.write( |
||||
f"- Positie vensters: {len(positions_by_window) if positions_by_window else 0}" |
||||
) |
||||
st.write(f"- Party mappings: {len(party_map) if party_map else 0}") |
||||
st.write( |
||||
f"- Geselecteerde partijen: {len(selected_parties) if selected_parties else 0}" |
||||
) |
||||
|
||||
if "centroid_diagnostics" in locals(): |
||||
st.write("**Centroid berekening:**") |
||||
st.write( |
||||
f"- Partijen met posities: {len(centroid_diagnostics.get('parties_with_positions', []))}" |
||||
) |
||||
st.write( |
||||
f"- Partijen met alleen NaN: {len(centroid_diagnostics.get('parties_all_nan', []))}" |
||||
) |
||||
|
||||
st.write("\n**Mogelijke oorzaken:**") |
||||
st.write("1. Geen SVD vectoren berekend voor de geselecteerde vensters") |
||||
st.write("2. MP namen in posities komen niet overeen met party_map") |
||||
st.write("3. Alle geselecteerde partijen hebben te weinig MPs (< 5)") |
||||
|
||||
if st.button("🔧 Database diagnostiek uitvoeren"): |
||||
with st.spinner("Bezig met diagnostiek..."): |
||||
from scripts.diagnose_trajectories_cli import ( |
||||
run as diagnose_trajectories, |
||||
) |
||||
|
||||
explorer.build_trajectories_tab(db_path, window_size) |
||||
results = diagnose_trajectories(db_path) |
||||
st.json(results) |
||||
else: |
||||
try: |
||||
st.info( |
||||
f"[DEBUG] trace_count={trace_count}, fig data count={len(fig.data)}, layout title={fig.layout.title.text if fig.layout.title else 'none'}" |
||||
) |
||||
except Exception: |
||||
pass |
||||
try: |
||||
logging.getLogger(__name__).debug( |
||||
"[TRAJ DEBUG] About to render plotly chart — trace_count=%d, banner=%s, fig has %d traces", |
||||
trace_count, |
||||
banner_text, |
||||
len(fig.data), |
||||
) |
||||
st.plotly_chart(fig, use_container_width=True) |
||||
except Exception as e: |
||||
st.error(f"Trajectories rendering failed: {e}") |
||||
if get_debug_trajectories_enabled(): |
||||
try: |
||||
st.json(_last_trajectories_diagnostics) |
||||
except Exception: |
||||
st.text_area( |
||||
"Trajectories diagnostics (JSON failed)", |
||||
json.dumps(_last_trajectories_diagnostics, default=str), |
||||
height=240, |
||||
) |
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,170 @@ |
||||
"""Automated pipeline scheduling. |
||||
|
||||
Runs the parliamentary embedding pipeline and motion summarization |
||||
on a configurable schedule using the `schedule` library. |
||||
|
||||
Usage: |
||||
uv run python scheduler.py # start scheduler loop |
||||
uv run python scheduler.py --once # run once and exit |
||||
uv run python scheduler.py --pipeline-time 03:00 --summarizer-every 6 |
||||
""" |
||||
|
||||
from __future__ import annotations |
||||
|
||||
import argparse |
||||
import logging |
||||
import signal |
||||
import sys |
||||
import time |
||||
from typing import Callable |
||||
|
||||
import schedule |
||||
|
||||
from config import config |
||||
import argparse |
||||
|
||||
from pipeline.run_pipeline import run as run_pipeline |
||||
from summarizer import summarizer |
||||
|
||||
_logger = logging.getLogger(__name__) |
||||
|
||||
|
||||
class PipelineScheduler: |
||||
"""Schedules and runs pipeline jobs.""" |
||||
|
||||
def __init__(self, db_path: str = "data/motions.db"): |
||||
self.db_path = db_path |
||||
self._running = False |
||||
|
||||
def run_pipeline(self) -> int: |
||||
"""Run the full embedding pipeline. |
||||
|
||||
Returns the exit code from the pipeline run. |
||||
""" |
||||
_logger.info("Starting scheduled pipeline run") |
||||
try: |
||||
args = argparse.Namespace( |
||||
db_path=self.db_path, |
||||
window_size="annual", |
||||
start_date=None, |
||||
end_date=None, |
||||
svd_k=50, |
||||
svd_workers=None, |
||||
text_model=None, |
||||
text_batch_size=200, |
||||
skip_metadata=False, |
||||
skip_extract=False, |
||||
skip_svd=False, |
||||
skip_text=False, |
||||
skip_fusion=False, |
||||
dry_run=False, |
||||
) |
||||
result = run_pipeline(args) |
||||
_logger.info("Pipeline run completed with code %s", result) |
||||
return result if isinstance(result, int) else 0 |
||||
except Exception: |
||||
_logger.exception("Pipeline run failed") |
||||
return 1 |
||||
|
||||
def run_summarizer(self) -> None: |
||||
"""Run motion summarization for missing explanations.""" |
||||
_logger.info("Starting scheduled summarizer run") |
||||
try: |
||||
summarizer.update_motion_summaries() |
||||
_logger.info("Summarizer run completed") |
||||
except Exception: |
||||
_logger.exception("Summarizer run failed") |
||||
|
||||
def schedule_daily(self, time_str: str = "02:00") -> None: |
||||
"""Schedule the pipeline to run daily at *time_str*.""" |
||||
_logger.info("Scheduling daily pipeline run at %s", time_str) |
||||
schedule.every().day.at(time_str).do(self.run_pipeline) |
||||
|
||||
def schedule_summarizer(self, every_n_hours: int = 6) -> None: |
||||
"""Schedule the summarizer to run every *every_n_hours* hours.""" |
||||
_logger.info("Scheduling summarizer every %s hours", every_n_hours) |
||||
schedule.every(every_n_hours).hours.do(self.run_summarizer) |
||||
|
||||
def _signal_handler(self, signum, frame) -> None: |
||||
"""Handle shutdown signals gracefully.""" |
||||
_logger.info("Received signal %s, shutting down", signum) |
||||
self.stop() |
||||
|
||||
def start(self) -> None: |
||||
"""Start the scheduler loop. |
||||
|
||||
Blocks until :meth:`stop` is called or a signal is received. |
||||
""" |
||||
self._running = True |
||||
|
||||
# Register signal handlers for graceful shutdown |
||||
signal.signal(signal.SIGTERM, self._signal_handler) |
||||
signal.signal(signal.SIGINT, self._signal_handler) |
||||
|
||||
_logger.info("Scheduler started") |
||||
while self._running: |
||||
schedule.run_pending() |
||||
time.sleep(1) |
||||
_logger.info("Scheduler stopped") |
||||
|
||||
def stop(self) -> None: |
||||
"""Stop the scheduler loop.""" |
||||
self._running = False |
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser: |
||||
parser = argparse.ArgumentParser( |
||||
description="Automated pipeline scheduler", |
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
||||
) |
||||
parser.add_argument( |
||||
"--db-path", |
||||
default="data/motions.db", |
||||
help="Path to the DuckDB file", |
||||
) |
||||
parser.add_argument( |
||||
"--pipeline-time", |
||||
default="02:00", |
||||
help="Daily pipeline run time (HH:MM)", |
||||
) |
||||
parser.add_argument( |
||||
"--summarizer-every", |
||||
type=int, |
||||
default=6, |
||||
help="Run summarizer every N hours", |
||||
) |
||||
parser.add_argument( |
||||
"--once", |
||||
action="store_true", |
||||
help="Run pipeline + summarizer once and exit (no scheduling loop)", |
||||
) |
||||
return parser |
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int: |
||||
parser = build_parser() |
||||
args = parser.parse_args(argv) |
||||
|
||||
logging.basicConfig( |
||||
level=logging.INFO, |
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s", |
||||
) |
||||
|
||||
sched = PipelineScheduler(db_path=args.db_path) |
||||
|
||||
if args.once: |
||||
_logger.info("Running in single-shot mode") |
||||
pipeline_rc = sched.run_pipeline() |
||||
sched.run_summarizer() |
||||
return pipeline_rc |
||||
|
||||
sched.schedule_daily(args.pipeline_time) |
||||
if args.summarizer_every > 0: |
||||
sched.schedule_summarizer(args.summarizer_every) |
||||
|
||||
sched.start() |
||||
return 0 |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
sys.exit(main()) |
||||
@ -0,0 +1,95 @@ |
||||
"""Tests for explorer.py decomposition (P3-001). |
||||
|
||||
Acceptance criteria: |
||||
- explorer.py must be under 1500 lines. |
||||
- Tab modules must define their build functions locally (not re-export from explorer). |
||||
- No circular imports between explorer.py and analysis.tabs. |
||||
""" |
||||
|
||||
import ast |
||||
import inspect |
||||
import pathlib |
||||
|
||||
|
||||
class TestExplorerDecomposition: |
||||
"""RED test: explorer.py must be under 1500 lines.""" |
||||
|
||||
def test_explorer_line_count_under_1500(self): |
||||
path = pathlib.Path("explorer.py") |
||||
lines = path.read_text(encoding="utf-8").splitlines() |
||||
assert len(lines) < 1500, ( |
||||
f"explorer.py has {len(lines)} lines; target is < 1500. " |
||||
f"Extract tab functions and rendering helpers into analysis/tabs/." |
||||
) |
||||
|
||||
def test_tab_modules_define_functions_locally(self): |
||||
"""Each tab module must define its build_*_tab without delegating to explorer.""" |
||||
tabs = [ |
||||
("analysis/tabs/compass.py", "build_compass_tab"), |
||||
("analysis/tabs/trajectories.py", "build_trajectories_tab"), |
||||
("analysis/tabs/search.py", "build_search_tab"), |
||||
("analysis/tabs/browser.py", "build_browser_tab"), |
||||
("analysis/tabs/components.py", "build_svd_components_tab"), |
||||
("analysis/tabs/quiz.py", "build_mp_quiz_tab"), |
||||
] |
||||
for module_path, func_name in tabs: |
||||
source = pathlib.Path(module_path).read_text(encoding="utf-8") |
||||
tree = ast.parse(source) |
||||
func_def = None |
||||
for node in ast.walk(tree): |
||||
if isinstance(node, ast.FunctionDef) and node.name == func_name: |
||||
func_def = node |
||||
break |
||||
assert func_def is not None, ( |
||||
f"{module_path} must define {func_name}" |
||||
) |
||||
# Ensure it's not a one-liner stub that imports from explorer |
||||
body = func_def.body |
||||
assert len(body) > 3, ( |
||||
f"{module_path}.{func_name} looks like a stub ({len(body)} lines). " |
||||
f"Extract the real implementation from explorer.py." |
||||
) |
||||
|
||||
def test_rendering_helpers_extracted(self): |
||||
"""Rendering helpers should not live in explorer.py.""" |
||||
helpers = [ |
||||
"_render_scree_plot", |
||||
"_build_party_axis_figure", |
||||
"_render_party_axis_chart", |
||||
"_render_party_axis_chart_1d", |
||||
"_render_svd_time_trajectory", |
||||
"_render_voting_results", |
||||
"_add_y_direction_annotations", |
||||
] |
||||
source = pathlib.Path("explorer.py").read_text(encoding="utf-8") |
||||
tree = ast.parse(source) |
||||
defined = {node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)} |
||||
for helper in helpers: |
||||
assert helper not in defined, ( |
||||
f"{helper} should be extracted from explorer.py " |
||||
f"into analysis/tabs/_rendering.py" |
||||
) |
||||
|
||||
def test_no_circular_import_tabs_to_explorer(self): |
||||
"""Tab modules must not import from explorer.""" |
||||
tab_modules = [ |
||||
"analysis/tabs/compass.py", |
||||
"analysis/tabs/trajectories.py", |
||||
"analysis/tabs/search.py", |
||||
"analysis/tabs/browser.py", |
||||
"analysis/tabs/components.py", |
||||
"analysis/tabs/quiz.py", |
||||
"analysis/tabs/_rendering.py", |
||||
] |
||||
for module_path in tab_modules: |
||||
if not pathlib.Path(module_path).exists(): |
||||
continue |
||||
source = pathlib.Path(module_path).read_text(encoding="utf-8") |
||||
assert "from explorer import" not in source, ( |
||||
f"{module_path} imports from explorer.py — " |
||||
f"move shared helpers to explorer_data.py or _rendering.py instead" |
||||
) |
||||
assert "import explorer" not in source, ( |
||||
f"{module_path} imports explorer module — " |
||||
f"move shared helpers to explorer_data.py or _rendering.py instead" |
||||
) |
||||
@ -0,0 +1,159 @@ |
||||
"""Tests for scheduler.py — automated pipeline scheduling. |
||||
|
||||
TDD: write failing test, implement, refactor. |
||||
""" |
||||
|
||||
from __future__ import annotations |
||||
|
||||
import signal |
||||
from unittest.mock import MagicMock, patch |
||||
|
||||
import pytest |
||||
|
||||
|
||||
class TestPipelineSchedulerInit: |
||||
def test_default_db_path(self): |
||||
from scheduler import PipelineScheduler |
||||
|
||||
sched = PipelineScheduler() |
||||
assert sched.db_path == "data/motions.db" |
||||
assert not sched._running |
||||
|
||||
def test_custom_db_path(self): |
||||
from scheduler import PipelineScheduler |
||||
|
||||
sched = PipelineScheduler(db_path="/tmp/test.db") |
||||
assert sched.db_path == "/tmp/test.db" |
||||
|
||||
|
||||
class TestPipelineSchedulerRunPipeline: |
||||
def test_calls_pipeline_run_with_db_path(self): |
||||
from scheduler import PipelineScheduler |
||||
|
||||
sched = PipelineScheduler(db_path="/tmp/test.db") |
||||
with patch("scheduler.run_pipeline") as mock_run: |
||||
mock_run.return_value = 0 |
||||
sched.run_pipeline() |
||||
mock_run.assert_called_once() |
||||
# Verify args contain db_path via Namespace |
||||
args = mock_run.call_args[0][0] |
||||
assert args.db_path == "/tmp/test.db" |
||||
|
||||
def test_logs_error_on_pipeline_failure(self): |
||||
from scheduler import PipelineScheduler |
||||
|
||||
sched = PipelineScheduler() |
||||
with patch("scheduler.run_pipeline") as mock_run: |
||||
mock_run.side_effect = RuntimeError("pipeline failed") |
||||
with patch("scheduler._logger") as mock_logger: |
||||
result = sched.run_pipeline() |
||||
assert result == 1 |
||||
mock_logger.exception.assert_called_once() |
||||
|
||||
|
||||
class TestPipelineSchedulerRunSummarizer: |
||||
def test_calls_summarizer_update(self): |
||||
from scheduler import PipelineScheduler |
||||
|
||||
sched = PipelineScheduler() |
||||
with patch("scheduler.summarizer") as mock_summarizer: |
||||
sched.run_summarizer() |
||||
mock_summarizer.update_motion_summaries.assert_called_once() |
||||
|
||||
def test_logs_error_on_summarizer_failure(self): |
||||
from scheduler import PipelineScheduler |
||||
|
||||
sched = PipelineScheduler() |
||||
with patch("scheduler.summarizer") as mock_summarizer: |
||||
mock_summarizer.update_motion_summaries.side_effect = RuntimeError( |
||||
"summarizer failed" |
||||
) |
||||
with patch("scheduler._logger") as mock_logger: |
||||
sched.run_summarizer() |
||||
mock_logger.exception.assert_called_once() |
||||
|
||||
|
||||
class TestPipelineSchedulerSchedule: |
||||
def test_schedule_daily_adds_job(self): |
||||
from scheduler import PipelineScheduler |
||||
|
||||
sched = PipelineScheduler() |
||||
with patch("scheduler.schedule") as mock_schedule: |
||||
mock_job = MagicMock() |
||||
mock_schedule.every.return_value.day.at.return_value.do = mock_job |
||||
sched.schedule_daily("02:00") |
||||
mock_schedule.every.assert_called_once() |
||||
|
||||
def test_schedule_summarizer_adds_job(self): |
||||
from scheduler import PipelineScheduler |
||||
|
||||
sched = PipelineScheduler() |
||||
with patch("scheduler.schedule") as mock_schedule: |
||||
mock_job = MagicMock() |
||||
mock_schedule.every.return_value.hour.do = mock_job |
||||
sched.schedule_summarizer(every_n_hours=6) |
||||
mock_schedule.every.assert_called_once() |
||||
|
||||
|
||||
class TestPipelineSchedulerLoop: |
||||
def test_start_runs_pending_jobs(self): |
||||
from scheduler import PipelineScheduler |
||||
|
||||
sched = PipelineScheduler() |
||||
call_count = 0 |
||||
|
||||
def _stop_after_first(*args, **kwargs): |
||||
nonlocal call_count |
||||
call_count += 1 |
||||
if call_count >= 3: |
||||
sched.stop() |
||||
|
||||
with patch("scheduler.schedule.run_pending") as mock_run_pending: |
||||
with patch("scheduler.time.sleep", side_effect=_stop_after_first): |
||||
with patch("scheduler.signal.signal"): |
||||
sched.start() |
||||
assert mock_run_pending.called |
||||
assert not sched._running |
||||
|
||||
def test_stop_sets_running_false(self): |
||||
from scheduler import PipelineScheduler |
||||
|
||||
sched = PipelineScheduler() |
||||
sched._running = True |
||||
sched.stop() |
||||
assert not sched._running |
||||
|
||||
def test_signal_handler_stops_scheduler(self): |
||||
from scheduler import PipelineScheduler |
||||
|
||||
sched = PipelineScheduler() |
||||
sched._running = True |
||||
with patch.object(sched, "stop") as mock_stop: |
||||
sched._signal_handler(signal.SIGINT, None) |
||||
mock_stop.assert_called_once() |
||||
|
||||
|
||||
class TestSchedulerCLI: |
||||
def test_main_parses_args(self): |
||||
from scheduler import main |
||||
|
||||
with patch("scheduler.PipelineScheduler") as mock_sched_class: |
||||
mock_sched = MagicMock() |
||||
mock_sched_class.return_value = mock_sched |
||||
rc = main(["--pipeline-time", "03:00"]) |
||||
assert rc == 0 |
||||
mock_sched_class.assert_called_once_with(db_path="data/motions.db") |
||||
mock_sched.schedule_daily.assert_called_once_with("03:00") |
||||
mock_sched.start.assert_called_once() |
||||
|
||||
def test_main_custom_db_path(self): |
||||
from scheduler import main |
||||
|
||||
with patch("scheduler.PipelineScheduler") as mock_sched_class: |
||||
mock_sched = MagicMock() |
||||
mock_sched.run_pipeline.return_value = 0 |
||||
mock_sched_class.return_value = mock_sched |
||||
rc = main(["--db-path", "/tmp/test.db", "--once"]) |
||||
assert rc == 0 |
||||
mock_sched_class.assert_called_once_with(db_path="/tmp/test.db") |
||||
mock_sched.run_pipeline.assert_called_once() |
||||
Loading…
Reference in new issue