diff --git a/explorer.py b/explorer.py index 7d2475e..7c01fce 100644 --- a/explorer.py +++ b/explorer.py @@ -252,23 +252,27 @@ def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]: @st.cache_data(show_spinner="Scree-plot laden…") def load_scree_data(db_path: str) -> List[float]: - """Return a list of component importances (L2-norm of party scores per dimension). + """Return component importances (L2-norm per SVD dimension), sorted descending. - Uses the same svd_vectors data as load_party_axis_scores but aggregates across - all components (0-indexed). Returns a list of length == vector dimensionality (50). + Uses ALL individual MP vectors (entity_type='mp', window='current_parliament'), + excluding party-aggregated rows. Since the stored vectors are U*s (scaled by + singular values), the L2-norm of all MP scores per dimension approximates the + singular value for that dimension. Sorting descending gives the proper scree shape. + + Note: Procrustes alignment across sub-windows may scramble the original dimension + ordering, so we sort by magnitude rather than relying on dimension index order. """ try: con = duckdb.connect(database=db_path, read_only=True) - party_list = sorted(CURRENT_PARLIAMENT_PARTIES) - placeholders = ", ".join("?" for _ in party_list) rows = con.execute( - f"SELECT vector FROM svd_vectors " - f"WHERE entity_type='mp' AND window_id='current_parliament' " - f"AND entity_id IN ({placeholders})", - party_list, + "SELECT entity_id, vector FROM svd_vectors " + "WHERE entity_type='mp' AND window_id='current_parliament'" ).fetchall() + # Individual MPs have "Lastname, F." format; party rows are short codes without commas vectors: List[List[float]] = [] - for (raw_vec,) in rows: + for entity_id, raw_vec in rows: + if "," not in entity_id: + continue # skip party-aggregated rows if isinstance(raw_vec, str): vec = json.loads(raw_vec) elif isinstance(raw_vec, (bytes, bytearray)): @@ -289,7 +293,7 @@ def load_scree_data(db_path: str) -> List[float]: col = [v[dim] for v in vectors if dim < len(v)] l2 = sum(x**2 for x in col) ** 0.5 importances.append(l2) - return importances + return sorted(importances, reverse=True) except Exception: logger.exception("Failed to load scree data") return [] @@ -301,33 +305,47 @@ def load_scree_data(db_path: str) -> List[float]: def _render_scree_plot(importances: List[float], n_show: int = 15) -> None: - """Render a bar chart showing relative component importance (scree plot). + """Render a bar+line combo chart showing relative SVD component importance. + + Bars show the L2-norm (singular value proxy) per rank; a line connects the tops + of the bars to make the 'elbow' in the scree curve easy to spot. Args: - importances: List of L2-norm scores per component (0-indexed). + importances: List of importance values sorted descending (from load_scree_data). n_show: How many components to display (default: first 15). """ if not importances: return data = importances[:n_show] - components = list(range(1, len(data) + 1)) - colours = [ - PARTY_COLOURS.get("PVV", "#1565C0") if i == 0 else "#90CAF9" - for i in range(len(data)) - ] - fig = go.Figure( + ranks = list(range(1, len(data) + 1)) + bar_colour = "#90CAF9" + line_colour = "#1565C0" + fig = go.Figure() + fig.add_trace( go.Bar( - x=components, + x=ranks, y=data, - marker_color=colours, - hovertemplate="As %{x}
Gewicht: %{y:.2f}", + marker_color=bar_colour, + hovertemplate="Rang %{x}
Gewicht: %{y:.2f}", + showlegend=False, + ) + ) + fig.add_trace( + go.Scatter( + x=ranks, + y=data, + mode="lines+markers", + line={"color": line_colour, "width": 2}, + marker={"size": 6, "color": line_colour}, + hoverinfo="skip", + showlegend=False, ) ) fig.update_layout( height=220, margin={"l": 10, "r": 10, "t": 10, "b": 30}, xaxis={ - "title": "SVD-as", + "title": "Rang", "tickmode": "linear", "tick0": 1, "dtick": 1, @@ -342,6 +360,7 @@ def _render_scree_plot(importances: List[float], n_show: int = 15) -> None: }, plot_bgcolor="rgba(0,0,0,0)", paper_bgcolor="rgba(0,0,0,0)", + bargap=0.2, ) st.plotly_chart(fig, use_container_width=True)