fix: scree plot now shows true EVR from Procrustes-aligned multi-window SVD

Previously load_scree_data computed L2-norms per dimension on current_parliament vectors only, giving ~11% for PC1. This was inconsistent with the compass which uses all windows + Procrustes alignment and gets PC1=24.1%. Added compute_svd_spectrum() helper to political_axis.py that reuses the same alignment pipeline. load_scree_data now delegates to it. _render_scree_plot no longer re-normalizes (inputs are already EVR percentages). Hover label updated to 'verklaarde variantie'.
1 month ago · 98b2583efd
parent e0f17e8b83
commit 98b2583efd
2 changed files with 78 additions and 43 deletions
--- a/analysis/political_axis.py
+++ b/analysis/political_axis.py
@ -551,3 +551,71 @@ def compute_2d_axes(
    else:
        raise ValueError("Unknown method '%s'" % method)
 def compute_svd_spectrum(
    db_path: str,
    window_ids: Optional[List[str]] = None,
    normalize_vectors: bool = True,
 ) -> List[float]:
    """Return explained variance ratios (%) for all SVD components, sorted descending.
    Uses the same Procrustes-aligned multi-window matrix as compute_2d_axes so the
    scree plot is consistent with the compass axes.
    Args:
        db_path: path to duckdb
        window_ids: optional ordered list of windows (defaults to all)
        normalize_vectors: whether to L2-normalise each MP vector before stacking
    Returns:
        List of EVR percentages sorted descending (e.g. [24.1, 10.4, 7.2, ...])
    """
    import importlib
    _trajectory = importlib.import_module("analysis.trajectory")
    if window_ids is None:
        window_ids = _trajectory._load_window_ids(db_path)
    raw_window_vecs: Dict[str, Dict[str, np.ndarray]] = {}
    for wid in window_ids:
        raw_window_vecs[wid] = _trajectory._load_mp_vectors_for_window(db_path, wid)
    if not raw_window_vecs:
        return []
    # Pad to uniform dimension before Procrustes alignment
    max_dim = max(v.shape[0] for d in raw_window_vecs.values() for v in d.values())
    padded: Dict[str, Dict[str, np.ndarray]] = {}
    for wid, d in raw_window_vecs.items():
        padded[wid] = {
            e: np.pad(v, (0, max_dim - v.shape[0])) if v.shape[0] < max_dim else v
            for e, v in d.items()
        }
    aligned = _trajectory._procrustes_align_windows(padded)
    all_vecs = []
    for d in aligned.values():
        for v in d.values():
            if normalize_vectors:
                n = np.linalg.norm(v)
                all_vecs.append(v / n if n > 1e-10 else v)
            else:
                all_vecs.append(v)
    if not all_vecs:
        return []
    M = np.vstack(all_vecs)
    Mc = M - M.mean(axis=0)
    try:
        _, s, _ = np.linalg.svd(Mc, full_matrices=False)
    except np.linalg.LinAlgError:
        _logger.exception("SVD failed in compute_svd_spectrum")
        return []
    sv2 = s**2
    evr = sv2 / (sv2.sum() + 1e-20) * 100
    return list(evr)  # already sorted descending by SVD
--- a/explorer.py
+++ b/explorer.py
@ -487,51 +487,18 @@ def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]:
@st.cache_data(show_spinner="Scree-plot laden…")
 def load_scree_data(db_path: str) -> List[float]:
-    """Return per-component importances (L2-norm per SVD dim), sorted descending.
+    """Return explained variance ratios (%) for all SVD components, sorted descending.
-    Uses individual MP vectors from current_parliament (entity_id LIKE '%,%').
+    Uses the same Procrustes-aligned multi-window matrix as the compass axes so the
-    Computes L2-norm per SVD dimension across all MPs, then sorts descending
+    scree plot is consistent with what the compass actually uses.
    so the elbow shape is visible in the scree chart.
    """
    try:
-        con = duckdb.connect(database=db_path, read_only=True)
+        from analysis.political_axis import compute_svd_spectrum
-        rows = con.execute(
+
-            "SELECT entity_id, vector FROM svd_vectors "
+        return compute_svd_spectrum(db_path)
            "WHERE entity_type='mp' AND window_id='current_parliament' "
            "AND entity_id LIKE '%,%'"
        ).fetchall()
        vectors: List[List[float]] = []
        for entity_id, raw_vec in rows:
            if isinstance(raw_vec, str):
                vec = json.loads(raw_vec)
            elif isinstance(raw_vec, (bytes, bytearray)):
                vec = json.loads(raw_vec.decode())
            elif isinstance(raw_vec, list):
                vec = raw_vec
            else:
                try:
                    vec = list(raw_vec)
                except Exception:
                    continue
            fvec = [float(v) if v is not None else 0.0 for v in vec]
            vectors.append(fvec)
        if not vectors:
            return []
        n_dims = len(vectors[0])
        importances: List[float] = []
        for dim in range(n_dims):
            col = [v[dim] for v in vectors if dim < len(v)]
            l2 = sum(x**2 for x in col) ** 0.5
            importances.append(l2)
        return sorted(importances, reverse=True)
    except Exception:
        logger.exception("Failed to load scree data")
        return []
    finally:
        try:
            con.close()
        except Exception:
            pass
 def _render_scree_plot(importances: List[float], n_show: int = 15) -> None:
@ -547,9 +514,9 @@ def _render_scree_plot(importances: List[float], n_show: int = 15) -> None:
    """
    if not importances:
        return
-    total = sum(importances) or 1.0
+    # importances are already EVR percentages summing to ~100 over all components.
-    raw = importances[:n_show]
+    # Slice to n_show for display; cumulative line shows how much variance is covered.
-    data = [v / total * 100 for v in raw]
+    data = list(importances[:n_show])
    ranks = list(range(1, len(data) + 1))
    # Cumulative variance for the dashed overlay line
@ -573,7 +540,7 @@ def _render_scree_plot(importances: List[float], n_show: int = 15) -> None:
            x=ranks,
            y=data,
            marker_color=bar_colours,
-            hovertemplate="As %{x}<br><b>%{y:.1f}%</b> van totaal<extra></extra>",
+            hovertemplate="As %{x}<br><b>%{y:.1f}%</b> verklaarde variantie<extra></extra>",
            showlegend=False,
        )
    )