fix(analysis): improve PCA handling when PC1 dominates, add pca_residual option and plot autoscaling/variance annotation

1 month ago · bf68e48460
parent 23a1234314
commit bf68e48460
2 changed files with 81 additions and 6 deletions
--- a/analysis/political_axis.py
+++ b/analysis/political_axis.py
@ -134,6 +134,7 @@ def compute_2d_axes(
    method: str = "pca",
    anchor_kwargs: Optional[Dict] = None,
    normalize_vectors: bool = True,
+    pca_residual: bool = False,
 ) -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], Dict[str, np.ndarray]]:
    """Compute 2D coordinates for MPs per window.

@ -192,19 +193,59 @@ def compute_2d_axes(
        # centre globally
        Mc = M - M.mean(axis=0)
        try:
-            _, _, Vt = np.linalg.svd(Mc, full_matrices=False)
+            U, s, Vt = np.linalg.svd(Mc, full_matrices=False)
        except np.linalg.LinAlgError:
            _logger.exception("SVD failed in compute_2d_axes (pca)")
            return ({}, {})
-        # take top-2 components as axes (shape k,)
+
+        # explained variance ratio from singular values
+        sv2 = s**2
+        evr = sv2 / (sv2.sum() + 1e-20)
+        evr1 = float(evr[0]) if evr.size > 0 else 0.0
+        evr2 = float(evr[1]) if evr.size > 1 else 0.0
+
+        # take top-1 component as primary axis
        comp1 = Vt[0]
+        comp1_hat = comp1 / (np.linalg.norm(comp1) + 1e-12)
+
+        # By default take the second Vt row as the second axis, but optionally
+        # compute the second axis from residuals (PCA on data with PC1 removed)
        comp2 = Vt[1] if Vt.shape[0] > 1 else np.zeros_like(comp1)
+
+        # compute residual-based second component if requested or if PC1 dominates
+        if pca_residual or evr1 > 0.85:
+            # Project out PC1 from centred data and compute PCA on residuals
+            proj1 = (Mc.dot(comp1_hat)).reshape(-1, 1) * comp1_hat.reshape(1, -1)
+            residual = Mc - proj1
+            try:
+                _, s_res, Vt_res = np.linalg.svd(residual, full_matrices=False)
+                comp2 = Vt_res[0]
+                _logger.info(
+                    "Using residual PCA for second axis (pca_residual=%s)", pca_residual
+                )
+            except Exception:
+                _logger.exception(
+                    "Residual PCA failed; falling back to raw second component"
+                )
+
+        comp2_hat = comp2 / (np.linalg.norm(comp2) + 1e-12)
+
        axes = {
-            "x_axis": comp1 / (np.linalg.norm(comp1) + 1e-12),
-            "y_axis": comp2 / (np.linalg.norm(comp2) + 1e-12),
+            "x_axis": comp1_hat,
+            "y_axis": comp2_hat,
            "method": "pca",
+            "explained_variance_ratio": [evr1, evr2],
+            "pca_residual_used": bool(pca_residual or evr1 > 0.85),
        }

+        # warn if PCA is effectively 1-D
+        if evr1 > 0.85 and not pca_residual:
+            _logger.warning(
+                "PCA first component explains %.1f%% of variance — data is near-1D;\n"
+                "consider using pca_residual=True or the anchor method for a second axis",
+                evr1 * 100,
+            )
+
        # project per-window vectors (centre by global mean)
        global_mean = M.mean(axis=0)
        positions_by_window: Dict[str, Dict[str, Tuple[float, float]]] = {
--- a/analysis/visualize.py
+++ b/analysis/visualize.py
@ -168,6 +168,8 @@ def plot_political_compass(
    positions_by_window: Dict,
    window_id: str,
    party_of: Optional[Dict] = None,
+    axis_def: Optional[Dict] = None,
+    y_scale: Optional[float] = None,
    output_path: str = "analysis_compass.html",
 ) -> str:
    """Plot 2D political compass scatter for a single window.
@ -215,19 +217,51 @@ def plot_political_compass(

    parties = [party_of.get(n, "Unknown") if party_of else "Unknown" for n in names]

+    # If axis_def provided and evr small, optionally scale y for visibility
+    scaled_ys = ys
+    if axis_def and y_scale is None:
+        evr = axis_def.get("explained_variance_ratio") if axis_def else None
+        if evr and isinstance(evr, (list, tuple)) and len(evr) >= 2:
+            evr1, evr2 = evr[0], evr[1]
+            if evr2 < 1e-6:
+                scale_guess = 1.0
+            else:
+                scale_guess = min(max(1.0, float(evr1 / (evr2 + 1e-9)) ** 0.5), 8.0)
+            scaled_ys = [y * scale_guess for y in ys]
+            _logger.info(
+                "Auto-scaling Y by %.2f for visibility (evr1=%.3f evr2=%.3f)",
+                scale_guess,
+                evr1,
+                evr2,
+            )
+    elif axis_def and y_scale is not None:
+        scaled_ys = [y * float(y_scale) for y in ys]
+
+    # mark unknowns differently
+    unknown_flags = [1 if parties[i] == "Unknown" else 0 for i in range(len(names))]
+
    fig = px.scatter(
        x=xs,
-        y=ys,
+        y=scaled_ys,
        color=parties,
+        symbol=unknown_flags,
        hover_name=names,
        title=f"Political Compass ({window_id})",
        labels={
            "x": "Left ← — → Right",
            "y": "Progressive ← — → Conservative",
            "color": "Party",
+            "symbol": "Unknown",
        },
    )
-    fig.update_traces(marker=dict(size=8, opacity=0.8))
+    fig.update_traces(marker=dict(size=8, opacity=0.85))
+    # annotate explained variance if available
+    if axis_def and axis_def.get("method") == "pca":
+        evr = axis_def.get("explained_variance_ratio")
+        if evr and len(evr) >= 2:
+            fig.update_layout(
+                title=f"Political Compass ({window_id}) — PCA EVR PC1={evr[0] * 100:.1f}%, PC2={evr[1] * 100:.1f}%"
+            )
    fig.write_html(output_path, include_plotlyjs="cdn")
    _logger.info("Political compass written to %s", output_path)
    return output_path