diff --git a/analysis/political_axis.py b/analysis/political_axis.py index 6113f56..1b46cbd 100644 --- a/analysis/political_axis.py +++ b/analysis/political_axis.py @@ -134,6 +134,7 @@ def compute_2d_axes( method: str = "pca", anchor_kwargs: Optional[Dict] = None, normalize_vectors: bool = True, + pca_residual: bool = False, ) -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], Dict[str, np.ndarray]]: """Compute 2D coordinates for MPs per window. @@ -192,19 +193,59 @@ def compute_2d_axes( # centre globally Mc = M - M.mean(axis=0) try: - _, _, Vt = np.linalg.svd(Mc, full_matrices=False) + U, s, Vt = np.linalg.svd(Mc, full_matrices=False) except np.linalg.LinAlgError: _logger.exception("SVD failed in compute_2d_axes (pca)") return ({}, {}) - # take top-2 components as axes (shape k,) + + # explained variance ratio from singular values + sv2 = s**2 + evr = sv2 / (sv2.sum() + 1e-20) + evr1 = float(evr[0]) if evr.size > 0 else 0.0 + evr2 = float(evr[1]) if evr.size > 1 else 0.0 + + # take top-1 component as primary axis comp1 = Vt[0] + comp1_hat = comp1 / (np.linalg.norm(comp1) + 1e-12) + + # By default take the second Vt row as the second axis, but optionally + # compute the second axis from residuals (PCA on data with PC1 removed) comp2 = Vt[1] if Vt.shape[0] > 1 else np.zeros_like(comp1) + + # compute residual-based second component if requested or if PC1 dominates + if pca_residual or evr1 > 0.85: + # Project out PC1 from centred data and compute PCA on residuals + proj1 = (Mc.dot(comp1_hat)).reshape(-1, 1) * comp1_hat.reshape(1, -1) + residual = Mc - proj1 + try: + _, s_res, Vt_res = np.linalg.svd(residual, full_matrices=False) + comp2 = Vt_res[0] + _logger.info( + "Using residual PCA for second axis (pca_residual=%s)", pca_residual + ) + except Exception: + _logger.exception( + "Residual PCA failed; falling back to raw second component" + ) + + comp2_hat = comp2 / (np.linalg.norm(comp2) + 1e-12) + axes = { - "x_axis": comp1 / (np.linalg.norm(comp1) + 1e-12), - "y_axis": comp2 / (np.linalg.norm(comp2) + 1e-12), + "x_axis": comp1_hat, + "y_axis": comp2_hat, "method": "pca", + "explained_variance_ratio": [evr1, evr2], + "pca_residual_used": bool(pca_residual or evr1 > 0.85), } + # warn if PCA is effectively 1-D + if evr1 > 0.85 and not pca_residual: + _logger.warning( + "PCA first component explains %.1f%% of variance — data is near-1D;\n" + "consider using pca_residual=True or the anchor method for a second axis", + evr1 * 100, + ) + # project per-window vectors (centre by global mean) global_mean = M.mean(axis=0) positions_by_window: Dict[str, Dict[str, Tuple[float, float]]] = { diff --git a/analysis/visualize.py b/analysis/visualize.py index e93bc78..538ceb1 100644 --- a/analysis/visualize.py +++ b/analysis/visualize.py @@ -168,6 +168,8 @@ def plot_political_compass( positions_by_window: Dict, window_id: str, party_of: Optional[Dict] = None, + axis_def: Optional[Dict] = None, + y_scale: Optional[float] = None, output_path: str = "analysis_compass.html", ) -> str: """Plot 2D political compass scatter for a single window. @@ -215,19 +217,51 @@ def plot_political_compass( parties = [party_of.get(n, "Unknown") if party_of else "Unknown" for n in names] + # If axis_def provided and evr small, optionally scale y for visibility + scaled_ys = ys + if axis_def and y_scale is None: + evr = axis_def.get("explained_variance_ratio") if axis_def else None + if evr and isinstance(evr, (list, tuple)) and len(evr) >= 2: + evr1, evr2 = evr[0], evr[1] + if evr2 < 1e-6: + scale_guess = 1.0 + else: + scale_guess = min(max(1.0, float(evr1 / (evr2 + 1e-9)) ** 0.5), 8.0) + scaled_ys = [y * scale_guess for y in ys] + _logger.info( + "Auto-scaling Y by %.2f for visibility (evr1=%.3f evr2=%.3f)", + scale_guess, + evr1, + evr2, + ) + elif axis_def and y_scale is not None: + scaled_ys = [y * float(y_scale) for y in ys] + + # mark unknowns differently + unknown_flags = [1 if parties[i] == "Unknown" else 0 for i in range(len(names))] + fig = px.scatter( x=xs, - y=ys, + y=scaled_ys, color=parties, + symbol=unknown_flags, hover_name=names, title=f"Political Compass ({window_id})", labels={ "x": "Left ← — → Right", "y": "Progressive ← — → Conservative", "color": "Party", + "symbol": "Unknown", }, ) - fig.update_traces(marker=dict(size=8, opacity=0.8)) + fig.update_traces(marker=dict(size=8, opacity=0.85)) + # annotate explained variance if available + if axis_def and axis_def.get("method") == "pca": + evr = axis_def.get("explained_variance_ratio") + if evr and len(evr) >= 2: + fig.update_layout( + title=f"Political Compass ({window_id}) — PCA EVR PC1={evr[0] * 100:.1f}%, PC2={evr[1] * 100:.1f}%" + ) fig.write_html(output_path, include_plotlyjs="cdn") _logger.info("Political compass written to %s", output_path) return output_path