fix(analysis): improve PCA handling when PC1 dominates, add pca_residual option and plot autoscaling/variance annotation

main
Sven Geboers 1 month ago
parent 23a1234314
commit bf68e48460
  1. 49
      analysis/political_axis.py
  2. 38
      analysis/visualize.py

@ -134,6 +134,7 @@ def compute_2d_axes(
method: str = "pca",
anchor_kwargs: Optional[Dict] = None,
normalize_vectors: bool = True,
pca_residual: bool = False,
) -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], Dict[str, np.ndarray]]:
"""Compute 2D coordinates for MPs per window.
@ -192,19 +193,59 @@ def compute_2d_axes(
# centre globally
Mc = M - M.mean(axis=0)
try:
_, _, Vt = np.linalg.svd(Mc, full_matrices=False)
U, s, Vt = np.linalg.svd(Mc, full_matrices=False)
except np.linalg.LinAlgError:
_logger.exception("SVD failed in compute_2d_axes (pca)")
return ({}, {})
# take top-2 components as axes (shape k,)
# explained variance ratio from singular values
sv2 = s**2
evr = sv2 / (sv2.sum() + 1e-20)
evr1 = float(evr[0]) if evr.size > 0 else 0.0
evr2 = float(evr[1]) if evr.size > 1 else 0.0
# take top-1 component as primary axis
comp1 = Vt[0]
comp1_hat = comp1 / (np.linalg.norm(comp1) + 1e-12)
# By default take the second Vt row as the second axis, but optionally
# compute the second axis from residuals (PCA on data with PC1 removed)
comp2 = Vt[1] if Vt.shape[0] > 1 else np.zeros_like(comp1)
# compute residual-based second component if requested or if PC1 dominates
if pca_residual or evr1 > 0.85:
# Project out PC1 from centred data and compute PCA on residuals
proj1 = (Mc.dot(comp1_hat)).reshape(-1, 1) * comp1_hat.reshape(1, -1)
residual = Mc - proj1
try:
_, s_res, Vt_res = np.linalg.svd(residual, full_matrices=False)
comp2 = Vt_res[0]
_logger.info(
"Using residual PCA for second axis (pca_residual=%s)", pca_residual
)
except Exception:
_logger.exception(
"Residual PCA failed; falling back to raw second component"
)
comp2_hat = comp2 / (np.linalg.norm(comp2) + 1e-12)
axes = {
"x_axis": comp1 / (np.linalg.norm(comp1) + 1e-12),
"y_axis": comp2 / (np.linalg.norm(comp2) + 1e-12),
"x_axis": comp1_hat,
"y_axis": comp2_hat,
"method": "pca",
"explained_variance_ratio": [evr1, evr2],
"pca_residual_used": bool(pca_residual or evr1 > 0.85),
}
# warn if PCA is effectively 1-D
if evr1 > 0.85 and not pca_residual:
_logger.warning(
"PCA first component explains %.1f%% of variance — data is near-1D;\n"
"consider using pca_residual=True or the anchor method for a second axis",
evr1 * 100,
)
# project per-window vectors (centre by global mean)
global_mean = M.mean(axis=0)
positions_by_window: Dict[str, Dict[str, Tuple[float, float]]] = {

@ -168,6 +168,8 @@ def plot_political_compass(
positions_by_window: Dict,
window_id: str,
party_of: Optional[Dict] = None,
axis_def: Optional[Dict] = None,
y_scale: Optional[float] = None,
output_path: str = "analysis_compass.html",
) -> str:
"""Plot 2D political compass scatter for a single window.
@ -215,19 +217,51 @@ def plot_political_compass(
parties = [party_of.get(n, "Unknown") if party_of else "Unknown" for n in names]
# If axis_def provided and evr small, optionally scale y for visibility
scaled_ys = ys
if axis_def and y_scale is None:
evr = axis_def.get("explained_variance_ratio") if axis_def else None
if evr and isinstance(evr, (list, tuple)) and len(evr) >= 2:
evr1, evr2 = evr[0], evr[1]
if evr2 < 1e-6:
scale_guess = 1.0
else:
scale_guess = min(max(1.0, float(evr1 / (evr2 + 1e-9)) ** 0.5), 8.0)
scaled_ys = [y * scale_guess for y in ys]
_logger.info(
"Auto-scaling Y by %.2f for visibility (evr1=%.3f evr2=%.3f)",
scale_guess,
evr1,
evr2,
)
elif axis_def and y_scale is not None:
scaled_ys = [y * float(y_scale) for y in ys]
# mark unknowns differently
unknown_flags = [1 if parties[i] == "Unknown" else 0 for i in range(len(names))]
fig = px.scatter(
x=xs,
y=ys,
y=scaled_ys,
color=parties,
symbol=unknown_flags,
hover_name=names,
title=f"Political Compass ({window_id})",
labels={
"x": "Left ← — → Right",
"y": "Progressive ← — → Conservative",
"color": "Party",
"symbol": "Unknown",
},
)
fig.update_traces(marker=dict(size=8, opacity=0.8))
fig.update_traces(marker=dict(size=8, opacity=0.85))
# annotate explained variance if available
if axis_def and axis_def.get("method") == "pca":
evr = axis_def.get("explained_variance_ratio")
if evr and len(evr) >= 2:
fig.update_layout(
title=f"Political Compass ({window_id}) — PCA EVR PC1={evr[0] * 100:.1f}%, PC2={evr[1] * 100:.1f}%"
)
fig.write_html(output_path, include_plotlyjs="cdn")
_logger.info("Political compass written to %s", output_path)
return output_path

Loading…
Cancel
Save