fix: use annual-only windows for SVD to restore EVR (~20% PC1)

Quarterly windows (29 of 41 total) diluted PC1 explained variance ratio
from ~20% down to ~14.6%. The fix splits the vector collection loop into:
- pca_vecs: annual windows only (re.match r'^\d{4}$') -> M_pca used for SVD
- all_vecs: every window -> M used for projections onto derived axes

Centering for SVD and global_mean for projection both now use M_pca.mean(axis=0)
so axes are consistent. Falls back to all windows if no annual windows exist.
main
Sven Geboers 1 month ago
parent 2cca1000ca
commit ffd8b191ef
  1. 31
      analysis/political_axis.py

@ -14,6 +14,7 @@ Both modes return a dict mapping mp_name → scalar score for the given window.
import json
import logging
import re
from typing import Dict, List, Optional, Tuple
import numpy as np
@ -190,17 +191,23 @@ def compute_2d_axes(
aligned_window_vecs = _trajectory._procrustes_align_windows(raw_window_vecs)
# Stack all vectors across windows into a single matrix for PCA if needed
# Stack all vectors across windows into a single matrix for PCA if needed.
# pca_vecs / pca_index: annual windows only (e.g. "2024") — used for SVD axis derivation.
# all_vecs / entity_index: every window — used for projection onto the derived axes.
pca_vecs = []
all_vecs = []
entity_index = [] # parallel list of (window_id, entity)
for wid, d in aligned_window_vecs.items():
for ent, v in d.items():
if normalize_vectors:
n = np.linalg.norm(v)
all_vecs.append(v / n if n > 1e-10 else v)
vec = v / n if n > 1e-10 else v
else:
all_vecs.append(v)
vec = v
all_vecs.append(vec)
entity_index.append((wid, ent))
if re.match(r"^\d{4}$", wid):
pca_vecs.append(vec)
if len(all_vecs) == 0:
_logger.info("No vectors loaded for windows %s", window_ids)
@ -208,9 +215,19 @@ def compute_2d_axes(
M = np.vstack(all_vecs)
# If no annual windows found, fall back to all windows for SVD.
if len(pca_vecs) == 0:
_logger.warning(
"No annual windows found; falling back to all %d windows for SVD axis derivation",
len(aligned_window_vecs),
)
M_pca = M
else:
M_pca = np.vstack(pca_vecs)
if method == "pca":
# centre globally
Mc = M - M.mean(axis=0)
# centre using annual-only mean so SVD axes are not diluted by quarterly windows
Mc = M_pca - M_pca.mean(axis=0)
try:
U, s, Vt = np.linalg.svd(Mc, full_matrices=False)
except np.linalg.LinAlgError:
@ -358,8 +375,8 @@ def compute_2d_axes(
evr1 * 100,
)
# project per-window vectors (centre by global mean)
global_mean = M.mean(axis=0)
# project per-window vectors (centre by annual-window global mean, consistent with SVD axes)
global_mean = M_pca.mean(axis=0)
axes["global_mean"] = global_mean
positions_by_window: Dict[str, Dict[str, Tuple[float, float]]] = {
wid: {} for wid in window_ids

Loading…
Cancel
Save