fix: use annual-only windows for SVD to restore EVR (~20% PC1)

Quarterly windows (29 of 41 total) diluted PC1 explained variance ratio
from ~20% down to ~14.6%. The fix splits the vector collection loop into:
- pca_vecs: annual windows only (re.match r'^\d{4}$') -> M_pca used for SVD
- all_vecs: every window -> M used for projections onto derived axes

Centering for SVD and global_mean for projection both now use M_pca.mean(axis=0)
so axes are consistent. Falls back to all windows if no annual windows exist.
main
Sven Geboers 1 month ago
parent 2cca1000ca
commit ffd8b191ef
  1. 31
      analysis/political_axis.py

@ -14,6 +14,7 @@ Both modes return a dict mapping mp_name → scalar score for the given window.
import json import json
import logging import logging
import re
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
import numpy as np import numpy as np
@ -190,17 +191,23 @@ def compute_2d_axes(
aligned_window_vecs = _trajectory._procrustes_align_windows(raw_window_vecs) aligned_window_vecs = _trajectory._procrustes_align_windows(raw_window_vecs)
# Stack all vectors across windows into a single matrix for PCA if needed # Stack all vectors across windows into a single matrix for PCA if needed.
# pca_vecs / pca_index: annual windows only (e.g. "2024") — used for SVD axis derivation.
# all_vecs / entity_index: every window — used for projection onto the derived axes.
pca_vecs = []
all_vecs = [] all_vecs = []
entity_index = [] # parallel list of (window_id, entity) entity_index = [] # parallel list of (window_id, entity)
for wid, d in aligned_window_vecs.items(): for wid, d in aligned_window_vecs.items():
for ent, v in d.items(): for ent, v in d.items():
if normalize_vectors: if normalize_vectors:
n = np.linalg.norm(v) n = np.linalg.norm(v)
all_vecs.append(v / n if n > 1e-10 else v) vec = v / n if n > 1e-10 else v
else: else:
all_vecs.append(v) vec = v
all_vecs.append(vec)
entity_index.append((wid, ent)) entity_index.append((wid, ent))
if re.match(r"^\d{4}$", wid):
pca_vecs.append(vec)
if len(all_vecs) == 0: if len(all_vecs) == 0:
_logger.info("No vectors loaded for windows %s", window_ids) _logger.info("No vectors loaded for windows %s", window_ids)
@ -208,9 +215,19 @@ def compute_2d_axes(
M = np.vstack(all_vecs) M = np.vstack(all_vecs)
# If no annual windows found, fall back to all windows for SVD.
if len(pca_vecs) == 0:
_logger.warning(
"No annual windows found; falling back to all %d windows for SVD axis derivation",
len(aligned_window_vecs),
)
M_pca = M
else:
M_pca = np.vstack(pca_vecs)
if method == "pca": if method == "pca":
# centre globally # centre using annual-only mean so SVD axes are not diluted by quarterly windows
Mc = M - M.mean(axis=0) Mc = M_pca - M_pca.mean(axis=0)
try: try:
U, s, Vt = np.linalg.svd(Mc, full_matrices=False) U, s, Vt = np.linalg.svd(Mc, full_matrices=False)
except np.linalg.LinAlgError: except np.linalg.LinAlgError:
@ -358,8 +375,8 @@ def compute_2d_axes(
evr1 * 100, evr1 * 100,
) )
# project per-window vectors (centre by global mean) # project per-window vectors (centre by annual-window global mean, consistent with SVD axes)
global_mean = M.mean(axis=0) global_mean = M_pca.mean(axis=0)
axes["global_mean"] = global_mean axes["global_mean"] = global_mean
positions_by_window: Dict[str, Dict[str, Tuple[float, float]]] = { positions_by_window: Dict[str, Dict[str, Tuple[float, float]]] = {
wid: {} for wid in window_ids wid: {} for wid in window_ids

Loading…
Cancel
Save