fix: exclude quarterly windows from all PCA/SVD computation

- analysis/explorer_data.py: add AND window_id NOT LIKE '%-Q%' to
  _UNIFORM_DIM_SQL so quarterly windows are filtered at the source
- explorer.py: remove stale comment justifying quarterly inclusion;
  remove redundant '-Q' guard in SVD tab trajectory view
- scripts/recompute_svd.py: replace quarter_bounds() with year_bounds()
  that handles annual window IDs like '2024'; filter window list to
  annual-only before recomputing SVD
main
Sven Geboers 2 weeks ago
parent be4375b303
commit 62d8e15e03
  1. 1
      analysis/explorer_data.py
  2. 10
      explorer.py
  3. 35
      scripts/recompute_svd.py

@ -64,6 +64,7 @@ _UNIFORM_DIM_SQL = """
SELECT window_id SELECT window_id
FROM dominant FROM dominant
WHERE dim >= 25 AND cnt >= 10 WHERE dim >= 25 AND cnt >= 10
AND window_id NOT LIKE '%-Q%'
ORDER BY window_id ORDER BY window_id
""" """

@ -481,10 +481,7 @@ def load_positions(
""" """
from analysis.political_axis import compute_2d_axes from analysis.political_axis import compute_2d_axes
# Always compute PCA on ALL uniform-dim windows (quarterly + annual) so that # Use only annual windows (quarterly windows are excluded by get_uniform_dim_windows).
# the principal components are determined by the full temporal spread of data.
# Using only annual windows (11) causes PC1 to capture cross-temporal drift
# instead of left-right ideology, resulting in a ~90° rotation.
all_available = get_uniform_dim_windows(db_path) all_available = get_uniform_dim_windows(db_path)
if not all_available: if not all_available:
@ -2715,11 +2712,8 @@ def build_svd_components_tab(db_path: str) -> None:
# Render party axis chart (single window or time trajectory) # Render party axis chart (single window or time trajectory)
if view_mode == "Tijdtraject" and selected_parties_for_trajectory: if view_mode == "Tijdtraject" and selected_parties_for_trajectory:
# Load party scores for all windows and render time trajectory # Load party scores for all windows and render time trajectory
# Filter to annual windows only (exclude quarters)
available_windows = get_uniform_dim_windows(db_path) available_windows = get_uniform_dim_windows(db_path)
year_windows = sorted( year_windows = sorted(w for w in available_windows if w != "current_parliament")
w for w in available_windows if w != "current_parliament" and "-Q" not in w
)
has_current = "current_parliament" in available_windows has_current = "current_parliament" in available_windows
all_windows = year_windows + (["current_parliament"] if has_current else []) all_windows = year_windows + (["current_parliament"] if has_current else [])

@ -28,17 +28,20 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(mess
logger = logging.getLogger("recompute_svd") logger = logging.getLogger("recompute_svd")
def quarter_bounds(window_id: str) -> Tuple[str, str]: def year_bounds(window_id: str) -> Tuple[str, str]:
# window_id like '2026-Q1' """Return (start_date, end_date) for an annual window_id like '2024'.
year, q = window_id.split("-Q")
y = int(year) Quarterly window IDs (containing '-Q') are not supported this script
qn = int(q) only processes annual windows.
starts = {1: (1, 1), 2: (4, 1), 3: (7, 1), 4: (10, 1)} """
ends = {1: (3, 31), 2: (6, 30), 3: (9, 30), 4: (12, 31)} if "-Q" in window_id:
s_m, s_d = starts[qn] raise ValueError(
e_m, e_d = ends[qn] f"Quarterly window '{window_id}' is not supported. "
start = date(y, s_m, s_d).isoformat() "Only annual windows should be recomputed."
end = date(y, e_m, e_d).isoformat() )
y = int(window_id)
start = date(y, 1, 1).isoformat()
end = date(y, 12, 31).isoformat()
return start, end return start, end
@ -76,12 +79,14 @@ def main(argv: List[str] | None = None) -> int:
db = MotionDatabase(dst) db = MotionDatabase(dst)
# find windows from original DB via trajectory helper # find windows from original DB via trajectory helper
window_ids = traj._load_window_ids(src) all_window_ids = traj._load_window_ids(src)
# Only process annual windows — quarterly windows are excluded from all PCA/SVD computation
window_ids = [w for w in all_window_ids if "-Q" not in w]
if not window_ids: if not window_ids:
logger.error("No windows found in source DB %s", src) logger.error("No annual windows found in source DB %s", src)
return 3 return 3
logger.info("Will recompute SVD for windows: %s", window_ids) logger.info("Will recompute SVD for annual windows: %s", window_ids)
# clear existing svd_vectors rows for these windows in dst DB # clear existing svd_vectors rows for these windows in dst DB
import duckdb import duckdb
@ -100,7 +105,7 @@ def main(argv: List[str] | None = None) -> int:
# Run SVD per window # Run SVD per window
for wid in window_ids: for wid in window_ids:
start, end = quarter_bounds(wid) start, end = year_bounds(wid)
logger.info("Running SVD for %s (%s -> %s) k=%d", wid, start, end, args.k) logger.info("Running SVD for %s (%s -> %s) k=%d", wid, start, end, args.k)
res = run_svd_for_window( res = run_svd_for_window(
db=db, window_id=wid, start_date=start, end_date=end, k=args.k db=db, window_id=wid, start_date=start, end_date=end, k=args.k

Loading…
Cancel
Save