From cd7bb3b1e0c35d22a6e18db69f3702a3362c1b77 Mon Sep 17 00:00:00 2001 From: Sven Geboers Date: Sun, 22 Mar 2026 22:54:54 +0100 Subject: [PATCH] fix(explorer): filter to uniform-dim windows before PCA to prevent np.vstack shape mismatch --- explorer.py | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/explorer.py b/explorer.py index 9e709a3..38255dd 100644 --- a/explorer.py +++ b/explorer.py @@ -72,6 +72,46 @@ def get_available_windows(db_path: str) -> List[str]: con.close() +@st.cache_data(show_spinner=False) +def get_uniform_dim_windows(db_path: str) -> List[str]: + """Return only windows whose vector dimension equals the most common dimension. + + np.vstack requires all vectors to have the same shape. Early or small windows + have lower SVD rank (dim < 50). This helper filters to only windows at the + dominant (max-count) dimension so compute_2d_axes never sees mixed shapes. + """ + con = duckdb.connect(database=db_path, read_only=True) + try: + rows = con.execute( + """ + WITH window_dims AS ( + SELECT DISTINCT ON (window_id) + window_id, + json_array_length(vector) AS dim + FROM svd_vectors + WHERE entity_type = 'mp' + ORDER BY window_id + ), + dim_counts AS ( + SELECT dim, COUNT(*) AS cnt FROM window_dims GROUP BY dim + ), + dominant AS ( + SELECT dim FROM dim_counts ORDER BY cnt DESC, dim DESC LIMIT 1 + ) + SELECT wd.window_id + FROM window_dims wd + JOIN dominant d ON wd.dim = d.dim + ORDER BY wd.window_id + """ + ).fetchall() + return [r[0] for r in rows] + except Exception: + logger.exception("Failed to query uniform-dim windows") + return [] + finally: + con.close() + + @st.cache_data(show_spinner="2D posities berekenen (kan even duren)…") def load_positions( db_path: str, window_size: str = "quarterly" @@ -84,7 +124,9 @@ def load_positions( """ from analysis.political_axis import compute_2d_axes - available = get_available_windows(db_path) + # Only use windows where all vectors share the same dimension (dim=50). + # Mixed-dim windows cause np.vstack to fail in compute_2d_axes. + available = get_uniform_dim_windows(db_path) if window_size == "annual": # Keep only Q4 windows (one representative window per year) available = [w for w in available if w.endswith("-Q4")]