From 559e1adb82dcfffc702729d5bf6c988390db79df Mon Sep 17 00:00:00 2001 From: Sven Geboers Date: Wed, 25 Mar 2026 22:41:35 +0100 Subject: [PATCH] fix(compass): fix annual window detection in get_uniform_dim_windows Previous query used DISTINCT ON without ordering by dim, picking arbitrary (often non-50) dim per window. Rewritten to find the dominant dim per window (highest count) and include only windows where dominant dim = 50 with >= 10 entities. This surfaces annual windows 2016/2018/2019/2022-2026 that were previously excluded due to mixed-dim rows from multiple pipeline runs. --- explorer.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/explorer.py b/explorer.py index 5bc6f8d..8ca248c 100644 --- a/explorer.py +++ b/explorer.py @@ -118,34 +118,37 @@ def get_available_windows(db_path: str) -> List[str]: @st.cache_data(show_spinner=False) def get_uniform_dim_windows(db_path: str) -> List[str]: - """Return only windows whose vector dimension equals the most common dimension. + """Return only windows whose dominant MP-vector dimension is 50. - np.vstack requires all vectors to have the same shape. Early or small windows - have lower SVD rank (dim < 50). This helper filters to only windows at the - dominant (max-count) dimension so compute_2d_axes never sees mixed shapes. + Some windows contain a mix of vector lengths due to multiple pipeline runs + (e.g. 2016 has both dim=1 and dim=50 rows). We find the most common dimension + per window and include only windows where that dominant dim equals 50. + Windows with too few dim-50 entities (< 10) are also excluded to avoid + degenerate PCA inputs. """ con = duckdb.connect(database=db_path, read_only=True) try: rows = con.execute( """ - WITH window_dims AS ( - SELECT DISTINCT ON (window_id) - window_id, - json_array_length(vector) AS dim + WITH vec_dims AS ( + SELECT window_id, json_array_length(vector) AS dim FROM svd_vectors WHERE entity_type = 'mp' - ORDER BY window_id ), - dim_counts AS ( - SELECT dim, COUNT(*) AS cnt FROM window_dims GROUP BY dim + window_dim_counts AS ( + SELECT window_id, dim, COUNT(*) AS cnt + FROM vec_dims + GROUP BY window_id, dim ), dominant AS ( - SELECT dim FROM dim_counts ORDER BY cnt DESC, dim DESC LIMIT 1 + SELECT DISTINCT ON (window_id) window_id, dim, cnt + FROM window_dim_counts + ORDER BY window_id, cnt DESC, dim DESC ) - SELECT wd.window_id - FROM window_dims wd - JOIN dominant d ON wd.dim = d.dim - ORDER BY wd.window_id + SELECT window_id + FROM dominant + WHERE dim = 50 AND cnt >= 10 + ORDER BY window_id """ ).fetchall() return [r[0] for r in rows]