diff --git a/explorer.py b/explorer.py index 5bc6f8d..8ca248c 100644 --- a/explorer.py +++ b/explorer.py @@ -118,34 +118,37 @@ def get_available_windows(db_path: str) -> List[str]: @st.cache_data(show_spinner=False) def get_uniform_dim_windows(db_path: str) -> List[str]: - """Return only windows whose vector dimension equals the most common dimension. + """Return only windows whose dominant MP-vector dimension is 50. - np.vstack requires all vectors to have the same shape. Early or small windows - have lower SVD rank (dim < 50). This helper filters to only windows at the - dominant (max-count) dimension so compute_2d_axes never sees mixed shapes. + Some windows contain a mix of vector lengths due to multiple pipeline runs + (e.g. 2016 has both dim=1 and dim=50 rows). We find the most common dimension + per window and include only windows where that dominant dim equals 50. + Windows with too few dim-50 entities (< 10) are also excluded to avoid + degenerate PCA inputs. """ con = duckdb.connect(database=db_path, read_only=True) try: rows = con.execute( """ - WITH window_dims AS ( - SELECT DISTINCT ON (window_id) - window_id, - json_array_length(vector) AS dim + WITH vec_dims AS ( + SELECT window_id, json_array_length(vector) AS dim FROM svd_vectors WHERE entity_type = 'mp' - ORDER BY window_id ), - dim_counts AS ( - SELECT dim, COUNT(*) AS cnt FROM window_dims GROUP BY dim + window_dim_counts AS ( + SELECT window_id, dim, COUNT(*) AS cnt + FROM vec_dims + GROUP BY window_id, dim ), dominant AS ( - SELECT dim FROM dim_counts ORDER BY cnt DESC, dim DESC LIMIT 1 + SELECT DISTINCT ON (window_id) window_id, dim, cnt + FROM window_dim_counts + ORDER BY window_id, cnt DESC, dim DESC ) - SELECT wd.window_id - FROM window_dims wd - JOIN dominant d ON wd.dim = d.dim - ORDER BY wd.window_id + SELECT window_id + FROM dominant + WHERE dim = 50 AND cnt >= 10 + ORDER BY window_id """ ).fetchall() return [r[0] for r in rows]