fix(compass): fix annual window detection in get_uniform_dim_windows

Previous query used DISTINCT ON without ordering by dim, picking arbitrary
(often non-50) dim per window. Rewritten to find the dominant dim per window
(highest count) and include only windows where dominant dim = 50 with >= 10
entities. This surfaces annual windows 2016/2018/2019/2022-2026 that were
previously excluded due to mixed-dim rows from multiple pipeline runs.
main
Sven Geboers 1 month ago
parent c386073430
commit 559e1adb82
  1. 35
      explorer.py

@ -118,34 +118,37 @@ def get_available_windows(db_path: str) -> List[str]:
@st.cache_data(show_spinner=False) @st.cache_data(show_spinner=False)
def get_uniform_dim_windows(db_path: str) -> List[str]: def get_uniform_dim_windows(db_path: str) -> List[str]:
"""Return only windows whose vector dimension equals the most common dimension. """Return only windows whose dominant MP-vector dimension is 50.
np.vstack requires all vectors to have the same shape. Early or small windows Some windows contain a mix of vector lengths due to multiple pipeline runs
have lower SVD rank (dim < 50). This helper filters to only windows at the (e.g. 2016 has both dim=1 and dim=50 rows). We find the most common dimension
dominant (max-count) dimension so compute_2d_axes never sees mixed shapes. per window and include only windows where that dominant dim equals 50.
Windows with too few dim-50 entities (< 10) are also excluded to avoid
degenerate PCA inputs.
""" """
con = duckdb.connect(database=db_path, read_only=True) con = duckdb.connect(database=db_path, read_only=True)
try: try:
rows = con.execute( rows = con.execute(
""" """
WITH window_dims AS ( WITH vec_dims AS (
SELECT DISTINCT ON (window_id) SELECT window_id, json_array_length(vector) AS dim
window_id,
json_array_length(vector) AS dim
FROM svd_vectors FROM svd_vectors
WHERE entity_type = 'mp' WHERE entity_type = 'mp'
ORDER BY window_id
), ),
dim_counts AS ( window_dim_counts AS (
SELECT dim, COUNT(*) AS cnt FROM window_dims GROUP BY dim SELECT window_id, dim, COUNT(*) AS cnt
FROM vec_dims
GROUP BY window_id, dim
), ),
dominant AS ( dominant AS (
SELECT dim FROM dim_counts ORDER BY cnt DESC, dim DESC LIMIT 1 SELECT DISTINCT ON (window_id) window_id, dim, cnt
FROM window_dim_counts
ORDER BY window_id, cnt DESC, dim DESC
) )
SELECT wd.window_id SELECT window_id
FROM window_dims wd FROM dominant
JOIN dominant d ON wd.dim = d.dim WHERE dim = 50 AND cnt >= 10
ORDER BY wd.window_id ORDER BY window_id
""" """
).fetchall() ).fetchall()
return [r[0] for r in rows] return [r[0] for r in rows]

Loading…
Cancel
Save