fix(compass): fix annual window detection in get_uniform_dim_windows

Previous query used DISTINCT ON without ordering by dim, picking arbitrary
(often non-50) dim per window. Rewritten to find the dominant dim per window
(highest count) and include only windows where dominant dim = 50 with >= 10
entities. This surfaces annual windows 2016/2018/2019/2022-2026 that were
previously excluded due to mixed-dim rows from multiple pipeline runs.
main
Sven Geboers 1 month ago
parent c386073430
commit 559e1adb82
  1. 35
      explorer.py

@ -118,34 +118,37 @@ def get_available_windows(db_path: str) -> List[str]:
@st.cache_data(show_spinner=False)
def get_uniform_dim_windows(db_path: str) -> List[str]:
"""Return only windows whose vector dimension equals the most common dimension.
"""Return only windows whose dominant MP-vector dimension is 50.
np.vstack requires all vectors to have the same shape. Early or small windows
have lower SVD rank (dim < 50). This helper filters to only windows at the
dominant (max-count) dimension so compute_2d_axes never sees mixed shapes.
Some windows contain a mix of vector lengths due to multiple pipeline runs
(e.g. 2016 has both dim=1 and dim=50 rows). We find the most common dimension
per window and include only windows where that dominant dim equals 50.
Windows with too few dim-50 entities (< 10) are also excluded to avoid
degenerate PCA inputs.
"""
con = duckdb.connect(database=db_path, read_only=True)
try:
rows = con.execute(
"""
WITH window_dims AS (
SELECT DISTINCT ON (window_id)
window_id,
json_array_length(vector) AS dim
WITH vec_dims AS (
SELECT window_id, json_array_length(vector) AS dim
FROM svd_vectors
WHERE entity_type = 'mp'
ORDER BY window_id
),
dim_counts AS (
SELECT dim, COUNT(*) AS cnt FROM window_dims GROUP BY dim
window_dim_counts AS (
SELECT window_id, dim, COUNT(*) AS cnt
FROM vec_dims
GROUP BY window_id, dim
),
dominant AS (
SELECT dim FROM dim_counts ORDER BY cnt DESC, dim DESC LIMIT 1
SELECT DISTINCT ON (window_id) window_id, dim, cnt
FROM window_dim_counts
ORDER BY window_id, cnt DESC, dim DESC
)
SELECT wd.window_id
FROM window_dims wd
JOIN dominant d ON wd.dim = d.dim
ORDER BY wd.window_id
SELECT window_id
FROM dominant
WHERE dim = 50 AND cnt >= 10
ORDER BY window_id
"""
).fetchall()
return [r[0] for r in rows]

Loading…
Cancel
Save