fix(explorer): filter to uniform-dim windows before PCA to prevent np.vstack shape mismatch

main
Sven Geboers 1 month ago
parent 2891e9ee70
commit cd7bb3b1e0
  1. 44
      explorer.py

@ -72,6 +72,46 @@ def get_available_windows(db_path: str) -> List[str]:
con.close()
@st.cache_data(show_spinner=False)
def get_uniform_dim_windows(db_path: str) -> List[str]:
"""Return only windows whose vector dimension equals the most common dimension.
np.vstack requires all vectors to have the same shape. Early or small windows
have lower SVD rank (dim < 50). This helper filters to only windows at the
dominant (max-count) dimension so compute_2d_axes never sees mixed shapes.
"""
con = duckdb.connect(database=db_path, read_only=True)
try:
rows = con.execute(
"""
WITH window_dims AS (
SELECT DISTINCT ON (window_id)
window_id,
json_array_length(vector) AS dim
FROM svd_vectors
WHERE entity_type = 'mp'
ORDER BY window_id
),
dim_counts AS (
SELECT dim, COUNT(*) AS cnt FROM window_dims GROUP BY dim
),
dominant AS (
SELECT dim FROM dim_counts ORDER BY cnt DESC, dim DESC LIMIT 1
)
SELECT wd.window_id
FROM window_dims wd
JOIN dominant d ON wd.dim = d.dim
ORDER BY wd.window_id
"""
).fetchall()
return [r[0] for r in rows]
except Exception:
logger.exception("Failed to query uniform-dim windows")
return []
finally:
con.close()
@st.cache_data(show_spinner="2D posities berekenen (kan even duren)…")
def load_positions(
db_path: str, window_size: str = "quarterly"
@ -84,7 +124,9 @@ def load_positions(
"""
from analysis.political_axis import compute_2d_axes
available = get_available_windows(db_path)
# Only use windows where all vectors share the same dimension (dim=50).
# Mixed-dim windows cause np.vstack to fail in compute_2d_axes.
available = get_uniform_dim_windows(db_path)
if window_size == "annual":
# Keep only Q4 windows (one representative window per year)
available = [w for w in available if w.endswith("-Q4")]

Loading…
Cancel
Save