@ -118,34 +118,37 @@ def get_available_windows(db_path: str) -> List[str]:
@st . cache_data ( show_spinner = False )
def get_uniform_dim_windows ( db_path : str ) - > List [ str ] :
""" Return only windows whose vector dimension equals the most common dimension .
""" Return only windows whose dominant MP-vector dimension is 50 .
np . vstack requires all vectors to have the same shape . Early or small windows
have lower SVD rank ( dim < 50 ) . This helper filters to only windows at the
dominant ( max - count ) dimension so compute_2d_axes never sees mixed shapes .
Some windows contain a mix of vector lengths due to multiple pipeline runs
( e . g . 2016 has both dim = 1 and dim = 50 rows ) . We find the most common dimension
per window and include only windows where that dominant dim equals 50.
Windows with too few dim - 50 entities ( < 10 ) are also excluded to avoid
degenerate PCA inputs .
"""
con = duckdb . connect ( database = db_path , read_only = True )
try :
rows = con . execute (
"""
WITH window_dims AS (
SELECT DISTINCT ON ( window_id )
window_id ,
json_array_length ( vector ) AS dim
WITH vec_dims AS (
SELECT window_id , json_array_length ( vector ) AS dim
FROM svd_vectors
WHERE entity_type = ' mp '
ORDER BY window_id
) ,
dim_counts AS (
SELECT dim , COUNT ( * ) AS cnt FROM window_dims GROUP BY dim
window_dim_counts AS (
SELECT window_id , dim , COUNT ( * ) AS cnt
FROM vec_dims
GROUP BY window_id , dim
) ,
dominant AS (
SELECT dim FROM dim_counts ORDER BY cnt DESC , dim DESC LIMIT 1
SELECT DISTINCT ON ( window_id ) window_id , dim , cnt
FROM window_dim_counts
ORDER BY window_id , cnt DESC , dim DESC
)
SELECT wd . window_id
FROM window_dims wd
JOIN dominant d ON wd . dim = d . dim
ORDER BY wd . w indow_id
SELECT window_id
FROM dominant
WHERE dim = 50 AND cnt > = 10
ORDER BY window_id
"""
) . fetchall ( )
return [ r [ 0 ] for r in rows ]