From 26acd8b96409407b3e9ea40b042f0eaa3059f0be Mon Sep 17 00:00:00 2001 From: Sven Geboers Date: Wed, 25 Mar 2026 22:44:57 +0100 Subject: [PATCH] fix: scree plot uses party vectors instead of individual MPs current_parliament has two separate SVD data spaces mixed together. Party vectors (entity_id without comma) carry the between-party signal in dims 0-15. Individual MP vectors only have signal in dim 3 and dims 16-49 (within-party variance). The axis chart uses party vectors, so the scree must too. --- explorer.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/explorer.py b/explorer.py index 8ca248c..7765992 100644 --- a/explorer.py +++ b/explorer.py @@ -262,27 +262,27 @@ def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]: @st.cache_data(show_spinner="Scree-plot laden…") def load_scree_data(db_path: str) -> List[float]: - """Return component importances (L2-norm per SVD dimension), sorted descending. + """Return party-level component importances (L2-norm per SVD dim), sorted descending. - Uses ALL individual MP vectors (entity_type='mp', window='current_parliament'), - excluding party-aggregated rows. Since the stored vectors are U*s (scaled by - singular values), the L2-norm of all MP scores per dimension approximates the - singular value for that dimension. Sorting descending gives the proper scree shape. + The current_parliament window contains two separate SVD data spaces: + - Party vectors (entity_id without comma): dims 0–15 have political signal + - Individual MP vectors (entity_id with comma): signal in dim 3 + dims 16–49 + (within-party variance, unrelated to between-party differences) - Note: Procrustes alignment across sub-windows may scramble the original dimension - ordering, so we sort by magnitude rather than relying on dimension index order. + Since the SVD tab axis chart uses party vectors exclusively, the scree plot + must also use party vectors. We filter to entries with L2-norm > 1 (excludes + near-empty/historical party entries), compute L2-norm per dim, then sort + descending so the elbow shape is visible. """ try: con = duckdb.connect(database=db_path, read_only=True) rows = con.execute( "SELECT entity_id, vector FROM svd_vectors " - "WHERE entity_type='mp' AND window_id='current_parliament'" + "WHERE entity_type='mp' AND window_id='current_parliament' " + "AND entity_id NOT LIKE '%,%'" ).fetchall() - # Individual MPs have "Lastname, F." format; party rows are short codes without commas vectors: List[List[float]] = [] for entity_id, raw_vec in rows: - if "," not in entity_id: - continue # skip party-aggregated rows if isinstance(raw_vec, str): vec = json.loads(raw_vec) elif isinstance(raw_vec, (bytes, bytearray)): @@ -294,7 +294,10 @@ def load_scree_data(db_path: str) -> List[float]: vec = list(raw_vec) except Exception: continue - vectors.append([float(v) if v is not None else 0.0 for v in vec]) + fvec = [float(v) if v is not None else 0.0 for v in vec] + l2 = sum(x**2 for x in fvec) ** 0.5 + if l2 > 1.0: # skip near-empty / historical party entries + vectors.append(fvec) if not vectors: return [] n_dims = len(vectors[0])