diff --git a/explorer.py b/explorer.py index 186ed13..31da354 100644 --- a/explorer.py +++ b/explorer.py @@ -1067,7 +1067,7 @@ def build_svd_components_tab(db_path: str) -> None: for m in left_motions: mid = m.get("motion_id") raw_title = m.get("title") or f"Motie #{mid}" - with st.expander(f"{left_arrow} {raw_title[:80]}"): + with st.expander(f"{left_arrow} {raw_title}"): row = motion_details.get(int(mid)) if mid is not None else None if row: try: @@ -1089,7 +1089,7 @@ def build_svd_components_tab(db_path: str) -> None: for m in right_motions: mid = m.get("motion_id") raw_title = m.get("title") or f"Motie #{mid}" - with st.expander(f"{right_arrow} {raw_title[:80]}"): + with st.expander(f"{right_arrow} {raw_title}"): row = motion_details.get(int(mid)) if mid is not None else None if row: try: diff --git a/scripts/generate_svd_json.py b/scripts/generate_svd_json.py new file mode 100644 index 0000000..15e31ad --- /dev/null +++ b/scripts/generate_svd_json.py @@ -0,0 +1,173 @@ +"""Generate thoughts/explorer/top_svd_top_motions.json from svd_vectors. + +For each SVD component, finds the top N motions by absolute score (split +equally between positive and negative pole), joins with the motions table, +and writes the result to the output JSON file. + +Usage: + uv run python3 scripts/generate_svd_json.py --db data/motions.db --window current_parliament + uv run python3 scripts/generate_svd_json.py --db data/motions.db --window 2025 +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import sys +from typing import Any, Dict, List, Optional, Tuple + +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if ROOT not in sys.path: + sys.path.insert(0, ROOT) + +logger = logging.getLogger("generate_svd_json") +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + + +def main(argv: Optional[List[str]] = None) -> int: + p = argparse.ArgumentParser( + description="Generate SVD top-motions JSON for a window." + ) + p.add_argument("--db", default="data/motions.db", help="Path to motions.db") + p.add_argument( + "--window", default="current_parliament", help="SVD window_id to use" + ) + p.add_argument( + "--top-n", + type=int, + default=10, + help="Top N motions per component (split pos/neg)", + ) + p.add_argument( + "--components", type=int, default=10, help="Number of SVD components to include" + ) + p.add_argument( + "--out", + default="thoughts/explorer/top_svd_top_motions.json", + help="Output JSON file path", + ) + args = p.parse_args(argv) + + try: + import duckdb + except ImportError: + logger.error("duckdb not available") + return 2 + + con = duckdb.connect(database=args.db, read_only=True) + + # Load all motion SVD vectors for the window + logger.info("Loading motion SVD vectors for window='%s' ...", args.window) + rows = con.execute( + "SELECT entity_id, vector FROM svd_vectors " + "WHERE entity_type='motion' AND window_id=?", + [args.window], + ).fetchall() + + if not rows: + logger.error( + "No motion vectors found for window='%s' in %s", args.window, args.db + ) + con.close() + return 3 + + logger.info("Loaded %d motion vectors", len(rows)) + + # Parse vectors into {motion_id: list[float]} + motion_scores: Dict[int, List[float]] = {} + for entity_id, raw_vec in rows: + try: + if isinstance(raw_vec, str): + vec = json.loads(raw_vec) + elif isinstance(raw_vec, (bytes, bytearray)): + vec = json.loads(raw_vec.decode()) + elif isinstance(raw_vec, list): + vec = raw_vec + else: + vec = list(raw_vec) + motion_scores[int(entity_id)] = [ + float(v) if v is not None else 0.0 for v in vec + ] + except Exception: + logger.warning("Failed to parse vector for motion_id=%s", entity_id) + + logger.info("Parsed %d motion vectors", len(motion_scores)) + + n_positive = args.top_n // 2 + n_negative = args.top_n - n_positive + + output_rows: List[Dict[str, Any]] = [] + all_motion_ids: List[int] = [] + + # Collect top motions per component + per_component: List[List[Tuple[int, float]]] = [] + for comp_idx in range(args.components): + scored: List[Tuple[int, float]] = [] + for mid, vec in motion_scores.items(): + if comp_idx < len(vec): + scored.append((mid, vec[comp_idx])) + + scored.sort(key=lambda x: x[1], reverse=True) + top_positive = scored[:n_positive] + top_negative = scored[-n_negative:] + combined = top_positive + list(reversed(top_negative)) + per_component.append(combined) + all_motion_ids.extend(mid for mid, _ in combined) + + # Batch-fetch motion details + unique_ids = list(set(all_motion_ids)) + if not unique_ids: + logger.error("No motion IDs to fetch") + con.close() + return 4 + + logger.info("Fetching details for %d unique motions ...", len(unique_ids)) + placeholders = ", ".join("?" for _ in unique_ids) + detail_rows = con.execute( + f"SELECT id, title, body_text, date, policy_area FROM motions WHERE id IN ({placeholders})", + unique_ids, + ).fetchall() + con.close() + + details_map: Dict[int, tuple] = {row[0]: row for row in detail_rows} + logger.info("Fetched details for %d motions", len(details_map)) + + # Build output rows + for comp_idx, top_motions in enumerate(per_component): + comp_num = comp_idx + 1 + for mid, score in top_motions: + detail = details_map.get(mid) + output_rows.append( + { + "component": comp_num, + "motion_id": mid, + "score": score, + "title": detail[1] if detail else None, + "body_text": detail[2] if detail else None, + "date": str(detail[3])[:10] if detail and detail[3] else None, + "policy_area": detail[4] if detail else None, + } + ) + + output: Dict[str, Any] = {"window": args.window, "rows": output_rows} + + out_dir = os.path.dirname(args.out) + if out_dir: + os.makedirs(out_dir, exist_ok=True) + + with open(args.out, "w", encoding="utf-8") as f: + json.dump(output, f, ensure_ascii=False, indent=2) + + logger.info( + "Written %d rows (%d components) to %s", + len(output_rows), + args.components, + args.out, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())