Removes the raw_title[:80] cap on expander labels so full titles show. Adds scripts/generate_svd_json.py to regenerate top_svd_top_motions.json from any SVD window after a recompute.main
parent
49a1f2f67d
commit
9daa899885
@ -0,0 +1,173 @@ |
|||||||
|
"""Generate thoughts/explorer/top_svd_top_motions.json from svd_vectors. |
||||||
|
|
||||||
|
For each SVD component, finds the top N motions by absolute score (split |
||||||
|
equally between positive and negative pole), joins with the motions table, |
||||||
|
and writes the result to the output JSON file. |
||||||
|
|
||||||
|
Usage: |
||||||
|
uv run python3 scripts/generate_svd_json.py --db data/motions.db --window current_parliament |
||||||
|
uv run python3 scripts/generate_svd_json.py --db data/motions.db --window 2025 |
||||||
|
""" |
||||||
|
|
||||||
|
from __future__ import annotations |
||||||
|
|
||||||
|
import argparse |
||||||
|
import json |
||||||
|
import logging |
||||||
|
import os |
||||||
|
import sys |
||||||
|
from typing import Any, Dict, List, Optional, Tuple |
||||||
|
|
||||||
|
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
||||||
|
if ROOT not in sys.path: |
||||||
|
sys.path.insert(0, ROOT) |
||||||
|
|
||||||
|
logger = logging.getLogger("generate_svd_json") |
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") |
||||||
|
|
||||||
|
|
||||||
|
def main(argv: Optional[List[str]] = None) -> int: |
||||||
|
p = argparse.ArgumentParser( |
||||||
|
description="Generate SVD top-motions JSON for a window." |
||||||
|
) |
||||||
|
p.add_argument("--db", default="data/motions.db", help="Path to motions.db") |
||||||
|
p.add_argument( |
||||||
|
"--window", default="current_parliament", help="SVD window_id to use" |
||||||
|
) |
||||||
|
p.add_argument( |
||||||
|
"--top-n", |
||||||
|
type=int, |
||||||
|
default=10, |
||||||
|
help="Top N motions per component (split pos/neg)", |
||||||
|
) |
||||||
|
p.add_argument( |
||||||
|
"--components", type=int, default=10, help="Number of SVD components to include" |
||||||
|
) |
||||||
|
p.add_argument( |
||||||
|
"--out", |
||||||
|
default="thoughts/explorer/top_svd_top_motions.json", |
||||||
|
help="Output JSON file path", |
||||||
|
) |
||||||
|
args = p.parse_args(argv) |
||||||
|
|
||||||
|
try: |
||||||
|
import duckdb |
||||||
|
except ImportError: |
||||||
|
logger.error("duckdb not available") |
||||||
|
return 2 |
||||||
|
|
||||||
|
con = duckdb.connect(database=args.db, read_only=True) |
||||||
|
|
||||||
|
# Load all motion SVD vectors for the window |
||||||
|
logger.info("Loading motion SVD vectors for window='%s' ...", args.window) |
||||||
|
rows = con.execute( |
||||||
|
"SELECT entity_id, vector FROM svd_vectors " |
||||||
|
"WHERE entity_type='motion' AND window_id=?", |
||||||
|
[args.window], |
||||||
|
).fetchall() |
||||||
|
|
||||||
|
if not rows: |
||||||
|
logger.error( |
||||||
|
"No motion vectors found for window='%s' in %s", args.window, args.db |
||||||
|
) |
||||||
|
con.close() |
||||||
|
return 3 |
||||||
|
|
||||||
|
logger.info("Loaded %d motion vectors", len(rows)) |
||||||
|
|
||||||
|
# Parse vectors into {motion_id: list[float]} |
||||||
|
motion_scores: Dict[int, List[float]] = {} |
||||||
|
for entity_id, raw_vec in rows: |
||||||
|
try: |
||||||
|
if isinstance(raw_vec, str): |
||||||
|
vec = json.loads(raw_vec) |
||||||
|
elif isinstance(raw_vec, (bytes, bytearray)): |
||||||
|
vec = json.loads(raw_vec.decode()) |
||||||
|
elif isinstance(raw_vec, list): |
||||||
|
vec = raw_vec |
||||||
|
else: |
||||||
|
vec = list(raw_vec) |
||||||
|
motion_scores[int(entity_id)] = [ |
||||||
|
float(v) if v is not None else 0.0 for v in vec |
||||||
|
] |
||||||
|
except Exception: |
||||||
|
logger.warning("Failed to parse vector for motion_id=%s", entity_id) |
||||||
|
|
||||||
|
logger.info("Parsed %d motion vectors", len(motion_scores)) |
||||||
|
|
||||||
|
n_positive = args.top_n // 2 |
||||||
|
n_negative = args.top_n - n_positive |
||||||
|
|
||||||
|
output_rows: List[Dict[str, Any]] = [] |
||||||
|
all_motion_ids: List[int] = [] |
||||||
|
|
||||||
|
# Collect top motions per component |
||||||
|
per_component: List[List[Tuple[int, float]]] = [] |
||||||
|
for comp_idx in range(args.components): |
||||||
|
scored: List[Tuple[int, float]] = [] |
||||||
|
for mid, vec in motion_scores.items(): |
||||||
|
if comp_idx < len(vec): |
||||||
|
scored.append((mid, vec[comp_idx])) |
||||||
|
|
||||||
|
scored.sort(key=lambda x: x[1], reverse=True) |
||||||
|
top_positive = scored[:n_positive] |
||||||
|
top_negative = scored[-n_negative:] |
||||||
|
combined = top_positive + list(reversed(top_negative)) |
||||||
|
per_component.append(combined) |
||||||
|
all_motion_ids.extend(mid for mid, _ in combined) |
||||||
|
|
||||||
|
# Batch-fetch motion details |
||||||
|
unique_ids = list(set(all_motion_ids)) |
||||||
|
if not unique_ids: |
||||||
|
logger.error("No motion IDs to fetch") |
||||||
|
con.close() |
||||||
|
return 4 |
||||||
|
|
||||||
|
logger.info("Fetching details for %d unique motions ...", len(unique_ids)) |
||||||
|
placeholders = ", ".join("?" for _ in unique_ids) |
||||||
|
detail_rows = con.execute( |
||||||
|
f"SELECT id, title, body_text, date, policy_area FROM motions WHERE id IN ({placeholders})", |
||||||
|
unique_ids, |
||||||
|
).fetchall() |
||||||
|
con.close() |
||||||
|
|
||||||
|
details_map: Dict[int, tuple] = {row[0]: row for row in detail_rows} |
||||||
|
logger.info("Fetched details for %d motions", len(details_map)) |
||||||
|
|
||||||
|
# Build output rows |
||||||
|
for comp_idx, top_motions in enumerate(per_component): |
||||||
|
comp_num = comp_idx + 1 |
||||||
|
for mid, score in top_motions: |
||||||
|
detail = details_map.get(mid) |
||||||
|
output_rows.append( |
||||||
|
{ |
||||||
|
"component": comp_num, |
||||||
|
"motion_id": mid, |
||||||
|
"score": score, |
||||||
|
"title": detail[1] if detail else None, |
||||||
|
"body_text": detail[2] if detail else None, |
||||||
|
"date": str(detail[3])[:10] if detail and detail[3] else None, |
||||||
|
"policy_area": detail[4] if detail else None, |
||||||
|
} |
||||||
|
) |
||||||
|
|
||||||
|
output: Dict[str, Any] = {"window": args.window, "rows": output_rows} |
||||||
|
|
||||||
|
out_dir = os.path.dirname(args.out) |
||||||
|
if out_dir: |
||||||
|
os.makedirs(out_dir, exist_ok=True) |
||||||
|
|
||||||
|
with open(args.out, "w", encoding="utf-8") as f: |
||||||
|
json.dump(output, f, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
logger.info( |
||||||
|
"Written %d rows (%d components) to %s", |
||||||
|
len(output_rows), |
||||||
|
args.components, |
||||||
|
args.out, |
||||||
|
) |
||||||
|
return 0 |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
raise SystemExit(main()) |
||||||
Loading…
Reference in new issue