motief/scripts/generate_svd_json.py

"""Generate thoughts/explorer/top_svd_top_motions.json from svd_vectors.

For each SVD component, finds the top N motions by absolute score (split
equally between positive and negative pole), joins with the motions table,
and writes the result to the output JSON file.

Usage:
  uv run python3 scripts/generate_svd_json.py --db data/motions.db --window current_parliament
  uv run python3 scripts/generate_svd_json.py --db data/motions.db --window 2025
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import sys
from typing import Any, Dict, List, Optional, Tuple

ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

logger = logging.getLogger("generate_svd_json")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")


def main(argv: Optional[List[str]] = None) -> int:
    p = argparse.ArgumentParser(
        description="Generate SVD top-motions JSON for a window."
    )
    p.add_argument("--db", default="data/motions.db", help="Path to motions.db")
    p.add_argument(
        "--window", default="current_parliament", help="SVD window_id to use"
    )
    p.add_argument(
        "--top-n",
        type=int,
        default=10,
        help="Top N motions per component (split pos/neg)",
    )
    p.add_argument(
        "--components", type=int, default=10, help="Number of SVD components to include"
    )
    p.add_argument(
        "--out",
        default="thoughts/explorer/top_svd_top_motions.json",
        help="Output JSON file path",
    )
    args = p.parse_args(argv)

    try:
        import duckdb
    except ImportError:
        logger.error("duckdb not available")
        return 2

    con = duckdb.connect(database=args.db, read_only=True)

    # Load all motion SVD vectors for the window
    logger.info("Loading motion SVD vectors for window='%s' ...", args.window)
    rows = con.execute(
        "SELECT entity_id, vector FROM svd_vectors "
        "WHERE entity_type='motion' AND window_id=?",
        [args.window],
    ).fetchall()

    if not rows:
        logger.error(
            "No motion vectors found for window='%s' in %s", args.window, args.db
        )
        con.close()
        return 3

    logger.info("Loaded %d motion vectors", len(rows))

    # Parse vectors into {motion_id: list[float]}
    motion_scores: Dict[int, List[float]] = {}
    for entity_id, raw_vec in rows:
        try:
            if isinstance(raw_vec, str):
                vec = json.loads(raw_vec)
            elif isinstance(raw_vec, (bytes, bytearray)):
                vec = json.loads(raw_vec.decode())
            elif isinstance(raw_vec, list):
                vec = raw_vec
            else:
                vec = list(raw_vec)
            motion_scores[int(entity_id)] = [
                float(v) if v is not None else 0.0 for v in vec
            ]
        except Exception:
            logger.warning("Failed to parse vector for motion_id=%s", entity_id)

    logger.info("Parsed %d motion vectors", len(motion_scores))

    n_positive = args.top_n // 2
    n_negative = args.top_n - n_positive

    output_rows: List[Dict[str, Any]] = []
    all_motion_ids: List[int] = []

    # Collect top motions per component
    per_component: List[List[Tuple[int, float]]] = []
    for comp_idx in range(args.components):
        scored: List[Tuple[int, float]] = []
        for mid, vec in motion_scores.items():
            if comp_idx < len(vec):
                scored.append((mid, vec[comp_idx]))

        scored.sort(key=lambda x: x[1], reverse=True)
        top_positive = scored[:n_positive]
        top_negative = scored[-n_negative:]
        combined = top_positive + list(reversed(top_negative))
        per_component.append(combined)
        all_motion_ids.extend(mid for mid, _ in combined)

    # Batch-fetch motion details
    unique_ids = list(set(all_motion_ids))
    if not unique_ids:
        logger.error("No motion IDs to fetch")
        con.close()
        return 4

    logger.info("Fetching details for %d unique motions ...", len(unique_ids))
    placeholders = ", ".join("?" for _ in unique_ids)
    detail_rows = con.execute(
        f"SELECT id, title, body_text, date, policy_area FROM motions WHERE id IN ({placeholders})",
        unique_ids,
    ).fetchall()
    con.close()

    details_map: Dict[int, tuple] = {row[0]: row for row in detail_rows}
    logger.info("Fetched details for %d motions", len(details_map))

    # Build output rows
    for comp_idx, top_motions in enumerate(per_component):
        comp_num = comp_idx + 1
        for mid, score in top_motions:
            detail = details_map.get(mid)
            output_rows.append(
                {
                    "component": comp_num,
                    "motion_id": mid,
                    "score": score,
                    "title": detail[1] if detail else None,
                    "body_text": detail[2] if detail else None,
                    "date": str(detail[3])[:10] if detail and detail[3] else None,
                    "policy_area": detail[4] if detail else None,
                }
            )

    output: Dict[str, Any] = {"window": args.window, "rows": output_rows}

    out_dir = os.path.dirname(args.out)
    if out_dir:
        os.makedirs(out_dir, exist_ok=True)

    with open(args.out, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    logger.info(
        "Written %d rows (%d components) to %s",
        len(output_rows),
        args.components,
        args.out,
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())