motief/scripts/compute_test_batch.py

"""Compute summaries and embeddings for a small test batch of motions.

Usage:
  # dry-run (no network calls)
  python scripts/compute_test_batch.py --limit 20 --dry-run

  # run (will call AI provider; requires OPENROUTER_API_KEY)
  python scripts/compute_test_batch.py --limit 20

This script is intentionally simple and intended for manual invocation.
It will update motions.layman_explanation and store embeddings via db.store_embedding if available.
"""

from __future__ import annotations

import argparse
import logging
import sys
from typing import List

import duckdb

from config import config
import ai_provider
from database import db
from summarizer import MotionSummarizer


logger = logging.getLogger("compute_test_batch")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")


def fetch_motion_candidates(limit: int) -> List[dict]:
    conn = duckdb.connect(config.DATABASE_PATH)
    try:
        # Prefer motions that still lack a layman_explanation so we don't re-process recent ones
        rows = conn.execute(
            "SELECT id, title, description FROM motions WHERE layman_explanation IS NULL OR layman_explanation = '' ORDER BY created_at DESC LIMIT ?",
            (limit,),
        ).fetchall()
        return [{"id": r[0], "title": r[1], "description": r[2] or ""} for r in rows]
    finally:
        conn.close()


def process_batch(limit: int = 20, dry_run: bool = False):
    summarizer = MotionSummarizer()
    motions = fetch_motion_candidates(limit)
    logger.info("Found %d motions to process", len(motions))

    conn = duckdb.connect(config.DATABASE_PATH)
    try:
        for i, m in enumerate(motions, start=1):
            mid = m["id"]
            title = m["title"]
            desc = m["description"]
            logger.info(
                "[%d/%d] Processing motion id=%s title=%s", i, len(motions), mid, title
            )

            if dry_run:
                logger.info(
                    "Dry run: would generate summary and embedding for motion %s", mid
                )
                continue

            # Generate summary
            summary = summarizer.generate_layman_explanation(title, desc)
            # Update DB
            try:
                conn.execute(
                    "UPDATE motions SET layman_explanation = ? WHERE id = ?",
                    (summary, mid),
                )
            except Exception as e:
                logger.exception("Failed to update motion %s: %s", mid, e)

            # Compute embedding and store
            try:
                emb = ai_provider.get_embedding(summary)
                store_fn = getattr(db, "store_embedding", None)
                if callable(store_fn):
                    store_fn(mid, "text-embedding-3-small", emb)
                    logger.info("Stored embedding for motion %s", mid)
                else:
                    logger.warning(
                        "No store_embedding available on db; skipping storage"
                    )
            except ai_provider.ProviderError as e:
                logger.exception(
                    "Failed to compute/store embedding for motion %s: %s", mid, e
                )

    finally:
        conn.close()


def main(argv=None):
    p = argparse.ArgumentParser()
    p.add_argument("--limit", type=int, default=20, help="Number of motions to process")
    p.add_argument(
        "--dry-run",
        action="store_true",
        help="Do not call external APIs; just show what would run",
    )
    args = p.parse_args(argv)

    if args.dry_run:
        logger.info("Running in dry-run mode; no network calls will be made")

    # Safety: confirm when not dry-run
    if not args.dry_run:
        confirm = (
            input(
                f"This will call the AI provider for {args.limit} motions and may incur cost. Continue? (y/N): "
            )
            .strip()
            .lower()
        )
        if confirm not in ("y", "yes"):
            logger.info("Aborting per user choice")
            sys.exit(0)

    process_batch(limit=args.limit, dry_run=args.dry_run)


if __name__ == "__main__":
    main()