"""Compute summaries and embeddings for a small test batch of motions. Usage: # dry-run (no network calls) python scripts/compute_test_batch.py --limit 20 --dry-run # run (will call AI provider; requires OPENROUTER_API_KEY) python scripts/compute_test_batch.py --limit 20 This script is intentionally simple and intended for manual invocation. It will update motions.layman_explanation and store embeddings via db.store_embedding if available. """ from __future__ import annotations import argparse import logging import sys from typing import List import duckdb from config import config import ai_provider from database import db from summarizer import MotionSummarizer logger = logging.getLogger("compute_test_batch") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") def fetch_motion_candidates(limit: int) -> List[dict]: conn = duckdb.connect(config.DATABASE_PATH) try: # Prefer motions that still lack a layman_explanation so we don't re-process recent ones rows = conn.execute( "SELECT id, title, description FROM motions WHERE layman_explanation IS NULL OR layman_explanation = '' ORDER BY created_at DESC LIMIT ?", (limit,), ).fetchall() return [{"id": r[0], "title": r[1], "description": r[2] or ""} for r in rows] finally: conn.close() def process_batch(limit: int = 20, dry_run: bool = False): summarizer = MotionSummarizer() motions = fetch_motion_candidates(limit) logger.info("Found %d motions to process", len(motions)) conn = duckdb.connect(config.DATABASE_PATH) try: for i, m in enumerate(motions, start=1): mid = m["id"] title = m["title"] desc = m["description"] logger.info( "[%d/%d] Processing motion id=%s title=%s", i, len(motions), mid, title ) if dry_run: logger.info( "Dry run: would generate summary and embedding for motion %s", mid ) continue # Generate summary summary = summarizer.generate_layman_explanation(title, desc) # Update DB try: conn.execute( "UPDATE motions SET layman_explanation = ? WHERE id = ?", (summary, mid), ) except Exception as e: logger.exception("Failed to update motion %s: %s", mid, e) # Compute embedding and store try: emb = ai_provider.get_embedding(summary) store_fn = getattr(db, "store_embedding", None) if callable(store_fn): store_fn(mid, "text-embedding-3-small", emb) logger.info("Stored embedding for motion %s", mid) else: logger.warning( "No store_embedding available on db; skipping storage" ) except ai_provider.ProviderError as e: logger.exception( "Failed to compute/store embedding for motion %s: %s", mid, e ) finally: conn.close() def main(argv=None): p = argparse.ArgumentParser() p.add_argument("--limit", type=int, default=20, help="Number of motions to process") p.add_argument( "--dry-run", action="store_true", help="Do not call external APIs; just show what would run", ) args = p.parse_args(argv) if args.dry_run: logger.info("Running in dry-run mode; no network calls will be made") # Safety: confirm when not dry-run if not args.dry_run: confirm = ( input( f"This will call the AI provider for {args.limit} motions and may incur cost. Continue? (y/N): " ) .strip() .lower() ) if confirm not in ("y", "yes"): logger.info("Aborting per user choice") sys.exit(0) process_batch(limit=args.limit, dry_run=args.dry_run) if __name__ == "__main__": main()