You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
motief/scripts/compute_test_batch.py

128 lines
4.1 KiB

"""Compute summaries and embeddings for a small test batch of motions.
Usage:
# dry-run (no network calls)
python scripts/compute_test_batch.py --limit 20 --dry-run
# run (will call AI provider; requires OPENROUTER_API_KEY)
python scripts/compute_test_batch.py --limit 20
This script is intentionally simple and intended for manual invocation.
It will update motions.layman_explanation and store embeddings via db.store_embedding if available.
"""
from __future__ import annotations
import argparse
import logging
import sys
from typing import List
import duckdb
from config import config
import ai_provider
from database import db
from summarizer import MotionSummarizer
logger = logging.getLogger("compute_test_batch")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
def fetch_motion_candidates(limit: int) -> List[dict]:
conn = duckdb.connect(config.DATABASE_PATH)
try:
# Prefer motions that still lack a layman_explanation so we don't re-process recent ones
rows = conn.execute(
"SELECT id, title, description FROM motions WHERE layman_explanation IS NULL OR layman_explanation = '' ORDER BY created_at DESC LIMIT ?",
(limit,),
).fetchall()
return [{"id": r[0], "title": r[1], "description": r[2] or ""} for r in rows]
finally:
conn.close()
def process_batch(limit: int = 20, dry_run: bool = False):
summarizer = MotionSummarizer()
motions = fetch_motion_candidates(limit)
logger.info("Found %d motions to process", len(motions))
conn = duckdb.connect(config.DATABASE_PATH)
try:
for i, m in enumerate(motions, start=1):
mid = m["id"]
title = m["title"]
desc = m["description"]
logger.info(
"[%d/%d] Processing motion id=%s title=%s", i, len(motions), mid, title
)
if dry_run:
logger.info(
"Dry run: would generate summary and embedding for motion %s", mid
)
continue
# Generate summary
summary = summarizer.generate_layman_explanation(title, desc)
# Update DB
try:
conn.execute(
"UPDATE motions SET layman_explanation = ? WHERE id = ?",
(summary, mid),
)
except Exception as e:
logger.exception("Failed to update motion %s: %s", mid, e)
# Compute embedding and store
try:
emb = ai_provider.get_embedding(summary)
store_fn = getattr(db, "store_embedding", None)
if callable(store_fn):
store_fn(mid, "text-embedding-3-small", emb)
logger.info("Stored embedding for motion %s", mid)
else:
logger.warning(
"No store_embedding available on db; skipping storage"
)
except ai_provider.ProviderError as e:
logger.exception(
"Failed to compute/store embedding for motion %s: %s", mid, e
)
finally:
conn.close()
def main(argv=None):
p = argparse.ArgumentParser()
p.add_argument("--limit", type=int, default=20, help="Number of motions to process")
p.add_argument(
"--dry-run",
action="store_true",
help="Do not call external APIs; just show what would run",
)
args = p.parse_args(argv)
if args.dry_run:
logger.info("Running in dry-run mode; no network calls will be made")
# Safety: confirm when not dry-run
if not args.dry_run:
confirm = (
input(
f"This will call the AI provider for {args.limit} motions and may incur cost. Continue? (y/N): "
)
.strip()
.lower()
)
if confirm not in ("y", "yes"):
logger.info("Aborting per user choice")
sys.exit(0)
process_batch(limit=args.limit, dry_run=args.dry_run)
if __name__ == "__main__":
main()