You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
128 lines
4.1 KiB
128 lines
4.1 KiB
"""Compute summaries and embeddings for a small test batch of motions.
|
|
|
|
Usage:
|
|
# dry-run (no network calls)
|
|
python scripts/compute_test_batch.py --limit 20 --dry-run
|
|
|
|
# run (will call AI provider; requires OPENROUTER_API_KEY)
|
|
python scripts/compute_test_batch.py --limit 20
|
|
|
|
This script is intentionally simple and intended for manual invocation.
|
|
It will update motions.layman_explanation and store embeddings via db.store_embedding if available.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
from typing import List
|
|
|
|
import duckdb
|
|
|
|
from config import config
|
|
import ai_provider
|
|
from database import db
|
|
from summarizer import MotionSummarizer
|
|
|
|
|
|
logger = logging.getLogger("compute_test_batch")
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
|
|
def fetch_motion_candidates(limit: int) -> List[dict]:
|
|
conn = duckdb.connect(config.DATABASE_PATH)
|
|
try:
|
|
# Prefer motions that still lack a layman_explanation so we don't re-process recent ones
|
|
rows = conn.execute(
|
|
"SELECT id, title, description FROM motions WHERE layman_explanation IS NULL OR layman_explanation = '' ORDER BY created_at DESC LIMIT ?",
|
|
(limit,),
|
|
).fetchall()
|
|
return [{"id": r[0], "title": r[1], "description": r[2] or ""} for r in rows]
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def process_batch(limit: int = 20, dry_run: bool = False):
|
|
summarizer = MotionSummarizer()
|
|
motions = fetch_motion_candidates(limit)
|
|
logger.info("Found %d motions to process", len(motions))
|
|
|
|
conn = duckdb.connect(config.DATABASE_PATH)
|
|
try:
|
|
for i, m in enumerate(motions, start=1):
|
|
mid = m["id"]
|
|
title = m["title"]
|
|
desc = m["description"]
|
|
logger.info(
|
|
"[%d/%d] Processing motion id=%s title=%s", i, len(motions), mid, title
|
|
)
|
|
|
|
if dry_run:
|
|
logger.info(
|
|
"Dry run: would generate summary and embedding for motion %s", mid
|
|
)
|
|
continue
|
|
|
|
# Generate summary
|
|
summary = summarizer.generate_layman_explanation(title, desc)
|
|
# Update DB
|
|
try:
|
|
conn.execute(
|
|
"UPDATE motions SET layman_explanation = ? WHERE id = ?",
|
|
(summary, mid),
|
|
)
|
|
except Exception as e:
|
|
logger.exception("Failed to update motion %s: %s", mid, e)
|
|
|
|
# Compute embedding and store
|
|
try:
|
|
emb = ai_provider.get_embedding(summary)
|
|
store_fn = getattr(db, "store_embedding", None)
|
|
if callable(store_fn):
|
|
store_fn(mid, "text-embedding-3-small", emb)
|
|
logger.info("Stored embedding for motion %s", mid)
|
|
else:
|
|
logger.warning(
|
|
"No store_embedding available on db; skipping storage"
|
|
)
|
|
except ai_provider.ProviderError as e:
|
|
logger.exception(
|
|
"Failed to compute/store embedding for motion %s: %s", mid, e
|
|
)
|
|
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def main(argv=None):
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument("--limit", type=int, default=20, help="Number of motions to process")
|
|
p.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Do not call external APIs; just show what would run",
|
|
)
|
|
args = p.parse_args(argv)
|
|
|
|
if args.dry_run:
|
|
logger.info("Running in dry-run mode; no network calls will be made")
|
|
|
|
# Safety: confirm when not dry-run
|
|
if not args.dry_run:
|
|
confirm = (
|
|
input(
|
|
f"This will call the AI provider for {args.limit} motions and may incur cost. Continue? (y/N): "
|
|
)
|
|
.strip()
|
|
.lower()
|
|
)
|
|
if confirm not in ("y", "yes"):
|
|
logger.info("Aborting per user choice")
|
|
sys.exit(0)
|
|
|
|
process_batch(limit=args.limit, dry_run=args.dry_run)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|