motief/pipeline/text_pipeline.py

import logging
import json
from typing import Optional, List, Tuple

import duckdb

from database import MotionDatabase, db as default_db
import ai_provider

_logger = logging.getLogger(__name__)

DEFAULT_MODEL = "qwen/qwen3-embedding-4b"


def _select_text(
    db: MotionDatabase, model: str, limit: Optional[int] = None
) -> List[Tuple[int, Optional[str]]]:
    """Select motions that do not yet have an embedding for `model`.

    Returns list of (motion_id, text).
    """
    conn = duckdb.connect(db.db_path)
    params = [model]
    # prefer layman_explanation > description > title (keep compatibility with existing tests)
    sql = (
        "SELECT m.id, COALESCE(m.layman_explanation, m.description, m.title) AS text"
        " FROM motions m"
        " LEFT JOIN embeddings e ON e.motion_id = m.id AND e.model = ?"
        " WHERE e.id IS NULL"
    )
    if limit:
        sql += " LIMIT ?"
        params.append(limit)

    try:
        rows = conn.execute(sql, params).fetchall()
        conn.close()
        results: List[Tuple[int, Optional[str]]] = []
        for r in rows:
            text_val = r[1]
            # treat empty strings as no text
            if text_val is None:
                text = None
            else:
                text = str(text_val).strip() or None
            results.append((int(r[0]), text))
        return results
    except Exception as exc:
        _logger.error("Error selecting motions for embeddings: %s", exc)
        try:
            conn.close()
        except Exception:
            pass
        return []


def ensure_text_embeddings(
    db_path: Optional[str] = None, model: Optional[str] = None
) -> Tuple[int, int, int, int]:
    """Ensure all motions have text embeddings for `model`.

    Returns tuple (stored_count, skipped_existing, skipped_no_text, errors).
    """
    model = model or DEFAULT_MODEL
    db = MotionDatabase(db_path) if db_path else default_db

    # motions to process
    to_process = _select_text(db, model)

    # how many already exist
    conn = duckdb.connect(db.db_path)
    try:
        total_motions = conn.execute("SELECT COUNT(*) FROM motions").fetchone()[0]
    except Exception:
        total_motions = 0

    try:
        existing = conn.execute(
            "SELECT COUNT(DISTINCT motion_id) FROM embeddings WHERE model = ?", (model,)
        ).fetchone()[0]
    except Exception:
        existing = 0

    conn.close()

    stored = 0
    skipped_no_text = 0
    errors = 0

    for motion_id, text in to_process:
        if not text:
            _logger.info("Skipping motion %s: no text available", motion_id)
            skipped_no_text += 1
            continue

        try:
            vec = ai_provider.get_embedding(text, model=model)
            if not isinstance(vec, list):
                _logger.warning(
                    "Embedding provider returned non-list for motion %s", motion_id
                )
                errors += 1
                continue

            res = db.store_embedding(motion_id, model, vec)
            if res and res > 0:
                stored += 1
            else:
                _logger.error(
                    "Failed to store embedding for motion %s (store returned %s)",
                    motion_id,
                    res,
                )
                errors += 1
        except Exception as exc:
            _logger.error(
                "Error computing/storing embedding for motion %s: %s", motion_id, exc
            )
            errors += 1

    skipped_existing = int(existing)
    return stored, skipped_existing, skipped_no_text, errors