motief/tests/integration/test_pipeline_end_to_end.py

import json
import os
import numpy as np
import pytest

# duckdb is an optional dependency in some environments; skip test if not available
duckdb = pytest.importorskip("duckdb")


def test_pipeline_end_to_end(tmp_path, monkeypatch):
    # ensure determinism for any random embedding generation
    np.random.seed(0)

    # prepare temp db
    db_path = str(tmp_path / "motions.db")

    # create the minimal MotionDatabase schema using existing code where possible
    from database import MotionDatabase

    db = MotionDatabase(db_path)

    # create embeddings table (migration would normally do this)
    conn = duckdb.connect(db.db_path)
    conn.execute("CREATE SEQUENCE IF NOT EXISTS embeddings_id_seq START 1")
    conn.execute(
        "CREATE TABLE IF NOT EXISTS embeddings (id INTEGER PRIMARY KEY DEFAULT nextval('embeddings_id_seq'), motion_id INTEGER, model TEXT, vector JSON, created_at TIMESTAMP)"
    )

    # insert three motions
    conn.execute(
        "INSERT INTO motions (title, description, url, layman_explanation) VALUES (?, ?, ?, ?)",
        ("t1", "d1", "u1", "ex1"),
    )
    conn.execute(
        "INSERT INTO motions (title, description, url, layman_explanation) VALUES (?, ?, ?, ?)",
        ("t2", "d2", "u2", "ex2"),
    )
    conn.execute(
        "INSERT INTO motions (title, description, url, layman_explanation) VALUES (?, ?, ?, ?)",
        ("t3", "d3", "u3", "ex3"),
    )

    # fetch ids
    rows = conn.execute("SELECT id FROM motions ORDER BY id").fetchall()
    ids = [r[0] for r in rows]

    # insert existing embedding for first motion
    vec = json.dumps([0.1] * 16)
    conn.execute(
        "INSERT INTO embeddings (motion_id, model, vector) VALUES (?, ?, ?)",
        (ids[0], "test-model", vec),
    )

    conn.close()

    # monkeypatch ai_provider.get_embedding to deterministic vector
    import ai_provider

    def fake_get_embedding(text, model=None):
        # produce a deterministic vector based on seeded numpy
        return list(np.random.rand(16))

    monkeypatch.setattr("ai_provider.get_embedding", fake_get_embedding)

    # run ensure_text_embeddings
    from pipeline.text_pipeline import ensure_text_embeddings

    stored, skipped_existing, skipped_no_text, errors = ensure_text_embeddings(
        db_path=db_path, model="test-model"
    )

    assert stored == 2
    assert skipped_existing == 1
    assert skipped_no_text == 0
    assert errors == 0

    # verify stored vectors length
    conn = duckdb.connect(db.db_path)
    rows = conn.execute(
        "SELECT vector FROM embeddings WHERE model = ? ORDER BY motion_id",
        ("test-model",),
    ).fetchall()
    conn.close()
    assert len(rows) == 3
    for r in rows:
        v = json.loads(r[0])
        assert len(v) == 16