You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
motief/tests/integration/test_pipeline_end_to_end.py

87 lines
2.7 KiB

import json
import os
import numpy as np
import pytest
# duckdb is an optional dependency in some environments; skip test if not available
duckdb = pytest.importorskip("duckdb")
def test_pipeline_end_to_end(tmp_path, monkeypatch):
# ensure determinism for any random embedding generation
np.random.seed(0)
# prepare temp db
db_path = str(tmp_path / "motions.db")
# create the minimal MotionDatabase schema using existing code where possible
from database import MotionDatabase
db = MotionDatabase(db_path)
# create embeddings table (migration would normally do this)
conn = duckdb.connect(db.db_path)
conn.execute("CREATE SEQUENCE IF NOT EXISTS embeddings_id_seq START 1")
conn.execute(
"CREATE TABLE IF NOT EXISTS embeddings (id INTEGER PRIMARY KEY DEFAULT nextval('embeddings_id_seq'), motion_id INTEGER, model TEXT, vector JSON, created_at TIMESTAMP)"
)
# insert three motions
conn.execute(
"INSERT INTO motions (title, description, url, layman_explanation) VALUES (?, ?, ?, ?)",
("t1", "d1", "u1", "ex1"),
)
conn.execute(
"INSERT INTO motions (title, description, url, layman_explanation) VALUES (?, ?, ?, ?)",
("t2", "d2", "u2", "ex2"),
)
conn.execute(
"INSERT INTO motions (title, description, url, layman_explanation) VALUES (?, ?, ?, ?)",
("t3", "d3", "u3", "ex3"),
)
# fetch ids
rows = conn.execute("SELECT id FROM motions ORDER BY id").fetchall()
ids = [r[0] for r in rows]
# insert existing embedding for first motion
vec = json.dumps([0.1] * 16)
conn.execute(
"INSERT INTO embeddings (motion_id, model, vector) VALUES (?, ?, ?)",
(ids[0], "test-model", vec),
)
conn.close()
# monkeypatch ai_provider.get_embedding to deterministic vector
import ai_provider
def fake_get_embedding(text, model=None):
# produce a deterministic vector based on seeded numpy
return list(np.random.rand(16))
monkeypatch.setattr("ai_provider.get_embedding", fake_get_embedding)
# run ensure_text_embeddings
from pipeline.text_pipeline import ensure_text_embeddings
stored, skipped_existing, skipped_no_text, errors = ensure_text_embeddings(
db_path=db_path, model="test-model"
)
assert stored == 2
assert skipped_existing == 1
assert skipped_no_text == 0
assert errors == 0
# verify stored vectors length
conn = duckdb.connect(db.db_path)
rows = conn.execute(
"SELECT vector FROM embeddings WHERE model = ? ORDER BY motion_id",
("test-model",),
).fetchall()
conn.close()
assert len(rows) == 3
for r in rows:
v = json.loads(r[0])
assert len(v) == 16