motief/tests/test_migration_pipeline_tab...

from pathlib import Path

try:
    import duckdb

    DB_BACKEND = "duckdb"
except Exception:
    import sqlite3

    DB_BACKEND = "sqlite3"


MIGRATIONS = [
    (
        "migrations/2026_03_21__create_mp_votes.sql",
        "mp_votes",
        [
            "id",
            "motion_id",
            "mp_name",
            "party",
            "vote",
            "date",
            "created_at",
        ],
    ),
    (
        "migrations/2026_03_21__create_mp_metadata.sql",
        "mp_metadata",
        [
            "mp_name",
            "party",
            "van",
            "tot_en_met",
            "persoon_id",
        ],
    ),
    (
        "migrations/2026_03_21__create_svd_vectors.sql",
        "svd_vectors",
        [
            "id",
            "window_id",
            "entity_type",
            "entity_id",
            "vector",
            "model",
            "created_at",
        ],
    ),
    (
        "migrations/2026_03_21__create_fused_embeddings.sql",
        "fused_embeddings",
        [
            "id",
            "motion_id",
            "window_id",
            "vector",
            "svd_dims",
            "text_dims",
            "created_at",
        ],
    ),
]


def test_run_migrations_and_tables(tmp_path):
    db_path = tmp_path / "test.db"
    if DB_BACKEND == "duckdb":
        conn = duckdb.connect(str(db_path))
    else:
        conn = sqlite3.connect(str(db_path))

    for sql_path, table_name, expected_cols in MIGRATIONS:
        p = Path(sql_path)
        assert p.exists(), f"Migration file {sql_path} must exist"
        sql = p.read_text()

        # If using sqlite3, transform SQL to be sqlite compatible
        if DB_BACKEND == "sqlite3":
            # remove CREATE SEQUENCE lines
            lines = [
                l
                for l in sql.splitlines()
                if not l.strip().upper().startswith("CREATE SEQUENCE")
            ]
            sql2 = "\n".join(lines)
            # remove DEFAULT nextval(...) occurrences
            import re

            sql2 = re.sub(
                r"DEFAULT\s+nextval\('[^']+'\)", "", sql2, flags=re.IGNORECASE
            )
            # replace JSON type with TEXT
            sql2 = re.sub(r"\bJSON\b", "TEXT", sql2, flags=re.IGNORECASE)
            # execute as script (multiple statements)
            conn.executescript(sql2)
        else:
            # execute migration SQL
            conn.execute(sql)

        # check columns via pragma
        if DB_BACKEND == "duckdb":
            rows = conn.execute(f"PRAGMA table_info('{table_name}')").fetchall()
            col_names = [r[1] for r in rows]
        else:
            cur = conn.execute(f"PRAGMA table_info('{table_name}')")
            rows = cur.fetchall()
            col_names = [r[1] for r in rows]

        for col in expected_cols:
            assert col in col_names, (
                f"Column {col} missing in table {table_name}, got {col_names}"
            )

        # perform a simple insert + select to validate basic round-trip
        if table_name == "mp_votes":
            if DB_BACKEND == "duckdb":
                conn.execute(
                    "INSERT INTO mp_votes (motion_id, mp_name, party, vote, date) VALUES (1, 'Jane Doe', 'PartyX', 'Yea', '2026-03-21')"
                )
                res = conn.execute(
                    "SELECT motion_id, mp_name, party, vote, date FROM mp_votes WHERE motion_id=1"
                ).fetchone()
                # DuckDB returns datetime.date for DATE columns; normalise to string
                assert (
                    res[:4] == (1, "Jane Doe", "PartyX", "Yea")
                    and str(res[4]) == "2026-03-21"
                )
            else:
                # sqlite: id has no default after transformation, provide id explicitly
                conn.execute(
                    "INSERT INTO mp_votes (id, motion_id, mp_name, party, vote, date) VALUES (1, 1, 'Jane Doe', 'PartyX', 'Yea', '2026-03-21')"
                )
                res = conn.execute(
                    "SELECT motion_id, mp_name, party, vote, date FROM mp_votes WHERE id=1"
                ).fetchone()
                assert res == (1, "Jane Doe", "PartyX", "Yea", "2026-03-21")

        elif table_name == "mp_metadata":
            conn.execute(
                "INSERT INTO mp_metadata (mp_name, party, van, tot_en_met, persoon_id) VALUES ('Jane Doe', 'PartyX', '2020-01-01', '2024-12-31', 'pid-123')"
            )
            res = conn.execute(
                "SELECT mp_name, party, van, tot_en_met, persoon_id FROM mp_metadata WHERE mp_name='Jane Doe'"
            ).fetchone()
            # DuckDB returns datetime.date for DATE columns; normalise to string
            assert (
                res[0] == "Jane Doe"
                and res[1] == "PartyX"
                and str(res[2]) == "2020-01-01"
                and str(res[3]) == "2024-12-31"
                and res[4] == "pid-123"
            )

        elif table_name == "svd_vectors":
            # JSON value as text
            if DB_BACKEND == "duckdb":
                conn.execute(
                    "INSERT INTO svd_vectors (window_id, entity_type, entity_id, vector, model) VALUES ('w1', 'typeA', 'e1', '[1,2,3]', 'm1')"
                )
                res = conn.execute(
                    "SELECT window_id, entity_type, entity_id, vector, model FROM svd_vectors WHERE window_id='w1'"
                ).fetchone()
                # Note: DuckDB may return the JSON column as string; compare string form
                assert (
                    res[0] == "w1"
                    and res[1] == "typeA"
                    and res[2] == "e1"
                    and (str(res[3]) == "[1,2,3]" or res[3] == "[1,2,3]")
                    and res[4] == "m1"
                )
            else:
                # sqlite: provide id explicitly
                conn.execute(
                    "INSERT INTO svd_vectors (id, window_id, entity_type, entity_id, vector, model) VALUES (1, 'w1', 'typeA', 'e1', '[1,2,3]', 'm1')"
                )
                res = conn.execute(
                    "SELECT window_id, entity_type, entity_id, vector, model FROM svd_vectors WHERE id=1"
                ).fetchone()
                assert (
                    res[0] == "w1"
                    and res[1] == "typeA"
                    and res[2] == "e1"
                    and str(res[3]) == "[1,2,3]"
                    and res[4] == "m1"
                )

        elif table_name == "fused_embeddings":
            if DB_BACKEND == "duckdb":
                conn.execute(
                    "INSERT INTO fused_embeddings (motion_id, window_id, vector, svd_dims, text_dims) VALUES (2, 'w2', '[0.1,0.2]', 16, 128)"
                )
                res = conn.execute(
                    "SELECT motion_id, window_id, vector, svd_dims, text_dims FROM fused_embeddings WHERE motion_id=2"
                ).fetchone()
                assert (
                    res[0] == 2
                    and res[1] == "w2"
                    and (str(res[2]) == "[0.1,0.2]" or res[2] == "[0.1,0.2]")
                    and res[3] == 16
                    and res[4] == 128
                )
            else:
                conn.execute(
                    "INSERT INTO fused_embeddings (id, motion_id, window_id, vector, svd_dims, text_dims) VALUES (1, 2, 'w2', '[0.1,0.2]', 16, 128)"
                )
                res = conn.execute(
                    "SELECT motion_id, window_id, vector, svd_dims, text_dims FROM fused_embeddings WHERE id=1"
                ).fetchone()
                assert (
                    res[0] == 2
                    and res[1] == "w2"
                    and str(res[2]) == "[0.1,0.2]"
                    and res[3] == 16
                    and res[4] == 128
                )

    conn.close()