motief/analysis/explorer_data.py

"""Data loading functions for the parliamentary explorer.

This module contains all data loading functions extracted from explorer.py.
It is intentionally free of Streamlit side-effects to be easy to unit test.
"""

from __future__ import annotations

import logging
from typing import Dict, List, Set, Tuple

try:
    import duckdb
except (
    Exception
):  # pragma: no cover - allow lightweight import without duckdb installed
    duckdb = None  # type: ignore
import numpy as np
import pandas as pd

from analysis.config import CURRENT_PARLIAMENT_PARTIES, _PARTY_NORMALIZE

__all__ = [
    "get_available_windows",
    "get_uniform_dim_windows",
    "load_positions",
    "load_party_map",
    "load_active_mps",
    "load_mp_vectors_by_window",
    "load_mp_vectors_by_party",
    "load_mp_vectors_by_party_for_window",
    "load_party_axis_scores",
    "load_party_axis_scores_for_window",
    "load_party_scores_all_windows",
    "load_party_scores_all_windows_aligned",
    "load_party_mp_vectors",
    "build_window_party_scores",
    "load_motions_df",
    "query_similar",
    "compute_party_axis_scores",
    "get_aligned_party_scores",
    "compute_party_discipline",
    "_get_aligned_trajectory_scores",
]

logger = logging.getLogger(__name__)

_WINDOW_SQL = """
    SELECT DISTINCT window_id FROM svd_vectors ORDER BY window_id
"""

_UNIFORM_DIM_SQL = """
    WITH vec_dims AS (
        SELECT window_id, json_array_length(vector) AS dim
        FROM svd_vectors
        WHERE entity_type = 'mp'
    ),
    window_dim_counts AS (
        SELECT window_id, dim, COUNT(*) AS cnt
        FROM vec_dims
        GROUP BY window_id, dim
    ),
    dominant AS (
        SELECT DISTINCT ON (window_id) window_id, dim, cnt
        FROM window_dim_counts
        ORDER BY window_id, cnt DESC, dim DESC
    )
    SELECT window_id
    FROM dominant
    WHERE dim >= 25 AND cnt >= 10
      AND window_id NOT LIKE '%-Q%'
    ORDER BY window_id
"""


def get_available_windows(db_path: str) -> List[str]:
    """Return sorted list of distinct window_ids from svd_vectors."""
    con = duckdb.connect(database=db_path, read_only=True)
    try:
        rows = con.execute(_WINDOW_SQL).fetchall()
        return [r[0] for r in rows]
    except Exception:
        logger.exception("Failed to query available windows")
        return []
    finally:
        con.close()


def get_uniform_dim_windows(db_path: str) -> List[str]:
    """Return only windows whose dominant MP-vector dimension is >= 25.

    Some windows contain a mix of vector lengths due to multiple pipeline runs
    (e.g. 2016 has both dim=1 and dim=50 rows). We find the most common dimension
    per window and include only windows where that dominant dim >= 25.
    Windows with too few dim-25+ entities (< 10) are also excluded to avoid
    degenerate PCA inputs.
    """
    con = duckdb.connect(database=db_path, read_only=True)
    try:
        rows = con.execute(_UNIFORM_DIM_SQL).fetchall()
        return [r[0] for r in rows]
    except Exception:
        logger.exception("Failed to query uniform-dim windows")
        return []
    finally:
        con.close()


def load_party_map(db_path: str) -> Dict[str, str]:
    """Return {mp_name: party} mapping, with party names normalised to abbreviations."""
    try:
        con = duckdb.connect(database=db_path, read_only=True)
        rows = con.execute(
            "SELECT mp_name, party FROM mp_metadata WHERE party IS NOT NULL"
        ).fetchall()
        con.close()
        return {
            mp: _PARTY_NORMALIZE.get(party, party) for mp, party in rows if mp and party
        }
    except Exception:
        logger.exception("Failed to load party map")
        return {}


def load_active_mps(db_path: str) -> Set[str]:
    """Return the set of mp_name values that are currently seated in parliament.

    An MP is considered active if their mp_metadata row has tot_en_met IS NULL,
    meaning they have no recorded end date for their current seat.
    """
    try:
        con = duckdb.connect(database=db_path, read_only=True)
        rows = con.execute(
            "SELECT mp_name FROM mp_metadata WHERE tot_en_met IS NULL"
        ).fetchall()
        con.close()
        return {r[0] for r in rows if r[0]}
    except Exception:
        logger.exception("Failed to load active MPs")
        return set()


def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]:
    """Return party scores for all windows (non-aligned).

    Returns dict mapping party_abbrev -> list of axis scores, one per window.
    Computed as the mean of individual MP vectors per party.
    """
    try:
        return compute_party_axis_scores(load_mp_vectors_by_party(db_path))
    except Exception:
        logger.exception("Failed to load party axis scores")
        return {}


def load_party_axis_scores_for_window(
    db_path: str, window: str
) -> Dict[str, List[float]]:
    """Return party scores for a specific window.

    Computed as the mean of individual MP vectors per party for the window.
    """
    try:
        return compute_party_axis_scores(
            load_mp_vectors_by_party_for_window(db_path, window)
        )
    except Exception:
        logger.exception("Failed to load party axis scores for window %s", window)
        return {}


def load_party_scores_all_windows(db_path: str) -> Dict[str, List[List[float]]]:
    """Return party scores across all windows (non-aligned)."""
    try:
        con = duckdb.connect(database=db_path, read_only=True)
        table_exists = con.execute(
            "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = 'party_axis_scores'"
        ).fetchone()[0]
        if table_exists:
            rows = con.execute(
                """
                SELECT party_abbrev, window_id, x_axis, y_axis
                FROM party_axis_scores
                ORDER BY party_abbrev, window_id
                """
            ).fetchall()
            con.close()

            scores: Dict[str, List[List[float]]] = {}
            current_party = None
            for party, window, x, y in rows:
                if party != current_party:
                    scores[party] = []
                    current_party = party
                if x is not None and y is not None:
                    scores[party].append([x, y])
                else:
                    scores[party].append([0.0, 0.0])
            return scores
        con.close()
    except Exception:
        logger.exception("Failed to load party scores all windows from table")

    # Fallback: compute from positions when table does not exist
    try:
        positions_by_window, _ = load_positions(db_path, "annual")
        _party_map = load_party_map(db_path)
        scores: Dict[str, List[List[float]]] = {}
        for window, window_pos in positions_by_window.items():
            party_coords: Dict[str, List[Tuple[float, float]]] = {}
            for mp_name, (x, y) in window_pos.items():
                party = _party_map.get(
                    mp_name, _party_map.get(mp_name.split("(")[0].strip(), None)
                )
                if party:
                    party_coords.setdefault(party, []).append((x, y))
            for party, coords in party_coords.items():
                if coords:
                    mean_x = float(np.mean([c[0] for c in coords]))
                    mean_y = float(np.mean([c[1] for c in coords]))
                    scores.setdefault(party, []).append([mean_x, mean_y])
        return scores
    except Exception:
        logger.exception("Failed to compute party scores all windows from positions")
        return {}


def load_party_scores_all_windows_aligned(
    db_path: str,
) -> Dict[str, List[List[float]]]:
    """Return party scores across all windows (Procrustes-aligned)."""
    try:
        con = duckdb.connect(database=db_path, read_only=True)
        table_exists = con.execute(
            "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = 'party_axis_scores'"
        ).fetchone()[0]
        if table_exists:
            rows = con.execute(
                """
                SELECT party_abbrev, window_id, x_axis_aligned, y_axis_aligned
                FROM party_axis_scores
                ORDER BY party_abbrev, window_id
                """
            ).fetchall()
            con.close()

            scores: Dict[str, List[List[float]]] = {}
            current_party = None
            for party, window, x, y in rows:
                if party != current_party:
                    scores[party] = []
                    current_party = party
                if x is not None and y is not None:
                    scores[party].append([x, y])
                else:
                    scores[party].append([0.0, 0.0])
            return scores
        con.close()
    except Exception:
        logger.exception("Failed to load aligned party scores all windows from table")

    # Fallback: compute from positions when table does not exist
    try:
        positions_by_window, _ = load_positions(db_path, "annual")
        _party_map = load_party_map(db_path)
        scores: Dict[str, List[List[float]]] = {}
        for window, window_pos in positions_by_window.items():
            party_coords: Dict[str, List[Tuple[float, float]]] = {}
            for mp_name, (x, y) in window_pos.items():
                party = _party_map.get(
                    mp_name, _party_map.get(mp_name.split("(")[0].strip(), None)
                )
                if party:
                    party_coords.setdefault(party, []).append((x, y))
            for party, coords in party_coords.items():
                if coords:
                    mean_x = float(np.mean([c[0] for c in coords]))
                    mean_y = float(np.mean([c[1] for c in coords]))
                    scores.setdefault(party, []).append([mean_x, mean_y])
        return scores
    except Exception:
        logger.exception("Failed to compute aligned party scores all windows from positions")
        return {}


def build_window_party_scores(
    scores_by_party: Dict[str, List[List[float]]],
    window_idx: int,
) -> Dict[str, List[float]]:
    """Extract scores for one window as {party: [x, y]} for compute_flip_direction.

    Args:
        scores_by_party: Output of load_party_scores_all_windows_aligned —
            {party: [[x, y], [x, y], ...]} per window.
        window_idx: Zero-based index of the window to extract.

    Returns:
        {party: [x, y]} for the given window. Returns empty dict if
        window_idx is out of range.
    """
    if window_idx < 0:
        return {}
    result: Dict[str, List[float]] = {}
    for party, window_scores in scores_by_party.items():
        if window_idx < len(window_scores):
            result[party] = window_scores[window_idx]
    return result


def load_party_mp_vectors(db_path: str) -> Dict[str, List[np.ndarray]]:
    """Load individual MP SVD vectors grouped by party.

    Returns {party_name: [np.ndarray(50,), ...]} — one array per MP.
    """
    con = duckdb.connect(database=db_path, read_only=True)
    try:
        meta_rows = con.execute(
            "SELECT mp_name, party FROM mp_metadata "
            "WHERE van >= '2023-11-22' OR tot_en_met IS NULL OR tot_en_met >= '2023-11-22' "
            "ORDER BY van ASC"
        ).fetchall()
        mp_party: Dict[str, str] = {}
        for mp_name, party in meta_rows:
            if mp_name and party:
                mp_party[mp_name] = _PARTY_NORMALIZE.get(party, party)

        rows = con.execute(
            "SELECT entity_id, vector FROM svd_vectors "
            "WHERE entity_type = 'mp' AND window_id = 'current_parliament'"
        ).fetchall()

        vectors_by_party: Dict[str, List[np.ndarray]] = {}
        for entity_id, vector_json in rows:
            if entity_id in mp_party:
                party = mp_party[entity_id]
                if party not in vectors_by_party:
                    vectors_by_party[party] = []
                vectors_by_party[party].append(np.array(vector_json))

        return vectors_by_party
    except Exception:
        logger.exception("Failed to load party MP vectors")
        return {}
    finally:
        con.close()


def load_scree_data(db_path: str) -> List[float]:
    """Load scree plot data (explained variance) for current_parliament.

    First tries to read the cached metadata row from svd_vectors.
    Falls back to on-the-fly computation via compute_svd_spectrum for
    backward compatibility with databases that haven't stored it yet.
    """
    try:
        con = duckdb.connect(database=db_path, read_only=True)
        row = con.execute(
            """
            SELECT vector FROM svd_vectors
            WHERE window_id = 'current_parliament'
              AND entity_type = 'metadata'
              AND entity_id = 'explained_variance'
            LIMIT 1
            """
        ).fetchone()
        con.close()

        if row and row[0]:
            import json

            return json.loads(row[0])

        # Fallback: compute dynamically for backward compatibility
        from analysis.political_axis import compute_svd_spectrum

        return compute_svd_spectrum(db_path)
    except Exception:
        logger.exception("Failed to load scree data")
        return []


def load_motions_df(db_path: str) -> pd.DataFrame:
    """Load the full motions table as a pandas DataFrame (read-only)."""
    try:
        con = duckdb.connect(database=db_path, read_only=True)
        df = con.execute(
            """
            SELECT id, title, description, date, policy_area,
                   voting_results, layman_explanation,
                   winning_margin, controversy_score, url
            FROM motions
            """
        ).fetchdf()
        con.close()
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        df["year"] = df["date"].dt.year
        return df
    except Exception:
        logger.exception("Failed to load motions DataFrame")
        return pd.DataFrame()


def load_mp_vectors_by_window(db_path: str, window: str) -> Dict[str, np.ndarray]:
    """Load individual MP SVD vectors for a specific window.

    Args:
        db_path: Path to DuckDB database
        window: Window ID (e.g., "2015", "current_parliament")

    Returns:
        {mp_name: np.ndarray(50,)} — one vector per MP
    """
    import json as _json

    try:
        con = duckdb.connect(database=db_path, read_only=True)
        rows = con.execute(
            """
            SELECT entity_id, vector FROM svd_vectors
            WHERE entity_type = 'mp' AND window_id = ?
            """,
            [window],
        ).fetchall()
        con.close()

        mp_vecs: Dict[str, np.ndarray] = {}
        for entity_id, raw_vec in rows:
            if isinstance(raw_vec, str):
                vec = _json.loads(raw_vec)
            elif isinstance(raw_vec, (bytes, bytearray)):
                vec = _json.loads(raw_vec.decode())
            elif isinstance(raw_vec, list):
                vec = raw_vec
            else:
                try:
                    vec = list(raw_vec)
                except Exception:
                    continue
            fvec = np.array([float(v) if v is not None else 0.0 for v in vec])
            mp_vecs[entity_id] = fvec

        return mp_vecs
    except Exception:
        logger.exception("Failed to load MP vectors for window %s", window)
        return {}


def query_similar(
    db_path: str,
    source_motion_id: int,
    vector_type: str = "fused",
    top_k: int = 10,
) -> pd.DataFrame:
    """Return top-k similar motions from similarity_cache (read-only)."""
    try:
        con = duckdb.connect(database=db_path, read_only=True)
        rows = con.execute(
            """
            SELECT sc.target_motion_id, sc.score, sc.window_id,
                   m.title, m.date, m.policy_area
            FROM similarity_cache sc
            JOIN motions m ON m.id = sc.target_motion_id
            WHERE sc.source_motion_id = ?
              AND sc.vector_type = ?
            ORDER BY sc.score DESC
            LIMIT ?
            """,
            [source_motion_id, vector_type, top_k],
        ).fetchdf()
        con.close()
        return rows
    except Exception:
        logger.exception(
            "Failed to query similarity cache for motion %s", source_motion_id
        )
        return pd.DataFrame()


def load_mp_vectors_by_party(db_path: str) -> Dict[str, List[np.ndarray]]:
    """Load individual MP SVD vectors grouped by party for current_parliament.

    Returns:
        {party_name: [np.ndarray(50,), ...]} — one array per MP.
    """
    import json as _json

    try:
        con = duckdb.connect(database=db_path, read_only=True)
        meta_rows = con.execute(
            "SELECT mp_name, party FROM mp_metadata "
            "WHERE van >= '2023-11-22' OR tot_en_met IS NULL OR tot_en_met >= '2023-11-22' "
            "ORDER BY van ASC"
        ).fetchall()
        mp_party: Dict[str, str] = {}
        for mp_name, party in meta_rows:
            if mp_name and party:
                mp_party[mp_name] = _PARTY_NORMALIZE.get(party, party)

        rows = con.execute(
            "SELECT entity_id, vector FROM svd_vectors "
            "WHERE entity_type='mp' AND window_id='current_parliament'"
        ).fetchall()
        con.close()

        party_vecs: Dict[str, List[np.ndarray]] = {}
        for entity_id, raw_vec in rows:
            party = mp_party.get(entity_id)
            if party is None or party not in CURRENT_PARLIAMENT_PARTIES:
                continue
            if isinstance(raw_vec, str):
                vec = _json.loads(raw_vec)
            elif isinstance(raw_vec, (bytes, bytearray)):
                vec = _json.loads(raw_vec.decode())
            elif isinstance(raw_vec, list):
                vec = raw_vec
            else:
                try:
                    vec = list(raw_vec)
                except Exception:
                    continue
            fvec = np.array([float(v) if v is not None else 0.0 for v in vec])
            party_vecs.setdefault(party, []).append(fvec)
        return party_vecs
    except Exception:
        logger.exception("Failed to load MP vectors by party")
        return {}


def load_mp_vectors_by_party_for_window(
    db_path: str, window: str
) -> Dict[str, List[np.ndarray]]:
    """Load individual MP SVD vectors grouped by party for a specific window.

    For historical windows, uses the MP→party mapping from that time period.

    Returns:
        {party_name: [np.ndarray(50,), ...]} — one array per MP.
    """
    import json as _json

    try:
        con = duckdb.connect(database=db_path, read_only=True)
        is_current = window == "current_parliament"

        if is_current:
            meta_rows = con.execute(
                "SELECT mp_name, party FROM mp_metadata "
                "WHERE van >= '2023-11-22' OR tot_en_met IS NULL OR tot_en_met >= '2023-11-22' "
                "ORDER BY van ASC"
            ).fetchall()
        else:
            try:
                year = int(window.split("-")[0])
            except ValueError:
                year = 2023
            meta_rows = con.execute(
                "SELECT mp_name, party FROM mp_metadata "
                "WHERE van <= ? AND (tot_en_met IS NULL OR tot_en_met >= ?) "
                "ORDER BY van ASC",
                [f"{year}-12-31", f"{year}-01-01"],
            ).fetchall()

        mp_party: Dict[str, str] = {}
        for mp_name, party in meta_rows:
            if mp_name and party:
                mp_party[mp_name] = _PARTY_NORMALIZE.get(party, party)

        rows = con.execute(
            "SELECT entity_id, vector FROM svd_vectors "
            "WHERE entity_type='mp' AND window_id=?",
            [window],
        ).fetchall()
        con.close()

        party_vecs: Dict[str, List[np.ndarray]] = {}
        for entity_id, raw_vec in rows:
            party = mp_party.get(entity_id)
            if party is None:
                continue
            if is_current and party not in CURRENT_PARLIAMENT_PARTIES:
                continue
            if isinstance(raw_vec, str):
                vec = _json.loads(raw_vec)
            elif isinstance(raw_vec, (bytes, bytearray)):
                vec = _json.loads(raw_vec.decode())
            elif isinstance(raw_vec, list):
                vec = raw_vec
            else:
                try:
                    vec = list(raw_vec)
                except Exception:
                    continue
            fvec = np.array([float(v) if v is not None else 0.0 for v in vec])
            party_vecs.setdefault(party, []).append(fvec)
        return party_vecs
    except Exception:
        logger.exception("Failed to load MP vectors by party for window %s", window)
        return {}


def compute_party_axis_scores(
    party_vecs: Dict[str, List[np.ndarray]],
) -> Dict[str, List[float]]:
    """Compute per-party axis scores as mean of MP vectors.

    Returns:
        {party_name: [float * k]} — k = 50, mean over all MPs in that party.
    """
    try:
        return {
            party: np.array(vecs).mean(axis=0).tolist()
            for party, vecs in party_vecs.items()
        }
    except Exception:
        logger.exception("Failed to compute party axis scores")
        return {}


def load_positions(
    db_path: str, window_size: str = "annual"
) -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], Dict]:
    """Compute 2D positions per window using PCA on aligned SVD vectors.

    Returns:
        positions_by_window: {window_id: {entity_name: (x, y)}}
        axis_def: dict with x_axis, y_axis, method keys
    """
    from analysis.political_axis import compute_2d_axes

    all_available = get_uniform_dim_windows(db_path)

    if not all_available:
        return {}, {}

    positions_by_window, axis_def = compute_2d_axes(
        db_path,
        window_ids=all_available,
        method="pca",
        pca_residual=True,
        normalize_vectors=True,
    )

    try:
        from analysis.axis_classifier import classify_axes

        axis_def = classify_axes(positions_by_window, axis_def, db_path)
    except Exception:
        logger.exception("classify_axes failed; using generic axis labels")

    if window_size == "annual":
        annual_keys = set(w for w in all_available if "-Q" not in w)
        positions_by_window = {
            w: v for w, v in positions_by_window.items() if w in annual_keys
        }

    return positions_by_window, axis_def


def get_aligned_party_scores(
    db_path: str, window: str, active_mps: set | None = None
) -> Dict[str, np.ndarray]:
    """Get party scores for all N components from aligned PCA positions.

    For current_parliament, pass active_mps to filter to only seated MPs
    (matching the compass behaviour). Historical windows include all MPs.
    """
    from analysis.political_axis import compute_nd_axes

    annual_windows = get_uniform_dim_windows(db_path)
    scores_by_window, _ = compute_nd_axes(
        db_path, window_ids=annual_windows, n_components=10
    )
    window_scores = scores_by_window.get(window, {})
    if not window_scores:
        return {}

    if window == "current_parliament" and active_mps is not None:
        window_scores = {mp: sc for mp, sc in window_scores.items() if mp in active_mps}

    _party_map = load_party_map(db_path)

    n_comps = 10
    party_scores_agg: Dict[str, List[np.ndarray]] = {}
    for mp_name, scores in window_scores.items():
        party = _party_map.get(
            mp_name, _party_map.get(mp_name.split("(")[0].strip(), None)
        )
        if party:
            party_scores_agg.setdefault(party, []).append(scores[:n_comps])

    return {
        party: np.mean(np.vstack(score_list), axis=0)
        for party, score_list in party_scores_agg.items()
        if score_list
    }


def compute_party_discipline(
    db_path: str,
    start_date: str,
    end_date: str,
) -> pd.DataFrame:
    """Compute per-party voting discipline (Rice index) for roll-call votes in a date range.

    Only individual MP vote rows are used (mp_name LIKE '%,%').
    Returns a DataFrame with columns [party, n_motions, discipline] sorted by discipline ascending.
    Returns an empty DataFrame if fewer than 1 qualifying motion exists or on any DB error.
    """
    from analysis import trajectory

    return trajectory.compute_party_discipline(db_path, start_date, end_date)


def _get_aligned_trajectory_scores(
    db_path: str, windows: List[str], n_components: int = 10
) -> Dict[str, Dict[str, List[float]]]:
    """Get aligned PCA scores for all windows as {window: {party: [scores per component]}}.

    Uses compute_nd_axes to get PCA-projected, flip-corrected scores across all windows,
    ensuring consistency with the single-window SVD components view.

    Computes the global PCA basis on *all* uniform-dim windows (matching
    get_aligned_party_scores) so that trajectory scores are numerically
    consistent with the single-window view even when the caller passes a
    subset of windows for display.
    """
    from analysis.political_axis import compute_nd_axes

    all_uniform_windows = get_uniform_dim_windows(db_path)
    scores_by_window, _ = compute_nd_axes(
        db_path, window_ids=all_uniform_windows, n_components=n_components
    )
    if not scores_by_window:
        return {}

    party_map = load_party_map(db_path)
    active_mps = load_active_mps(db_path)

    result: Dict[str, Dict[str, List[float]]] = {}
    for window in windows:
        window_scores = scores_by_window.get(window, {})
        if not window_scores:
            continue

        # For current_parliament, match single-window view by filtering to
        # only MPs who are still seated (active). Historical windows include
        # all MPs present in that window.
        if window == "current_parliament":
            window_scores = {
                mp: sc for mp, sc in window_scores.items() if mp in active_mps
            }

        party_vecs: Dict[str, List[np.ndarray]] = {}
        for mp_name, scores in window_scores.items():
            party = party_map.get(
                mp_name, party_map.get(mp_name.split("(")[0].strip(), None)
            )
            if party:
                party_vecs.setdefault(party, []).append(scores[:n_components])

        result[window] = {
            party: np.mean(np.vstack(score_list), axis=0).tolist()
            for party, score_list in party_vecs.items()
            if score_list
        }

    return result