feat(pipeline): add orchestrator CLI, analysis modules, and ActorFractie ingestion

- pipeline/run_pipeline.py: CLI orchestrator for all 5 pipeline phases with --dry-run, --skip-*, --window-size, --svd-k, --start/end-date flags - analysis/{political_axis,trajectory,clustering,visualize}.py: PCA/anchor ideological axis, MP drift trajectories, UMAP + KMeans clustering, Plotly HTML output - api_client.py: capture ActorFractie per individual MP vote (comma in ActorNaam) into mp_vote_parties dict on each motion - database.insert_motion: auto-insert mp_votes rows with party affiliation for newly ingested motions when mp_vote_parties is present - Add scikit-learn to pyproject.toml for KMeans clustering - tests/test_run_pipeline.py: window generation, dry-run, skip-all paths - tests/test_analysis.py: PCA axis, anchor axis, trajectory drift, KMeans Ref: thoughts/shared/plans/2026-03-21-parliamentary-embedding-pipeline-plan.md
1 month ago · f2a831dfcf
parent a36e6cba4e
commit f2a831dfcf
12 changed files with 1163 additions and 1 deletions
--- a/analysis/init.py
+++ b/analysis/init.py
@ -0,0 +1,8 @@
+"""Analysis modules for the parliamentary embedding pipeline.
+
+Modules:
+    political_axis  — project MP SVD vectors onto ideological axis
+    trajectory      — compute MP drift across aligned windows
+    clustering      — UMAP dimensionality reduction + cluster labelling
+    visualize       — Plotly interactive plots (outputs self-contained HTML)
+"""
--- a/analysis/clustering.py
+++ b/analysis/clustering.py
@ -0,0 +1,130 @@
+"""clustering.py — UMAP dimensionality reduction on fused embeddings.
+
+Reduces fused motion embeddings to 2D (or 3D) for visualisation,
+and optionally labels clusters using KMeans.
+
+Requires: umap-learn, scikit-learn (for KMeans)
+"""
+
+import json
+import logging
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import duckdb
+
+_logger = logging.getLogger(__name__)
+
+
+def _load_fused_vectors(
+    db_path: str, window_id: Optional[str] = None
+) -> Tuple[List[int], List[str], np.ndarray]:
+    """Load fused embeddings from the DB.
+
+    Returns (motion_ids, window_ids, matrix).
+    Optionally filter by window_id.
+    """
+    conn = duckdb.connect(db_path)
+    if window_id:
+        rows = conn.execute(
+            "SELECT motion_id, window_id, vector FROM fused_embeddings WHERE window_id = ?",
+            (window_id,),
+        ).fetchall()
+    else:
+        rows = conn.execute(
+            "SELECT motion_id, window_id, vector FROM fused_embeddings ORDER BY window_id, motion_id"
+        ).fetchall()
+    conn.close()
+
+    motion_ids, window_ids, vectors = [], [], []
+    for motion_id, wid, vec_json in rows:
+        try:
+            vec = json.loads(vec_json)
+            motion_ids.append(int(motion_id))
+            window_ids.append(wid)
+            vectors.append(vec)
+        except Exception:
+            _logger.warning("Could not parse fused vector for motion %s", motion_id)
+
+    if not vectors:
+        return [], [], np.zeros((0, 0))
+
+    # Pad to common length if needed (shouldn't happen if pipeline is consistent)
+    max_len = max(len(v) for v in vectors)
+    mat = np.zeros((len(vectors), max_len), dtype=float)
+    for i, v in enumerate(vectors):
+        mat[i, : len(v)] = v
+
+    return motion_ids, window_ids, mat
+
+
+def run_umap(
+    db_path: str,
+    window_id: Optional[str] = None,
+    n_components: int = 2,
+    n_neighbors: int = 15,
+    min_dist: float = 0.1,
+    random_state: int = 42,
+) -> Dict:
+    """Run UMAP on fused embeddings and return 2D/3D coordinates.
+
+    Returns:
+        {
+          "motion_ids": [...],
+          "window_ids": [...],
+          "coords": [[x, y], ...],   # or [x, y, z] if n_components=3
+          "n_components": int,
+        }
+    """
+    try:
+        import umap
+    except ImportError:
+        _logger.error("umap-learn is not installed; cannot run UMAP")
+        return {}
+
+    motion_ids, window_ids, mat = _load_fused_vectors(db_path, window_id)
+    if mat.size == 0:
+        _logger.warning("No fused embeddings found for window_id=%s", window_id)
+        return {}
+
+    if mat.shape[0] < n_neighbors + 1:
+        # UMAP requires at least n_neighbors+1 samples
+        n_neighbors = max(2, mat.shape[0] - 1)
+        _logger.warning(
+            "Reduced n_neighbors to %d due to small dataset (%d samples)",
+            n_neighbors,
+            mat.shape[0],
+        )
+
+    reducer = umap.UMAP(
+        n_components=n_components,
+        n_neighbors=n_neighbors,
+        min_dist=min_dist,
+        random_state=random_state,
+    )
+    coords = reducer.fit_transform(mat)
+
+    return {
+        "motion_ids": motion_ids,
+        "window_ids": window_ids,
+        "coords": coords.tolist(),
+        "n_components": n_components,
+    }
+
+
+def cluster_kmeans(
+    coords: np.ndarray, n_clusters: int = 8, random_state: int = 42
+) -> np.ndarray:
+    """Run KMeans on 2D/3D UMAP coordinates.
+
+    Returns array of integer cluster labels (length = len(coords)).
+    """
+    try:
+        from sklearn.cluster import KMeans
+    except ImportError:
+        _logger.error("scikit-learn is not installed; cannot run KMeans")
+        return np.zeros(len(coords), dtype=int)
+
+    n_clusters = min(n_clusters, len(coords))
+    km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init="auto")
+    return km.fit_predict(coords)
--- a/analysis/political_axis.py
+++ b/analysis/political_axis.py
@ -0,0 +1,125 @@
+"""political_axis.py — Project MP SVD vectors onto an ideological axis.
+
+Two modes:
+  1. PCA mode (default): compute the first principal component of all MP SVD
+     vectors for a window and project each MP onto it.  The sign is arbitrary
+     but consistent within a window.
+
+  2. Anchor mode: define the axis as the vector from the centroid of
+     ``left_parties`` to the centroid of ``right_parties``.  Project all MPs
+     onto this normalised anchor axis.
+
+Both modes return a dict mapping mp_name → scalar score for the given window.
+"""
+
+import json
+import logging
+from typing import Dict, List, Optional
+
+import numpy as np
+import duckdb
+
+_logger = logging.getLogger(__name__)
+
+
+def _load_mp_svd_vectors(db_path: str, window_id: str) -> Dict[str, np.ndarray]:
+    """Load all MP SVD vectors for a window from svd_vectors table."""
+    conn = duckdb.connect(db_path)
+    rows = conn.execute(
+        "SELECT entity_id, vector FROM svd_vectors WHERE window_id = ? AND entity_type = 'mp'",
+        (window_id,),
+    ).fetchall()
+    conn.close()
+
+    result = {}
+    for mp_name, vec_json in rows:
+        try:
+            result[mp_name] = np.array(json.loads(vec_json), dtype=float)
+        except Exception:
+            _logger.warning("Could not parse SVD vector for MP %s", mp_name)
+    return result
+
+
+def compute_pca_axis(db_path: str, window_id: str) -> Dict[str, float]:
+    """Project MP SVD vectors onto their first principal component.
+
+    Returns {mp_name: score}.  Returns empty dict if fewer than 2 MPs.
+    """
+    mp_vecs = _load_mp_svd_vectors(db_path, window_id)
+    if len(mp_vecs) < 2:
+        _logger.warning(
+            "window %s has only %d MPs; skipping PCA axis", window_id, len(mp_vecs)
+        )
+        return {}
+
+    names = list(mp_vecs.keys())
+    mat = np.vstack([mp_vecs[n] for n in names])  # (n_mps, k)
+
+    # Centre
+    mat_centred = mat - mat.mean(axis=0)
+
+    # First PC via SVD
+    try:
+        _, _, Vt = np.linalg.svd(mat_centred, full_matrices=False)
+        axis = Vt[0]  # (k,)
+    except np.linalg.LinAlgError:
+        _logger.exception("SVD failed in compute_pca_axis for window %s", window_id)
+        return {}
+
+    projections = mat_centred.dot(axis)
+    return {name: float(score) for name, score in zip(names, projections)}
+
+
+def compute_anchor_axis(
+    db_path: str,
+    window_id: str,
+    left_parties: List[str],
+    right_parties: List[str],
+) -> Dict[str, float]:
+    """Project MP SVD vectors onto a left↔right anchor axis.
+
+    The axis runs from the centroid of ``left_parties`` to the centroid of
+    ``right_parties``.  Positive scores are toward the right.
+
+    Returns {mp_name: score}.
+    """
+    mp_vecs = _load_mp_svd_vectors(db_path, window_id)
+    if not mp_vecs:
+        return {}
+
+    # Load party affiliation for this window from mp_metadata
+    conn = duckdb.connect(db_path)
+    rows = conn.execute("SELECT mp_name, party FROM mp_metadata").fetchall()
+    conn.close()
+    party_of = {mp: party for mp, party in rows}
+
+    left_vecs = [
+        mp_vecs[mp]
+        for mp, party in party_of.items()
+        if party in left_parties and mp in mp_vecs
+    ]
+    right_vecs = [
+        mp_vecs[mp]
+        for mp, party in party_of.items()
+        if party in right_parties and mp in mp_vecs
+    ]
+
+    if not left_vecs or not right_vecs:
+        _logger.warning(
+            "window %s: insufficient anchor parties (left=%d, right=%d)",
+            window_id,
+            len(left_vecs),
+            len(right_vecs),
+        )
+        return {}
+
+    left_centroid = np.mean(left_vecs, axis=0)
+    right_centroid = np.mean(right_vecs, axis=0)
+    axis = right_centroid - left_centroid
+    norm = np.linalg.norm(axis)
+    if norm < 1e-10:
+        _logger.warning("Anchor axis has near-zero norm for window %s", window_id)
+        return {}
+    axis = axis / norm
+
+    return {name: float(np.dot(vec, axis)) for name, vec in mp_vecs.items()}
--- a/analysis/trajectory.py
+++ b/analysis/trajectory.py
@ -0,0 +1,123 @@
+"""trajectory.py — Compute MP political drift across aligned time windows.
+
+For each MP that appears in multiple windows, computes:
+  - The aligned SVD vector per window
+  - The Euclidean distance between consecutive windows (drift)
+  - Total cumulative drift
+
+Returns a dict keyed by mp_name containing per-window positions and drift scores.
+"""
+
+import json
+import logging
+from typing import Dict, List, Optional
+
+import numpy as np
+import duckdb
+
+_logger = logging.getLogger(__name__)
+
+
+def _load_window_ids(db_path: str) -> List[str]:
+    """Return all distinct window IDs from svd_vectors, in lexicographic order."""
+    conn = duckdb.connect(db_path)
+    rows = conn.execute(
+        "SELECT DISTINCT window_id FROM svd_vectors WHERE entity_type = 'mp' ORDER BY window_id"
+    ).fetchall()
+    conn.close()
+    return [r[0] for r in rows]
+
+
+def _load_mp_vectors_for_window(db_path: str, window_id: str) -> Dict[str, np.ndarray]:
+    conn = duckdb.connect(db_path)
+    rows = conn.execute(
+        "SELECT entity_id, vector FROM svd_vectors WHERE window_id = ? AND entity_type = 'mp'",
+        (window_id,),
+    ).fetchall()
+    conn.close()
+    result = {}
+    for mp_name, vec_json in rows:
+        try:
+            result[mp_name] = np.array(json.loads(vec_json), dtype=float)
+        except Exception:
+            _logger.warning(
+                "Could not parse vector for MP %s window %s", mp_name, window_id
+            )
+    return result
+
+
+def compute_trajectories(
+    db_path: str,
+    window_ids: Optional[List[str]] = None,
+) -> Dict[str, Dict]:
+    """Compute per-MP trajectories across windows.
+
+    Returns:
+        {
+          mp_name: {
+            "windows": [window_id, ...],
+            "vectors": [[...], ...],   # one vector per window
+            "drift": [float, ...],     # consecutive Euclidean distances
+            "total_drift": float,
+          }
+        }
+    Only MPs present in at least 2 windows are included.
+    """
+    if window_ids is None:
+        window_ids = _load_window_ids(db_path)
+
+    if len(window_ids) < 2:
+        _logger.info("Fewer than 2 windows — no trajectories to compute")
+        return {}
+
+    # Collect per-window vectors for each MP
+    mp_data: Dict[str, Dict] = {}
+
+    for wid in window_ids:
+        vecs = _load_mp_vectors_for_window(db_path, wid)
+        for mp_name, vec in vecs.items():
+            if mp_name not in mp_data:
+                mp_data[mp_name] = {"windows": [], "vectors": []}
+            mp_data[mp_name]["windows"].append(wid)
+            mp_data[mp_name]["vectors"].append(vec)
+
+    # Compute drift for MPs with >= 2 windows
+    result = {}
+    for mp_name, data in mp_data.items():
+        if len(data["windows"]) < 2:
+            continue
+        vecs = data["vectors"]
+        drifts = [
+            float(np.linalg.norm(vecs[i + 1] - vecs[i])) for i in range(len(vecs) - 1)
+        ]
+        result[mp_name] = {
+            "windows": data["windows"],
+            "vectors": [v.tolist() for v in vecs],
+            "drift": drifts,
+            "total_drift": float(sum(drifts)),
+        }
+
+    _logger.info(
+        "Trajectories computed for %d MPs across %d windows",
+        len(result),
+        len(window_ids),
+    )
+    return result
+
+
+def top_drifters(trajectories: Dict[str, Dict], n: int = 10) -> List[Dict]:
+    """Return the top-n MPs by total drift, sorted descending.
+
+    Each entry: {"mp_name": ..., "total_drift": ..., "windows": [...]}
+    """
+    ranked = sorted(
+        trajectories.items(), key=lambda kv: kv[1]["total_drift"], reverse=True
+    )
+    return [
+        {
+            "mp_name": mp,
+            "total_drift": data["total_drift"],
+            "windows": data["windows"],
+        }
+        for mp, data in ranked[:n]
+    ]
--- a/analysis/visualize.py
+++ b/analysis/visualize.py
@ -0,0 +1,163 @@
+"""visualize.py — Plotly interactive plots for parliamentary embeddings.
+
+Produces self-contained HTML files.
+
+Functions:
+  plot_umap_scatter   — 2D scatter of fused motion embeddings, coloured by cluster
+  plot_mp_trajectory  — Line plot of MP drift across windows
+  plot_political_axis — Bar chart of MP scores on the ideological axis
+"""
+
+import logging
+from typing import Dict, List, Optional
+
+import numpy as np
+
+_logger = logging.getLogger(__name__)
+
+
+def _require_plotly():
+    try:
+        import plotly.graph_objects as go
+        import plotly.express as px
+
+        return go, px
+    except ImportError:
+        raise ImportError("plotly is not installed. Install it with: uv add plotly")
+
+
+def plot_umap_scatter(
+    motion_ids: List[int],
+    coords: List[List[float]],
+    labels: Optional[List[int]] = None,
+    window_id: Optional[str] = None,
+    output_path: str = "analysis_umap.html",
+) -> str:
+    """Produce a 2D scatter plot of UMAP-reduced fused embeddings.
+
+    Args:
+        motion_ids:  Motion IDs (used as hover labels)
+        coords:      List of [x, y] coordinates
+        labels:      Optional cluster labels (integer per motion)
+        window_id:   Window label for the plot title
+        output_path: Where to write the self-contained HTML
+
+    Returns the output_path on success.
+    """
+    go, px = _require_plotly()
+
+    xs = [c[0] for c in coords]
+    ys = [c[1] for c in coords]
+    color = labels if labels is not None else [0] * len(motion_ids)
+    title = f"UMAP — fused motion embeddings" + (f" ({window_id})" if window_id else "")
+
+    fig = px.scatter(
+        x=xs,
+        y=ys,
+        color=[str(c) for c in color],
+        hover_name=[str(mid) for mid in motion_ids],
+        title=title,
+        labels={"x": "UMAP-1", "y": "UMAP-2", "color": "Cluster"},
+    )
+    fig.write_html(output_path, include_plotlyjs="cdn")
+    _logger.info("UMAP scatter written to %s", output_path)
+    return output_path
+
+
+def plot_mp_trajectory(
+    trajectories: Dict[str, Dict],
+    mp_names: Optional[List[str]] = None,
+    output_path: str = "analysis_trajectory.html",
+) -> str:
+    """Line plot of MP drift across time windows.
+
+    Args:
+        trajectories: Output of analysis.trajectory.compute_trajectories()
+        mp_names:     Subset of MPs to plot (default: all)
+        output_path:  Output HTML file path
+
+    Returns the output_path on success.
+    """
+    go, px = _require_plotly()
+
+    if mp_names is None:
+        mp_names = list(trajectories.keys())
+
+    fig = go.Figure()
+
+    for mp in mp_names:
+        if mp not in trajectories:
+            continue
+        data = trajectories[mp]
+        windows = data["windows"]
+        drifts_cumulative = [0.0] + list(np.cumsum(data["drift"]))
+        # Plot cumulative drift per window transition
+        x_labels = windows[: len(drifts_cumulative)]
+        fig.add_trace(
+            go.Scatter(
+                x=x_labels,
+                y=drifts_cumulative,
+                mode="lines+markers",
+                name=mp,
+            )
+        )
+
+    fig.update_layout(
+        title="MP Political Drift Over Time (Cumulative)",
+        xaxis_title="Window",
+        yaxis_title="Cumulative Drift",
+    )
+    fig.write_html(output_path, include_plotlyjs="cdn")
+    _logger.info("Trajectory plot written to %s", output_path)
+    return output_path
+
+
+def plot_political_axis(
+    scores: Dict[str, float],
+    party_of: Optional[Dict[str, str]] = None,
+    window_id: Optional[str] = None,
+    n_top: int = 30,
+    output_path: str = "analysis_political_axis.html",
+) -> str:
+    """Horizontal bar chart of MP scores on the ideological axis.
+
+    Args:
+        scores:      {mp_name: score} from political_axis module
+        party_of:    Optional {mp_name: party} for colour-coding
+        window_id:   Window label for the title
+        n_top:       Show only the top/bottom n MPs by score
+        output_path: Output HTML path
+
+    Returns the output_path on success.
+    """
+    go, px = _require_plotly()
+
+    # Sort by score
+    sorted_items = sorted(scores.items(), key=lambda kv: kv[1])
+
+    # Take n_top from each end if list is large
+    if len(sorted_items) > 2 * n_top:
+        sorted_items = sorted_items[:n_top] + sorted_items[-n_top:]
+
+    names = [item[0] for item in sorted_items]
+    vals = [item[1] for item in sorted_items]
+    colors = (
+        [party_of.get(n, "Unknown") for n in names]
+        if party_of
+        else ["Unknown"] * len(names)
+    )
+
+    title = "MP Ideological Axis Score" + (f" ({window_id})" if window_id else "")
+
+    fig = px.bar(
+        x=vals,
+        y=names,
+        color=colors,
+        orientation="h",
+        title=title,
+        labels={"x": "Score (← left — right →)", "y": "MP", "color": "Party"},
+    )
+    fig.update_layout(yaxis={"categoryorder": "total ascending"})
+    fig.write_html(output_path, include_plotlyjs="cdn")
+    _logger.info("Political axis chart written to %s", output_path)
+    return output_path
--- a/api_client.py
+++ b/api_client.py
@ -92,7 +92,12 @@ class TweedeKamerAPI:

        # Group records by Besluit_Id (decision/motion)
        motion_groups = defaultdict(
-            lambda: {"votes": {}, "besluit_id": None, "latest_date": None}
+            lambda: {
+                "votes": {},
+                "mp_vote_parties": {},
+                "besluit_id": None,
+                "latest_date": None,
+            }
        )

        for record in records:
@ -120,6 +125,14 @@ class TweedeKamerAPI:
            motion_groups[besluit_id]["votes"][party_name] = vote
            motion_groups[besluit_id]["besluit_id"] = besluit_id

+            # For individual MPs (ActorNaam contains comma), also capture their party
+            if "," in party_name:
+                actor_fractie = record.get("ActorFractie")
+                if actor_fractie:
+                    motion_groups[besluit_id]["mp_vote_parties"][party_name] = (
+                        actor_fractie
+                    )
+
            # Track the latest date for this motion
            if (
                not motion_groups[besluit_id]["latest_date"]
@ -166,6 +179,7 @@ class TweedeKamerAPI:
                    motion_details["title"], motion_details["description"]
                ),
                "voting_results": voting_results,
+                "mp_vote_parties": motion_data["mp_vote_parties"],
                "winning_margin": winning_margin,
                "url": f"https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/{besluit_id}",
                "externe_identifier": motion_details.get("externe_identifier"),
--- a/database.py
+++ b/database.py
@ -190,6 +190,33 @@ class MotionDatabase:
            )

            conn.close()
+
+            # Also insert mp_vote rows for individual MPs if party data is available.
+            # This only runs for brand-new motions (existing motions are rejected above),
+            # so there is no risk of duplicates — no existence check needed here.
+            mp_vote_parties = motion_data.get("mp_vote_parties", {})
+            voting_results_raw = motion_data.get("voting_results", {})
+            if mp_vote_parties:
+                conn2 = duckdb.connect(self.db_path)
+                row = conn2.execute(
+                    "SELECT id FROM motions WHERE url = ? LIMIT 1",
+                    (motion_data["url"],),
+                ).fetchone()
+                conn2.close()
+                motion_id = row[0] if row else None
+
+                if motion_id is not None:
+                    motion_date = motion_data.get("date", "")
+                    for mp_name, party in mp_vote_parties.items():
+                        vote = voting_results_raw.get(mp_name, "afwezig")
+                        self.insert_mp_vote(
+                            motion_id=motion_id,
+                            mp_name=mp_name,
+                            party=party,
+                            vote=vote,
+                            date=motion_date,
+                        )
+
            return True

        except Exception as e:
--- a/pipeline/run_pipeline.py
+++ b/pipeline/run_pipeline.py
@ -0,0 +1,261 @@
+"""CLI orchestrator for the parliamentary embedding pipeline.
+
+Runs all phases in sequence:
+  1. fetch_mp_metadata  — pull MP party + tenure from OData
+  2. extract_mp_votes   — parse voting_results JSON → mp_votes rows
+  3. svd per window     — build vote matrix, SVD, Procrustes-align
+  4. text embeddings    — fill any gaps in the embeddings table
+  5. fuse per window    — concatenate SVD + text vectors → fused_embeddings
+
+Usage:
+    uv run python -m pipeline.run_pipeline [options]
+
+Options:
+    --db-path PATH        Path to the DuckDB file (default: data/motions.db)
+    --start-date DATE     Window start (YYYY-MM-DD, default: 2 years ago)
+    --end-date DATE       Window end   (YYYY-MM-DD, default: today)
+    --window-size {quarterly,annual}  Time window granularity (default: quarterly)
+    --svd-k INT           SVD dimensionality (default: 50)
+    --text-model TEXT     Text embedding model name (default: from ai_provider)
+    --skip-metadata       Skip fetching MP metadata from OData
+    --skip-extract        Skip extracting MP votes from voting_results
+    --skip-svd            Skip SVD computation
+    --skip-text           Skip text embedding gap-fill
+    --skip-fusion         Skip vector fusion
+    --dry-run             Print actions but make no DB writes
+"""
+
+import argparse
+import calendar
+import logging
+import sys
+from datetime import date, timedelta
+from typing import List, Tuple
+
+from database import MotionDatabase
+
+_logger = logging.getLogger(__name__)
+
+
+def _generate_windows(
+    start: date, end: date, granularity: str
+) -> List[Tuple[str, str, str]]:
+    """Return list of (window_id, start_str, end_str) tuples.
+
+    window_id format:
+      quarterly → "2024-Q1", "2024-Q2", …
+      annual    → "2024"
+    """
+    windows = []
+    cursor = date(start.year, start.month, 1)
+
+    if granularity == "annual":
+        cursor = date(start.year, 1, 1)
+        while cursor <= end:
+            year_end = date(cursor.year, 12, 31)
+            w_end = min(year_end, end)
+            windows.append((str(cursor.year), cursor.isoformat(), w_end.isoformat()))
+            cursor = date(cursor.year + 1, 1, 1)
+    else:
+        # quarterly
+        quarter_starts = {1: 1, 2: 4, 3: 7, 4: 10}
+        quarter_ends = {1: 3, 2: 6, 3: 9, 4: 12}
+
+        # Align cursor to quarter start
+        q = (cursor.month - 1) // 3 + 1
+        cursor = date(cursor.year, quarter_starts[q], 1)
+
+        while cursor <= end:
+            q = (cursor.month - 1) // 3 + 1
+            q_end_month = quarter_ends[q]
+            last_day = calendar.monthrange(cursor.year, q_end_month)[1]
+            q_end = date(cursor.year, q_end_month, last_day)
+            w_end = min(q_end, end)
+            window_id = f"{cursor.year}-Q{q}"
+            windows.append((window_id, cursor.isoformat(), w_end.isoformat()))
+            cursor = q_end + timedelta(days=1)
+
+    return windows
+
+
+def run(args: argparse.Namespace) -> int:
+    """Execute the pipeline. Returns exit code (0 = success)."""
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+
+    db_path = args.db_path
+    dry_run = args.dry_run
+
+    if dry_run:
+        _logger.info("DRY RUN — no writes will be made")
+
+    # Resolve date range
+    end_date = date.fromisoformat(args.end_date) if args.end_date else date.today()
+    start_date = (
+        date.fromisoformat(args.start_date)
+        if args.start_date
+        else end_date - timedelta(days=730)
+    )
+
+    _logger.info(
+        "Pipeline run: %s → %s (%s windows), db=%s",
+        start_date,
+        end_date,
+        args.window_size,
+        db_path,
+    )
+
+    db = MotionDatabase(db_path)
+
+    # ── Phase 1: MP metadata ────────────────────────────────────────────────
+    if not args.skip_metadata:
+        _logger.info("Phase 1: fetching MP metadata from OData")
+        if not dry_run:
+            from pipeline.fetch_mp_metadata import fetch_mp_metadata
+
+            fetched, skipped = fetch_mp_metadata(db)
+            _logger.info("  mp_metadata: fetched=%d skipped=%d", fetched, skipped)
+        else:
+            _logger.info("  [dry-run] would call fetch_mp_metadata(db)")
+    else:
+        _logger.info("Phase 1: skipped (--skip-metadata)")
+
+    # ── Phase 2: Extract MP votes ────────────────────────────────────────────
+    if not args.skip_extract:
+        _logger.info("Phase 2: extracting MP votes from voting_results")
+        if not dry_run:
+            from pipeline.extract_mp_votes import extract_mp_votes
+
+            inserted, skipped = extract_mp_votes(db)
+            _logger.info(
+                "  mp_votes: inserted=%d motions skipped=%d", inserted, skipped
+            )
+        else:
+            _logger.info("  [dry-run] would call extract_mp_votes(db)")
+    else:
+        _logger.info("Phase 2: skipped (--skip-extract)")
+
+    # ── Phase 3: SVD per window ──────────────────────────────────────────────
+    if not args.skip_svd:
+        windows = _generate_windows(start_date, end_date, args.window_size)
+        _logger.info("Phase 3: SVD for %d windows (k=%d)", len(windows), args.svd_k)
+        from pipeline.svd_pipeline import run_svd_for_window
+
+        for window_id, w_start, w_end in windows:
+            _logger.info("  window %s: %s → %s", window_id, w_start, w_end)
+            if not dry_run:
+                result = run_svd_for_window(
+                    db=db,
+                    window_id=window_id,
+                    start_date=w_start,
+                    end_date=w_end,
+                    k=args.svd_k,
+                )
+                _logger.info(
+                    "    k_used=%d stored_mp=%d stored_motion=%d",
+                    result["k_used"],
+                    result["stored_mp"],
+                    result["stored_motion"],
+                )
+            else:
+                _logger.info("  [dry-run] would run SVD for window %s", window_id)
+    else:
+        _logger.info("Phase 3: skipped (--skip-svd)")
+
+    # ── Phase 4: Text embeddings ──────────────────────────────────────────────
+    if not args.skip_text:
+        _logger.info("Phase 4: ensuring text embeddings")
+        if not dry_run:
+            from pipeline.text_pipeline import ensure_text_embeddings
+
+            stored, existing, no_text, errors = ensure_text_embeddings(
+                db_path=db_path, model=args.text_model
+            )
+            _logger.info(
+                "  embeddings: stored=%d existing=%d no_text=%d errors=%d",
+                stored,
+                existing,
+                no_text,
+                errors,
+            )
+        else:
+            _logger.info("  [dry-run] would call ensure_text_embeddings")
+    else:
+        _logger.info("Phase 4: skipped (--skip-text)")
+
+    # ── Phase 5: Fusion per window ────────────────────────────────────────────
+    if not args.skip_fusion:
+        windows = _generate_windows(start_date, end_date, args.window_size)
+        _logger.info("Phase 5: fusing vectors for %d windows", len(windows))
+        from pipeline.fusion import fuse_for_window
+
+        for window_id, _w_start, _w_end in windows:
+            if not dry_run:
+                result = fuse_for_window(
+                    window_id=window_id,
+                    db_path=db_path,
+                    model=args.text_model,
+                )
+                _logger.info(
+                    "  window %s: fused=%d skipped_no_svd=%d skipped_no_text=%d",
+                    window_id,
+                    result["fused"],
+                    result.get("skipped_no_svd", 0),
+                    result.get("skipped_no_text", 0),
+                )
+            else:
+                _logger.info("  [dry-run] would fuse window %s", window_id)
+    else:
+        _logger.info("Phase 5: skipped (--skip-fusion)")
+
+    _logger.info("Pipeline complete.")
+    return 0
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Parliamentary embedding pipeline orchestrator",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--db-path", default="data/motions.db", help="Path to DuckDB file"
+    )
+    parser.add_argument("--start-date", default=None, help="Window start YYYY-MM-DD")
+    parser.add_argument("--end-date", default=None, help="Window end YYYY-MM-DD")
+    parser.add_argument(
+        "--window-size",
+        choices=["quarterly", "annual"],
+        default="quarterly",
+        help="Time window granularity",
+    )
+    parser.add_argument("--svd-k", type=int, default=50, help="SVD dimensions")
+    parser.add_argument(
+        "--text-model",
+        default=None,
+        help="Text embedding model (default: ai_provider default)",
+    )
+    parser.add_argument(
+        "--skip-metadata", action="store_true", help="Skip MP metadata fetch"
+    )
+    parser.add_argument(
+        "--skip-extract", action="store_true", help="Skip MP vote extraction"
+    )
+    parser.add_argument("--skip-svd", action="store_true", help="Skip SVD computation")
+    parser.add_argument(
+        "--skip-text", action="store_true", help="Skip text embedding gap-fill"
+    )
+    parser.add_argument("--skip-fusion", action="store_true", help="Skip vector fusion")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print what would happen without writing anything",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = build_parser()
+    args = parser.parse_args()
+    sys.exit(run(args))
--- a/pyproject.toml
+++ b/pyproject.toml
@ -15,4 +15,5 @@ dependencies = [
    "requests>=2.32.4",
    "schedule>=1.2.2",
    "streamlit>=1.48.0",
+    "scikit-learn>=1.8.0",
 ]
--- a/tests/test_analysis.py
+++ b/tests/test_analysis.py
@ -0,0 +1,195 @@
+"""Tests for analysis modules: political_axis, trajectory, clustering."""
+
+import json
+import numpy as np
+import pytest
+
+duckdb = pytest.importorskip("duckdb")
+
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+
+
+def _setup_svd_vectors(db_path: str, window_ids_mp_vecs: dict):
+    """Insert synthetic MP SVD vectors into svd_vectors table.
+
+    window_ids_mp_vecs: {window_id: {mp_name: np.ndarray}}
+    """
+    conn = duckdb.connect(db_path)
+    conn.execute(
+        """
+        CREATE TABLE IF NOT EXISTS svd_vectors (
+            id INTEGER,
+            window_id TEXT,
+            entity_type TEXT,
+            entity_id TEXT,
+            vector JSON,
+            model TEXT,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+        """
+    )
+    for wid, mp_vecs in window_ids_mp_vecs.items():
+        for mp_name, vec in mp_vecs.items():
+            conn.execute(
+                "INSERT INTO svd_vectors (window_id, entity_type, entity_id, vector, model) VALUES (?, 'mp', ?, ?, 'test')",
+                (wid, mp_name, json.dumps(vec.tolist())),
+            )
+    conn.close()
+
+
+def _setup_mp_metadata(db_path: str, mp_party: dict):
+    """Insert synthetic MP metadata rows."""
+    conn = duckdb.connect(db_path)
+    conn.execute(
+        """
+        CREATE TABLE IF NOT EXISTS mp_metadata (
+            mp_name TEXT,
+            party TEXT,
+            van DATE,
+            tot_en_met DATE,
+            persoon_id TEXT
+        )
+        """
+    )
+    for mp_name, party in mp_party.items():
+        conn.execute(
+            "INSERT INTO mp_metadata (mp_name, party) VALUES (?, ?)",
+            (mp_name, party),
+        )
+    conn.close()
+
+
+# ── political_axis ────────────────────────────────────────────────────────────
+
+
+class TestPoliticalAxis:
+    def test_pca_axis_basic(self, tmp_path):
+        np.random.seed(42)
+        db_path = str(tmp_path / "test.db")
+        n_mps, k = 20, 5
+
+        # Create a low-rank set of MP vectors (they should have a clear first PC)
+        vecs = np.random.randn(n_mps, k)
+        mp_names = [f"MP_{i}" for i in range(n_mps)]
+        _setup_svd_vectors(
+            db_path, {"2024-Q1": {mp_names[i]: vecs[i] for i in range(n_mps)}}
+        )
+
+        from analysis.political_axis import compute_pca_axis
+
+        scores = compute_pca_axis(db_path, "2024-Q1")
+        assert len(scores) == n_mps
+        assert all(isinstance(v, float) for v in scores.values())
+        # Scores should have non-trivial variance
+        vals = list(scores.values())
+        assert np.std(vals) > 0.0
+
+    def test_pca_axis_too_few_mps(self, tmp_path):
+        db_path = str(tmp_path / "test.db")
+        _setup_svd_vectors(db_path, {"w1": {"MP_A": np.array([1.0, 0.0])}})
+
+        from analysis.political_axis import compute_pca_axis
+
+        scores = compute_pca_axis(db_path, "w1")
+        assert scores == {}
+
+    def test_anchor_axis_basic(self, tmp_path):
+        db_path = str(tmp_path / "test.db")
+        # Two clusters clearly separated on dim 0
+        left_vec = np.array([-2.0, 0.0, 0.0])
+        right_vec = np.array([2.0, 0.0, 0.0])
+        mp_vecs = {
+            "Left_A": left_vec + np.array([0.1, 0.0, 0.0]),
+            "Left_B": left_vec - np.array([0.1, 0.0, 0.0]),
+            "Right_A": right_vec + np.array([0.1, 0.0, 0.0]),
+            "Right_B": right_vec - np.array([0.1, 0.0, 0.0]),
+            "Centre": np.array([0.0, 0.0, 0.0]),
+        }
+        _setup_svd_vectors(db_path, {"w1": mp_vecs})
+        _setup_mp_metadata(
+            db_path,
+            {
+                "Left_A": "SP",
+                "Left_B": "SP",
+                "Right_A": "VVD",
+                "Right_B": "VVD",
+                "Centre": "D66",
+            },
+        )
+
+        from analysis.political_axis import compute_anchor_axis
+
+        scores = compute_anchor_axis(
+            db_path, "w1", left_parties=["SP"], right_parties=["VVD"]
+        )
+        assert len(scores) == 5
+        # Left MPs should have negative scores, Right MPs positive
+        assert scores["Left_A"] < scores["Right_A"]
+        assert scores["Left_B"] < scores["Right_B"]
+
+
+# ── trajectory ───────────────────────────────────────────────────────────────
+
+
+class TestTrajectory:
+    def test_basic_trajectory(self, tmp_path):
+        np.random.seed(0)
+        db_path = str(tmp_path / "test.db")
+
+        vec_w1 = {"MP_A": np.array([1.0, 0.0]), "MP_B": np.array([0.0, 1.0])}
+        vec_w2 = {
+            "MP_A": np.array([1.5, 0.5]),
+            "MP_B": np.array([0.0, 1.0]),
+            "MP_C": np.array([2.0, 2.0]),
+        }
+        _setup_svd_vectors(db_path, {"2024-Q1": vec_w1, "2024-Q2": vec_w2})
+
+        from analysis.trajectory import compute_trajectories, top_drifters
+
+        traj = compute_trajectories(db_path)
+
+        # Only MPs appearing in >= 2 windows
+        assert "MP_A" in traj
+        assert "MP_B" in traj
+        assert "MP_C" not in traj  # only in one window
+
+        assert len(traj["MP_A"]["drift"]) == 1
+        assert traj["MP_A"]["total_drift"] > 0.0
+
+        # MP_B didn't move — drift should be 0
+        assert traj["MP_B"]["total_drift"] == pytest.approx(0.0)
+
+        drifters = top_drifters(traj, n=5)
+        assert drifters[0]["mp_name"] == "MP_A"
+
+    def test_fewer_than_2_windows(self, tmp_path):
+        db_path = str(tmp_path / "test.db")
+        _setup_svd_vectors(db_path, {"2024-Q1": {"MP_A": np.array([1.0, 2.0])}})
+
+        from analysis.trajectory import compute_trajectories
+
+        traj = compute_trajectories(db_path)
+        assert traj == {}
+
+
+# ── clustering ────────────────────────────────────────────────────────────────
+
+
+class TestClustering:
+    def test_cluster_kmeans_basic(self):
+        from analysis.clustering import cluster_kmeans
+        import numpy as np
+
+        coords = np.random.randn(20, 2)
+        labels = cluster_kmeans(coords, n_clusters=3)
+        assert len(labels) == 20
+        assert set(labels).issubset({0, 1, 2})
+
+    def test_cluster_kmeans_fewer_points_than_clusters(self):
+        from analysis.clustering import cluster_kmeans
+
+        coords = np.array([[0.0, 0.0], [1.0, 1.0]])
+        labels = cluster_kmeans(coords, n_clusters=5)
+        # Should not crash; n_clusters clamped to len(coords)
+        assert len(labels) == 2
--- a/tests/test_run_pipeline.py
+++ b/tests/test_run_pipeline.py
@ -0,0 +1,113 @@
+"""Tests for pipeline/run_pipeline.py"""
+
+import argparse
+import sys
+import pytest
+
+from pipeline.run_pipeline import _generate_windows, build_parser, run
+from datetime import date
+
+
+def test_generate_windows_quarterly():
+    start = date(2024, 1, 1)
+    end = date(2024, 12, 31)
+    windows = _generate_windows(start, end, "quarterly")
+
+    assert len(windows) == 4
+    ids = [w[0] for w in windows]
+    assert ids == ["2024-Q1", "2024-Q2", "2024-Q3", "2024-Q4"]
+
+    # Q1 bounds
+    assert windows[0][1] == "2024-01-01"
+    assert windows[0][2] == "2024-03-31"
+
+    # Q4 bounds
+    assert windows[3][1] == "2024-10-01"
+    assert windows[3][2] == "2024-12-31"
+
+
+def test_generate_windows_annual():
+    start = date(2022, 6, 1)
+    end = date(2024, 3, 31)
+    windows = _generate_windows(start, end, "annual")
+
+    assert len(windows) == 3
+    ids = [w[0] for w in windows]
+    assert ids == ["2022", "2023", "2024"]
+
+    # 2024 should end at end_date, not Dec 31
+    assert windows[2][2] == "2024-03-31"
+
+
+def test_generate_windows_mid_quarter_start():
+    """Starting in the middle of Q2 should still produce a full Q2 window."""
+    start = date(2024, 5, 15)
+    end = date(2024, 9, 30)
+    windows = _generate_windows(start, end, "quarterly")
+
+    ids = [w[0] for w in windows]
+    assert "2024-Q2" in ids
+    assert "2024-Q3" in ids
+
+
+def test_build_parser_defaults():
+    parser = build_parser()
+    args = parser.parse_args([])
+    assert args.db_path == "data/motions.db"
+    assert args.window_size == "quarterly"
+    assert args.svd_k == 50
+    assert args.dry_run is False
+
+
+def test_run_dry_run(tmp_path, monkeypatch):
+    """Dry-run should log actions and return 0 without touching the DB."""
+    db_path = str(tmp_path / "motions.db")
+
+    # Create minimal DB so MotionDatabase initialises
+    from database import MotionDatabase
+
+    MotionDatabase(db_path)
+
+    args = argparse.Namespace(
+        db_path=db_path,
+        start_date="2024-01-01",
+        end_date="2024-03-31",
+        window_size="quarterly",
+        svd_k=10,
+        text_model=None,
+        skip_metadata=False,
+        skip_extract=False,
+        skip_svd=False,
+        skip_text=False,
+        skip_fusion=False,
+        dry_run=True,
+    )
+
+    exit_code = run(args)
+    assert exit_code == 0
+
+
+def test_run_skip_all(tmp_path):
+    """Skipping all phases should still return 0."""
+    db_path = str(tmp_path / "motions.db")
+    from database import MotionDatabase
+
+    MotionDatabase(db_path)
+
+    args = argparse.Namespace(
+        db_path=db_path,
+        start_date="2024-01-01",
+        end_date="2024-03-31",
+        window_size="quarterly",
+        svd_k=10,
+        text_model=None,
+        skip_metadata=True,
+        skip_extract=True,
+        skip_svd=True,
+        skip_text=True,
+        skip_fusion=True,
+        dry_run=False,
+    )
+
+    exit_code = run(args)
+    assert exit_code == 0
--- a/uv.lock
+++ b/uv.lock
@ -1056,6 +1056,7 @@ dependencies = [
    { name = "pytest" },
    { name = "requests" },
    { name = "schedule" },
+    { name = "scikit-learn" },
    { name = "scipy" },
    { name = "streamlit" },
    { name = "umap-learn" },
@ -1070,6 +1071,7 @@ requires-dist = [
    { name = "pytest", specifier = ">=9.0.2" },
    { name = "requests", specifier = ">=2.32.4" },
    { name = "schedule", specifier = ">=1.2.2" },
+    { name = "scikit-learn", specifier = ">=1.8.0" },
    { name = "scipy", specifier = ">=1.11" },
    { name = "streamlit", specifier = ">=1.48.0" },
    { name = "umap-learn", specifier = ">=0.5" },