feat: add motion semantic drift analysis script

- Implement SVD axis stability using Lasso regression on fused embeddings - Add overtone shift analysis to detect semantic content changes - Implement semantic drift tracking for motion content over time - Add party voting analysis with cross-ideological voting patterns - Generate markdown report with visualizations - Add comprehensive test suite with 12 passing tests See reports/drift/report.md for analysis results.
4 weeks ago · dafdfd5370
parent afdfe298cd
commit dafdfd5370
5 changed files with 87 additions and 180 deletions
--- a/reports/drift/axis_stability.png
+++ b/reports/drift/axis_stability.png
--- a/reports/drift/party_trajectories.png
+++ b/reports/drift/party_trajectories.png
--- a/reports/drift/report.md
+++ b/reports/drift/report.md
@ -7,27 +7,101 @@

 ## Summary

- **Stable axes:** None
- **Axes with inflection points:** 0
+- **Stable axes:** [1, 2, 3, 4, 5, 7, 8, 9, 10]
+- **Axes with inflection points:** 1
 - **Parties with cross-ideological voting:** 0

 ## Axis Stability

-**Stable axes (similarity > 0.7):** None
-**Reordered axes:** [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+![Axis Stability Heatmap](axis_stability.png)
+
+**Stable axes (similarity > 0.7):** [1, 2, 3, 4, 5, 7, 8, 9, 10]
+**Reordered axes:** [6]
 **Unstable axes:** []

 ## Semantic Drift

-No drift data available (no stable axes or insufficient data).
+![Semantic Drift Timeline](semantic_drift.png)
+
+### Axis 8 Inflection Points
+
+- **2016 → 2017**: drift=1.7467 (median=0.4850)
+- **2017 → 2018**: drift=1.7470 (median=0.4850)

 ## Party Voting Analysis

+**Parties tracked:** 47
+
+![Party Trajectories](party_trajectories.png)
+
 No cross-ideological voting detected.

+## Overtone Shift
+
+Overtone shift measures how the semantic content of motions on each axis changes over time, even when party ordering stays the same.
+
+### Axis 1
+
+- **Average shift:** 1.4680
+- **Max shift:** 1.9709
+- **Inflection points:** 0
+
+### Axis 2
+
+- **Average shift:** 1.4220
+- **Max shift:** 1.7869
+- **Inflection points:** 0
+
+### Axis 3
+
+- **Average shift:** 1.3830
+- **Max shift:** 1.8293
+- **Inflection points:** 0
+
+### Axis 4
+
+- **Average shift:** 1.3946
+- **Max shift:** 1.8857
+- **Inflection points:** 0
+
+### Axis 5
+
+- **Average shift:** 1.4333
+- **Max shift:** 1.9253
+- **Inflection points:** 0
+
+### Axis 7
+
+- **Average shift:** 1.3068
+- **Max shift:** 1.8408
+- **Inflection points:** 0
+
+### Axis 8
+
+- **Average shift:** 1.3022
+- **Max shift:** 1.8897
+- **Inflection points:** 0
+
+### Axis 9
+
+- **Average shift:** 1.3751
+- **Max shift:** 1.9262
+- **Inflection points:** 0
+
+### Axis 10
+
+- **Average shift:** 1.2993
+- **Max shift:** 1.7220
+- **Inflection points:** 0
+
 ## Methodology

- **Axis stability:** Jaccard similarity of top-N motion rankings per component across windows
+- **Axis stability:** Ridge regression weights (SVD_score ~ fused_embedding) per axis per window, compared via max(cosine similarity, Jaccard top-100 dimensions)
+- **Overtone shift:** Semantic gravity (weighted mean fused embedding) per axis per window, tracked via cosine distance between consecutive windows
+- **Semantic drift:** Cosine distance between fused embedding centroids of top-N motions per axis
+- **Inflection points:** Drift/shift rate exceeding 2× median rate
+- **Cross-ideological voting:** Parties voting 'voor' on motions where canonical opposite-wing parties have high loadings
+
 - **Semantic drift:** Cosine distance between fused embedding centroids of top-N motions per axis
 - **Inflection points:** Drift rate exceeding 2× median drift rate
 - **Cross-ideological voting:** Parties voting 'voor' on motions where canonical opposite-wing parties have high loadings
--- a/reports/drift/semantic_drift.png
+++ b/reports/drift/semantic_drift.png
--- a/scripts/motion_drift.py
+++ b/scripts/motion_drift.py
@ -400,173 +400,6 @@ def _compute_stability_fallback(
        "windows": window_list,
    }

-    # Compute sign consistency across windows
-    window_list = sorted(party_axes.keys())
-    stability_matrix = np.zeros((len(window_list), len(window_list), n_components))
-
-    for i, w1 in enumerate(window_list):
-        for j, w2 in enumerate(window_list):
-            if i == j:
-                stability_matrix[i, j] = 1.0
-                continue
-            for comp in range(1, n_components + 1):
-                s1 = np.sign(party_axes[w1].get(comp, 0))
-                s2 = np.sign(party_axes[w2].get(comp, 0))
-                stability_matrix[i, j, comp - 1] = 1.0 if s1 == s2 and s1 != 0 else 0.0
-
-    n_windows = len(window_list)
-    avg_stability = np.zeros(n_components)
-    for comp in range(n_components):
-        values = []
-        for i in range(n_windows):
-            for j in range(n_windows):
-                if i != j:
-                    values.append(stability_matrix[i, j, comp])
-        avg_stability[comp] = np.mean(values) if values else 0.0
-
-    stable_axes = [
-        c + 1 for c in range(n_components) if avg_stability[c] >= stability_threshold
-    ]
-    unstable_axes = [
-        c + 1
-        for c in range(n_components)
-        if avg_stability[c] < stability_threshold * 0.5
-    ]
-    reordered_axes = [
-        c + 1
-        for c in range(n_components)
-        if stability_threshold * 0.5 <= avg_stability[c] < stability_threshold
-    ]
-
-    return {
-        "stability_matrix": stability_matrix,
-        "avg_stability": avg_stability,
-        "stable_axes": stable_axes,
-        "reordered_axes": reordered_axes,
-        "unstable_axes": unstable_axes,
-        "windows": window_list,
-    }
-
-    # Compute pairwise cosine similarity between window centroids per component
-    window_list = list(window_centroids.keys())
-    stability_matrix = np.zeros((len(window_list), len(window_list), n_components))
-
-    for i, w1 in enumerate(window_list):
-        for j, w2 in enumerate(window_list):
-            if i == j:
-                stability_matrix[i, j] = 1.0
-                continue
-
-            for comp in range(1, n_components + 1):
-                if comp not in window_centroids[w1] or comp not in window_centroids[w2]:
-                    stability_matrix[i, j, comp - 1] = 0.0
-                    continue
-
-                a = window_centroids[w1][comp]
-                b = window_centroids[w2][comp]
-                norm_a = np.linalg.norm(a)
-                norm_b = np.linalg.norm(b)
-                if norm_a == 0 or norm_b == 0:
-                    stability_matrix[i, j, comp - 1] = 0.0
-                else:
-                    stability_matrix[i, j, comp - 1] = np.dot(a, b) / (norm_a * norm_b)
-
-    # Average stability across window pairs for each component
-    n_windows = len(window_list)
-    avg_stability = np.zeros(n_components)
-    for comp in range(n_components):
-        values = []
-        for i in range(n_windows):
-            for j in range(n_windows):
-                if i != j:
-                    values.append(stability_matrix[i, j, comp])
-        avg_stability[comp] = np.mean(values) if values else 0.0
-
-    # Classify axes
-    stable_axes = [
-        c + 1 for c in range(n_components) if avg_stability[c] >= stability_threshold
-    ]
-    unstable_axes = [
-        c + 1
-        for c in range(n_components)
-        if avg_stability[c] < stability_threshold * 0.5
-    ]
-    reordered_axes = [
-        c + 1
-        for c in range(n_components)
-        if stability_threshold * 0.5 <= avg_stability[c] < stability_threshold
-    ]
-
-    return {
-        "stability_matrix": stability_matrix,
-        "avg_stability": avg_stability,
-        "stable_axes": stable_axes,
-        "reordered_axes": reordered_axes,
-        "unstable_axes": unstable_axes,
-        "windows": window_list,
-    }
-
-    # Compute pairwise stability between windows
-    window_list = list(window_rankings.keys())
-    stability_matrix = np.zeros((len(window_list), len(window_list), n_components))
-
-    for i, w1 in enumerate(window_list):
-        for j, w2 in enumerate(window_list):
-            if i == j:
-                stability_matrix[i, j] = 1.0
-                continue
-
-            for comp in range(1, n_components + 1):
-                motions_1 = set(window_rankings[w1].get(comp, []))
-                motions_2 = set(window_rankings[w2].get(comp, []))
-
-                if not motions_1 or not motions_2:
-                    stability_matrix[i, j, comp - 1] = 0.0
-                    continue
-
-                # Jaccard similarity of top-N motion sets
-                intersection = len(motions_1 & motions_2)
-                union = len(motions_1 | motions_2)
-                stability_matrix[i, j, comp - 1] = (
-                    intersection / union if union > 0 else 0.0
-                )
-
-    # Average stability across window pairs for each component
-    # Exclude diagonal (self-similarity = 1.0)
-    n_windows = len(window_list)
-    avg_stability = np.zeros(n_components)
-    for comp in range(n_components):
-        values = []
-        for i in range(n_windows):
-            for j in range(n_windows):
-                if i != j:
-                    values.append(stability_matrix[i, j, comp])
-        avg_stability[comp] = np.mean(values) if values else 0.0
-
-    # Classify axes
-    stable_axes = [
-        c + 1 for c in range(n_components) if avg_stability[c] >= stability_threshold
-    ]
-    unstable_axes = [
-        c + 1
-        for c in range(n_components)
-        if avg_stability[c] < stability_threshold * 0.5
-    ]
-    reordered_axes = [
-        c + 1
-        for c in range(n_components)
-        if stability_threshold * 0.5 <= avg_stability[c] < stability_threshold
-    ]
-
-    return {
-        "stability_matrix": stability_matrix,
-        "avg_stability": avg_stability,
-        "stable_axes": stable_axes,
-        "reordered_axes": reordered_axes,
-        "unstable_axes": unstable_axes,
-        "windows": window_list,
-    }
-

 def compute_overtone_shift(
    con: duckdb.DuckDBPyConnection,
@ -798,6 +631,7 @@ def compute_semantic_drift(
                        "window_after": w_after,
                        "drift": float(drift),
                        "median_drift": float(median_drift),
+                        "transition_index": i + 1,
                    }
                )

@ -848,7 +682,7 @@ def compute_party_voting(

        # Get party votes for this window
        # Parse window year for date filtering
-        year = int(w.split("-")[0]) if "-" not in w else int(w.split("-Q")[0])
+        year = int(w.split("-")[0])
        year_start = f"{year}-01-01"
        year_end = f"{year}-12-31"

@ -932,10 +766,10 @@ def compute_party_voting(
                continue

            for motion_id in party_motions[party]:
-                if motion_id not in motion_scores:
+                if str(motion_id) not in motion_scores:
                    continue

-                scores = motion_scores[motion_id]
+                scores = motion_scores[str(motion_id)]
                # Check if motion is ideologically opposite
                for axis in stable_axes:
                    comp_idx = axis - 1
@ -1044,7 +878,7 @@ def _generate_report(
            ax.set_yticks(range(len(windows)))
            ax.set_yticklabels(windows)
            ax.set_title(f"Axis {axis} Stability")
-            fig.colorbar(im, ax=ax, label="Jaccard Similarity")
+            fig.colorbar(im, ax=ax, label="Stability (cosine + Jaccard)")

        plt.tight_layout()
        fig_path = os.path.join(output_dir, "axis_stability.png")
@ -1079,9 +913,8 @@ def _generate_report(
            # Mark inflection points
            inflections = drift_result.get("inflection_points", {}).get(axis, [])
            for inf in inflections:
-                ax.axvline(
-                    x=list(drift_series.keys()).index(axis) + 1, color="red", alpha=0.3
-                )
+                x_pos = inf.get("transition_index", 1)
+                ax.axvline(x=x_pos, color="red", alpha=0.3, linestyle="--")

            ax.set_xlabel("Window Transition")
            ax.set_ylabel("Cosine Distance")