feat(explorer): extend diagnostic inspector to surface mp samples/counts

chore(explorer): add get_debug_trajectories_enabled helper feat(explorer): instrument trajectories with debug diagnostics and un-silence helper exceptions
1 month ago · baee50f3a5
parent 0f2db0a9be
commit baee50f3a5
4 changed files with 1256 additions and 106 deletions
--- a/explorer.py
+++ b/explorer.py
--- a/explorer_helpers.py
+++ b/explorer_helpers.py
@ -0,0 +1,227 @@
+"""Helper utilities used by explorer.py.
+
+Primary export:
+- compute_party_coords: compute per-party (x_mean, y_mean) from positions_by_window.
+
+This module is intentionally free of Streamlit side-effects to be easy to unit test.
+"""
+
+from __future__ import annotations
+
+import logging
+import math
+import re
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+def _strip_paren(s: str) -> str:
+    # helper used in plan to try to strip parenthetical variants
+    return s.split("(")[0].strip()
+
+
+def inspect_positions_for_issues(
+    positions_by_window: Dict[str, Dict[str, Tuple[float, float]]],
+    party_map: Dict[str, str],
+) -> Dict[str, Any]:
+    """Inspect positions_by_window for simple issues/summary.
+
+    Returns a dictionary with keys including the previous ones (windows_count,
+    window_labels, mp_id_set, party_map_count, parties_with_centroid_counts,
+    mismatched_mp_ids_sample) plus:
+      - mp_positions_count: int (num unique MP ids seen)
+      - mp_positions_sample: list[str] (sorted sample up to 10)
+      - windows_with_no_positions: list[str]
+
+    This helper remains pure and import-safe so unit tests can exercise it.
+    """
+    windows = list(positions_by_window.keys())
+    windows_count = len(windows)
+    window_labels = sorted(windows)[:10]
+
+    mp_id_set: Set[str] = set()
+    parties_with_centroid_counts: Dict[str, int] = {}
+    mismatched: Set[str] = set()
+    windows_with_no_positions: List[str] = []
+
+    for win, pos in positions_by_window.items():
+        if not pos:
+            windows_with_no_positions.append(win)
+            continue
+        present_parties: Set[str] = set()
+        for ent in pos.keys():
+            if not ent:
+                continue
+            mp_id_set.add(ent)
+            party = party_map.get(ent)
+            if party is None:
+                # try stripping paren variant
+                party = party_map.get(_strip_paren(ent))
+            if party:
+                present_parties.add(party)
+            else:
+                mismatched.add(ent)
+
+        for p in present_parties:
+            parties_with_centroid_counts[p] = parties_with_centroid_counts.get(p, 0) + 1
+
+    mismatched_mp_ids_sample = sorted(list(mismatched))[:10]
+
+    mp_positions_sample = sorted(list(mp_id_set))[:10]
+    mp_positions_count = len(mp_id_set)
+
+    return {
+        "windows_count": windows_count,
+        "window_labels": window_labels,
+        "mp_id_set": mp_id_set,
+        "party_map_count": len(party_map),
+        "parties_with_centroid_counts": parties_with_centroid_counts,
+        "mismatched_mp_ids_sample": mismatched_mp_ids_sample,
+        "mp_positions_sample": mp_positions_sample,
+        "mp_positions_count": mp_positions_count,
+        "windows_with_no_positions": windows_with_no_positions,
+    }
+
+
+def compute_party_coords(
+    positions_by_window: Dict[str, Dict[str, Tuple[float, float]]],
+    party_map: Dict[str, str],
+    window_id: str,
+    fallback_party_scores: Optional[Dict[str, List[float]]] = None,
+) -> Tuple[Dict[str, Tuple[float, float]], Set[str]]:
+    """
+    Compute per-party centroids (x_mean, y_mean) for a specific window.
+
+    Args:
+        positions_by_window: mapping window_id -> {entity_name: (x, y)}
+        party_map: mapping mp_name -> party abbreviation (Normalized)
+        window_id: which window to compute centroids for (key into positions_by_window)
+        fallback_party_scores: optional mapping party -> numeric vector (len>=2). When a
+            party has no MPs in the window and fallback_party_scores contains an entry,
+            the first two elements of that vector will be used as a fallback (x,y).
+
+    Returns:
+        (party_coords, fallback_used) where:
+          - party_coords: {party: (x_mean, y_mean)} for parties with a computed coord or fallback.
+          - fallback_used: set of party names where fallback_party_scores was used.
+    """
+    pos = positions_by_window.get(window_id, {}) or {}
+
+    per_party: Dict[str, List[Tuple[float, float]]] = {}
+    for ent, xy in pos.items():
+        if not ent or xy is None:
+            continue
+        try:
+            x, y = float(xy[0]), float(xy[1])
+        except Exception:
+            # skip malformed coords
+            continue
+        party = party_map.get(ent)
+        if party is None:
+            # try stripped name fallback
+            party = party_map.get(_strip_paren(ent))
+        if not party or party == "Unknown":
+            continue
+        per_party.setdefault(party, []).append((x, y))
+
+    party_coords: Dict[str, Tuple[float, float]] = {}
+    fallback_used: Set[str] = set()
+
+    # compute means for parties that have MPs
+    for party, coords in per_party.items():
+        xs = [c[0] for c in coords]
+        ys = [c[1] for c in coords]
+        # defensive: drop nan/inf
+        xs = [float(x) for x in xs if not (math.isnan(x) or math.isinf(x))]
+        ys = [float(y) for y in ys if not (math.isnan(y) or math.isinf(y))]
+        if not xs or not ys:
+            continue
+        party_coords[party] = (float(np.mean(xs)), float(np.mean(ys)))
+
+    # fallback: use supplied party vectors if a party has no MPs in this window
+    if fallback_party_scores:
+        for party, vec in fallback_party_scores.items():
+            if party in party_coords:
+                continue
+            if not vec:
+                continue
+            try:
+                # vec may be list, np.array, etc.
+                if len(vec) >= 2:
+                    x_f, y_f = float(vec[0]), float(vec[1])
+                    if (
+                        math.isnan(x_f)
+                        or math.isnan(y_f)
+                        or math.isinf(x_f)
+                        or math.isinf(y_f)
+                    ):
+                        continue
+                    party_coords[party] = (x_f, y_f)
+                    fallback_used.add(party)
+            except Exception:
+                continue
+
+    if fallback_used:
+        logger.warning(
+            "compute_party_coords used fallback for parties: %s",
+            sorted(list(fallback_used)),
+        )
+
+    return party_coords, fallback_used
+
+
+def compute_party_centroids(
+    positions_by_window: Dict[str, Dict[str, Tuple[float, float]]],
+    party_map: Dict[str, str],
+    windows: List[str],
+) -> Tuple[Dict[str, List[Tuple[float, float]]], Dict[str, Any]]:
+    """Compute per-party centroids across multiple windows.
+
+    Returns (party_centroids, metadata)
+    - party_centroids: mapping party -> list of (x,y) tuples of length len(windows).
+      Entries without MPs are (np.nan, np.nan).
+    - metadata: dict with keys 'per_party_counts', 'total_windows', 'parties'
+    """
+    party_centroids: Dict[str, List[Tuple[float, float]]] = {}
+    # collect all parties from party_map values
+    parties = sorted(set(party_map.values()))
+    # if no parties known, return empty dict but still metadata
+    if not parties:
+        return {}, {
+            "per_party_counts": {},
+            "total_windows": len(windows),
+            "parties": [],
+        }
+
+    # initialize lists
+    for p in parties:
+        party_centroids[p] = []
+
+    # for each window, compute party coords using compute_party_coords for that window
+    for w in windows:
+        coords, _ = compute_party_coords(positions_by_window or {}, party_map, w)
+        for p in parties:
+            if p in coords:
+                # ensure numeric floats
+                party_centroids[p].append((float(coords[p][0]), float(coords[p][1])))
+            else:
+                party_centroids[p].append((float(np.nan), float(np.nan)))
+
+    # metadata
+    per_party_counts: Dict[str, int] = {}
+    for p, vals in party_centroids.items():
+        count = 0
+        for x, y in vals:
+            if not (np.isnan(x) or np.isnan(y)):
+                count += 1
+        per_party_counts[p] = count
+
+    metadata = {
+        "per_party_counts": per_party_counts,
+        "total_windows": len(windows),
+        "parties": parties,
+    }
+    return party_centroids, metadata
--- a/tests/test_diagnose_no_plot_trajectories.py
+++ b/tests/test_diagnose_no_plot_trajectories.py
@ -0,0 +1,49 @@
+import os
+import types
+
+import explorer
+
+
+def test_load_positions_empty_sets_diagnostics(monkeypatch):
+    # Monkeypatch load_positions to return empty positions
+    monkeypatch.setattr(
+        explorer, "load_positions", lambda db_path, window_size: ({}, {})
+    )
+    monkeypatch.setenv("EXPLORER_DEBUG_TRAJECTORIES", "1")
+
+    # Call build_trajectories_tab; it should set diagnostics and return without exception
+    explorer.build_trajectories_tab(db_path="unused", window_size="annual")
+
+    assert (
+        explorer._last_trajectories_diagnostics.get("stage") == "load_positions_empty"
+    )
+
+
+def test_select_helper_exception_is_captured(monkeypatch):
+    # Provide a minimal non-empty positions_by_window
+    positions = {"W1": {"mp1": (0.1, 0.2)}}
+
+    def fake_load_positions(db_path, window_size):
+        return positions, {}
+
+    monkeypatch.setattr(explorer, "load_positions", fake_load_positions)
+    # Ensure party_map maps the mp so centroids/path that invoke select_trajectory_plot_data
+    monkeypatch.setattr(explorer, "load_party_map", lambda db_path: {"mp1": "P1"})
+
+    # Patch select_trajectory_plot_data to raise
+    def bad_helper(*args, **kwargs):
+        raise ValueError("boom")
+
+    monkeypatch.setattr(explorer, "select_trajectory_plot_data", bad_helper)
+    monkeypatch.setenv("EXPLORER_DEBUG_TRAJECTORIES", "1")
+
+    explorer.build_trajectories_tab(db_path="unused", window_size="annual")
+
+    # Ensure the helper function has diagnostics attached and module diagnostics updated
+    assert getattr(explorer.select_trajectory_plot_data, "_last_diagnostics", None)
+    assert "exception" in explorer.select_trajectory_plot_data._last_diagnostics
+    assert (
+        explorer._last_trajectories_diagnostics.get("stage")
+        == "select_helper_exception"
+    )
+    assert "ValueError" in explorer._last_trajectories_diagnostics.get("exception", "")
--- a/tests/test_explorer_helpers_diagnostics.py
+++ b/tests/test_explorer_helpers_diagnostics.py
@ -0,0 +1,22 @@
+import numpy as np
+from explorer_helpers import inspect_positions_for_issues
+
+
+def test_inspect_positions_for_issues_basic():
+    positions_by_window = {
+        "w1": {"mp1": (1.0, 2.0), "mp2": (float("nan"), float("nan"))},
+        "w2": {},
+    }
+    party_map = {"mp1": "P1"}
+    d = inspect_positions_for_issues(positions_by_window, party_map)
+
+    # basic keys still present
+    assert d["windows_count"] == 2
+    assert isinstance(d["mp_id_set"], set)
+    # new diagnostics
+    assert "mp_positions_count" in d
+    assert d["mp_positions_count"] >= 1
+    assert "mp_positions_sample" in d
+    assert isinstance(d["mp_positions_sample"], list)
+    assert "windows_with_no_positions" in d
+    assert isinstance(d["windows_with_no_positions"], list)