diff --git a/analysis/projections.py b/analysis/projections.py new file mode 100644 index 0000000..be26a92 --- /dev/null +++ b/analysis/projections.py @@ -0,0 +1,128 @@ +"""SVD projection utilities for the parliamentary explorer. + +Pure computation functions for projecting motions and entities onto ideological axes. +No IO or external dependencies - fully testable without Streamlit or DuckDB. +""" + +from __future__ import annotations + +import math +from typing import Any, Dict, List, Tuple + +__all__ = [ + "should_swap_axes", + "swap_axes", + "project_motion_scores", + "normalize_coordinates", +] + + +def should_swap_axes(axis_def: dict) -> bool: + """Return True if the Y axis is economic left-right and the X axis is not. + + When true, caller should swap x/y positions and metadata so the economic + dimension (welfare vs market) is conventionally on the horizontal axis. + """ + economic_labels = {"Verzorgingsstaat–Marktwerking", "Links–Rechts"} + y_label = axis_def.get("y_label") + x_label = axis_def.get("x_label") + return y_label in economic_labels and x_label not in economic_labels + + +def swap_axes( + positions_by_window: Dict[str, Dict[str, Tuple[float, float]]], + axis_def: dict, +) -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], dict]: + """Swap x and y in all positions and axis metadata. + + Pure function — returns (new_positions_by_window, new_axis_def). + """ + new_positions: Dict[str, Dict[str, Tuple[float, float]]] = {} + for wid, pos_dict in positions_by_window.items(): + new_positions[wid] = {ent: (y, x) for ent, (x, y) in pos_dict.items()} + + new_ax = dict(axis_def) + new_ax["x_label"] = axis_def.get("y_label") + new_ax["y_label"] = axis_def.get("x_label") + + for x_key, y_key in [ + ("x_quality", "y_quality"), + ("x_interpretation", "y_interpretation"), + ("x_top_motions", "y_top_motions"), + ("x_label_confidence", "y_label_confidence"), + ("x_axis", "y_axis"), + ]: + new_ax[x_key] = axis_def.get(y_key) + new_ax[y_key] = axis_def.get(x_key) + + return new_positions, new_ax + + +def project_motion_scores( + motion_scores: Dict[int, float], top_n: int = 5 +) -> Tuple[List[Tuple[int, float]], List[Tuple[int, float]]]: + """Split motion scores into positive and negative poles. + + Args: + motion_scores: Dict mapping motion_id to loading score + top_n: Number of top motions per pole + + Returns: + Tuple of (positive_pole, negative_pole) where each is a list of (motion_id, score) + """ + sorted_scores = sorted(motion_scores.items(), key=lambda x: x[1], reverse=True) + + positive_pole = sorted_scores[:top_n] + negative_pole = sorted_scores[-top_n:][::-1] + + return positive_pole, negative_pole + + +def normalize_coordinates( + positions: Dict[str, Tuple[float, float]], + clamp_abs_value: float = 1e3, + null_tokens: Tuple[str, ...] = ("nan", "NaN", "None", "none", "null", ""), +) -> Dict[str, Tuple[float, float]]: + """Normalize coordinate values. + + Pure function that clamps extreme values and handles null tokens. + + Args: + positions: Dict mapping entity names to (x, y) coordinates + clamp_abs_value: Maximum absolute coordinate value + null_tokens: Values to treat as null + + Returns: + Dict with normalized coordinates + """ + + def _coerce(val: Any) -> float: + if val is None: + return float("nan") + if isinstance(val, (float, int)): + v = float(val) + if math.isnan(v) or math.isinf(v): + return float("nan") + if abs(v) > clamp_abs_value: + return float("nan") + return v + if isinstance(val, str): + if val in null_tokens or val.strip() in null_tokens: + return float("nan") + try: + v = float(val) + if math.isnan(v) or math.isinf(v): + return float("nan") + if abs(v) > clamp_abs_value: + return float("nan") + return v + except ValueError: + return float("nan") + return float("nan") + + result = {} + for entity, (x, y) in positions.items(): + nx = _coerce(x) + ny = _coerce(y) + result[entity] = (nx, ny) + return result diff --git a/analysis/tabs/__init__.py b/analysis/tabs/__init__.py new file mode 100644 index 0000000..3f53722 --- /dev/null +++ b/analysis/tabs/__init__.py @@ -0,0 +1,21 @@ +"""Tab modules for the parliamentary explorer. + +This package contains tab-building functions extracted from explorer.py. +Each module contains a `build__tab()` function that implements one tab. +""" + +from analysis.tabs.compass import build_compass_tab +from analysis.tabs.trajectories import build_trajectories_tab +from analysis.tabs.search import build_search_tab +from analysis.tabs.browser import build_browser_tab +from analysis.tabs.components import build_svd_components_tab +from analysis.tabs.quiz import build_mp_quiz_tab + +__all__ = [ + "build_compass_tab", + "build_trajectories_tab", + "build_search_tab", + "build_browser_tab", + "build_svd_components_tab", + "build_mp_quiz_tab", +] diff --git a/analysis/tabs/browser.py b/analysis/tabs/browser.py new file mode 100644 index 0000000..e240fd0 --- /dev/null +++ b/analysis/tabs/browser.py @@ -0,0 +1,18 @@ +"""Browser tab for the parliamentary explorer. + +This module will contain the browser tab implementation. +Currently: Tab logic remains in explorer.py pending Streamlit decoupling. +""" + +from __future__ import annotations + + +def build_browser_tab(db_path: str, show_rejected: bool) -> None: + """Build the Motie Browser tab. + + Currently delegates to explorer.py implementation. + Will be extracted when rendering logic is decoupled from Streamlit. + """ + import explorer + + explorer.build_browser_tab(db_path, show_rejected) diff --git a/analysis/tabs/compass.py b/analysis/tabs/compass.py new file mode 100644 index 0000000..3ca9199 --- /dev/null +++ b/analysis/tabs/compass.py @@ -0,0 +1,20 @@ +"""Compass tab for the parliamentary explorer. + +This module will contain the compass tab implementation. +Currently: Tab logic remains in explorer.py pending Streamlit decoupling. +""" + +from __future__ import annotations + +from typing import List + + +def build_compass_tab(db_path: str, window_size: str) -> None: + """Build the Politiek Kompas tab. + + Currently delegates to explorer.py implementation. + Will be extracted when rendering logic is decoupled from Streamlit. + """ + import explorer + + explorer.build_compass_tab(db_path, window_size) diff --git a/analysis/tabs/components.py b/analysis/tabs/components.py new file mode 100644 index 0000000..8dc806f --- /dev/null +++ b/analysis/tabs/components.py @@ -0,0 +1,18 @@ +"""SVD Components tab for the parliamentary explorer. + +This module will contain the SVD components tab implementation. +Currently: Tab logic remains in explorer.py pending Streamlit decoupling. +""" + +from __future__ import annotations + + +def build_svd_components_tab(db_path: str) -> None: + """Build the SVD Components tab. + + Currently delegates to explorer.py implementation. + Will be extracted when rendering logic is decoupled from Streamlit. + """ + import explorer + + explorer.build_svd_components_tab(db_path) diff --git a/analysis/tabs/quiz.py b/analysis/tabs/quiz.py new file mode 100644 index 0000000..5c7bc9f --- /dev/null +++ b/analysis/tabs/quiz.py @@ -0,0 +1,18 @@ +"""MP Quiz tab for the parliamentary explorer. + +This module will contain the MP quiz tab implementation. +Currently: Tab logic remains in explorer.py pending Streamlit decoupling. +""" + +from __future__ import annotations + + +def build_mp_quiz_tab(db_path: str) -> None: + """Build the MP Quiz tab. + + Currently delegates to explorer.py implementation. + Will be extracted when rendering logic is decoupled from Streamlit. + """ + import explorer + + explorer.build_mp_quiz_tab(db_path) diff --git a/analysis/tabs/search.py b/analysis/tabs/search.py new file mode 100644 index 0000000..2821bf7 --- /dev/null +++ b/analysis/tabs/search.py @@ -0,0 +1,18 @@ +"""Search tab for the parliamentary explorer. + +This module will contain the search tab implementation. +Currently: Tab logic remains in explorer.py pending Streamlit decoupling. +""" + +from __future__ import annotations + + +def build_search_tab(db_path: str, show_rejected: bool) -> None: + """Build the Motie Zoeken tab. + + Currently delegates to explorer.py implementation. + Will be extracted when rendering logic is decoupled from Streamlit. + """ + import explorer + + explorer.build_search_tab(db_path, show_rejected) diff --git a/analysis/tabs/trajectories.py b/analysis/tabs/trajectories.py new file mode 100644 index 0000000..18e39fd --- /dev/null +++ b/analysis/tabs/trajectories.py @@ -0,0 +1,20 @@ +"""Trajectories tab for the parliamentary explorer. + +This module will contain the trajectories tab implementation. +Currently: Tab logic remains in explorer.py pending Streamlit decoupling. +""" + +from __future__ import annotations + +from typing import List + + +def build_trajectories_tab(db_path: str, window_size: str) -> None: + """Build the Partij Trajectories tab. + + Currently delegates to explorer.py implementation. + Will be extracted when rendering logic is decoupled from Streamlit. + """ + import explorer + + explorer.build_trajectories_tab(db_path, window_size) diff --git a/analysis/trajectory.py b/analysis/trajectory.py index c2dae34..ee0b5e7 100644 --- a/analysis/trajectory.py +++ b/analysis/trajectory.py @@ -10,9 +10,11 @@ Returns a dict keyed by mp_name containing per-window positions and drift scores import json import logging -from typing import Dict, List, Optional +import re +from typing import Dict, List, Optional, Tuple import numpy as np +import pandas as pd import duckdb try: @@ -25,6 +27,15 @@ except ImportError: _logger = logging.getLogger(__name__) +__all__ = [ + "compute_trajectories", + "compute_2d_trajectories", + "top_drifters", + "compute_party_discipline", + "window_to_dates", + "choose_trajectory_title", +] + def _procrustes_align_windows( window_vecs: Dict[str, Dict[str, np.ndarray]], @@ -295,3 +306,126 @@ def top_drifters(trajectories: Dict[str, Dict], n: int = 10) -> List[Dict]: } for mp, data in ranked[:n] ] + + +def compute_party_discipline( + db_path: str, + start_date: str, + end_date: str, +) -> pd.DataFrame: + """Compute per-party voting discipline (Rice index) for roll-call votes in a date range. + + Only individual MP vote rows are used (mp_name LIKE '%,%'). + Returns a DataFrame with columns [party, n_motions, discipline] sorted by discipline ascending. + Returns an empty DataFrame if fewer than 1 qualifying motion exists or on any DB error. + + Rice index per motion per party = fraction of party MPs voting with the party majority. + The per-party score is the average Rice index across all motions in the date range. + Only 'voor' and 'tegen' votes are counted; absent and abstaining MPs are excluded. + """ + conn = None + try: + conn = duckdb.connect(db_path, read_only=True) + result = conn.execute( + """ + WITH individual_votes AS ( + SELECT + motion_id, + party, + LOWER(vote) AS vote + FROM mp_votes + WHERE mp_name LIKE '%,%' + AND date >= CAST(? AS DATE) + AND date <= CAST(? AS DATE) + AND vote IN ('voor', 'tegen') + ), + vote_counts AS ( + SELECT + motion_id, + party, + vote, + COUNT(*) AS cnt + FROM individual_votes + GROUP BY motion_id, party, vote + ), + majority_vote AS ( + SELECT + motion_id, + party, + FIRST(vote ORDER BY cnt DESC, vote ASC) AS maj_vote, + SUM(cnt) AS total_mp_votes + FROM vote_counts + GROUP BY motion_id, party + ), + rice_per_motion AS ( + SELECT + mv.motion_id, + mv.party, + SUM(CASE WHEN vc.vote = mv.maj_vote THEN vc.cnt ELSE 0 END) + * 1.0 / mv.total_mp_votes AS rice + FROM majority_vote mv + JOIN vote_counts vc + ON mv.motion_id = vc.motion_id AND mv.party = vc.party + GROUP BY mv.motion_id, mv.party, mv.total_mp_votes + ) + SELECT + party, + COUNT(DISTINCT motion_id) AS n_motions, + AVG(rice) AS discipline + FROM rice_per_motion + GROUP BY party + ORDER BY discipline ASC + """, + [start_date, end_date], + ).fetchdf() + return result + except Exception as exc: + _logger.warning("compute_party_discipline failed: %s", exc) + return pd.DataFrame(columns=["party", "n_motions", "discipline"]) + finally: + if conn is not None: + try: + conn.close() + except Exception: + pass + + +def window_to_dates(window_id: str) -> Tuple[str, str]: + """Return (start_date, end_date) ISO strings for a given window_id. + + Annual windows like '2024' → ('2024-01-01', '2024-12-31'). + 'current_parliament' → ('2023-11-22', '2099-12-31') (2023 formation date, open end). + Unknown formats → ('2000-01-01', '2099-12-31') (effectively all time). + """ + if window_id == "current_parliament": + return ("2023-11-22", "2099-12-31") + if re.fullmatch(r"\d{4}", window_id): + return (f"{window_id}-01-01", f"{window_id}-12-31") + m = re.fullmatch(r"(\d{4})-Q([1-4])", window_id) + if m: + year, q = int(m.group(1)), int(m.group(2)) + starts = {1: "01-01", 2: "04-01", 3: "07-01", 4: "10-01"} + ends = {1: "03-31", 2: "06-30", 3: "09-30", 4: "12-31"} + return (f"{year}-{starts[q]}", f"{year}-{ends[q]}") + return ("2000-01-01", "2099-12-31") + + +def choose_trajectory_title(axis_def: dict, axis: str, threshold: float = 0.65) -> str: + """Choose a short trajectory axis title based on aggregated confidence. + + axis: 'x' or 'y'. Returns axis_def label when its mean confidence >= threshold, + otherwise returns the compact fallback 'As 1' / 'As 2'. Matches previous logic. + """ + conf_map = axis_def.get(f"{axis}_label_confidence", {}) or {} + vals = [v for v in conf_map.values() if v is not None] + mean = float(sum(vals) / len(vals)) if vals else None + label = axis_def.get(f"{axis}_label") + if mean is not None and mean >= threshold and label: + return label + try: + from analysis.axis_classifier import display_label_for_modal + + fallback_modal = "As 1" if axis == "x" else "As 2" + return display_label_for_modal(fallback_modal, axis) + except Exception: + return "As 1" if axis == "x" else "As 2" diff --git a/docs/brainstorms/2026-04-04-explorer-refactor-requirements.md b/docs/brainstorms/2026-04-04-explorer-refactor-requirements.md new file mode 100644 index 0000000..1e2cfd8 --- /dev/null +++ b/docs/brainstorms/2026-04-04-explorer-refactor-requirements.md @@ -0,0 +1,118 @@ +--- +date: 2026-04-04 +topic: explorer-refactor +--- + +# Explorer.py Refactor: Extract to analysis/ + +## Problem Frame + +explorer.py is 3715 lines with 39 functions mixing: +- Data loading (DuckDB queries) +- Business logic (SVD projections, trajectory alignment) +- UI rendering (Streamlit components) + +This makes the file: +- Hard to navigate (no clear boundaries) +- Hard to test (requires Streamlit + DuckDB) +- Hard to review (changes affect everything) + +**Goal**: Improve navigability by extracting computation-heavy logic to `analysis/`, leaving explorer.py as a UI orchestration layer. + +## Requirements + +### Data Layer + +- **R1.1**: Create `analysis/explorer_data.py` containing all data loading functions currently in explorer.py: + - `get_available_windows()` + - `get_uniform_dim_windows()` + - `load_positions()` + - `load_party_map()` + - `load_active_mps()` + - `load_party_axis_scores()` + - `load_party_scores_all_windows()` + - `load_party_scores_all_windows_aligned()` + - `load_party_mp_vectors()` + - `load_scree_data()` + - `load_motions_df()` + +- **R1.2**: All extracted functions must be callable without Streamlit imports (no `@st.cache_data`, no `st.*` calls) + +- **R1.3**: Functions return pure Python data structures (DataFrames, dicts, lists) - no Plotly figures + +### Business Logic Layer + +- **R2.1**: Move computation functions to `analysis/` modules based on domain: + - `_should_swap_axes()`, `_swap_axes()` → `analysis/axis_utils.py` (new) + - `compute_party_discipline()` → `analysis/trajectories.py` + - Trajectory computation functions → `analysis/trajectories.py` + - SVD projection functions → `analysis/svd_labels.py` or new `analysis/projections.py` + +- **R2.2**: Computations must be pure functions (no IO, deterministic outputs) + +### UI Layer (explorer.py) + +- **R3.1**: explorer.py becomes a thin orchestration layer: + - Imports from `analysis/explorer_data.py` for data + - Imports from `analysis/` modules for computations + - Contains only Streamlit UI code and `@st.cache_data` wrappers + +- **R3.2**: Render functions (`_render_*`) stay in explorer.py (they're UI-only) + +- **R3.3**: Tab-building functions (`build_*_tab()`) stay in explorer.py but delegate to imported functions + +### Import Safety + +- **R4.1**: New `analysis/` modules must not import from `explorer.py` (no circular dependencies) + +- **R4.2**: `analysis/explorer_data.py` may import from `database.py` (already exists) + +### Testing + +- **R5.1**: Extracted data functions should be testable with mocked DuckDB connections + +- **R5.2**: Extracted computation functions should be pure and testable without database + +## Success Criteria + +- explorer.py reduced to under 1500 lines (from 3715) +- No function in explorer.py exceeds 100 lines +- Clear module boundaries: data → computation → UI +- All extracted functions have docstrings with type hints +- No circular imports between `analysis/` and `explorer/` + +## Scope Boundaries + +**Included:** +- Data loading functions +- Computation/transformation logic +- Clear separation of concerns + +**Excluded:** +- UI rendering functions (they can stay in explorer.py) +- Database schema changes +- New features or behavior changes +- Test suite updates (handled separately) + +## Key Decisions + +- **Domain-based splitting**: Computation goes to relevant `analysis/` module, not all to one file +- **Import direction**: `explorer.py` imports from `analysis/`, never vice versa +- **Preserve function signatures**: Refactoring shouldn't change public APIs + +## Dependencies / Assumptions + +- `database.py` provides `MotionDatabase` singleton - data functions will use this +- `explorer_helpers.py` pattern is already established - follow its conventions +- Streamlit caching (`@st.cache_data`) stays in explorer.py as the orchestration layer + +## Outstanding Questions + +### Deferred to Planning +- [ ] [Implementation] Should `_load_mp_vectors_by_party()` and variants be merged or kept separate? +- [ ] [Implementation] Should we create `analysis/projections.py` or extend existing `analysis/axis_classifier.py`? +- [ ] [Implementation] How to handle the `_cached_bootstrap_cis()` function - move to analysis or keep as cache wrapper? + +## Next Steps + +→ `/ce:plan` for structured implementation planning diff --git a/docs/brainstorms/2026-04-05-right-wing-party-axis-validation-requirements.md b/docs/brainstorms/2026-04-05-right-wing-party-axis-validation-requirements.md new file mode 100644 index 0000000..bce4034 --- /dev/null +++ b/docs/brainstorms/2026-04-05-right-wing-party-axis-validation-requirements.md @@ -0,0 +1,77 @@ +--- +date: 2026-04-05 +topic: right-wing-party-axis-validation +--- + +# Right-Wing Party Axis Validation + +## Problem Frame + +The project convention states that PVV, FVD, JA21, and SGP must appear on the RIGHT side of all axes in visualizations (AGENTS.md). This is the #1 documented convention with zero automated enforcement. A single test prevents regression when SVD labels change or new components are added. + +## Requirements + +**R1. Canonical party sets defined once, imported everywhere** +- Define `CANONICAL_RIGHT = frozenset({"PVV", "FVD", "JA21", "SGP"})` in `analysis/config.py` +- Define `CANONICAL_LEFT = frozenset({"SP", "PvdA", "GL", "GroenLinks", "GroenLinks-PvdA", "DENK", "PvdD", "Volt"})` in `analysis/config.py` — matches svd_labels.py LEFT_PARTIES exactly +- All code that checks political orientation (svd_labels.py, political_axis.py) imports from config instead of defining inline + +**R2. Validation test loads real data from DuckDB** +- Test file: `tests/test_axis_political_orientation.py` +- Uses existing data loading functions (`load_party_scores_all_windows_aligned` from `analysis/explorer_data.py`) +- No synthetic data — validates against actual `party_axis_scores` table + +**R3. 2D political compass orientation check (statistical, not per-party)** +- `party_axis_scores` table has `x_axis_aligned` (component 1) and `y_axis_aligned` (component 2) +- For each window, validate both axes using **mean scores**: + - **Axis 1 (x)**: Compute mean of `CANONICAL_RIGHT` x-values and mean of `CANONICAL_LEFT` x-values. Assert `right_mean > left_mean` + - **Axis 2 (y)**: Same for y-values. Assert `right_mean > left_mean` +- "Right on right" means the **average** right party is right of the **average** left party — individual parties may deviate slightly (e.g., one right party slightly negative is fine) +- `compute_flip_direction` already implements this logic (compares group means) — use it +- Skips parties not present in a given window (graceful, not a failure) + +**R4. `compute_flip_direction` consistency check** +- After loading data, call `compute_flip_direction(1, party_scores)` and `compute_flip_direction(2, party_scores)` per window +- Assert both return `False` (no flip needed) when data is already correctly oriented +- If either returns `True`, the data violates the convention and the test fails with a clear message + +**R5. Clear failure messages** +- When orientation check fails, report: window, axis (x/y), right_mean, left_mean, difference +- Example: `"Window '2021-2023', x-axis: right_mean=-0.12, left_mean=0.08 (right parties on LEFT side — flip direction=True)"` + +## Success Criteria + +- Test runs as part of `pytest` suite (`.venv/bin/python -m pytest tests/test_axis_political_orientation.py`) +- Test passes with current data (convention currently holds — this establishes the baseline) +- If convention is violated in future data, test fails with actionable message +- Test works for all windows in the database (not just current) +- Statistical check (mean-based) — test passes even if individual parties deviate slightly from group mean + +## Scope Boundaries + +- **Not included**: Testing unaligned scores (only aligned scores are validated — these are what users see) +- **Not included**: VVD, NSC, BBB, CDA, ChristenUnie — these are center parties, not right-wing per AGENTS.md convention +- **Not included**: Per-party strict sign checks (statistical mean check is sufficient and more robust) +- **Not included**: Updating `political_axis.py` — R1 only updates `svd_labels.py` to import from config; `political_axis.py` uses different party sets for PCA centroid orientation and is out of scope + +## Key Decisions + +- **Canonical sets match AGENTS.md for right, svd_labels.py for left**: `CANONICAL_RIGHT = {PVV, FVD, JA21, SGP}` matches AGENTS.md exactly. `CANONICAL_LEFT = {SP, PvdA, GL, GroenLinks, GroenLinks-PvdA, DENK, PvdD, Volt}` matches svd_labels.py LEFT_PARTIES exactly. +- **Single unified source of truth in config.py**: `CANONICAL_RIGHT` and `CANONICAL_LEFT` frozensets go in `config.py` — it's a prerequisite for the test to work correctly. Only `svd_labels.py` is updated to import from config; `political_axis.py` is out of scope (uses party sets for PCA centroid orientation, not the same usage). +- **Aligned scores only**: Unaligned scores may vary across windows due to Procrustes alignment drift; aligned scores are the stable, user-facing representation. +- **Statistical (mean-based) validation, not per-party**: The orientation check compares group means, not individual party scores. A single right party being slightly negative is not a failure — the mean right score must exceed the mean left score. + +## Dependencies / Assumptions + +- DuckDB database is populated with `party_axis_scores` table with `x_axis_aligned` and `y_axis_aligned` columns (verified) +- `analysis/explorer_data.py` functions work correctly (already tested) +- `_PARTY_NORMALIZE` already exists in `config.py` (lines 247-256) — use it for party name alias normalization +- `config.py` currently lacks `CANONICAL_RIGHT`/`CANONICAL_LEFT` frozensets — these must be added as part of R1 +- `compute_flip_direction()` in `svd_labels.py` currently uses inline `RIGHT_PARTIES`/`LEFT_PARTIES` — must be updated to import from config after R1 + +## Outstanding Questions + +All resolved. Key decisions documented above. + +## Next Steps +→ `/ce:plan` for structured implementation planning diff --git a/docs/ideation/2026-04-04-code-quality-architecture-ideation.md b/docs/ideation/2026-04-04-code-quality-architecture-ideation.md new file mode 100644 index 0000000..50f2aa6 --- /dev/null +++ b/docs/ideation/2026-04-04-code-quality-architecture-ideation.md @@ -0,0 +1,149 @@ +--- +date: 2026-04-04 +topic: code-quality-architecture-ideation +focus: code quality and architecture improvements +--- + +# Ideation: Code Quality & Architecture Improvements + +## Codebase Context +- **explorer.py**: 3715 lines — monolithic Streamlit app with 65+ `except Exception:` handlers +- **database.py**: 1366 lines — `MotionDatabase` class with similar exception patterns +- **explorer_helpers.py**: 317 lines — pure functions, import-safe, well-testable (the pattern) +- **Anti-patterns**: 208 instances of bare/broad exception handling, nested try-except blocks +- **Tests**: Well-organized in `tests/` with good coverage of helpers + +## Ranked Ideas + +### 1. Systematic Exception Handler Audit & Refactor +**Description:** Audit all 208 `except Exception:` blocks across the codebase. Categorize by failure mode (missing dependency, data validation, network, IO) and replace with specific exceptions. Add error context propagation. + +**Rationale:** The current pattern silently swallows errors, making debugging impossible. Refactoring to specific exceptions enables proper error handling, logging, and user feedback. This compounds: each fix reduces 2-3 nested exception handlers. + +**Downsides:** High volume of changes requires careful regression testing. + +**Confidence:** 90% + +**Complexity:** High + +**Status:** Unexplored + +--- + +### 2. Extract Business Logic from explorer.py into Pure Functions +**Description:** Identify and extract computation-heavy sections from the 3715-line explorer.py. Move to pure functions in a new module (e.g., `explorer_logic.py`), keeping Streamlit UI glue in the main file. + +**Rationale:** explorer.py mixes UI code with business logic, making it untestable and hard to reason about. The existing `explorer_helpers.py` proves this pattern works — same approach applied more broadly enables unit testing of core algorithms. + +**Downsides:** Requires careful interface design to avoid breaking the Streamlit page. + +**Confidence:** 85% + +**Complexity:** Medium + +**Status:** Unexplored + +--- + +### 3. Create Typed Data Transfer Objects (DTOs) for Database Layer +**Description:** Replace dictionary-based data passing between `database.py` and consumers with typed dataclasses or Pydantic models. Define `MotionDTO`, `PartyResultDTO`, `SessionDTO`. + +**Rationale:** 208 exception handlers often mask type mismatches that would be caught at compile-time with typed DTOs. The `src/validators/types.py` shows existing type awareness — extend this systematically to the data layer. + +**Downsides:** Migration effort; some duckdb results may not serialize cleanly. + +**Confidence:** 75% + +**Complexity:** Medium + +**Status:** Unexplored + +--- + +### 4. Establish Explicit Error Recovery Strategies +**Description:** Rather than catch-all exception handling, implement explicit recovery strategies per failure mode: retry with backoff for transient failures, fallback to cached data for missing dependencies, graceful degradation for optional features. + +**Rationale:** The anti-pattern exists because there's no systematic recovery approach. Explicit strategies replace 208 silent catches with intentional behavior — this is the "compound leverage" angle. + +**Downsides:** Requires identifying which failures are transient vs. permanent per operation. + +**Confidence:** 80% + +**Complexity:** Medium + +**Status:** Unexplored + +--- + +### 5. Modularize database.py into Focused Modules +**Description:** Split `database.py` (1366 lines) into: `db_connection.py` (connection lifecycle), `db_motions.py` (motion queries), `db_sessions.py` (session management), `db_migrations.py` (schema updates). + +**Rationale:** Single-responsibility violation — database.py handles connection, schema, queries, and migrations. Splitting enables independent testing and clearer ownership. The `pipeline/` modular structure shows this is already the project's convention. + +**Downsides:** Breaking changes for any existing imports. + +**Confidence:** 70% + +**Complexity:** Medium + +**Status:** Unexplored + +--- + +### 6. Add Comprehensive Type Hints to Core Modules +**Description:** Run mypy on `explorer.py`, `database.py`, `analysis/*.py`. Fix missing type hints and enable strict type checking in CI. + +**Rationale:** Type hints catch the errors that 208 exception handlers are currently masking. The `src/types/motion_types.py` shows the project already has some type investment — this extends it to the pain points. + +**Downsides:** May require `cast()` in some duckdb interop scenarios. + +**Confidence:** 85% + +**Complexity:** Low + +**Status:** Unexplored + +--- + +### 7. Create Code Climate Metrics & Monitoring +**Description:** Add radon or lizard to measure cyclomatic complexity per module. Set thresholds that fail CI if exceeded. Track over time. + +**Rationale:** Quantitative baseline for refactoring impact. Currently no way to measure if the 3715-line explorer.py is improving or degrading. Compounds: each refactor can be measured. + +**Downsides:** Tool overhead; thresholds may need tuning. + +**Confidence:** 60% + +**Complexity:** Low + +**Status:** Unexplored + +--- + +### 8. Extract Static Analysis Rule for Bare Except Detection +**Description:** Add a flake8 plugin or ruff rule that flags `except:` and `except Exception:` without re-raising or logging. Document the project-specific exception hierarchy. + +**Rationale:** Prevents the anti-pattern from re-entering. The project has 208 violations — a custom lint rule catches new violations and encodes the team's error-handling philosophy. This is the "assumption-breaking" angle: stop fixing cases, fix the system. + +**Downsides:** Requires defining what specific exceptions ARE allowed per context. + +**Confidence:** 70% + +**Complexity:** Low + +**Status:** Unexplored + +--- + +## Rejection Summary + +| # | Idea | Reason Rejected | +|---|------|-----------------| +| 1 | Add docstrings to all functions | Too obvious; not leverage-focused | +| 2 | Migrate to async database operations | Premature optimization; duckdb is sync | +| 3 | Add logging library (structured logging) | Tool-focused, not addressing root cause | +| 4 | Replace Streamlit with another framework | Out of scope for this codebase | +| 5 | Add Caching layer for database queries | Already exists via Streamlit caching; not addressing architecture | + +## Session Log +- 2026-04-04: Initial ideation — 13 generated, 8 survived diff --git a/docs/ideation/2026-04-04-reliability-correctness-ideation.md b/docs/ideation/2026-04-04-reliability-correctness-ideation.md new file mode 100644 index 0000000..b965e4b --- /dev/null +++ b/docs/ideation/2026-04-04-reliability-correctness-ideation.md @@ -0,0 +1,160 @@ +--- +date: 2026-04-04 +topic: reliability-correctness-improvements +focus: reliability and correctness +--- + +# Ideation: Reliability & Correctness Improvements + +## Codebase Context +- **Python + Streamlit + DuckDB** data pipeline application +- **Key Issues from docs/solutions/**: + - SVD labels must reflect voting patterns, not semantic content (850+ SVD component labels in code) + - Bare exception handlers: 850+ `except Exception:` across codebase + - Nested exception handling creates opaque error paths + - Error handling catches broad Exception and prints to stdout (179 `print()` statements in error paths) +- **Existing Pattern**: `explorer_helpers.py` is pure functions, testable, well-structured — the model to follow + +## Grounding Evidence +1. `docs/solutions/best-practices/svd-labels-voting-patterns-not-semantics.md` documents the SVD labeling convention +2. Grep search found 281 `except Exception:` in `.py` files plus bare `except:` handlers +3. `database.py` line 47: bare `except:` that catches everything including KeyboardInterrupt +4. 179 print statements in error handling paths hide issues from logging + +## Ranked Ideas + +### 1. Right-Wing Party Axis Validation — Automated Assert +**Description:** Add runtime validation that PVV, FVD, JA21, SGP appear on RIGHT side of all SVD/PCA axes. Create a `validate_axis_polrity()` function that checks party loadings and raises `AssertionError` if right-wing parties appear on the left. + +**Rationale:** This is the most impactful correctness fix — the project convention is explicitly documented in AGENTS.md yet has no automated enforcement. A single validation pass catches SVD labeling errors before they reach production. + +**Downsides:** Requires careful handling of axis flips (sometimes flipping is the correct fix, not validation failure). + +**Confidence:** 95% + +**Complexity:** Low + +**Status:** Unexplored + +--- + +### 2. Type-Safe Vote Normalization with Exhaustiveness Checking +**Description:** Replace the fragile string-based vote normalization in `database.py` (lines 715-744) with a typed enum + exhaustiveness checking. Add a `Vote` enum with variants: `VOOR`, `TEGEN`, `ONTHOUDEN`, `AFWEZIG`. Use match/case with `case _` to catch unmapped values at development time. + +**Rationale:** The current normalization silently returns `None` for unknown vote values — this causes data loss that only manifests as "agreement percentage is wrong". Typed enums with exhaustiveness checking prevent silent data loss. + +**Downsides:** Requires updating all call sites that pass vote strings. + +**Confidence:** 90% + +**Complexity:** Medium + +**Status:** Unexplored + +--- + +### 3. DuckDB Connection Leak Detector — Context Manager Audit +**Description:** Audit all `duckdb.connect()` calls for proper context manager usage or explicit `.close()`. Many handlers catch exceptions but forget to close connections. Add a `ConnectionTracker` that warns on unclosed connections in development. + +**Rationale:** Connection leaks accumulate and eventually exhaust database connections. The codebase has 15+ places where exceptions cause early returns without connection cleanup. + +**Downsides:** Tracking adds overhead; some leaks are already handled by DuckDB's connection pooling. + +**Confidence:** 85% + +**Complexity:** Medium + +**Status:** Unexplored + +--- + +### 4. Replace Print-Based Debugging with Structured Logging +**Description:** Replace the 179 `print()` statements in error paths with structured logging using the existing `_logger`. Create a script that automates this conversion for common patterns. + +**Rationale:** Print statements go to stdout and are discarded in production. Proper logging enables log aggregation, alerting, and debugging of production issues. + +**Downsides:** High volume of changes; risk of losing context in some print statements. + +**Confidence:** 80% + +**Complexity:** Medium + +**Status:** Unexplored + +--- + +### 5. SVD Component Label Verification — Pre-Deployment Assertion +**Description:** Create a CI/CD pre-deployment script that verifies SVD labels against actual voting data — checking that labels match the voting pattern, not semantic assumptions. Query which parties vote positive/negative per component and validate label accuracy. + +**Rationale:** The SVD label documentation exists but there's no enforcement. This automated check prevents the documented mistake (semantic labels that don't match voting) from recurring. + +**Downsides:** Requires understanding of the SVD pipeline and periodic re-calibration as voting data changes. + +**Confidence:** 75% + +**Complexity:** Medium + +**Status:** Unexplored + +--- + +### 6. Nested Exception Handler Flattening — EAFP to LBYL Migration +**Description:** Replace nested try-except blocks with explicit preconditions (LBYL — Look Before You Leap). Many handlers wrap every operation in `try-except` because they don't trust the data. Add validation functions that check preconditions before operations. + +**Rationale:** Nested exception handlers make the control flow impossible to reason about. Replacing with explicit validation makes code more readable and debuggable. + +**Downsides:** Requires understanding what conditions each operation actually needs. + +**Confidence:** 70% + +**Complexity:** High + +**Status:** Unexplored + +--- + +### 7. Database Schema Validation — Foreign Key and Constraint Checks +**Description:** Add startup validation that checks the actual database schema against expected schema. Verify table existence, column types, and foreign key relationships. Fail fast with clear error messages if schema is stale. + +**Rationale:** The current code tries to add columns with `ALTER TABLE ... IF NOT EXISTS` which can fail silently. A schema validation pass catches migration failures immediately. + +**Downsides:** Schema changes require updating validation code. + +**Confidence:** 85% + +**Complexity:** Low + +**Status:** Unexplored + +--- + +### 8. Motion Data Sanitization Pipeline — Pre-Insert Validation +**Description:** Add a sanitization layer for incoming motion data that validates: +- `winning_margin` is between 0 and 1 +- `policy_area` is non-empty +- `voting_results` keys match known parties +- Date parsing succeeds for motion dates + +**Rationale:** The current insertion code trusts upstream data. Invalid data causes hard-to-debug issues downstream in SVD computation and similarity calculations. + +**Downsides:** Requires defining what "valid" means for each field. + +**Confidence:** 80% + +**Complexity:** Medium + +**Status:** Unexplored + +--- + +## Rejection Summary + +| # | Idea | Reason Rejected | +|---|------|-----------------| +| 1 | Add unit tests for exception paths | Good idea but lower leverage than preventing errors at source; covered by existing test infrastructure | +| 2 | Refactor all 850+ exception handlers in one pass | Too high volume — needs phased approach captured by idea #1 | +| 3 | Add type hints to all functions | Good hygiene but doesn't directly address reliability — covered by existing typing effort | +| 4 | Implement circuit breaker for external API calls | No external API calls observed in core codebase | + +## Session Log +- 2026-04-04: Initial ideation — 8 generated, 8 survived \ No newline at end of file diff --git a/docs/ideation/2026-04-04-stemwijzer-improvement-ideas.md b/docs/ideation/2026-04-04-stemwijzer-improvement-ideas.md new file mode 100644 index 0000000..5b83c5c --- /dev/null +++ b/docs/ideation/2026-04-04-stemwijzer-improvement-ideas.md @@ -0,0 +1,149 @@ +--- +date: 2026-04-04 +topic: stemwijzer-improvement-ideas +focus: general +--- + +# Ideation: Stemwijzer Improvement Ideas + +## Codebase Context + +**Project shape:** Python/Streamlit Dutch voting advice tool ("Stemwijzer") +- Uses uv for package management, pytest for testing, DuckDB for data +- Key modules: analysis/, pipeline/, database.py (50KB), explorer.py (143KB) +- Notable: 3 venvs (.venv, .venv_axis, .venv_plotly) suggest dependency experimentation +- AGENTS.md exists with conventions (right-wing parties on RIGHT side, SVD labels reflect voting patterns) + +**Pain points identified:** +- explorer.py is 143KB monolith - hard to navigate +- SVD labels must reflect voting patterns (documented as learning) +- 850+ bare exception handlers documented as anti-pattern +- No CONTRIBUTING.md for onboarding + +**Leverage points:** +- Good test organization (tests/ with subdirs) +- Documented solutions in docs/solutions/ +- explorer_helpers.py proves pure-function pattern works + +## Ranked Ideas + +### 1. Right-Wing Party Axis Validation +**Description:** Add an automated test that asserts PVV, FVD, JA21, SGP appear on the RIGHT side (positive loading) of all SVD/PCA axes. + +**Rationale:** This is the #1 project convention (from AGENTS.md) with zero automated enforcement. The documented SVD label bug showed how easy it is to get this wrong. A simple test prevents regression. + +**Downsides:** Requires defining "RIGHT side" for each component - some components may have flipped poles. + +**Confidence:** 95% +**Complexity:** Low +**Status:** Unexplored + +### 2. Extract Business Logic from explorer.py +**Description:** Break the 143KB explorer.py monolith into pure functions in a new module (e.g., analysis/explorer_core.py), keeping only UI glue in the main file. + +**Rationale:** explorer.py is too large to navigate, review, or refactor safely. The explorer_helpers.py pattern already proves pure functions work. This enables parallel development and safer changes. + +**Downsides:** High complexity - requires understanding all the current dependencies and careful extraction to avoid breaking the Streamlit UI. + +**Confidence:** 90% +**Complexity:** High +**Status:** Unexplored + +### 3. SVD Component Label Verification +**Description:** Create a pre-deployment verification script that checks SVD_THEMES labels against actual voting data, flagging components where labels don't match party score distributions. + +**Rationale:** The documented SVD label bug showed labels can drift from reality. A verification step before deployment prevents this recurring. + +**Downsides:** Requires clear criteria for "label matches voting data" - some components are genuinely ambiguous. + +**Confidence:** 85% +**Complexity:** Medium +**Status:** Unexplored + +### 4. Interactive Component-Explorer UI +**Description:** Add a Streamlit UI selector letting users view any pair of SVD components as a 2D scatter plot, not just the political compass (components 1-2). + +**Rationale:** Components 3-10 are essentially black boxes. Making these explorable reveals hidden political dimensions and adds significant user value. + +**Downsides:** Requires understanding how to project between arbitrary component pairs. + +**Confidence:** 85% +**Complexity:** Medium +**Status:** Unexplored + +### 5. Type-Safe Vote Normalization +**Description:** Replace string-based vote normalization (casting '1', '-1', '0' strings) with typed enums and exhaustiveness checking. + +**Rationale:** Vote matching is core functionality - wrong types cause silent bugs. Typed enums catch errors at compile time. + +**Downsides:** Requires updating all callers and ensuring backward compatibility. + +**Confidence:** 80% +**Complexity:** Medium +**Status:** Unexplored + +### 6. Add CONTRIBUTING.md +**Description:** Create top-level CONTRIBUTING.md covering setup (uv), running tests, lint/typecheck commands, and key conventions from AGENTS.md. + +**Rationale:** AGENTS.md is internal-focused. A CONTRIBUTING.md lowers the barrier for external contributors and encodes project norms explicitly. + +**Downsides:** Low risk - straightforward documentation. + +**Confidence:** 75% +**Complexity:** Low +**Status:** Explored + +### 7. Database Schema Validation +**Description:** Add startup validation that checks the actual database schema against expected schema. Verify table existence, column types, and foreign key relationships. Fail fast with clear error messages if schema is stale. +**Rationale:** The current code tries to add columns with `ALTER TABLE ... IF NOT EXISTS` which can fail silently. A schema validation pass catches migration failures immediately. +**Downsides:** Schema changes require updating validation code. +**Confidence:** 85% +**Complexity:** Low +**Status:** Unexplored + +### 8. DuckDB Connection Leak Detector +**Description:** Audit all `duckdb.connect()` calls for proper context manager usage or explicit `.close()`. Many handlers catch exceptions but forget to close connections. Add a `ConnectionTracker` that warns on unclosed connections in development. +**Rationale:** Connection leaks accumulate and eventually exhaust database connections. The codebase has 15+ places where exceptions cause early returns without connection cleanup. +**Downsides:** Tracking adds overhead; some leaks are already handled by DuckDB's connection pooling. +**Confidence:** 85% +**Complexity:** Medium +**Status:** Unexplored + +### 9. Static Analysis Rule for Bare Except +**Description:** Add a flake8 plugin or ruff rule that flags `except:` and `except Exception:` without re-raising or logging. Document the project-specific exception hierarchy. +**Rationale:** Prevents the anti-pattern from re-entering. The project has 208 violations — a custom lint rule catches new violations and encodes the team's error-handling philosophy. +**Downsides:** Requires defining what specific exceptions ARE allowed per context. +**Confidence:** 70% +**Complexity:** Low +**Status:** Unexplored + +### 10. SVD Component Label Verification +**Description:** Create a CI/CD pre-deployment script that verifies SVD labels against actual voting data — checking that labels match the voting pattern, not semantic assumptions. +**Rationale:** The SVD label documentation exists but there's no enforcement. This automated check prevents the documented mistake from recurring. +**Downsides:** Requires understanding of the SVD pipeline and periodic re-calibration. +**Confidence:** 75% +**Complexity:** Medium +**Status:** Unexplored + +## Rejection Summary (Raised Bar — 2026-04-05) + +| # | Idea | Reason Rejected | +|---|------|-----------------| +| 1 | Consolidate 3 venvs into 1 | Lower priority - works currently, would need investigation | +| 2 | Modularize database.py | Secondary to explorer.py refactor; not a direct user/developer impact | +| 3 | Add Makefile/Task Aliases | Nice-to-have, lower leverage | +| 4 | Exception Handler Audit (208 handlers) | Too large to scope safely; architectural, not fixing root cause | +| 5 | Add Comprehensive Type Hints | Huge scope; hygiene, not correctness | +| 6 | Party Polarization Score | Interesting but niche | +| 7 | Scree Plot Extension | Low urgency feature | +| 8 | Typed DTOs for Database Layer | High migration effort; duckdb interop complications | +| 9 | Nested Exception Handler Flattening | Architectural refactor; too much change for uncertain value | +| 10 | Print→Logging Replacement (179 print statements) | High effort, low leverage — logging exists but not used | +| 11 | Code Climate Metrics | Measures for its own sake; doesn't directly prevent bugs | +| 12 | CONTRIBUTING.md | Good hygiene, low urgency — can defer | + +## Session Log + +- 2026-04-04: Initial ideation — 32 generated, 6 survived +- 2026-04-05: Raised the bar — 22 ideas reviewed, 5 survivors after stricter filtering + - Idea #1 (Right-Wing Party Axis Validation) selected for brainstorming diff --git a/docs/plans/2026-04-04-002-refactor-explorer-extraction-plan.md b/docs/plans/2026-04-04-002-refactor-explorer-extraction-plan.md new file mode 100644 index 0000000..8db7350 --- /dev/null +++ b/docs/plans/2026-04-04-002-refactor-explorer-extraction-plan.md @@ -0,0 +1,220 @@ +--- +title: "refactor: Extract business logic from explorer.py to analysis/" +type: refactor +status: active +date: 2026-04-04 +origin: docs/brainstorms/2026-04-04-explorer-refactor-requirements.md +--- + +# Refactor: Extract Business Logic from explorer.py to analysis/ + +## Overview + +Split the 3715-line `explorer.py` into clear layers: data loading, business logic, and UI. This improves navigability and testability while preserving all existing behavior. + +## Problem Frame + +`explorer.py` mixes three concerns (data loading, computation, UI) making it: +- Hard to navigate — no clear boundaries +- Hard to test — requires Streamlit + DuckDB +- Hard to review — changes affect everything + +## Requirements Trace + +- R1.1: Create `analysis/explorer_data.py` with data loading functions +- R1.2: Data functions callable without Streamlit imports +- R1.3: Functions return pure Python data structures +- R2.1: Move computation to domain-appropriate `analysis/` modules +- R2.2: Computations are pure functions +- R3.1: explorer.py becomes thin orchestration layer +- R3.2: `_render_*` functions stay in explorer.py +- R3.3: `build_*_tab()` functions delegate to imported functions +- R4.1: No circular imports +- R5.1: Data functions testable with mocked DuckDB +- R5.2: Computation functions pure and testable + +## Key Technical Decisions + +- **Domain-based splitting**: Computation goes to relevant `analysis/` module +- **Import direction**: `explorer.py` imports from `analysis/`, never vice versa +- **Preserve signatures**: Refactoring doesn't change public APIs +- **`_load_mp_vectors_by_party` variants**: Keep separate (serve different use cases) +- **`analysis/projections.py`**: Create new file (distinct from axis_classifier.py) +- **`_cached_bootstrap_cis()`**: Keep as cache wrapper in explorer.py, move computation to analysis/ + +## Open Questions + +### Resolved During Planning + +- **`_load_mp_vectors_by_party` variants**: Keep separate — they have different signatures and use cases +- **`analysis/projections.py`**: Create new file — projections are distinct from axis classification +- **`_cached_bootstrap_cis()`**: Keep wrapper in explorer.py, move computation to analysis/trajectories.py + +### Deferred to Implementation + +- Exact function grouping within `analysis/explorer_data.py` — will be refined during extraction +- Whether to add `__all__` exports — decide based on usage patterns after extraction + +## Implementation Units + +- [ ] **Unit 1: Create `analysis/explorer_data.py` skeleton** + +**Goal:** Create the data loading module with extracted functions + +**Requirements:** R1.1, R1.2, R1.3 + +**Dependencies:** None + +**Files:** +- Create: `analysis/explorer_data.py` + +**Approach:** +1. Create module with docstring and imports +2. Add stub functions with original signatures (no implementation) +3. Copy docstrings and type hints from explorer.py + +**Functions to extract:** +- `get_available_windows(db_path: str) -> List[str]` +- `get_uniform_dim_windows(db_path: str) -> List[str]` +- `load_positions(db_path: str, window_size: str) -> pd.DataFrame` +- `load_party_map(db_path: str) -> Dict[str, str]` +- `load_active_mps(db_path: str) -> set` +- `load_party_axis_scores(db_path: str) -> Dict[str, List[float]]` +- `load_party_axis_scores_for_window(db_path: str, window: str) -> Dict[str, List[float]]` +- `load_party_scores_all_windows(db_path: str) -> Dict[str, List[List[float]]]` +- `load_party_scores_all_windows_aligned(db_path: str) -> Dict[str, List[List[float]]]` +- `load_party_mp_vectors(db_path: str) -> Dict[str, List[np.ndarray]]` +- `load_scree_data(db_path: str) -> List[float]` +- `load_motions_df(db_path: str) -> pd.DataFrame` + +**Patterns to follow:** +- `explorer_helpers.py` conventions (pure functions, no IO side effects) +- `database.py` for DuckDB connection patterns + +**Verification:** +- Module imports without errors +- All functions have correct signatures + +--- + +- [ ] **Unit 2: Create `analysis/projections.py`** + +**Goal:** Create module for SVD projection and axis utilities + +**Requirements:** R2.1, R2.2 + +**Dependencies:** Unit 1 + +**Files:** +- Create: `analysis/projections.py` + +**Approach:** +1. Extract `_should_swap_axes()` and `_swap_axes()` from explorer.py +2. Add pure projection computation functions + +**Functions to extract:** +- `_should_swap_axes(axis_def: dict) -> bool` +- `_swap_axes(axis_def: dict) -> dict` +- `project_motions_onto_axis(motion_ids, scores) -> List[Tuple[int, float]]` (stub) + +**Patterns to follow:** +- Pure function conventions from `explorer_helpers.py` + +**Verification:** +- Functions work without Streamlit/DuckDB imports + +--- + +- [ ] **Unit 3: Update `analysis/trajectories.py`** + +**Goal:** Add trajectory computation functions from explorer.py + +**Requirements:** R2.1, R2.2 + +**Dependencies:** Unit 1 + +**Files:** +- Modify: `analysis/trajectories.py` + +**Approach:** +1. Add `compute_party_discipline()` and related functions +2. Add `compute_trajectory_points()` (pure computation) + +**Functions to add:** +- `compute_party_discipline(mp_scores: Dict[str, List[float]]) -> Dict[str, float]` +- `compute_2d_trajectories(positions_by_window, party_axis_scores)` (stub) +- `compute_aligned_trajectories(positions_by_window, party_scores_all)` (stub) + +**Verification:** +- Functions are pure (no IO) +- Existing trajectory.py tests pass + +--- + +- [ ] **Unit 4: Wire up imports in explorer.py** + +**Goal:** Update explorer.py to import from new modules + +**Requirements:** R3.1, R3.3, R4.1 + +**Dependencies:** Units 1, 2, 3 + +**Files:** +- Modify: `explorer.py` + +**Approach:** +1. Replace local function definitions with imports +2. Keep wrapper functions where needed for `@st.cache_data` +3. Verify no circular imports + +**Verification:** +- explorer.py imports work +- No circular import errors +- Streamlit app runs correctly + +--- + +- [ ] **Unit 5: Final cleanup and verification** + +**Goal:** Ensure explorer.py meets success criteria + +**Requirements:** All + +**Dependencies:** Unit 4 + +**Approach:** +1. Count lines in explorer.py — target under 1500 +2. Check no function exceeds 100 lines +3. Verify all extracted functions have docstrings +4. Run existing tests + +**Verification:** +- `wc -l explorer.py` < 1500 +- All functions under 100 lines +- Tests pass + +## System-Wide Impact + +- **Interaction graph:** explorer.py imports from analysis/ — no reverse imports +- **Error propagation:** Data functions raise exceptions on DB errors (same as before) +- **API surface parity:** All function signatures preserved +- **Unchanged invariants:** UI behavior identical, no new features + +## Risks & Dependencies + +| Risk | Mitigation | +|------|------------| +| Breaking existing function signatures | Preserve exact signatures, update in place | +| Circular imports | One-way import direction (explorer → analysis only) | +| Regression in UI behavior | Test after each unit, verify Streamlit app runs | + +## Documentation / Operational Notes + +- Update `ARCHITECTURE.md` to document new `analysis/explorer_data.py` module +- No changes to deployment or configuration needed + +## Sources & References + +- **Requirements doc:** `docs/brainstorms/2026-04-04-explorer-refactor-requirements.md` +- Related code: `explorer.py`, `explorer_helpers.py`, `analysis/trajectories.py` +- Pattern reference: `explorer_helpers.py` (pure function conventions) diff --git a/docs/plans/2026-04-04-003-refactor-complete-explorer-decomposition-plan.md b/docs/plans/2026-04-04-003-refactor-complete-explorer-decomposition-plan.md new file mode 100644 index 0000000..e4f2d48 --- /dev/null +++ b/docs/plans/2026-04-04-003-refactor-complete-explorer-decomposition-plan.md @@ -0,0 +1,182 @@ +--- +title: "refactor: Complete explorer.py decomposition — extract tabs, constants, and rendering" +type: refactor +status: completed +date: 2026-04-04 +origin: docs/plans/2026-04-04-002-refactor-explorer-extraction-plan.md +completed: 2026-04-04 +--- + +# Refactor: Complete explorer.py Decomposition + +## Overview + +Completed extraction of constants and tab module structure from `explorer.py`. Tab functions remain in explorer.py pending Streamlit decoupling. + +## Problem Frame + +The first phase extracted data loading functions to `analysis/explorer_data.py`. The remaining content contains: +- Tab building functions (~1617 lines across 6 tabs) +- Rendering helpers (~600 lines) +- Constants (~237 lines) + +## Current State + +| Module | Lines | Status | +|--------|-------|--------| +| `explorer.py` | 3102 | In progress | +| `analysis/explorer_data.py` | 549 | Done | +| `analysis/projections.py` | 121 | Done | +| `analysis/trajectory.py` | 380 | Done | +| `analysis/config.py` | 230 | **NEW** | +| `analysis/tabs/` | - | **NEW** (placeholders) | +| `analysis/visualize.py` | 434 | Existing | +| Target | <1500 | Partial | + +## Requirements Trace + +- R1.1: Extract `build_*_tab()` functions to `analysis/tabs/` +- R1.2: Extract `_render_*` helpers to `analysis/rendering.py` +- R1.3: Extract constants to `analysis/config.py` +- R2.1: Preserve `@st.cache_data` decorators in explorer.py +- R3.1: Maintain import direction: explorer.py → analysis/ only + +## Scope Boundaries + +**Included:** +- Tab function extraction (6 tabs) +- Rendering helper extraction +- Constant extraction + +**Excluded:** +- Behavior changes (UI looks the same) +- New test coverage (existing tests pass) +- Database schema changes + +## Key Technical Decisions + +- **Tab modules**: Create `analysis/tabs/compass.py`, `trajectories.py`, `search.py`, `browser.py`, `components.py`, `quiz.py` +- **Rendering module**: `analysis/rendering.py` contains all `_render_*` and `_build_*` functions +- **Config module**: `analysis/config.py` contains all constants +- **Backward compatibility**: Keep wrapper functions in explorer.py for `@st.cache_data` decorators +- **Import pattern**: Each tab module imports from `analysis/` (data, projections, config) + +## Implementation Units + +- [x] **Unit 6: Extract constants to `analysis/config.py`** ✓ + +**Goal:** Centralize all constants used across the explorer + +**Requirements:** R1.3 + +**Dependencies:** None + +**Files:** +- Create: `analysis/config.py` +- Modify: `explorer.py` + +**Approach:** +Extracted these constants from explorer.py: +1. `PARTY_COLOURS: Dict[str, str]` - party color mapping +2. `SVD_THEMES: dict[int, dict[str, str]]` - SVD component themes +3. `KNOWN_MAJOR_PARTIES` - ordered party list +4. `CURRENT_PARLIAMENT_PARTIES: frozenset[str]` - current party list +5. `_PARTY_NORMALIZE: dict[str, str]` - party name normalization + +**Verification:** +- `explorer.py` imports from `analysis/config.py` +- All tests pass (153 passed) + +**Lines saved:** ~237 + +--- + +- [x] **Unit 7: Extract `_render_*` helpers** - SKIPPED + +**Decision:** UI rendering functions use Streamlit (`st.*`). Per R3.2, UI functions stay in explorer.py. + +--- + +- [x] **Unit 8-10: Tab extraction** - PARTIAL + +**Goal:** Create module structure for tab functions + +**Status:** Created `analysis/tabs/` with placeholder modules. Actual tab functions remain in explorer.py due to tight Streamlit coupling. + +**Files:** +- Create: `analysis/tabs/__init__.py` +- Create: `analysis/tabs/compass.py` +- Create: `analysis/tabs/trajectories.py` +- Create: `analysis/tabs/search.py` +- Create: `analysis/tabs/browser.py` +- Create: `analysis/tabs/components.py` +- Create: `analysis/tabs/quiz.py` + +**Note:** Full tab extraction requires decoupling rendering logic from Streamlit, which is a larger refactoring effort beyond the current scope. + +--- + +- [x] **Unit 11: Final cleanup and line count verification** + +**Verification:** +- `wc -l explorer.py`: 3102 lines (reduced from 3715) +- All tests pass (153 passed, 2 skipped) +- Import verification passes + +## File Structure (Target) + +``` +analysis/ +├── __init__.py +├── config.py # NEW: Constants (PARTY_COLOURS, SVD_THEMES, etc.) +├── explorer_data.py # Data loading (done) +├── projections.py # Pure projection math (done) +├── rendering.py # NEW: _render_* and _build_* helpers +├── trajectory.py # Trajectory computation (done) +├── visualize.py # Existing visualization utils +└── tabs/ # NEW: Tab modules + ├── __init__.py + ├── compass.py # build_compass_tab + ├── trajectories.py # build_trajectories_tab + ├── search.py # build_search_tab + ├── browser.py # build_browser_tab + ├── components.py # build_svd_components_tab + └── quiz.py # build_mp_quiz_tab +``` + +## System-Wide Impact + +- **Interaction graph:** explorer.py becomes a thin orchestrator, importing from `analysis/tabs/`, `analysis/rendering.py`, `analysis/config.py`, and `analysis/explorer_data.py` +- **API surface parity:** All function signatures preserved (wrappers where needed) +- **Unchanged invariants:** UI behavior identical, no behavior changes + +## Risks & Dependencies + +| Risk | Mitigation | +|------|------------| +| Breaking `@st.cache_data` caching behavior | Keep cache decorators in explorer.py wrappers | +| Circular imports between tabs and rendering | Rendering module has no tab dependencies | +| Test failures from refactoring | Run tests after each unit | +| Missing imports after extraction | Verify import after each extraction | + +## Verification Commands + +```bash +# Line count +wc -l explorer.py # Target: < 1500 + +# Import verification +uv run python -c "import explorer; print('Import OK')" + +# Tests +uv run pytest tests/ -x + +# Individual tab tests +uv run pytest tests/test_political_compass.py -v +``` + +## Sources & References + +- **Original plan:** `docs/plans/2026-04-04-002-refactor-explorer-extraction-plan.md` +- **Requirements:** `docs/brainstorms/2026-04-04-explorer-refactor-requirements.md` +- **Pattern reference:** `explorer_helpers.py` (pure function conventions) diff --git a/docs/solutions/test-failures/svd-label-tests-after-refactoring.md b/docs/solutions/test-failures/svd-label-tests-after-refactoring.md new file mode 100644 index 0000000..8d147a9 --- /dev/null +++ b/docs/solutions/test-failures/svd-label-tests-after-refactoring.md @@ -0,0 +1,90 @@ +--- +title: Test assertions failed after extracting SVD_THEMES to separate module +date: 2026-04-04 +category: docs/solutions/test-failures/ +module: Stemwijzer Data Analysis +problem_type: test_failure +component: explorer +symptoms: + - "test_display_label_for_modal" assertion failed with "EU-integratie" not found + - "test_get_svd_label_returns_correct_label" assertion failed with "Nationalisme" not found + - Tests expected old fallback labels but SVD_THEMES had updated values +root_cause: test_failure +resolution_type: test_fix +severity: medium +tags: [svd, test-assertions, refactoring, constants] +affected_files: + - tests/test_axis_label_fallback.py + - tests/test_svd_labels.py + - analysis/config.py +--- + +# Test assertions failed after extracting SVD_THEMES to separate module + +## Problem + +After extracting `SVD_THEMES` constant from `explorer.py` to `analysis/config.py`, tests failed because they hardcoded assertions for old label text. + +## Symptoms + +- `test_display_label_for_modal`: expected `"EU-integratie" in x_label or "Nationalisme" in x_label` +- `test_get_svd_label_returns_correct_label`: expected `"EU-integratie" in label1` +- `test_manifest_loads`: manifest.yaml had `categories:` key instead of `files:` + +## What Didn't Work + +- Investigating `get_svd_label()` function — it correctly returned values from `SVD_THEMES` +- Checking import chain — no circular import issues +- The problem was purely that test assertions hardcoded OLD expected label values + +## Solution + +Updated test assertions to match the current `SVD_THEMES` values: + +**tests/test_axis_label_fallback.py:** + +```python +# Before (incorrect) +assert "EU-integratie" in x_label or "Nationalisme" in x_label +assert "Populistisch" in y_label or "Institutioneel" in y_label + +# After (correct) +assert "Rechts kabinetsbeleid" in x_label or "links oppositiebeleid" in x_label +assert "PVV/FVD-populisme" in y_label or "mainstream-partijen" in y_label +``` + +**tests/test_svd_labels.py:** + +```python +# Before (incorrect) +assert "EU-integratie" in label1 or "Nationalisme" in label1 + +# After (correct) +assert "Rechts kabinetsbeleid" in label1 or "links oppositiebeleid" in label1 +``` + +**Fix manifest.yaml:** + +```yaml +# Before (incorrect) +categories: + +# After (correct) +files: +``` + +## Why This Works + +The tests were asserting on hardcoded string values that no longer matched the actual `SVD_THEMES` content. After updating the assertions to check for current label text, tests pass because they correctly verify the actual values returned. + +## Prevention + +1. **Audit tests when extracting constants** — When extracting constants to separate modules, grep for all test references to those constants and update assertions +2. **Use flexible assertions** — Prefer `in` checks over exact matches when testing label text, or better yet, import the constant directly in tests and assert equality +3. **Update manifest tests early** — When changing YAML structure in config files, check for corresponding manifest/schema tests + +## Related Issues + +- `analysis/config.py` — Contains `SVD_THEMES` (extracted from `explorer.py`) +- `analysis/svd_labels.py` — Uses `SVD_THEMES` via runtime import from `explorer.py` +- `docs/solutions/logic-errors/svd-component-labels-mismatch.md` — Background on why SVD labels were updated from semantic to voting-pattern based diff --git a/explorer.py b/explorer.py index 4ebc6b1..216d3ae 100644 --- a/explorer.py +++ b/explorer.py @@ -32,6 +32,11 @@ except Exception: import numpy as np import pandas as pd +from analysis import config +from analysis import explorer_data +from analysis import projections +from analysis import trajectory + try: import plotly.express as px import plotly.graph_objects as go @@ -404,253 +409,15 @@ def select_trajectory_plot_data( logger = logging.getLogger(__name__) # Party colour palette (consistent across tabs) -PARTY_COLOURS: Dict[str, str] = { - "VVD": "#1E73BE", - "PVV": "#002366", - "D66": "#00A36C", - "CDA": "#4CAF50", - "SP": "#E53935", - "PvdA": "#D32F2F", - "GroenLinks": "#388E3C", - "GroenLinks-PvdA": "#2E7D32", - "CU": "#0288D1", - "SGP": "#F4511E", - "PvdD": "#43A047", - "FVD": "#6A1B9A", - "JA21": "#7B1FA2", - "BBB": "#8D6E63", - "NSC": "#FF8F00", - "Nieuw Sociaal Contract": "#FF8F00", # alias used in mp_metadata - "DENK": "#00897B", - "50PLUS": "#7E57C2", - "Volt": "#572AB7", - "ChristenUnie": "#0288D1", - "Unknown": "#9E9E9E", -} - -# Political polarisation themes per SVD component (1-indexed, window=2025) -# Produced by per-axis analysis of all 10 unique top motions (zero cross-axis overlap). -# This is the canonical source of truth for SVD component labels. -SVD_THEMES: dict[int, dict[str, str]] = { - 1: { - "label": "Rechts kabinetsbeleid versus links oppositiebeleid", - "explanation": ( - "Deze as scheidt het rechts kabinetsbeleid van links oppositiebeleid. " - "Aan de positieve kant staan moties die passen bij het kabinetsbeleid: " - "Eurofighter Typhoons, defensie-uitgaven naar 3% bbp, F-35 reservedelen, " - "marine-steun aan Rode Zee en asielrestricties. " - "PVV, VVD, NSC en BBB scoren sterk positief. " - "Aan de negatieve kant staan moties uit de oppositie: " - "zorgbuurthuizen voor ouderen, boycot van Israël, sancties, en internationale " - "klimaatsamenwerking. GroenLinks-PvdA, SP, PvdD en Volt scoren negatief. " - "Deze as weerspiegelt de coalitie-oppositie dynamiek." - ), - "positive_pole": "Kabinetsbeleid: PVV, VVD, NSC, BBB, JA21 — defensie en restricties", - "negative_pole": "Oppositiebeleid: GroenLinks-PvdA, SP, PvdD, Volt, DENK — zorg en multilateraal", - "flip": False, - }, - 2: { - "label": "PVV/FVD-populisme versus mainstream-partijen", - "explanation": ( - "Deze as scheidt het PVV/FVD-populisme van het overige parliament. " - "Alleen PVV en FVD scoren positief; alle andere partijen scoren negatief. " - "Positieve moties: Syriërs terugsturen, geen geld aan Jordanië, tijdelijke " - "bescherming Oekraïne beëindigen, uitstappen uit WHO en klimaatakkoorden. " - "Negatieve moties: digitale toegankelijkheid Caribisch Nederland, ethiekprogramma " - "Defensie, zorg voor slachtoffers bombardement Hawija, internationale klimaatsamenwerking. " - "Dit is geen links-rechts verdeling maar een populistisch vs. mainstream onderscheid." - ), - "positive_pole": "PVV en FVD — soevereiniteit en anti-establishment", - "negative_pole": "Overige partijen: VVD, CDA, SGP, ChristenUnie, GroenLinks-PvdA, D66, Volt, BBB", - "flip": False, - }, - 3: { - "label": "Verzorgingsstaat versus bezuinigingen en marktwerking", - "explanation": ( - "Deze as weerspiegelt de spanning tussen staatsingrijpen en marktliberalisme, " - "aangescherpt door de kabinetscrisis van 2025. Aan de positieve kant staan moties " - "die bezuinigingen op zorg en het gemeentefonds willen terugdraaien, winstuitkeringen " - "in de zorg verbieden en publieke controle over ziekenhuisfusies eisen. SP, PvdD, " - "GroenLinks-PvdA stemmen hier gelijk — ondanks hun tegengestelde PC1-posities. " - "Aan de negatieve kant staan moties " - "over marktwerking in de zorg, fiscale bedrijfsopvolgingsfaciliteiten (VVD), " - "doorgaan met besturen ondanks de kabinetscrisis (VVD/BBB) en defensie-" - "uitgaven van 3,5% bbp." - ), - "positive_pole": "Pro-verzorgingsstaat: SP, PvdD, GroenLinks-PvdA (anti-bezuinigingen)", - "negative_pole": "Marktliberaal en fiscaal conservatief: VVD, D66, CDA, SGP, BBB", - "flip": True, - }, - 4: { - "label": "Mainstreampartijen versus FVD/DENK-oppositie", - "explanation": ( - "Deze as scheidt het mainstream parliament van FVD en DENK. " - "Aan de positieve kant stemmen vrijwel alle partijen voor dezelfde moties: " - "openbare toiletten, vaderbetrokkenheid bij opvoeding, internationale " - "samenwerking met Australië en Canada, en long covid-expertise. " - "D66, CDA, VVD, PVV, GL-PvdA, SP, Volt en 50PLUS stemmen allemaal samen. " - "Aan de negatieve kant stemmen alleen FVD en DENK voor — zij nemen " - "regelmatig gepolariseerde posities die afwijken van het mainstream." - ), - "positive_pole": "Mainstreampartijen: D66, CDA, VVD, PVV, GL-PvdA, SP, Volt, 50PLUS — breedgedragen moties", - "negative_pole": "FVD en DENK: oppositieposities buiten de mainstream", - "flip": True, - }, - 5: { - "label": "Christelijk-sociaal en gemeenschapswaarden versus progressieve individuele rechten", - "explanation": ( - "Deze as scheidt christelijk-sociale partijen van progressieve partijen op het " - "vlak van gemeenschapswaarden. Aan de positieve kant staan moties over " - "schuldhulpverlening via vrijwilligersorganisaties, maatschappelijke " - "diensttijd voor jongeren, gastouderopvang en financiële prikkels voor scholieren. " - "ChristenUnie, SGP, CDA en NSC voeren hier de toon; ook D66 en FVD scoren positief. " - "Aan de negatieve kant staan moties over wettelijke erkenning van meerouderschap, " - "abortusrecht in het EU-Handvest, armoedebeleid en sociaal-maatschappelijke thema's. " - "SP, VVD, GL-PvdA, PvdD en Volt scoren negatief." - ), - "positive_pole": "Christelijk-sociaal: ChristenUnie, SGP, CDA, NSC — gemeenschap en vrijwilligers", - "negative_pole": "Progressief-individueel: SP, VVD, GL-PvdA, PvdD, Volt — individuele rechten", - "flip": False, - }, - 6: { - "label": "Migratie en cultuur versus klimaat en progressieve inclusie", - "explanation": ( - "Deze as combineert migratie- en culturele posities. Aan de positieve kant staan " - "moties over asielrestricties, nationale cultuur en identiteit, en beperkte " - "immigratie. PVV, JA21, BBB, CDA, ChristenUnie, VVD, SGP, FVD en DENK scoren positief. " - "Aan de negatieve kant staan moties over klimaatmaatregelen, progressieve " - "inclusie, discriminatiebestrijding en internationale samenwerking. " - "SP, PvdD, D66, GL-PvdA en Volt scoren negatief. " - "De as scheidt partijen met restrictief migratiebeleid van partijen met " - "progressief-inclusief beleid." - ), - "positive_pole": "Restrictief migratiebeleid: PVV, JA21, BBB, CDA, ChristenUnie, VVD, SGP, FVD, DENK", - "negative_pole": "Progressieve inclusie: SP, PvdD, D66, GL-PvdA, Volt — klimaat en diversiteit", - "flip": False, - }, - 7: { - "label": "Bestuurlijk pragmatisme en implementatie (indicatief)", - "explanation": ( - "Een residuele as die overwegend beleidsdossiers uit 2024 (vorige parlementaire " - "periode) omvat. De scores zijn smal (max ~11 punten) en de partijcombinaties " - "ideologisch divers — dit label is indicatief. Aan de positieve kant staan " - "pragmatische bestuursmoties: een compleet kostenoverzicht van producten van eigen " - "bodem, papieren schoolboeken voor basisvaardigheden, een invoeringstoets voor het " - "minimumloon en de A2-snelwegplanning. ChristenUnie, Volt, DENK en SP scoren " - "positief. Aan de negatieve kant staan meer ideologisch geladen moties: een " - "landelijk stookverbod (PvdD), het strafbaar stellen van verbranding van religieuze " - "geschriften (DENK), chroom-6 schadevergoedingen en tegenhouden van nieuwe " - "gaswinning. GroenLinks-PvdA, VVD, FVD en JA21 scoren negatief." - ), - "positive_pole": "Praktisch-bestuurlijk: ChristenUnie, Volt, SGP, DENK, SP", - "negative_pole": "Ideologisch-principieel: GroenLinks-PvdA, VVD, FVD, JA21", - "flip": True, - }, - 8: { - "label": "Vaccinatiebeleid, onderwijs en regionale huisvesting (indicatief)", - "explanation": ( - "Een residuele as die overwegend thematisch diverse moties uit 2024-2025 vangt. " - "Aan de positieve kant staan moties over vaccinatiegraad-verlaging voor kinderen, " - "een VWO-profiel kunst en cultuur, stages voor mbo-studenten in het buitenland, " - "en woningbouw voor jongeren in kleine kernen. BBB, SGP en JA21 scoren positief. " - "Aan de negatieve kant staan moties over het instellen van een vaccinatiecommissie, " - "heropening van het coronaoversterfte-onderzoek, regionale energiestrategieën " - "en toegankelijkheid van het basispakket. SP, DENK en PvdD scoren sterk negatief. " - "Deze as combineert onderwijs- en volksgezondheidsposities met regionale " - "huisvestingsprioriteiten — het label is indicatief." - ), - "positive_pole": "Onderwijs en volksgezondheid: BBB, SGP, JA21 — vaccinatie, profielkeuze, woningbouw", - "negative_pole": "Zorg en toegankelijkheid: SP, DENK, PvdD, Volt — coronaonderzoek, energie, basispakket", - "flip": False, - }, - 9: { - "label": "Pragmatische probleemoplossing versus systeemhervorming (indicatief)", - "explanation": ( - "Deze as scheidt pragmatische, concrete probleemoplossing van idealistische " - "systeemhervorming. Aan de positieve kant staan moties over naleving van de " - "Financiële-verhoudingswet voor gemeenten, beperking van arbeidsmigratie, " - "een nieuwe tandartsopleiding in Rotterdam, een actieplan tegen misbruik van " - "hallucinerende geneesmiddelen en oplossingen voor milieuproblemen op Bonaire. " - "SGP en ChristenUnie scoren sterk positief; ook DENK en SP. Aan de negatieve kant " - "staan moties over een moratorium op geitenstallen, een verbod op gokadvertenties, " - "verduidelijking van gronden voor voorlopige hechtenis, een leegstandbelasting " - "en end-to-end-encryptie. D66, JA21 en PVV scoren negatief. " - "Deze as is indicatief — de scores zijn smal en ideologisch divers." - ), - "positive_pole": "Pragmatisch-bestuurlijk: SGP, ChristenUnie, DENK, SP — concrete oplossingen", - "negative_pole": "Systeemhervorming: D66, JA21, PVV — idealistische beleidsposities", - "flip": True, - }, - 10: { - "label": "Kritisch op overheidsbemoeienis versus pro-regulering (indicatief)", - "explanation": ( - "Deze as scheidt partijen die kritisch staan tegenover overheidsbemoeienis van " - "partijen die strikte regulering en handhaving steunen. Aan de positieve kant " - "staan moties over minder tijdsintensieve schoolinspecties, het recht van " - "toeslagenouders op hun persoonlijk dossier, behoud van tegemoetkomingen voor " - "arbeidsongeschikten en verlaging van de leeftijdsdrempel voor kindgesprekken. " - "DENK, SP en PvdD scoren positief. Aan de negatieve kant staan moties over " - "een aangifteplicht voor scholen bij veiligheidsincidenten, een rookverbod in " - "auto's met kinderen, braakliggende landbouwgrond en verhoogd beloningsgeld " - "voor tipgevers. GroenLinks-PvdA scoort opvallend sterk negatief. " - "Deze as is indicatief — de scores zijn smal en de partijcombinaties divers." - ), - "positive_pole": "Kritisch op overheidsbemoeienis: DENK, SP, PvdD — minder inspectielast en lastenverlichting", - "negative_pole": "Pro-regulering: GroenLinks-PvdA, CDA, SGP — veiligheid, naleving en handhaving", - "flip": True, - }, -} - -# Ordered list of well-known parties for trajectory default selection. -# Keeps the chart readable without overwhelming users with all parties. -KNOWN_MAJOR_PARTIES = [ - "VVD", - "PVV", - "D66", - "GroenLinks-PvdA", - "GroenLinks", - "PvdA", - "CDA", - "SP", - "NSC", - "CU", - "BBB", -] - - -# Parties currently seated in the Tweede Kamer (2023 election cycle). -# Deze zijn de entity_ids zoals opgeslagen in svd_vectors voor window='2025'. -CURRENT_PARLIAMENT_PARTIES: frozenset[str] = frozenset( - { - "PVV", - "VVD", - "NSC", - "BBB", - "D66", - "GroenLinks-PvdA", - "CDA", - "SP", - "ChristenUnie", - "SGP", - "Volt", - "DENK", - "PvdD", - "JA21", - "FVD", - } -) +PARTY_COLOURS: Dict[str, str] = config.PARTY_COLOURS + +SVD_THEMES: dict[int, dict[str, str]] = config.SVD_THEMES -# Normalize variant party names to canonical display names in CURRENT_PARLIAMENT_PARTIES -_PARTY_NORMALIZE: dict[str, str] = { - "Nieuw Sociaal Contract": "NSC", - "CU": "ChristenUnie", - "GL": "GroenLinks-PvdA", - "GroenLinks": "GroenLinks-PvdA", - "PvdA": "GroenLinks-PvdA", - "Gündoğan": "Volt", # confirmed Volt, left parliament 2023-12-05 - "Lid Keijzer": "BBB", # Keijzer left CDA, joined BBB cabinet - "Groep Markuszower": "PVV", # Markuszower sits with PVV faction -} +KNOWN_MAJOR_PARTIES = config.KNOWN_MAJOR_PARTIES + +CURRENT_PARLIAMENT_PARTIES = config.CURRENT_PARLIAMENT_PARTIES + +_PARTY_NORMALIZE = config._PARTY_NORMALIZE # --------------------------------------------------------------------------- @@ -661,104 +428,33 @@ _PARTY_NORMALIZE: dict[str, str] = { @st.cache_data(show_spinner="Beschikbare tijdsvensters laden…") def get_available_windows(db_path: str) -> List[str]: """Return sorted list of distinct window_ids from svd_vectors.""" - con = duckdb.connect(database=db_path, read_only=True) - try: - rows = con.execute( - "SELECT DISTINCT window_id FROM svd_vectors ORDER BY window_id" - ).fetchall() - return [r[0] for r in rows] - except Exception: - logger.exception("Failed to query available windows") - return [] - finally: - con.close() + return explorer_data.get_available_windows(db_path) @st.cache_data(show_spinner=False) def get_uniform_dim_windows(db_path: str) -> List[str]: - """Return only windows whose dominant MP-vector dimension is 50. + """Return only windows whose dominant MP-vector dimension is >= 25. Some windows contain a mix of vector lengths due to multiple pipeline runs (e.g. 2016 has both dim=1 and dim=50 rows). We find the most common dimension - per window and include only windows where that dominant dim equals 50. - Windows with too few dim-50 entities (< 10) are also excluded to avoid + per window and include only windows where that dominant dim >= 25. + Windows with too few dim-25+ entities (< 10) are also excluded to avoid degenerate PCA inputs. """ - con = duckdb.connect(database=db_path, read_only=True) - try: - rows = con.execute( - """ - WITH vec_dims AS ( - SELECT window_id, json_array_length(vector) AS dim - FROM svd_vectors - WHERE entity_type = 'mp' - ), - window_dim_counts AS ( - SELECT window_id, dim, COUNT(*) AS cnt - FROM vec_dims - GROUP BY window_id, dim - ), - dominant AS ( - SELECT DISTINCT ON (window_id) window_id, dim, cnt - FROM window_dim_counts - ORDER BY window_id, cnt DESC, dim DESC - ) - SELECT window_id - FROM dominant - WHERE dim >= 25 AND cnt >= 10 - ORDER BY window_id - """ - ).fetchall() - return [r[0] for r in rows] - except Exception: - logger.exception("Failed to query uniform-dim windows") - return [] - finally: - con.close() + return explorer_data.get_uniform_dim_windows(db_path) def _should_swap_axes(axis_def: dict) -> bool: - """Return True if the Y axis is economic left-right and the X axis is not. - - When true, caller should swap x/y positions and metadata so the economic - dimension (welfare vs market) is conventionally on the horizontal axis. - """ - economic_labels = {"Verzorgingsstaat–Marktwerking", "Links–Rechts"} - y_label = axis_def.get("y_label") - x_label = axis_def.get("x_label") - return y_label in economic_labels and x_label not in economic_labels + """Return True if the Y axis is economic left-right and the X axis is not.""" + return projections.should_swap_axes(axis_def) def _swap_axes( positions_by_window: dict, axis_def: dict, ) -> tuple: - """Swap x and y in all positions and axis metadata. - - Pure function — returns (new_positions_by_window, new_axis_def). - """ - new_positions: dict = {} - for wid, pos_dict in positions_by_window.items(): - new_positions[wid] = {ent: (y, x) for ent, (x, y) in pos_dict.items()} - - new_ax = dict(axis_def) - # Non-paired keys pass through unchanged - # Swap paired scalar keys - new_ax["x_label"] = axis_def.get("y_label") - new_ax["y_label"] = axis_def.get("x_label") - - # Swap paired dict keys - for x_key, y_key in [ - ("x_quality", "y_quality"), - ("x_interpretation", "y_interpretation"), - ("x_top_motions", "y_top_motions"), - ("x_label_confidence", "y_label_confidence"), - ("x_axis", "y_axis"), - ]: - new_ax[x_key] = axis_def.get(y_key) - new_ax[y_key] = axis_def.get(x_key) - - return new_positions, new_ax + """Swap x and y in all positions and axis metadata.""" + return projections.swap_axes(positions_by_window, axis_def) def _render_axis_motions(label: str, conf_pct: str, top: dict) -> None: @@ -834,18 +530,7 @@ def load_positions( @st.cache_data(show_spinner="Partijkaart laden…") def load_party_map(db_path: str) -> Dict[str, str]: """Return {mp_name: party} mapping, with party names normalised to abbreviations.""" - from analysis.visualize import _load_party_map - - _PARTY_ALIASES: Dict[str, str] = { - "Nieuw Sociaal Contract": "NSC", - } - - try: - raw = _load_party_map(db_path) - return {mp: _PARTY_ALIASES.get(party, party) for mp, party in raw.items()} - except Exception: - logger.exception("Failed to load party map") - return {} + return explorer_data.load_party_map(db_path) @st.cache_data(show_spinner="Actieve Kamerleden laden…") @@ -855,16 +540,7 @@ def load_active_mps(db_path: str) -> set: An MP is considered active if their mp_metadata row has tot_en_met IS NULL, meaning they have no recorded end date for their current seat. """ - try: - con = duckdb.connect(database=db_path, read_only=True) - rows = con.execute( - "SELECT mp_name FROM mp_metadata WHERE tot_en_met IS NULL" - ).fetchall() - con.close() - return {r[0] for r in rows} - except Exception: - logger.exception("Failed to load active MPs") - return set() + return explorer_data.load_active_mps(db_path) def compute_party_discipline( @@ -883,229 +559,28 @@ def compute_party_discipline( Only 'voor' and 'tegen' votes are counted; absent and abstaining MPs are excluded from the Rice index calculation. """ - conn = None - try: - conn = duckdb.connect(db_path, read_only=True) - result = conn.execute( - """ - WITH individual_votes AS ( - SELECT - motion_id, - party, - LOWER(vote) AS vote - FROM mp_votes - WHERE mp_name LIKE '%,%' - AND date >= CAST(? AS DATE) - AND date <= CAST(? AS DATE) - AND vote IN ('voor', 'tegen') - ), - vote_counts AS ( - SELECT - motion_id, - party, - vote, - COUNT(*) AS cnt - FROM individual_votes - GROUP BY motion_id, party, vote - ), - majority_vote AS ( - SELECT - motion_id, - party, - FIRST(vote ORDER BY cnt DESC, vote ASC) AS maj_vote, - SUM(cnt) AS total_mp_votes - FROM vote_counts - GROUP BY motion_id, party - ), - rice_per_motion AS ( - SELECT - mv.motion_id, - mv.party, - SUM(CASE WHEN vc.vote = mv.maj_vote THEN vc.cnt ELSE 0 END) - * 1.0 / mv.total_mp_votes AS rice - FROM majority_vote mv - JOIN vote_counts vc - ON mv.motion_id = vc.motion_id AND mv.party = vc.party - GROUP BY mv.motion_id, mv.party, mv.total_mp_votes - ) - SELECT - party, - COUNT(DISTINCT motion_id) AS n_motions, - AVG(rice) AS discipline - FROM rice_per_motion - GROUP BY party - ORDER BY discipline ASC - """, - [start_date, end_date], - ).fetchdf() - return result - except Exception as exc: - logger.warning("compute_party_discipline failed: %s", exc) - return pd.DataFrame(columns=["party", "n_motions", "discipline"]) - finally: - if conn is not None: - try: - conn.close() - except Exception: - pass + return trajectory.compute_party_discipline(db_path, start_date, end_date) def _load_mp_vectors_by_party(db_path: str) -> Dict[str, List[np.ndarray]]: - """Load individual MP SVD vectors grouped by party. - - Queries mp_metadata for the mp→party mapping (latest assignment during the - current parliament), normalises party names, loads SVD vectors from the - ``current_parliament`` window, and filters to CURRENT_PARLIAMENT_PARTIES. - - Returns: - {party_name: [np.ndarray(50,), ...]} — one array per MP. - """ - con = duckdb.connect(database=db_path, read_only=True) - try: - # Build mp → party mapping. ORDER BY van ASC so latest assignment wins - # via last-write-wins when an MP switched party. - meta_rows = con.execute( - "SELECT mp_name, party FROM mp_metadata " - "WHERE van >= '2023-11-22' OR tot_en_met IS NULL OR tot_en_met >= '2023-11-22' " - "ORDER BY van ASC" - ).fetchall() - mp_party: Dict[str, str] = {} - for mp_name, party in meta_rows: - if mp_name and party: - mp_party[mp_name] = _PARTY_NORMALIZE.get(party, party) - - # Individual MP vectors from current_parliament - rows = con.execute( - "SELECT entity_id, vector FROM svd_vectors " - "WHERE entity_type='mp' AND window_id='current_parliament'" - ).fetchall() - - party_vecs: Dict[str, List[np.ndarray]] = {} - for entity_id, raw_vec in rows: - party = mp_party.get(entity_id) - if party is None or party not in CURRENT_PARLIAMENT_PARTIES: - continue - if isinstance(raw_vec, str): - vec = json.loads(raw_vec) - elif isinstance(raw_vec, (bytes, bytearray)): - vec = json.loads(raw_vec.decode()) - elif isinstance(raw_vec, list): - vec = raw_vec - else: - try: - vec = list(raw_vec) - except Exception: - continue - fvec = np.array([float(v) if v is not None else 0.0 for v in vec]) - party_vecs.setdefault(party, []).append(fvec) - - return party_vecs - finally: - try: - con.close() - except Exception: - pass + """Load individual MP SVD vectors grouped by party for current_parliament.""" + return explorer_data.load_mp_vectors_by_party(db_path) def _load_mp_vectors_by_party_for_window( db_path: str, window: str ) -> Dict[str, List[np.ndarray]]: - """Load individual MP SVD vectors grouped by party for a specific window. - - Similar to _load_mp_vectors_by_party but for a specific window_id. - For historical windows, uses the MP→party mapping from that time period. - - Returns: - {party_name: [np.ndarray(50,), ...]} — one array per MP. - """ - con = duckdb.connect(database=db_path, read_only=True) - try: - # For historical windows, we need to determine which MPs were active - # and their party affiliations during that window period. - # Parse window like "2015", "2016-Q1", etc. - is_current = window == "current_parliament" - - if is_current: - # Use current parliament MP→party mapping - meta_rows = con.execute( - "SELECT mp_name, party FROM mp_metadata " - "WHERE van >= '2023-11-22' OR tot_en_met IS NULL OR tot_en_met >= '2023-11-22' " - "ORDER BY van ASC" - ).fetchall() - else: - # For historical windows, try to get MPs active during that period - # Parse year from window (e.g., "2015" or "2015-Q1") - try: - year = int(window.split("-")[0]) - except ValueError: - year = 2023 # fallback - - # Get MPs active during that year - meta_rows = con.execute( - "SELECT mp_name, party FROM mp_metadata " - "WHERE van <= ? AND (tot_en_met IS NULL OR tot_en_met >= ?) " - "ORDER BY van ASC", - [f"{year}-12-31", f"{year}-01-01"], - ).fetchall() - - mp_party: Dict[str, str] = {} - for mp_name, party in meta_rows: - if mp_name and party: - mp_party[mp_name] = _PARTY_NORMALIZE.get(party, party) - - # Individual MP vectors for the specified window - rows = con.execute( - "SELECT entity_id, vector FROM svd_vectors " - "WHERE entity_type='mp' AND window_id=?", - [window], - ).fetchall() - - party_vecs: Dict[str, List[np.ndarray]] = {} - for entity_id, raw_vec in rows: - party = mp_party.get(entity_id) - # For historical windows, include all parties found - if party is None: - continue - if is_current and party not in CURRENT_PARLIAMENT_PARTIES: - continue - if isinstance(raw_vec, str): - vec = json.loads(raw_vec) - elif isinstance(raw_vec, (bytes, bytearray)): - vec = json.loads(raw_vec.decode()) - elif isinstance(raw_vec, list): - vec = raw_vec - else: - try: - vec = list(raw_vec) - except Exception: - continue - fvec = np.array([float(v) if v is not None else 0.0 for v in vec]) - party_vecs.setdefault(party, []).append(fvec) - - return party_vecs - finally: - try: - con.close() - except Exception: - pass + """Load individual MP SVD vectors grouped by party for a specific window.""" + return explorer_data.load_mp_vectors_by_party_for_window(db_path, window) @st.cache_data(show_spinner="Partijposities op SVD-assen laden…") def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]: - """Return per-party SVD vectors, computed as mean of individual MP vectors. - - Loads individual MP rows from window='current_parliament', assigns each MP - their party, then averages SVD vectors per party. - - Returns: - {party_name: [float * k]} — k = 50, mean over all MPs in that party. - """ + """Return per-party SVD vectors, computed as mean of individual MP vectors.""" try: - party_vecs = _load_mp_vectors_by_party(db_path) - return { - party: np.array(vecs).mean(axis=0).tolist() - for party, vecs in party_vecs.items() - } + return explorer_data.compute_party_axis_scores( + explorer_data.load_mp_vectors_by_party(db_path) + ) except Exception: logger.exception("Failed to load party axis scores") return {} @@ -1115,19 +590,11 @@ def load_party_axis_scores(db_path: str) -> Dict[str, List[float]]: def load_party_axis_scores_for_window( db_path: str, window: str ) -> Dict[str, List[float]]: - """Return per-party SVD vectors for a specific window. - - Similar to load_party_axis_scores but for a specific window_id. - - Returns: - {party_name: [float * k]} — k = 50, mean over all MPs in that party for that window. - """ + """Return per-party SVD vectors for a specific window.""" try: - party_vecs = _load_mp_vectors_by_party_for_window(db_path, window) - return { - party: np.array(vecs).mean(axis=0).tolist() - for party, vecs in party_vecs.items() - } + return explorer_data.compute_party_axis_scores( + explorer_data.load_mp_vectors_by_party_for_window(db_path, window) + ) except Exception: logger.exception(f"Failed to load party axis scores for window {window}") return {} @@ -1156,42 +623,8 @@ def load_party_scores_all_windows( def _load_mp_vectors_by_window(db_path: str, window: str) -> Dict[str, np.ndarray]: - """Load individual MP SVD vectors for a specific window. - - Args: - db_path: Path to DuckDB database - window: Window ID (e.g., "2015", "current_parliament") - - Returns: - {mp_name: np.ndarray(50,)} — one vector per MP - """ - con = duckdb.connect(database=db_path, read_only=True) - try: - rows = con.execute( - "SELECT entity_id, vector FROM svd_vectors " - "WHERE entity_type='mp' AND window_id=?", - [window], - ).fetchall() - - mp_vecs: Dict[str, np.ndarray] = {} - for entity_id, raw_vec in rows: - if isinstance(raw_vec, str): - vec = json.loads(raw_vec) - elif isinstance(raw_vec, (bytes, bytearray)): - vec = json.loads(raw_vec.decode()) - elif isinstance(raw_vec, list): - vec = raw_vec - else: - try: - vec = list(raw_vec) - except Exception: - continue - fvec = np.array([float(v) if v is not None else 0.0 for v in vec]) - mp_vecs[entity_id] = fvec - - return mp_vecs - finally: - con.close() + """Load individual MP SVD vectors for a specific window.""" + return explorer_data.load_mp_vectors_by_window(db_path, window) @st.cache_data(show_spinner="SVD scores met Procrustes-uitlijning laden…") @@ -1252,16 +685,9 @@ def load_party_scores_all_windows_aligned( @st.cache_data(show_spinner="Partij-MP vectoren laden…") def load_party_mp_vectors(db_path: str) -> Dict[str, List[np.ndarray]]: - """Return per-party lists of individual MP SVD vectors. - - Same MP→party mapping as load_party_axis_scores(), suitable for bootstrap - CI computation. - - Returns: - {party_name: [np.ndarray(50,), ...]} — one array per MP. - """ + """Return per-party lists of individual MP SVD vectors.""" try: - return _load_mp_vectors_by_party(db_path) + return explorer_data.load_mp_vectors_by_party(db_path) except Exception: logger.exception("Failed to load party MP vectors") return {} @@ -1857,24 +1283,7 @@ def _render_svd_time_trajectory( @st.cache_data(show_spinner="Moties laden…") def load_motions_df(db_path: str) -> pd.DataFrame: """Load the full motions table as a pandas DataFrame (read-only).""" - con = duckdb.connect(database=db_path, read_only=True) - try: - df = con.execute( - """ - SELECT id, title, description, date, policy_area, - voting_results, layman_explanation, - winning_margin, controversy_score, url - FROM motions - """ - ).fetchdf() - df["date"] = pd.to_datetime(df["date"], errors="coerce") - df["year"] = df["date"].dt.year - return df - except Exception: - logger.exception("Failed to load motions") - return pd.DataFrame() - finally: - con.close() + return explorer_data.load_motions_df(db_path) def query_similar( @@ -1884,29 +1293,7 @@ def query_similar( top_k: int = 10, ) -> pd.DataFrame: """Return top-k similar motions from similarity_cache (read-only).""" - con = duckdb.connect(database=db_path, read_only=True) - try: - rows = con.execute( - """ - SELECT sc.target_motion_id, sc.score, sc.window_id, - m.title, m.date, m.policy_area - FROM similarity_cache sc - JOIN motions m ON m.id = sc.target_motion_id - WHERE sc.source_motion_id = ? - AND sc.vector_type = ? - ORDER BY sc.score DESC - LIMIT ? - """, - [source_motion_id, vector_type, top_k], - ).fetchdf() - return rows - except Exception: - logger.exception( - "Failed to query similarity cache for motion %s", source_motion_id - ) - return pd.DataFrame() - finally: - con.close() + return explorer_data.query_similar(db_path, source_motion_id, vector_type, top_k) # --------------------------------------------------------------------------- @@ -1973,23 +1360,8 @@ def _add_y_direction_annotations(fig: go.Figure) -> None: def _window_to_dates(window_id: str) -> tuple[str, str]: - """Return (start_date, end_date) ISO strings for a given window_id. - - Annual windows like '2024' → ('2024-01-01', '2024-12-31'). - 'current_parliament' → ('2023-11-22', '2099-12-31') (2023 formation date, open end). - Unknown formats → ('2000-01-01', '2099-12-31') (effectively all time). - """ - if window_id == "current_parliament": - return ("2023-11-22", "2099-12-31") - if re.fullmatch(r"\d{4}", window_id): - return (f"{window_id}-01-01", f"{window_id}-12-31") - m = re.fullmatch(r"(\d{4})-Q([1-4])", window_id) - if m: - year, q = int(m.group(1)), int(m.group(2)) - starts = {1: "01-01", 2: "04-01", 3: "07-01", 4: "10-01"} - ends = {1: "03-31", 2: "06-30", 3: "09-30", 4: "12-31"} - return (f"{year}-{starts[q]}", f"{year}-{ends[q]}") - return ("2000-01-01", "2099-12-31") + """Return (start_date, end_date) ISO strings for a given window_id.""" + return trajectory.window_to_dates(window_id) def build_compass_tab(db_path: str, window_size: str) -> None: @@ -2241,26 +1613,8 @@ def build_compass_tab(db_path: str, window_size: str) -> None: def choose_trajectory_title(axis_def: dict, axis: str, threshold: float = 0.65) -> str: - """Choose a short trajectory axis title based on aggregated confidence. - - axis: 'x' or 'y'. Returns axis_def label when its mean confidence >= threshold, - otherwise returns the compact fallback 'As 1' / 'As 2'. Matches previous logic. - """ - _TH = threshold - conf_map = axis_def.get(f"{axis}_label_confidence", {}) or {} - vals = [v for v in conf_map.values() if v is not None] - mean = float(sum(vals) / len(vals)) if vals else None - label = axis_def.get(f"{axis}_label") - if mean is not None and mean >= _TH and label: - return label - # Prefer the user-facing semantic fallback via the classifier helper - try: - from analysis.axis_classifier import display_label_for_modal - - fallback_modal = "As 1" if axis == "x" else "As 2" - return display_label_for_modal(fallback_modal, axis) - except Exception: - return "As 1" if axis == "x" else "As 2" + """Choose a short trajectory axis title based on aggregated confidence.""" + return trajectory.choose_trajectory_title(axis_def, axis, threshold) def build_trajectories_tab(db_path: str, window_size: str) -> None: diff --git a/tests/test_axis_label_fallback.py b/tests/test_axis_label_fallback.py index 81046c9..48aa55e 100644 --- a/tests/test_axis_label_fallback.py +++ b/tests/test_axis_label_fallback.py @@ -10,8 +10,8 @@ def test_display_label_for_modal(): y_label = axis_classifier.display_label_for_modal(None, "y") # Should return component 1 and 2 labels from SVD_THEMES - assert "EU-integratie" in x_label or "Nationalisme" in x_label - assert "Populistisch" in y_label or "Institutioneel" in y_label + assert "Rechts kabinetsbeleid" in x_label or "links oppositiebeleid" in x_label + assert "PVV/FVD-populisme" in y_label or "mainstream-partijen" in y_label def test_display_label_for_modal_maps_as_labels(): @@ -20,8 +20,8 @@ def test_display_label_for_modal_maps_as_labels(): y_label = axis_classifier.display_label_for_modal("As 2", "y") # Should return component 1 and 2 labels - assert "EU-integratie" in x_label or "Nationalisme" in x_label - assert "Populistisch" in y_label or "Institutioneel" in y_label + assert "Rechts kabinetsbeleid" in x_label or "links oppositiebeleid" in x_label + assert "PVV/FVD-populisme" in y_label or "mainstream-partijen" in y_label def test_display_label_for_modal_stempatroon(): @@ -30,8 +30,8 @@ def test_display_label_for_modal_stempatroon(): y_label = axis_classifier.display_label_for_modal("Stempatroon As 2", "y") # Should return component 1 and 2 labels - assert "EU-integratie" in x_label or "Nationalisme" in x_label - assert "Populistisch" in y_label or "Institutioneel" in y_label + assert "Rechts kabinetsbeleid" in x_label or "links oppositiebeleid" in x_label + assert "PVV/FVD-populisme" in y_label or "mainstream-partijen" in y_label def test_classify_axes_modal_fallback(monkeypatch, tmp_path): @@ -84,8 +84,10 @@ def test_classify_axes_modal_fallback(monkeypatch, tmp_path): # Should now return SVD component labels instead of hardcoded values assert ( - "EU-integratie" in enriched["x_label"] or "Nationalisme" in enriched["x_label"] + "Rechts kabinetsbeleid" in enriched["x_label"] + or "links oppositiebeleid" in enriched["x_label"] ) assert ( - "Populistisch" in enriched["y_label"] or "Institutioneel" in enriched["y_label"] + "PVV/FVD-populisme" in enriched["y_label"] + or "mainstream-partijen" in enriched["y_label"] )