From 154762a4c8b76e8b9fe692a83cd521b60d02031d Mon Sep 17 00:00:00 2001
From: Sven Geboers <geboers.sven@gmail.com>
Date: Sun, 5 Apr 2026 00:46:57 +0200
Subject: [PATCH] docs: refresh 2 stale references in solutions docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- refactoring-streamlit-data-loading.md: update test count
  164/164 → 173/173 (7 new axis validation tests added)
- svd-component-labels-mismatch.md: SVD_THEMES moved from
  explorer.py:434-611 → analysis/config.py:67+ per the
  refactoring that extracted constants to analysis/config.py
---
 .../refactoring-streamlit-data-loading.md     | 171 ++++++++++++++++++
 .../svd-component-labels-mismatch.md          |   2 +-
 2 files changed, 172 insertions(+), 1 deletion(-)
 create mode 100644 docs/solutions/best-practices/refactoring-streamlit-data-loading.md

diff --git a/docs/solutions/best-practices/refactoring-streamlit-data-loading.md b/docs/solutions/best-practices/refactoring-streamlit-data-loading.md
new file mode 100644
index 0000000..a7632f0
--- /dev/null
+++ b/docs/solutions/best-practices/refactoring-streamlit-data-loading.md
@@ -0,0 +1,171 @@
+---
+module: explorer
+component: explorer.py
+problem_type: developer_experience
+component: development_workflow
+category: best-practices
+severity: medium
+tags:
+  - streamlit
+  - refactoring
+  - horizontal-decomposition
+  - duckdb
+  - data-loading
+  - constants
+title: "Refactoring Large Streamlit Apps: Extract Pure Data Logic"
+date: "2026-04-04"
+last_updated: "2026-04-05"
+applies_when:
+  - Streamlit apps with large files (>1500 lines)
+  - Mixed UI and business logic
+  - Heavy data loading functions decorated with @st.cache_data
+  - DuckDB connections scattered throughout
+---
+
+## Context
+
+The `explorer.py` Streamlit app (originally 3715 lines) had 39 functions mixing:
+- Streamlit UI rendering calls
+- DuckDB database queries  
+- Business logic computations
+- Pure data transformations
+
+This pattern makes the code:
+- Hard to unit test (requires Streamlit runtime)
+- Difficult to understand (UI interwoven with logic)
+- Challenging to maintain (changes ripple unpredictably)
+
+## Guidance
+
+### Pattern: Horizontal Decomposition
+
+Extract pure functions by type, not just by moving code. The key insight is **separating concerns by computation type**, not just file size:
+
+**1. Data Loading Functions → `analysis/explorer_data.py`**
+```python
+# Before: Mixed in explorer.py with @st.cache_data
+@st.cache_data(show_spinner="Partijkaart laden…")
+def load_party_map(db_path: str) -> Dict[str, str]:
+    con = duckdb.connect(database=db_path, read_only=True)
+    try:
+        rows = con.execute("SELECT mp_name, party FROM mp_metadata...").fetchall()
+        return {mp: _PARTY_NORMALIZE.get(party, party) for mp, party in rows if mp and party}
+    finally:
+        con.close()
+
+# After: Pure data access in analysis/explorer_data.py
+def load_party_map(db_path: str) -> Dict[str, str]:
+    """Return {mp_name: party} mapping, with party names normalised to abbreviations."""
+    try:
+        con = duckdb.connect(database=db_path, read_only=True)
+        rows = con.execute("SELECT mp_name, party FROM mp_metadata WHERE party IS NOT NULL").fetchall()
+        con.close()
+        return {mp: _PARTY_NORMALIZE.get(party, party) for mp, party in rows if mp and party}
+    except Exception:
+        logger.exception("Failed to load party map")
+        return {}
+```
+
+**2. Pure Computation → `analysis/projections.py`**
+```python
+# Before: Mixed with UI
+def _should_swap_axes(axis_def: dict) -> bool:
+    economic_labels = {"Verzorgingsstaat–Marktwerking", "Links–Rechts"}
+    return axis_def.get("y_label") in economic_labels and axis_def.get("x_label") not in economic_labels
+
+# After: Pure function, no external dependencies
+def should_swap_axes(axis_def: dict) -> bool:
+    """Return True if the Y axis is economic left-right and the X axis is not."""
+    economic_labels = {"Verzorgingsstaat–Marktwerking", "Links–Rechts"}
+    return axis_def.get("y_label") in economic_labels and axis_def.get("x_label") not in economic_labels
+```
+
+**4. Constants → `analysis/config.py`**
+```python
+# Before: Defined inline in the data loading module
+_PARTY_NORMALIZE: Dict[str, str] = {
+    "Nieuw Sociaal Contract": "NSC",
+    "CU": "ChristenUnie",
+}
+
+# After: Import from config, define once
+from analysis.config import _PARTY_NORMALIZE
+
+def load_party_map(db_path: str) -> Dict[str, str]:
+    """Return {mp_name: party} mapping, with party names normalised to abbreviations."""
+    try:
+        con = duckdb.connect(database=db_path, read_only=True)
+        rows = con.execute("SELECT mp_name, party FROM mp_metadata WHERE party IS NOT NULL").fetchall()
+        con.close()
+        return {mp: _PARTY_NORMALIZE.get(party, party) for mp, party in rows if mp and party}
+    except Exception:
+        logger.exception("Failed to load party map")
+        return {}
+```
+
+**Why this matters:** DRY enforcement. If `_PARTY_NORMALIZE` or `CURRENT_PARLIAMENT_PARTIES` are defined in multiple modules, changes must be made in all places. Importing from a single source of truth (`config.py`) prevents drift.
+
+**Anti-pattern to avoid:** Creating new constant definitions in extracted modules without checking if they already exist in `config.py`. Before adding a constant to any `analysis/*.py` module, grep for existing definitions of that constant name across all `analysis/` modules.
+When tests import from the original module, preserve wrappers:
+```python
+# In explorer.py - preserves test compatibility
+def _should_swap_axes(axis_def: dict) -> bool:
+    """Return True if the Y axis is economic left-right and the X axis is not."""
+    return projections.should_swap_axes(axis_def)
+```
+
+### Import Direction Rule
+
+```
+explorer.py → analysis/*.py  (one-way)
+analysis/*.py ↛ explorer.py  (never)
+```
+
+This prevents circular imports and clarifies module boundaries.
+
+### Preserve Function Signatures
+
+Refactoring shouldn't change public APIs:
+```python
+# Keep these unchanged:
+@st.cache_data(show_spinner="...")
+def load_party_map(db_path: str) -> Dict[str, str]:
+    return explorer_data.load_party_map(db_path)
+```
+
+## Why This Matters
+
+1. **Testability**: Pure functions in `analysis/` can be unit tested without Streamlit runtime
+2. **Reusability**: Data loading functions can be imported by other modules
+3. **Maintainability**: Changes to business logic are isolated from UI changes
+4. **Readability**: Single responsibility - each function does one thing
+
+## Prevention
+
+1. **Audit existing constants before creating new ones**: Before adding a constant to any `analysis/*.py` module, grep across `analysis/` for existing definitions. DRY violations cause maintenance burden and drift.
+2. **Define `__all__` in every analysis module**: Without `__all__`, it's hard to know what's public API. Add it when creating the module, not as an afterthought. All 6 public functions in `analysis/trajectory.py` (compute_trajectories, compute_2d_trajectories, top_drifters, compute_party_discipline, window_to_dates, choose_trajectory_title) should be exported.
+3. **Test assertions may need updates**: When extracting constants, grep for all test references to those constants and update assertions. See `docs/solutions/test-failures/svd-label-tests-after-refactoring.md` for an example.
+4. **Verify tests pass after extraction**: Run the full test suite to catch any broken import chains or assertion mismatches.
+
+## Results
+
+| Metric | Before | After |
+|--------|--------|-------|
+| explorer.py lines | 3715 | 3069 |
+| analysis/ modules | 0 | 12 modules |
+| Functions extracted | 0 | 15+ |
+| Test pass rate | 164/164 | 173/173 |
+
+**Practical minimum reached at ~3000 lines.** Tab functions (`build_*_tab()`) and their `_render_*` helpers are inherently Streamlit-coupled — they mix UI rendering with business logic. Further reduction requires decoupling data preparation from rendering (prepare data in `analysis/`, render in `explorer.py`), which is a significant architectural refactor.
+
+## When to Apply
+
+Apply this pattern when:
+- A Streamlit file exceeds 1500 lines
+- Functions mix `@st.cache_data` with DuckDB queries and business logic
+- Tests require Streamlit runtime to run
+- Multiple tabs share similar data loading patterns
+
+## Examples
+
+See `analysis/config.py` for constants (PARTY_COLOURS, SVD_THEMES, KNOWN_MAJOR_PARTIES, CURRENT_PARLIAMENT_PARTIES, CANONICAL_RIGHT, CANONICAL_LEFT, _PARTY_NORMALIZE), `analysis/explorer_data.py` for extracted data loading functions, and `analysis/projections.py` for pure computation utilities.
diff --git a/docs/solutions/logic-errors/svd-component-labels-mismatch.md b/docs/solutions/logic-errors/svd-component-labels-mismatch.md
index aef945d..f048c79 100644
--- a/docs/solutions/logic-errors/svd-component-labels-mismatch.md
+++ b/docs/solutions/logic-errors/svd-component-labels-mismatch.md
@@ -93,4 +93,4 @@ The fix works because it aligns labels with actual voting data:
 ## Related Issues
 - Analysis: `thoughts/explorer/top_svd_top_motions_report.md`
 - JSON generator: `scripts/generate_svd_json.py`
-- Labels source: `explorer.py:434-611` (SVD_THEMES dictionary)
\ No newline at end of file
+- Labels source: `analysis/config.py:67+` (SVD_THEMES dictionary)
\ No newline at end of file