From 154762a4c8b76e8b9fe692a83cd521b60d02031d Mon Sep 17 00:00:00 2001 From: Sven Geboers Date: Sun, 5 Apr 2026 00:46:57 +0200 Subject: [PATCH] docs: refresh 2 stale references in solutions docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - refactoring-streamlit-data-loading.md: update test count 164/164 → 173/173 (7 new axis validation tests added) - svd-component-labels-mismatch.md: SVD_THEMES moved from explorer.py:434-611 → analysis/config.py:67+ per the refactoring that extracted constants to analysis/config.py --- .../refactoring-streamlit-data-loading.md | 171 ++++++++++++++++++ .../svd-component-labels-mismatch.md | 2 +- 2 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 docs/solutions/best-practices/refactoring-streamlit-data-loading.md diff --git a/docs/solutions/best-practices/refactoring-streamlit-data-loading.md b/docs/solutions/best-practices/refactoring-streamlit-data-loading.md new file mode 100644 index 0000000..a7632f0 --- /dev/null +++ b/docs/solutions/best-practices/refactoring-streamlit-data-loading.md @@ -0,0 +1,171 @@ +--- +module: explorer +component: explorer.py +problem_type: developer_experience +component: development_workflow +category: best-practices +severity: medium +tags: + - streamlit + - refactoring + - horizontal-decomposition + - duckdb + - data-loading + - constants +title: "Refactoring Large Streamlit Apps: Extract Pure Data Logic" +date: "2026-04-04" +last_updated: "2026-04-05" +applies_when: + - Streamlit apps with large files (>1500 lines) + - Mixed UI and business logic + - Heavy data loading functions decorated with @st.cache_data + - DuckDB connections scattered throughout +--- + +## Context + +The `explorer.py` Streamlit app (originally 3715 lines) had 39 functions mixing: +- Streamlit UI rendering calls +- DuckDB database queries +- Business logic computations +- Pure data transformations + +This pattern makes the code: +- Hard to unit test (requires Streamlit runtime) +- Difficult to understand (UI interwoven with logic) +- Challenging to maintain (changes ripple unpredictably) + +## Guidance + +### Pattern: Horizontal Decomposition + +Extract pure functions by type, not just by moving code. The key insight is **separating concerns by computation type**, not just file size: + +**1. Data Loading Functions → `analysis/explorer_data.py`** +```python +# Before: Mixed in explorer.py with @st.cache_data +@st.cache_data(show_spinner="Partijkaart laden…") +def load_party_map(db_path: str) -> Dict[str, str]: + con = duckdb.connect(database=db_path, read_only=True) + try: + rows = con.execute("SELECT mp_name, party FROM mp_metadata...").fetchall() + return {mp: _PARTY_NORMALIZE.get(party, party) for mp, party in rows if mp and party} + finally: + con.close() + +# After: Pure data access in analysis/explorer_data.py +def load_party_map(db_path: str) -> Dict[str, str]: + """Return {mp_name: party} mapping, with party names normalised to abbreviations.""" + try: + con = duckdb.connect(database=db_path, read_only=True) + rows = con.execute("SELECT mp_name, party FROM mp_metadata WHERE party IS NOT NULL").fetchall() + con.close() + return {mp: _PARTY_NORMALIZE.get(party, party) for mp, party in rows if mp and party} + except Exception: + logger.exception("Failed to load party map") + return {} +``` + +**2. Pure Computation → `analysis/projections.py`** +```python +# Before: Mixed with UI +def _should_swap_axes(axis_def: dict) -> bool: + economic_labels = {"Verzorgingsstaat–Marktwerking", "Links–Rechts"} + return axis_def.get("y_label") in economic_labels and axis_def.get("x_label") not in economic_labels + +# After: Pure function, no external dependencies +def should_swap_axes(axis_def: dict) -> bool: + """Return True if the Y axis is economic left-right and the X axis is not.""" + economic_labels = {"Verzorgingsstaat–Marktwerking", "Links–Rechts"} + return axis_def.get("y_label") in economic_labels and axis_def.get("x_label") not in economic_labels +``` + +**4. Constants → `analysis/config.py`** +```python +# Before: Defined inline in the data loading module +_PARTY_NORMALIZE: Dict[str, str] = { + "Nieuw Sociaal Contract": "NSC", + "CU": "ChristenUnie", +} + +# After: Import from config, define once +from analysis.config import _PARTY_NORMALIZE + +def load_party_map(db_path: str) -> Dict[str, str]: + """Return {mp_name: party} mapping, with party names normalised to abbreviations.""" + try: + con = duckdb.connect(database=db_path, read_only=True) + rows = con.execute("SELECT mp_name, party FROM mp_metadata WHERE party IS NOT NULL").fetchall() + con.close() + return {mp: _PARTY_NORMALIZE.get(party, party) for mp, party in rows if mp and party} + except Exception: + logger.exception("Failed to load party map") + return {} +``` + +**Why this matters:** DRY enforcement. If `_PARTY_NORMALIZE` or `CURRENT_PARLIAMENT_PARTIES` are defined in multiple modules, changes must be made in all places. Importing from a single source of truth (`config.py`) prevents drift. + +**Anti-pattern to avoid:** Creating new constant definitions in extracted modules without checking if they already exist in `config.py`. Before adding a constant to any `analysis/*.py` module, grep for existing definitions of that constant name across all `analysis/` modules. +When tests import from the original module, preserve wrappers: +```python +# In explorer.py - preserves test compatibility +def _should_swap_axes(axis_def: dict) -> bool: + """Return True if the Y axis is economic left-right and the X axis is not.""" + return projections.should_swap_axes(axis_def) +``` + +### Import Direction Rule + +``` +explorer.py → analysis/*.py (one-way) +analysis/*.py ↛ explorer.py (never) +``` + +This prevents circular imports and clarifies module boundaries. + +### Preserve Function Signatures + +Refactoring shouldn't change public APIs: +```python +# Keep these unchanged: +@st.cache_data(show_spinner="...") +def load_party_map(db_path: str) -> Dict[str, str]: + return explorer_data.load_party_map(db_path) +``` + +## Why This Matters + +1. **Testability**: Pure functions in `analysis/` can be unit tested without Streamlit runtime +2. **Reusability**: Data loading functions can be imported by other modules +3. **Maintainability**: Changes to business logic are isolated from UI changes +4. **Readability**: Single responsibility - each function does one thing + +## Prevention + +1. **Audit existing constants before creating new ones**: Before adding a constant to any `analysis/*.py` module, grep across `analysis/` for existing definitions. DRY violations cause maintenance burden and drift. +2. **Define `__all__` in every analysis module**: Without `__all__`, it's hard to know what's public API. Add it when creating the module, not as an afterthought. All 6 public functions in `analysis/trajectory.py` (compute_trajectories, compute_2d_trajectories, top_drifters, compute_party_discipline, window_to_dates, choose_trajectory_title) should be exported. +3. **Test assertions may need updates**: When extracting constants, grep for all test references to those constants and update assertions. See `docs/solutions/test-failures/svd-label-tests-after-refactoring.md` for an example. +4. **Verify tests pass after extraction**: Run the full test suite to catch any broken import chains or assertion mismatches. + +## Results + +| Metric | Before | After | +|--------|--------|-------| +| explorer.py lines | 3715 | 3069 | +| analysis/ modules | 0 | 12 modules | +| Functions extracted | 0 | 15+ | +| Test pass rate | 164/164 | 173/173 | + +**Practical minimum reached at ~3000 lines.** Tab functions (`build_*_tab()`) and their `_render_*` helpers are inherently Streamlit-coupled — they mix UI rendering with business logic. Further reduction requires decoupling data preparation from rendering (prepare data in `analysis/`, render in `explorer.py`), which is a significant architectural refactor. + +## When to Apply + +Apply this pattern when: +- A Streamlit file exceeds 1500 lines +- Functions mix `@st.cache_data` with DuckDB queries and business logic +- Tests require Streamlit runtime to run +- Multiple tabs share similar data loading patterns + +## Examples + +See `analysis/config.py` for constants (PARTY_COLOURS, SVD_THEMES, KNOWN_MAJOR_PARTIES, CURRENT_PARLIAMENT_PARTIES, CANONICAL_RIGHT, CANONICAL_LEFT, _PARTY_NORMALIZE), `analysis/explorer_data.py` for extracted data loading functions, and `analysis/projections.py` for pure computation utilities. diff --git a/docs/solutions/logic-errors/svd-component-labels-mismatch.md b/docs/solutions/logic-errors/svd-component-labels-mismatch.md index aef945d..f048c79 100644 --- a/docs/solutions/logic-errors/svd-component-labels-mismatch.md +++ b/docs/solutions/logic-errors/svd-component-labels-mismatch.md @@ -93,4 +93,4 @@ The fix works because it aligns labels with actual voting data: ## Related Issues - Analysis: `thoughts/explorer/top_svd_top_motions_report.md` - JSON generator: `scripts/generate_svd_json.py` -- Labels source: `explorer.py:434-611` (SVD_THEMES dictionary) \ No newline at end of file +- Labels source: `analysis/config.py:67+` (SVD_THEMES dictionary) \ No newline at end of file