diff --git a/.mindmodel/anti-patterns.yaml b/.mindmodel/anti-patterns.yaml new file mode 100644 index 0000000..338d219 --- /dev/null +++ b/.mindmodel/anti-patterns.yaml @@ -0,0 +1,43 @@ +# Known anti-patterns and recommended remediation (Phase 1 findings) + +anti_patterns: + - id: broad_except_swallows_errors + description: "Wide except: clauses that swallow exceptions without logging or re-raising." + examples: + - path: multiple + note: "Observed in various pipeline and ingestion spots where except Exception: returns a default without context." + remediation: + - "Replace broad except with specific exceptions." + - "When broad except is absolutely needed, call logger.exception(...) and re-raise or convert to a typed domain error." + - "Add unit tests to ensure critical errors are visible in CI logs." + + - id: mixed_print_and_logging + description: "Mixing print() and logging() for errors and info messages." + examples: + - path: api_client.py + excerpt: | + ```python + print(f"Fetched {len(voting_records)} voting records from API") + ... + except Exception as e: + print(f"Error fetching motions from API: {e}") + ``` + remediation: + - "Use logging.getLogger(__name__) and logger.info/warning/exception consistently." + - "Add a top-level logging configuration for Streamlit and scripts." + + - id: no_lockfile + description: "No lockfile present -> unreproducible installs and CI unpredictability." + remediation: + - "Add a lockfile (poetry.lock, requirements.txt produced by pip-tools) and pin versions in CI." + - "Make CI use the lockfile for reproducible builds." + + - id: declared_but_unused_dependency + description: "Dependency declared but unused (openai in pyproject)." + remediation: + - "Either remove the dependency or add clear adapter code/tests that exercise it. Keep pyproject tidy." + + - id: brittle_identity_heuristics + description: "Heuristics for MP identity (comma-based parsing) are brittle." + remediation: + - "Add robust parsing rules and unit tests; prefer canonical identifiers (persoon_id) where available." diff --git a/.mindmodel/architecture.yaml b/.mindmodel/architecture.yaml new file mode 100644 index 0000000..a40abb6 --- /dev/null +++ b/.mindmodel/architecture.yaml @@ -0,0 +1,35 @@ +# Architecture overview and confidence levels + +layers: + - name: ui + description: "Streamlit pages and app entrypoints (Home.py, pages/*)." + confidence: high + - name: ingestion + description: "API client and scrapers (api_client.py, scraper.py)." + confidence: high + - name: processing + description: "Pipelines for embeddings, SVD, fusion (pipeline/*, similarity/*)." + confidence: high + - name: storage + description: "DuckDB primary store; JSON fallback used in tests when duckdb missing." + confidence: high + - name: ai_provider + description: "Lightweight HTTP wrapper around OpenRouter/OpenAI-style backends in ai_provider.py." + confidence: medium + - name: orchestration + description: "Script-based orchestration (scripts/*.py), rerun_embeddings, scheduler." + confidence: medium + +organization: + - Keep UI code separated from heavy compute — Streamlit runs should avoid heavy compute inline (use subprocess or schedule). + - Pipelines are implemented as re-entrant functions returning summary dicts to facilitate testing and subprocess usage (seen in svd_pipeline.compute_svd_for_window). + - DB access is centralised via MotionDatabase helper (database.py) with convenience methods (store_fused_embedding, append_audit_event). + +design_decisions: + - Use DuckDB for local fast analytics storage; read_only connections used in compute stages to allow parallel workers. + - Embeddings and similarity cache are stored as JSON in DuckDB tables (vector columns). + - The ai_provider uses requests with retry/backoff rather than a heavy SDK to keep testing simple. + +confidence_summary: + overall_confidence: high + notes: "Phase 1 input inspected files across the repo; design mapping is consistent with code samples." diff --git a/.mindmodel/constraints/db_connection.yaml b/.mindmodel/constraints/db_connection.yaml new file mode 100644 index 0000000..52ed6a5 --- /dev/null +++ b/.mindmodel/constraints/db_connection.yaml @@ -0,0 +1,29 @@ +# DB connection handling constraints + +rules: + - name: use_context_managers_for_connections + rule: "Prefer using 'with duckdb.connect(path, read_only=...) as conn' for scoped DB interactions where possible." + rationale: "Ensures proper resource cleanup and avoids connection leaks." + + - name: read_only_for_compute + rule: "Use read_only=True for compute steps that only read data (SVD, similarity compute)." + rationale: "Allows safe parallel workers and reduces write contention." + + - name: short_lived_writes + rule: "When performing database writes, open short-lived connections, commit quickly and close." + rationale: "Avoids long-lived transactions and reduces lock windows." + +examples: + - path: pipeline/svd_pipeline.py + snippet: | + conn = duckdb.connect(db_path, read_only=True) + try: + rows = conn.execute(...).fetchall() + finally: + conn.close() + +anti_patterns_and_remediations: + - bad: "Creating a global connection at import that performs migrations." + remediation: "Move migrations to an explicit init function that runs at deployment/upgrade time." + - bad: "Not closing connections on exceptions." + remediation: "Wrap connects in `with` or finally: conn.close() blocks." diff --git a/.mindmodel/constraints/error_handling.yaml b/.mindmodel/constraints/error_handling.yaml new file mode 100644 index 0000000..2f95936 --- /dev/null +++ b/.mindmodel/constraints/error_handling.yaml @@ -0,0 +1,36 @@ +# Error handling style rules (YAML constraint example) + +rules: + - name: explicit_exceptions + rule: "Raise explicit exceptions (ValueError, ProviderError) for known error conditions rather than returning magic values." + examples: + - good: | + if not isinstance(text, str): + raise ProviderError('text must be a string') + - bad: | + if not isinstance(text, str): + return [] + + - name: avoid_broad_except + rule: "Avoid 'except Exception:' that swallows errors. If broad except is used for best-effort, log the exception with logger.exception and re-raise or convert." + examples: + - bad: | + try: + do_work() + except Exception: + return [] + - remediation: | + try: + do_work() + except SpecificError as exc: + logger.warning('Handled error: %s', exc) + raise + + - name: logging_over_print + rule: "Prefer logger.* over print() for messages and errors." + examples: + - bad: "print('Error fetching motions from API: %s' % e)" + - good: "logger.exception('Error fetching motions from API')" + +enforcement_examples: + - "Add a static code check to flag 'print(' in modules (except in simple scripts) and 'except Exception:' usages without logger.exception." diff --git a/.mindmodel/constraints/imports.yaml b/.mindmodel/constraints/imports.yaml new file mode 100644 index 0000000..453b4dc --- /dev/null +++ b/.mindmodel/constraints/imports.yaml @@ -0,0 +1,24 @@ +# Import grouping and ordering constraints + +rules: + - name: grouping + rule: "Group imports in three sections separated by a single blank line: stdlib, third-party, local." + examples: + - good: | + import json + import logging + + import requests + import duckdb + + from .pipeline import text_pipeline + - bad: | + import duckdb + import json + from pipeline import text_pipeline + + - name: from_imports + rule: "Prefer 'from x import y' only when it improves clarity or avoids circular import; otherwise import module and reference attributes." + +enforcement_examples: + - "Run isort or ruff- import sorting in pre-commit or CI to enforce ordering." diff --git a/.mindmodel/constraints/naming.yaml b/.mindmodel/constraints/naming.yaml new file mode 100644 index 0000000..c1c6d3e --- /dev/null +++ b/.mindmodel/constraints/naming.yaml @@ -0,0 +1,30 @@ +# Naming constraint rules (example constraint file) + +rules: + - name: module_file_names + rule: "Use snake_case for Python module filenames (e.g., text_pipeline.py, ai_provider.py)." + examples: + - good: "text_pipeline.py" + - bad: "TextPipeline.py" + + - name: function_names + rule: "Use snake_case for functions and methods." + examples: + - good: "def compute_similarities(...):" + - bad: "def ComputeSimilarities(...):" + + - name: class_names + rule: "Use PascalCase for classes." + examples: + - good: "class MotionDatabase:" + - bad: "class motion_database:" + + - name: constants + rule: "Constants use UPPER_SNAKE_CASE." + examples: + - good: "VOTE_MAP = { ... }" + - bad: "vote_map = { ... }" + +enforcement_examples: + - "Add a linter rule in CI: ruff or flake8 naming plugin to detect violations." + - "Run `python -m pip install ruff` and `ruff check` as part of CI." diff --git a/.mindmodel/constraints/testing.yaml b/.mindmodel/constraints/testing.yaml new file mode 100644 index 0000000..f6095d9 --- /dev/null +++ b/.mindmodel/constraints/testing.yaml @@ -0,0 +1,26 @@ +# Testing conventions constraint (YAML) + +rules: + - name: test_naming + rule: "Use pytest and name tests test_*.py and test_* functions." + examples: + - good: "tests/test_text_pipeline.py" + - bad: "tests/text_pipeline_test.py" + + - name: fixtures_and_conftest + rule: "Place shared fixtures in tests/conftest.py or tests/fixtures/ for reuse." + examples: + - good: "use fixtures declared in tests/conftest.py" + + - name: assert_raises + rule: "Explicitly assert expected exceptions with pytest.raises for invalid input." + examples: + - good: | + import pytest + + def test_invalid_input(): + with pytest.raises(ValueError): + function_under_test('bad') + +enforcement_examples: + - "Run pytest in CI; fail if tests don't run or if there are regressions." diff --git a/.mindmodel/conventions.yaml b/.mindmodel/conventions.yaml new file mode 100644 index 0000000..01dfb80 --- /dev/null +++ b/.mindmodel/conventions.yaml @@ -0,0 +1,32 @@ +# Coding conventions cheat-sheet (extracted from Phase 1) + +naming: + module_files: snake_case (e.g., text_pipeline.py, ai_provider.py) + functions: snake_case + classes: PascalCase + constants: UPPER_SNAKE_CASE + module_singletons: module-level instances, named lower_snake (e.g., db = MotionDatabase()) + +imports: + order: + - stdlib + - third-party + - local application imports + style: + - group imports with a blank line between groups + - prefer "from x import y" only when needed to avoid circular imports + +types_and_dataclasses: + - Use type hints broadly (functions, public APIs) + - config should be a dataclass in config.py + - Module-level singletons are allowed (but follow lifecycle rules in db_connection constraints) + +tests: + - pytest + - tests/ directory, files named test_*.py + - Use fixtures in tests/fixtures and conftest.py + - Tests expect raises(...) for invalid input or ProviderError + +error_handling: + - Prefer explicit exceptions (ValueError, ProviderError) + - Avoid overly-broad except: clauses (see anti-patterns) diff --git a/.mindmodel/dependencies.yaml b/.mindmodel/dependencies.yaml new file mode 100644 index 0000000..4bccd9c --- /dev/null +++ b/.mindmodel/dependencies.yaml @@ -0,0 +1,55 @@ +# Dependencies map and recommended extras (Phase 1 authoritative) +declared: + - streamlit + - duckdb + - ibis-framework[duckdb] + - plotly + - scikit-learn + - scipy + - umap-learn + - openai # note: declared but not observed imported; review usage + - requests + +observed: + - requests + - duckdb (used but sometimes import guarded) + - numpy + - pytest + +grouped: + core: + - python >=3.13 + - streamlit + - duckdb + - ibis-framework[duckdb] + - requests + ml: + - scikit-learn + - scipy + - umap-learn + - numpy + viz: + - plotly + testing: + - pytest + +recommended_extras: + reproducibility: + - poetry (poetry.lock) or pip-tools (requirements.txt + requirements.in) + - pipx or virtualenv usage documented + linting_and_formatting: + - black + - ruff + - isort + - mypy + logging_and_monitoring: + - structlog (optional) + containerization: + - docker (already used) + heavy_analytics (optional): + - pandas + - altair + - dash (if more interactive dashboards are needed) +notes: + - Because no lockfile was present during Phase 1, adding one is high priority for reproducible CI builds. + - openai is declared but not imported anywhere in Phase 1 files; prefer to either remove or add an explicit adapter usage and tests. diff --git a/.mindmodel/domain-glossary.yaml b/.mindmodel/domain-glossary.yaml new file mode 100644 index 0000000..69df450 --- /dev/null +++ b/.mindmodel/domain-glossary.yaml @@ -0,0 +1,37 @@ +# Domain glossary (core concepts from Phase 1) + +terms: + Motion: + short: "A parliamentary motion/decision" + keys: [id, title, description, date, body_text, url] + motie: + short: "Dutch: motion (motie). Equivalent to Motion in code comments and UI." + MP: + short: "Member of Parliament (kamerlid)" + keys: [mp_name, party, van, tot_en_met, persoon_id] + mp_votes: + short: "Raw voting rows: motion_id, mp_name, vote, date" + mp_metadata: + short: "Per-MP metadata table and fields" + user_sessions: + short: "Streamlit user quiz session state (session_id, user_votes, completed_motions...)" + embeddings: + short: "Raw text embeddings stored per motion (embeddings table)" + svd_vectors: + short: "SVD-derived vectors from the vote matrix (svd_vectors table)" + fused_embeddings: + short: "Concatenation of SVD and text embeddings (fused_embeddings table)" + similarity_cache: + short: "Precomputed nearest neighbors for each motion" + window_id: + short: "Processing window identifier used for SVD/fusion runs" + controversy_score: + short: "Numeric measure stored in motions table" + winning_margin: + short: "Numeric field indicating margin of win in a vote" + Politiek_Kompas: + short: "Political compass; also appears in UI features" + MP_quiz: + short: "Interactive quiz derived from motions and mp_votes" +notes: + - Use these canonical terms in docs, tests, variable names and DB schemas. diff --git a/.mindmodel/examples/pattern-examples.md b/.mindmodel/examples/pattern-examples.md new file mode 100644 index 0000000..aecdef6 --- /dev/null +++ b/.mindmodel/examples/pattern-examples.md @@ -0,0 +1,116 @@ +# Extracted pattern examples (representative snippets) + +Note: snippets are verbatim extracts from repository files (Phase 1). Paths shown. + +## DuckDB connect + schema init (database.py) +```python +conn = duckdb.connect(self.db_path) + +# Create sequence for auto-incrementing IDs +try: + conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1") +except: + pass + +# Create tables with proper ID handling +conn.execute(""" + CREATE TABLE IF NOT EXISTS motions ( + id INTEGER DEFAULT nextval('motions_id_seq'), + title TEXT NOT NULL, + description TEXT, + date DATE, + policy_area TEXT, + voting_results JSON, + winning_margin FLOAT, + controversy_score FLOAT, + layman_explanation TEXT, + externe_identifier TEXT, + body_text TEXT, + url TEXT UNIQUE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (id) + ) +""") +conn.close() +``` + +## Read-only compute worker (svd_pipeline.py) +```python +conn = duckdb.connect(db_path, read_only=True) +try: + rows = conn.execute( + "SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ?", + (start_date, end_date), + ).fetchall() +finally: + conn.close() +``` + +## Requests with retry/backoff (ai_provider.py) +```python +resp = requests.post(url, json=json, headers=headers, timeout=10) +... +if getattr(resp, "status_code", 0) == 429: + if attempt == retries: + raise ProviderError(f"Provider returned HTTP {resp.status_code}") + retry_after = None + raw = resp.headers.get("Retry-After") if getattr(resp, "headers", None) else None + if raw: + try: + retry_after = int(raw) + except Exception: + try: + dt = parsedate_to_datetime(raw) + now = datetime.now(tz=dt.tzinfo or timezone.utc) + secs = (dt - now).total_seconds() + retry_after = max(0, int(secs)) + except Exception: + retry_after = None + + if retry_after is not None: + time.sleep(retry_after) + continue +``` + +## Embedding batch + per-item fallback (pipeline/ai_provider_wrapper.py) +```python +for start in range(0, len(texts), batch_size): + chunk = texts[i:end] + emb_chunk, emb_exc = _attempt_batch(chunk, i) + if emb_chunk is not None: + for j, emb in enumerate(emb_chunk): + results[i + j] = emb + i = end + continue + + # batch failed -> fallback to per-item attempts + for j in range(i, end): + t = texts[j] + single, single_exc = _attempt_batch([t], j) + if single: + results[j] = single[0] + continue + results[j] = None +``` + +## Similarity compute (similarity/compute.py) +```python +# Ensure consistent dimensionality: pad shorter vectors with zeros +lengths = [len(v) for v in vecs] +max_dim = max(lengths) +if len(set(lengths)) != 1: + logger.warning( + "Inconsistent vector dimensions detected (max=%d). Padding shorter vectors with zeros.", + max_dim, + ) + +matrix = np.zeros((len(vecs), max_dim), dtype=np.float32) +for i, v in enumerate(vecs): + matrix[i, : len(v)] = v + +# Normalize rows and compute cosine similarity +norms = np.linalg.norm(matrix, axis=1, keepdims=True) +norms[norms == 0] = 1.0 +normalized = matrix / norms +sim = normalized @ normalized.T +``` diff --git a/.mindmodel/manifest.yaml b/.mindmodel/manifest.yaml new file mode 100644 index 0000000..071febb --- /dev/null +++ b/.mindmodel/manifest.yaml @@ -0,0 +1,60 @@ +name: stemwijzer +version: 2 +categories: + - path: stack.yaml + description: Project technology stack (languages, frameworks, runtime) + group: stack + - path: dependencies.yaml + description: Declared and recommended dependencies grouped by purpose + group: stack + - path: system.md + description: System overview and architecture high-level notes + group: architecture + - path: architecture.yaml + description: Architectural layers, organization and confidence levels + group: architecture + - path: conventions.yaml + description: Coding conventions cheat-sheet (naming, imports, types) + group: style + - path: domain-glossary.yaml + description: Business domain glossary for the project + group: domain + - path: patterns/duckdb_access.yaml + description: DuckDB access patterns, examples, and anti-patterns + group: patterns + - path: patterns/requests_http.yaml + description: Requests/HTTP client usage and retry best-practices + group: patterns + - path: patterns/embeddings_similarity.yaml + description: Embedding, SVD, fusion and similarity pipeline patterns + group: patterns + - path: patterns/error_handling.yaml + description: Error handling patterns and rules + group: patterns + - path: patterns/validation.yaml + description: Input/domain validation patterns and examples + group: patterns + - path: patterns/module_singletons.yaml + description: Module-level singletons and lifecycle patterns + group: patterns + - path: anti-patterns.yaml + description: Known anti-patterns and remediation steps + group: patterns + - path: examples/pattern-examples.md + description: Consolidated extracted code examples across patterns + group: patterns + - path: constraints/naming.yaml + description: Enforce naming rules (snake_case, PascalCase, constants) + group: constraints + - path: constraints/imports.yaml + description: Enforce import grouping and ordering + group: constraints + - path: constraints/db_connection.yaml + description: Rules for opening/closing DB connections and read-only usage + group: constraints + - path: constraints/error_handling.yaml + description: Error handling style and allowed exception scopes + group: constraints + - path: constraints/testing.yaml + description: Test conventions (pytest, test naming, fixtures) + group: constraints diff --git a/.mindmodel/patterns/duckdb_access.yaml b/.mindmodel/patterns/duckdb_access.yaml new file mode 100644 index 0000000..63204a5 --- /dev/null +++ b/.mindmodel/patterns/duckdb_access.yaml @@ -0,0 +1,70 @@ +name: duckdb_access + +rules: + - Prefer using read_only=True for compute-only subprocesses (e.g., SVD compute) to allow concurrent readers. + - Prefer "with duckdb.connect(db_path, read_only=True) as conn" for scoped connections so conn.close() is automatic. + - If a long-lived connection is created at module level, provide explicit close() or ensure operation is safe for Streamlit's lifecycle. + - Prefer parameterizing db_path in pipelines and creating connections locally (avoid global connections that cross threads). + +examples: + - path: database.py + excerpt: | + ```python + conn = duckdb.connect(self.db_path) + ... + conn.execute(""" + CREATE TABLE IF NOT EXISTS fused_embeddings ( + id INTEGER DEFAULT nextval('fused_embeddings_id_seq'), + motion_id INTEGER NOT NULL, + window_id TEXT NOT NULL, + vector JSON NOT NULL, + svd_dims INTEGER NOT NULL, + text_dims INTEGER NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (id) + ) + """) + conn.close() + ``` + note: explicit connect/close used when initializing schema + + - path: pipeline/svd_pipeline.py + excerpt: | + ```python + conn = duckdb.connect(db_path, read_only=True) + try: + rows = conn.execute( + "SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ?", + (start_date, end_date), + ).fetchall() + finally: + conn.close() + ``` + note: read_only connection used for compute-heavy worker + + - path: similarity/compute.py + excerpt: | + ```python + try: + import duckdb + except Exception: + logger.exception("duckdb import failed; cannot load vectors") + return 0 + + with duckdb.connect(db.db_path) as conn: + rows = conn.execute(query, params).fetchall() + ``` + note: preferred 'with' context for automatic close + +anti_patterns: + - Bad: creating a connection without closure in a long-running process + remediation: use "with" context or ensure conn.close() in finally block + example: | + ```python + # BAD: connection may leak if exception occurs before explicit close + conn = duckdb.connect(db_path) + rows = conn.execute("SELECT ...").fetchall() + # missing finally/close + ``` + - Bad: Opening write connections from many parallel workers without coordination + remediation: open read_only for compute processes and centralize writes via short-lived connections or a single writer worker. diff --git a/.mindmodel/patterns/embeddings_similarity.yaml b/.mindmodel/patterns/embeddings_similarity.yaml new file mode 100644 index 0000000..40a3149 --- /dev/null +++ b/.mindmodel/patterns/embeddings_similarity.yaml @@ -0,0 +1,63 @@ +name: embeddings_similarity_pipeline + +rules: + - Keep embedding calls batched where possible; fallback to per-item attempts on persistent batch failure. + - Store raw embeddings, SVD vectors, and fused_embeddings separately; fused_embeddings are typically concatenation [svd + text]. + - Compute similarity as normalized cosine on padded vectors; record top-k neighbors in similarity_cache. + - Use read_only DuckDB connections in compute workers to allow parallel runs. + +examples: + - path: pipeline/ai_provider_wrapper.py + excerpt: | + ```python + for start in range(0, len(texts), batch_size): + chunk = texts[start : start + batch_size] + resp = _post_with_retries("/embeddings", json={"model": model, "input": chunk}) + ... + for j in range(i, end): + t = texts[j] + single, single_exc = _attempt_batch([t], j) + if single: + results[j] = single[0] + ``` + note: batched embed + fallback per-item retry + + - path: pipeline/fusion.py + excerpt: | + ```python + try: + svd_vec = json.loads(svd_json) + except Exception: + _logger.exception("Invalid SVD vector JSON for entity %s", entity_id) + skipped_missing_svd += 1 + continue + ... + fused = list(svd_vec) + list(text_vec) + res = db.store_fused_embedding( + int(entity_id), + window_id, + fused, + svd_dims=len(svd_vec), + text_dims=len(text_vec), + ) + ``` + note: concatenation of vectors and storage via MotionDatabase + + - path: similarity/compute.py + excerpt: | + ```python + # Normalize rows + norms = np.linalg.norm(matrix, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + normalized = matrix / norms + sim = normalized @ normalized.T + ... + # pick top-k neighbors and write to similarity_cache + ``` + note: numeric pipeline and padding to consistent dimensionality + +anti_patterns: + - Bad: Assuming consistent vector length without checks (leads to shape errors). + remediation: Detect inconsistent lengths, pad with zeros, and log a warning (as seen in compute.py). + - Bad: Recomputing heavy pipelines inline in UI requests. + remediation: schedule heavy work in scripts/subprocesses and read precomputed results in UI. diff --git a/.mindmodel/patterns/error_handling.yaml b/.mindmodel/patterns/error_handling.yaml new file mode 100644 index 0000000..d6344cc --- /dev/null +++ b/.mindmodel/patterns/error_handling.yaml @@ -0,0 +1,54 @@ +name: error_handling + +rules: + - Use explicit exceptions for domain/error classification (e.g., ProviderError, ValueError). + - Prefer logging.exception when catching an exception where stack trace is useful. + - Avoid broad except: clauses that swallow exceptions; if broad except is used for "best-effort" fallback, log at warning and include original exception context. + - For public library-like functions, prefer raising typed exceptions instead of returning magic values ([], False) — only return safe defaults where documented. + +examples: + - path: ai_provider.py + excerpt: | + ```python + except requests.ConnectionError as exc: + if attempt == retries: + raise ProviderError( + f"Connection error when calling provider: {exc}" + ) from exc + ... + ``` + note: mapping network error to ProviderError with re-raise chaining + + - path: pipeline/ai_provider_wrapper.py + excerpt: | + ```python + except Exception: + _logger.exception("Failed to append audit event for embedding failure") + results[j] = None + ``` + note: logs and assigns None for failure; fallback behavior documented earlier in wrapper rule + + - path: similarity/compute.py + excerpt: | + ```python + try: + import duckdb + except Exception: + logger.exception("duckdb import failed; cannot load vectors") + return 0 + ``` + note: defensive import handling and early return on failure + +anti_patterns: + - Bad: Broad except without logging and without re-raising (silently hides bugs) + remediation: Narrow exception types or at minimum log.exception() and re-raise or convert to a domain error if truly handled. + example: | + ```python + try: + do_work() + except Exception: + return [] + # BAD: hides the root cause and returns an ambiguous default + ``` + - Bad: Mixing print() and logging for errors + remediation: Replace print() calls with logger.* calls; use structured logging configuration. diff --git a/.mindmodel/patterns/module_singletons.yaml b/.mindmodel/patterns/module_singletons.yaml new file mode 100644 index 0000000..7ce7d96 --- /dev/null +++ b/.mindmodel/patterns/module_singletons.yaml @@ -0,0 +1,33 @@ +name: module_singletons + +rules: + - Module-level singletons (e.g., db = MotionDatabase()) are acceptable but should be created carefully: + - Avoid expensive initialization at import time. + - Provide a way to construct with a test DB path or to reinitialize in tests. + - If a singleton holds resources (DB connections, sessions), ensure safe shutdown on program exit. + +examples: + - path: database.py + excerpt: | + ```python + class MotionDatabase: + def __init__(self, db_path: str = config.DATABASE_PATH): + self.db_path = db_path + # If duckdb is not available, operate in lightweight file-backed mode + self._file_mode = duckdb is None + self._init_database() + ``` + note: class is safe to instantiate and creates DB at init; consider lazy init if heavy + + - path: similarity/lookup.py + excerpt: | + ```python + db = MotionDatabase(db_path=db_path) if db_path else MotionDatabase() + if hasattr(db, "get_cached_similarities"): + rows = db.get_cached_similarities(...) + ``` + note: consumers create local MotionDatabase instances, not relying on a single global + +anti_patterns: + - Bad: Creating connections and performing heavy schema migrations during import + remediation: Move heavy init to an explicit initialize() method and keep import fast. diff --git a/.mindmodel/patterns/requests_http.yaml b/.mindmodel/patterns/requests_http.yaml new file mode 100644 index 0000000..135287c --- /dev/null +++ b/.mindmodel/patterns/requests_http.yaml @@ -0,0 +1,65 @@ +name: requests_http + +rules: + - Reuse requests.Session when making multiple calls to the same host to benefit from connection pooling. + - Wrap outbound HTTP calls with retry/backoff logic and respect Retry-After on 429. + - Treat 5xx as transient and retry; surface 4xx as configuration/client errors (do not retry unless 429). + - Raise or wrap non-OK responses into domain ProviderError to make behavior consistent across the codebase. + +examples: + - path: ai_provider.py + excerpt: | + ```python + resp = requests.post(url, json=json, headers=headers, timeout=10) + ... + if getattr(resp, "status_code", 0) == 429: + if attempt == retries: + raise ProviderError(f"Provider returned HTTP {resp.status_code}") + retry_after = None + raw = resp.headers.get("Retry-After") if getattr(resp, "headers", None) else None + if raw: + try: + retry_after = int(raw) + except Exception: + ... + if retry_after is not None: + time.sleep(retry_after) + continue + ``` + note: explicit handling of 429 and Retry-After + + - path: api_client.py + excerpt: | + ```python + response = self.session.get( + base_url, params=params, timeout=config.API_TIMEOUT + ) + response.raise_for_status() + data = response.json() + ``` + note: uses session + raise_for_status() to surface HTTP errors + + - path: pipeline/ai_provider_wrapper.py + excerpt: | + ```python + def _attempt_batch(chunk_texts, start_index): + backoff = 0.5 + for attempt in range(1, retries + 1): + try: + emb_chunk = _embedder( + chunk_texts, model=model, batch_size=len(chunk_texts) + ) + return emb_chunk, None + except Exception as exc: + if attempt == retries: + break + sleep = backoff * (2 ** (attempt - 1)) + time.sleep(sleep) + continue + ``` + note: wrapper adds retry/backoff and per-item fallback + +anti_patterns: + - Bad: Blindly catching all requests exceptions and returning empty response + remediation: map network exceptions to retryable vs terminal (ProviderError) and log details. + - Bad: Using print() for network errors instead of structured logging (see api_client.py where print() is used; prefer logging). diff --git a/.mindmodel/patterns/validation.yaml b/.mindmodel/patterns/validation.yaml new file mode 100644 index 0000000..5b68808 --- /dev/null +++ b/.mindmodel/patterns/validation.yaml @@ -0,0 +1,29 @@ +name: validation + +rules: + - Validate inputs early and raise ValueError or domain-specific exceptions (ProviderError) for invalid contract inputs. + - Tests should assert that invalid inputs raise the expected exceptions. + - Use explicit checks for types and shapes on public APIs (e.g., ensure text is str before embedding). + +examples: + - path: ai_provider.py + excerpt: | + ```python + if not isinstance(text, str): + raise ProviderError("text must be a string") + ``` + note: explicit type validation before network call + + - path: pipeline/ai_provider_wrapper.py + excerpt: | + ```python + if not texts: + return [] + if motion_ids is None: + motion_ids = [None for _ in texts] + ``` + note: defensive handling of empty inputs + +anti_patterns: + - Bad: Allowing invalid values to propagate into heavy computation (e.g., non-string into embedding pipeline). + remediation: Fail fast with a typed exception and add unit tests to cover validations. diff --git a/.mindmodel/stack.yaml b/.mindmodel/stack.yaml new file mode 100644 index 0000000..857d190 --- /dev/null +++ b/.mindmodel/stack.yaml @@ -0,0 +1,33 @@ +# Tech stack (Phase 1 authoritative) + +language: + name: python + version: ">=3.13" + +frameworks: + - streamlit: ">=1.48.0" # UI: Home.py, pages/..., app.py + +database: + primary: duckdb + orm_or_adapter: ibis-framework[duckdb] # used for some parts + +visualization: + - plotly + +ml: + - scikit-learn + - scipy + - umap-learn + +ai: + declared_dependency: openai # declared in pyproject but not observed imported; ai_provider uses requests + runtime_adapter: custom requests-based wrapper (ai_provider.py) + +container: + - docker: Dockerfile FROM python:3.13-slim, EXPOSE 8501, CMD streamlit run Home.py + +testing: + - pytest + +ci: + - drone: .drone.yml present diff --git a/.mindmodel/system.md b/.mindmodel/system.md new file mode 100644 index 0000000..c90657d --- /dev/null +++ b/.mindmodel/system.md @@ -0,0 +1,18 @@ +# System overview + +This project is a Streamlit-based UI and data-processing pipeline that computes embeddings, +performs SVD over MP/motion voting matrices, fuses vector representations, and precomputes +a similarity cache for quick lookup in the UI. + +Key subsystems: +- UI: Streamlit pages (Home.py, pages/*). Exposes interactive explorer and quizzes. +- Data ingestion: scripts and scraper/api_client.py (Tweede Kamer OData). +- Processing pipelines: pipeline/* (text embeddings, SVD, fusion). +- Similarity layer: similarity/compute.py and similarity/lookup.py storing precomputed neighbors. +- Storage: DuckDB (primary), with a JSON-file fallback used in tests/environments without duckdb. +- AI/Embedding provider: ai_provider.py (HTTP wrapper around an OpenRouter/OpenAI-compatible API). + +Operational notes: +- Dockerfile exists; Streamlit default port 8501 exposed. +- Tests use pytest. CI uses Drone (.drone.yml). +- There is no lockfile present in the repository snapshot; add one (poetry.lock or requirements.txt) for reproducible installs.