Deleted stale root-level Python files: - main.py (unused 'Hello world' script) - verify.py (unused table info script) - scraper.py (unused MotionScraper class) - scheduler.py (unused DataUpdateScheduler class) Deleted duplicate .mindmodel root YAML files (subdirectory versions are more comprehensive): - anti-patterns.yaml, architecture.yaml, conventions.yaml - dependencies.yaml, domain.yaml, domain-glossary.yaml - stack.yaml, tech-stack.yaml, workflows.yaml Added comprehensive .mindmodel subdirectories: - constraints/ (naming, db-schema, error-handling, types, etc.) - patterns/ (api, architecture, database, python, streamlit, etc.) - examples/ (code examples for each pattern) - anti-patterns/, architecture/, conventions/, dependencies/, domain/, stack/ Updated ARCHITECTURE.md to reflect current codebase: - Removed references to non-existent files - Added missing files (explorer.py, explorer_helpers.py, pipeline/) - Added directory structure documentation - Updated tech stack to include scipy, sklearn, umap Updated .gitignore: - Added patterns for generated analysis files - Added .worktrees/ pattern (was already in gitignore but dir was deleted) Removed empty .worktrees/ directorymain
parent
0308d20f12
commit
f376300804
@ -0,0 +1,184 @@ |
|||||||
|
# Error Handling Constraints |
||||||
|
|
||||||
|
## Core Rule |
||||||
|
|
||||||
|
**Catch `Exception`, return safe fallbacks (False/[]/None)** |
||||||
|
|
||||||
|
Never let exceptions propagate to user-facing code. Always provide a safe default. |
||||||
|
|
||||||
|
## Patterns |
||||||
|
|
||||||
|
### For Not-Found Operations |
||||||
|
|
||||||
|
Return `None` or falsy value when item not found: |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD: Return None on not found |
||||||
|
def get_motion_by_id(self, motion_id: int) -> Optional[Dict]: |
||||||
|
try: |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
result = conn.execute( |
||||||
|
"SELECT * FROM motions WHERE id = ?", (motion_id,) |
||||||
|
).fetchone() |
||||||
|
conn.close() |
||||||
|
return result |
||||||
|
except Exception: |
||||||
|
conn.close() |
||||||
|
return None |
||||||
|
``` |
||||||
|
|
||||||
|
### For Collection Operations |
||||||
|
|
||||||
|
Return empty list when no results: |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD: Return empty list on failure |
||||||
|
def get_filtered_motions(self, **kwargs) -> List[Dict]: |
||||||
|
try: |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
rows = conn.execute(query, params).fetchall() |
||||||
|
conn.close() |
||||||
|
return rows |
||||||
|
except Exception: |
||||||
|
conn.close() |
||||||
|
return [] |
||||||
|
``` |
||||||
|
|
||||||
|
### For Boolean Operations |
||||||
|
|
||||||
|
Return `False` for failed boolean checks: |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD: Return False on failure |
||||||
|
def motion_exists(self, motion_id: int) -> bool: |
||||||
|
try: |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
count = conn.execute( |
||||||
|
"SELECT COUNT(*) FROM motions WHERE id = ?", (motion_id,) |
||||||
|
).fetchone()[0] |
||||||
|
conn.close() |
||||||
|
return count > 0 |
||||||
|
except Exception: |
||||||
|
return False |
||||||
|
``` |
||||||
|
|
||||||
|
### For Creation Operations |
||||||
|
|
||||||
|
Return `False` or empty string on failure: |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD: Return empty string on failure |
||||||
|
def generate_summary(self, title: str, body: str) -> str: |
||||||
|
try: |
||||||
|
return ai_provider.chat_completion(messages) |
||||||
|
except ai_provider.ProviderError: |
||||||
|
logger.exception("AI provider failed") |
||||||
|
return "" |
||||||
|
``` |
||||||
|
|
||||||
|
## Anti-Patterns to Avoid |
||||||
|
|
||||||
|
### Don't Catch Specific Exceptions Only |
||||||
|
```python |
||||||
|
# BAD: Catches only FileNotFoundError, misses other issues |
||||||
|
try: |
||||||
|
with open(path) as f: |
||||||
|
return json.load(f) |
||||||
|
except FileNotFoundError: |
||||||
|
return None |
||||||
|
``` |
||||||
|
|
||||||
|
### Don't Re-raise Without Context |
||||||
|
```python |
||||||
|
# BAD: Loses information |
||||||
|
try: |
||||||
|
process(data) |
||||||
|
except Exception: |
||||||
|
raise # No context added |
||||||
|
``` |
||||||
|
|
||||||
|
### Don't Swallow Exceptions Silently |
||||||
|
```python |
||||||
|
# BAD: No logging, no fallback |
||||||
|
try: |
||||||
|
return risky_operation() |
||||||
|
except Exception: |
||||||
|
pass # What happened? |
||||||
|
``` |
||||||
|
|
||||||
|
## Nested Exception Handling |
||||||
|
|
||||||
|
When calling code that has its own error handling, wrap only if needed: |
||||||
|
|
||||||
|
```python |
||||||
|
# Accept result from wrapped function (it handles errors) |
||||||
|
def fetch_motions(self, start_date): |
||||||
|
# ai_provider_wrapper handles retries internally |
||||||
|
embeddings = get_embeddings_with_retry(texts) |
||||||
|
|
||||||
|
# Only wrap if wrapper doesn't handle errors |
||||||
|
if all(e is None for e in embeddings): |
||||||
|
logger.error("All embeddings failed") |
||||||
|
return [] |
||||||
|
|
||||||
|
return process(embeddings) |
||||||
|
``` |
||||||
|
|
||||||
|
## Context Managers |
||||||
|
|
||||||
|
Use `try/finally` for cleanup: |
||||||
|
|
||||||
|
```python |
||||||
|
def process_with_temp_file(self): |
||||||
|
temp = NamedTemporaryFile(delete=False) |
||||||
|
try: |
||||||
|
temp.write(data) |
||||||
|
temp.close() |
||||||
|
return process_file(temp.name) |
||||||
|
finally: |
||||||
|
os.unlink(temp.name) |
||||||
|
temp.close() |
||||||
|
``` |
||||||
|
|
||||||
|
## When to Log vs Return |
||||||
|
|
||||||
|
| Scenario | Action | |
||||||
|
|----------|--------| |
||||||
|
| User action fails | Log warning, return safe default | |
||||||
|
| Internal error (corrupt data) | Log error, return safe default | |
||||||
|
| Transient failure (network) | Log warning, retry if appropriate | |
||||||
|
| Configuration error | Log error, raise with clear message | |
||||||
|
|
||||||
|
## Exception Propagation |
||||||
|
|
||||||
|
Only raise exceptions for: |
||||||
|
1. Configuration/setup errors (missing required env vars) |
||||||
|
2. Programming errors (invalid arguments) |
||||||
|
3. Fatal system errors (database corruption) |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD: Raise for configuration errors |
||||||
|
def _get_api_key(self) -> str: |
||||||
|
key = os.environ.get("OPENROUTER_API_KEY") |
||||||
|
if not key: |
||||||
|
raise ProviderError( |
||||||
|
"OPENROUTER_API_KEY environment variable is required" |
||||||
|
) |
||||||
|
return key |
||||||
|
``` |
||||||
|
|
||||||
|
## Logging Errors |
||||||
|
|
||||||
|
Always include context: |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD: Include relevant context |
||||||
|
_logger.error( |
||||||
|
"Failed to fetch motion %d: %s", |
||||||
|
motion_id, |
||||||
|
exc |
||||||
|
) |
||||||
|
|
||||||
|
# BAD: No context |
||||||
|
_logger.error("Failed to fetch") |
||||||
|
``` |
||||||
@ -1,24 +1,205 @@ |
|||||||
# Import grouping and ordering constraints |
# Import Organization Constraints |
||||||
|
|
||||||
rules: |
## Standard Order |
||||||
- name: grouping |
|
||||||
rule: "Group imports in three sections separated by a single blank line: stdlib, third-party, local." |
Organize imports in three groups with blank lines between: |
||||||
examples: |
|
||||||
- good: | |
```python |
||||||
import json |
# 1. Standard library imports (alphabetical within group) |
||||||
import logging |
import json |
||||||
|
import logging |
||||||
import requests |
import os |
||||||
import duckdb |
from datetime import datetime, timedelta |
||||||
|
from typing import Dict, List, Optional, Tuple |
||||||
from .pipeline import text_pipeline |
|
||||||
- bad: | |
# 2. Third-party packages (alphabetical within group) |
||||||
import duckdb |
import duckdb |
||||||
import json |
import requests |
||||||
from pipeline import text_pipeline |
from config import config |
||||||
|
|
||||||
- name: from_imports |
# 3. Local application modules (can use relative imports) |
||||||
rule: "Prefer 'from x import y' only when it improves clarity or avoids circular import; otherwise import module and reference attributes." |
from database import db |
||||||
|
from summarizer import summarizer |
||||||
enforcement_examples: |
``` |
||||||
- "Run isort or ruff- import sorting in pre-commit or CI to enforce ordering." |
|
||||||
|
## Alphabetical Ordering |
||||||
|
|
||||||
|
Within each group, sort imports alphabetically: |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD - alphabetical |
||||||
|
import json |
||||||
|
import logging |
||||||
|
from datetime import datetime |
||||||
|
from typing import Dict, List, Optional |
||||||
|
|
||||||
|
# BAD - random order |
||||||
|
from typing import Optional |
||||||
|
import json |
||||||
|
from datetime import datetime |
||||||
|
import logging |
||||||
|
from typing import Dict, List |
||||||
|
``` |
||||||
|
|
||||||
|
## Grouping Rules |
||||||
|
|
||||||
|
### Standard Library |
||||||
|
- `json`, `logging`, `os`, `sys`, `time` |
||||||
|
- `datetime`, `timedelta` from `datetime` |
||||||
|
- `Dict`, `List`, `Optional`, etc. from `typing` |
||||||
|
- `argparse`, `pathlib`, `re`, `uuid` |
||||||
|
|
||||||
|
### Third-Party |
||||||
|
- `duckdb`, `requests`, `streamlit` |
||||||
|
- `numpy`, `scipy`, `sklearn` |
||||||
|
- `plotly`, `beautifulsoup4` |
||||||
|
- `pytest` |
||||||
|
|
||||||
|
### Local Application |
||||||
|
- Modules from same package |
||||||
|
- Relative imports when appropriate |
||||||
|
|
||||||
|
## When to Use `from X import Y` |
||||||
|
|
||||||
|
### Prefer `from module import specific_items` for: |
||||||
|
- Constants and config |
||||||
|
- Single classes or functions used frequently |
||||||
|
- Type annotations |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD - clear about what we're using |
||||||
|
from config import config |
||||||
|
from database import db |
||||||
|
|
||||||
|
# GOOD - type hints |
||||||
|
from typing import Dict, List, Optional |
||||||
|
``` |
||||||
|
|
||||||
|
### Use `import module` when: |
||||||
|
- You need multiple items from the module |
||||||
|
- Using module.namespace is clearer |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD - duckdb used for types and module access |
||||||
|
import duckdb |
||||||
|
|
||||||
|
conn = duckdb.connect(...) |
||||||
|
result = conn.execute(...) |
||||||
|
|
||||||
|
# Also acceptable for types |
||||||
|
from typing import Dict |
||||||
|
``` |
||||||
|
|
||||||
|
## Relative Imports |
||||||
|
|
||||||
|
In package modules, prefer relative imports: |
||||||
|
|
||||||
|
```python |
||||||
|
# pipeline/svd_pipeline.py |
||||||
|
from ..database import MotionDatabase # relative import |
||||||
|
from .text_pipeline import process_text # relative import |
||||||
|
``` |
||||||
|
|
||||||
|
## Circular Imports |
||||||
|
|
||||||
|
Avoid circular imports by: |
||||||
|
1. Moving shared code to a third module |
||||||
|
2. Using TYPE_CHECKING for type hints only |
||||||
|
|
||||||
|
```python |
||||||
|
# types.py - shared type definitions |
||||||
|
from typing import TypedDict |
||||||
|
|
||||||
|
class MotionDict(TypedDict): |
||||||
|
id: int |
||||||
|
title: str |
||||||
|
... |
||||||
|
|
||||||
|
# module_a.py |
||||||
|
from .types import MotionDict |
||||||
|
|
||||||
|
# module_b.py - if needed here too |
||||||
|
from .types import MotionDict |
||||||
|
``` |
||||||
|
|
||||||
|
## Import Patterns to Avoid |
||||||
|
|
||||||
|
### Wildcard Imports |
||||||
|
```python |
||||||
|
# BAD |
||||||
|
from database import * |
||||||
|
|
||||||
|
# GOOD |
||||||
|
from database import db, MotionDatabase |
||||||
|
``` |
||||||
|
|
||||||
|
### Import in Function Scope (unless necessary) |
||||||
|
```python |
||||||
|
# AVOID - delays import, makes dependencies unclear |
||||||
|
def some_function(): |
||||||
|
import pandas as pd # Late import |
||||||
|
return pd.DataFrame(...) |
||||||
|
|
||||||
|
# PREFER - import at module level |
||||||
|
import pandas as pd |
||||||
|
|
||||||
|
def some_function(): |
||||||
|
return pd.DataFrame(...) |
||||||
|
``` |
||||||
|
|
||||||
|
### Reassigning Imported Names |
||||||
|
```python |
||||||
|
# BAD - confusing |
||||||
|
from module import process |
||||||
|
process = something_else # Reassigning |
||||||
|
|
||||||
|
# GOOD - clear naming |
||||||
|
from module import process as process_data |
||||||
|
``` |
||||||
|
|
||||||
|
## Type Checking Imports |
||||||
|
|
||||||
|
For type hints only, use TYPE_CHECKING: |
||||||
|
|
||||||
|
```python |
||||||
|
from typing import TYPE_CHECKING |
||||||
|
|
||||||
|
if TYPE_CHECKING: |
||||||
|
from .models import Motion |
||||||
|
|
||||||
|
def get_motion(motion_id: int) -> "Motion": # String quote for forward ref |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
## Optional Dependency Imports |
||||||
|
|
||||||
|
Handle optional dependencies gracefully: |
||||||
|
|
||||||
|
```python |
||||||
|
try: |
||||||
|
import duckdb |
||||||
|
except Exception: |
||||||
|
duckdb = None # Will be checked later |
||||||
|
|
||||||
|
class MotionDatabase: |
||||||
|
def __init__(self): |
||||||
|
if duckdb is None: |
||||||
|
self._file_mode = True # Fallback mode |
||||||
|
``` |
||||||
|
|
||||||
|
## Example: Complete Import Block |
||||||
|
|
||||||
|
```python |
||||||
|
# Complete example from database.py |
||||||
|
import json |
||||||
|
import logging |
||||||
|
import uuid |
||||||
|
from datetime import datetime, timedelta |
||||||
|
from typing import Dict, List, Optional, Tuple |
||||||
|
|
||||||
|
import duckdb |
||||||
|
|
||||||
|
from config import config |
||||||
|
|
||||||
|
from database import db |
||||||
|
``` |
||||||
|
|||||||
@ -0,0 +1,167 @@ |
|||||||
|
# Logging Constraints |
||||||
|
|
||||||
|
## Core Rule |
||||||
|
|
||||||
|
**Use `logging.getLogger(__name__)` - never use `print()`** |
||||||
|
|
||||||
|
## Logger Initialization |
||||||
|
|
||||||
|
Get logger at module level: |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD: Use logging.getLogger(__name__) |
||||||
|
import logging |
||||||
|
|
||||||
|
_logger = logging.getLogger(__name__) |
||||||
|
|
||||||
|
def some_function(): |
||||||
|
_logger.info("Processing started") |
||||||
|
_logger.debug("Detail: %s", detail) |
||||||
|
``` |
||||||
|
|
||||||
|
## Logger Naming |
||||||
|
|
||||||
|
Use `__name__` for automatic module path: |
||||||
|
|
||||||
|
```python |
||||||
|
# In database.py - logger will be "database" |
||||||
|
_logger = logging.getLogger(__name__) |
||||||
|
|
||||||
|
# In pipeline/svd_pipeline.py - logger will be "pipeline.svd_pipeline" |
||||||
|
_logger = logging.getLogger(__name__) |
||||||
|
``` |
||||||
|
|
||||||
|
## Log Levels |
||||||
|
|
||||||
|
| Level | When to Use | |
||||||
|
|-------|-------------| |
||||||
|
| DEBUG | Detailed diagnostic info (dev only) | |
||||||
|
| INFO | Normal operation milestones | |
||||||
|
| WARNING | Unexpected but handled (fallbacks) | |
||||||
|
| ERROR | Operation failed, may need attention | |
||||||
|
| CRITICAL | Fatal error, program may crash | |
||||||
|
|
||||||
|
## Examples |
||||||
|
|
||||||
|
### Good Logging Practice |
||||||
|
```python |
||||||
|
_logger.info("Pipeline run: %s → %s (%s windows)", start, end, count) |
||||||
|
_logger.debug("Batch embedding attempt %d failed: %s", attempt, exc) |
||||||
|
_logger.warning("Fallback used for motion %d: %s", motion_id, reason) |
||||||
|
_logger.error("Query failed: %s", exc) |
||||||
|
``` |
||||||
|
|
||||||
|
### Bad: Using print() |
||||||
|
```python |
||||||
|
# BAD - don't use print |
||||||
|
print(f"Fetched {len(voting_records)} voting records from API") |
||||||
|
print(f"Error fetching motions from API: {e}") |
||||||
|
``` |
||||||
|
|
||||||
|
### Good: Using logger |
||||||
|
```python |
||||||
|
# GOOD - use logger |
||||||
|
_logger.info("Fetched %d voting records from API", len(voting_records)) |
||||||
|
_logger.error("Error fetching motions from API: %s", e) |
||||||
|
``` |
||||||
|
|
||||||
|
## Exception Logging |
||||||
|
|
||||||
|
Use `_logger.exception()` for caught exceptions (includes traceback): |
||||||
|
|
||||||
|
```python |
||||||
|
try: |
||||||
|
result = risky_operation() |
||||||
|
except Exception as exc: |
||||||
|
_logger.exception("Operation failed: %s", exc) |
||||||
|
return fallback_value |
||||||
|
``` |
||||||
|
|
||||||
|
Use `_logger.error()` with explicit exception for controlled errors: |
||||||
|
|
||||||
|
```python |
||||||
|
try: |
||||||
|
result = risky_operation() |
||||||
|
except Exception as exc: |
||||||
|
_logger.error("Operation failed: %s", exc) |
||||||
|
return fallback_value |
||||||
|
``` |
||||||
|
|
||||||
|
## Configuration |
||||||
|
|
||||||
|
Ensure logging is configured in entry points: |
||||||
|
|
||||||
|
```python |
||||||
|
# pipeline/run_pipeline.py |
||||||
|
def run(args): |
||||||
|
logging.basicConfig( |
||||||
|
level=logging.INFO, |
||||||
|
format="%(asctime)s %(levelname)s %(name)s: %(message)s", |
||||||
|
) |
||||||
|
# ... rest of pipeline |
||||||
|
``` |
||||||
|
|
||||||
|
## Anti-Patterns |
||||||
|
|
||||||
|
### Debug Prints in Production Code |
||||||
|
```python |
||||||
|
# BAD |
||||||
|
print(f"[TRAJ DEBUG] processing window {wid}") |
||||||
|
|
||||||
|
# GOOD |
||||||
|
_logger.debug("Processing window %s", wid) |
||||||
|
``` |
||||||
|
|
||||||
|
### Inconsistent Logger Names |
||||||
|
```python |
||||||
|
# BAD - mixing _logger and logger |
||||||
|
_logger = logging.getLogger(__name__) |
||||||
|
logger = logging.getLogger("other") # Inconsistent |
||||||
|
|
||||||
|
# GOOD - use single consistent pattern |
||||||
|
_logger = logging.getLogger(__name__) |
||||||
|
``` |
||||||
|
|
||||||
|
### Missing Logger Initialization |
||||||
|
```python |
||||||
|
# BAD - no logger defined |
||||||
|
def some_function(): |
||||||
|
logging.getLogger(__name__).info("...") # Redundant calls |
||||||
|
|
||||||
|
# GOOD - define once at module level |
||||||
|
_logger = logging.getLogger(__name__) |
||||||
|
|
||||||
|
def some_function(): |
||||||
|
_logger.info("...") |
||||||
|
``` |
||||||
|
|
||||||
|
## Sensitive Data |
||||||
|
|
||||||
|
Never log sensitive information: |
||||||
|
- API keys |
||||||
|
- User votes |
||||||
|
- Session IDs (if tied to user data) |
||||||
|
- Personal information |
||||||
|
|
||||||
|
```python |
||||||
|
# BAD |
||||||
|
_logger.info("User %s voted %s", user_id, vote) |
||||||
|
|
||||||
|
# GOOD - log aggregates, not individual votes |
||||||
|
_logger.info("Vote recorded for session %s", session_id[:8]) |
||||||
|
``` |
||||||
|
|
||||||
|
## Structured Logging |
||||||
|
|
||||||
|
For complex data, use structured logging: |
||||||
|
|
||||||
|
```python |
||||||
|
_logger.info( |
||||||
|
"Motion processed", |
||||||
|
extra={ |
||||||
|
"motion_id": motion_id, |
||||||
|
"policy_area": policy_area, |
||||||
|
"processing_time_ms": elapsed_ms, |
||||||
|
} |
||||||
|
) |
||||||
|
``` |
||||||
@ -1,30 +1,141 @@ |
|||||||
# Naming constraint rules (example constraint file) |
# Naming Constraints |
||||||
|
|
||||||
rules: |
## File Names |
||||||
- name: module_file_names |
|
||||||
rule: "Use snake_case for Python module filenames (e.g., text_pipeline.py, ai_provider.py)." |
### Python Modules |
||||||
examples: |
- **Convention**: `snake_case.py` |
||||||
- good: "text_pipeline.py" |
- **Examples**: `motion_database.py`, `api_client.py`, `text_pipeline.py` |
||||||
- bad: "TextPipeline.py" |
|
||||||
|
### Test Files |
||||||
- name: function_names |
- **Convention**: `test_<module_name>.py` |
||||||
rule: "Use snake_case for functions and methods." |
- **Examples**: `test_database.py`, `test_api_client.py` |
||||||
examples: |
|
||||||
- good: "def compute_similarities(...):" |
### Config Files |
||||||
- bad: "def ComputeSimilarities(...):" |
- **Convention**: `snake_case` |
||||||
|
- **Examples**: `config.py`, `.env.example`, `pyproject.toml` |
||||||
- name: class_names |
|
||||||
rule: "Use PascalCase for classes." |
### Directories |
||||||
examples: |
- **Convention**: `snake_case/` |
||||||
- good: "class MotionDatabase:" |
- **Examples**: `pipeline/`, `tests/integration/`, `src/validators/` |
||||||
- bad: "class motion_database:" |
|
||||||
|
## Class Names |
||||||
- name: constants |
|
||||||
rule: "Constants use UPPER_SNAKE_CASE." |
- **Convention**: `PascalCase` |
||||||
examples: |
- **Examples**: `MotionDatabase`, `TweedeKamerAPI`, `MotionSummarizer` |
||||||
- good: "VOTE_MAP = { ... }" |
|
||||||
- bad: "vote_map = { ... }" |
### Naming Patterns |
||||||
|
| Pattern | Example | |
||||||
enforcement_examples: |
|---------|---------| |
||||||
- "Add a linter rule in CI: ruff or flake8 naming plugin to detect violations." |
| Database wrapper | `MotionDatabase` | |
||||||
- "Run `python -m pip install ruff` and `ruff check` as part of CI." |
| API client | `TweedeKamerAPI` | |
||||||
|
| Service/Helpers | `MotionScraper`, `MotionAnalyzer` | |
||||||
|
| Exceptions | `ProviderError` | |
||||||
|
|
||||||
|
## Function Names |
||||||
|
|
||||||
|
- **Convention**: `snake_case` |
||||||
|
- **Examples**: `get_motions`, `compute_similarity`, `process_voting_records` |
||||||
|
|
||||||
|
### Private Methods |
||||||
|
- **Convention**: `_snake_case` (single underscore prefix) |
||||||
|
- **Examples**: `_get_voting_records`, `_parse_response` |
||||||
|
|
||||||
|
## Variable Names |
||||||
|
|
||||||
|
### Regular Variables |
||||||
|
- **Convention**: `snake_case` |
||||||
|
- **Examples**: `motion_id`, `party_name`, `voting_results` |
||||||
|
|
||||||
|
### Constants (Module-Level) |
||||||
|
- **Convention**: `UPPER_SNAKE_CASE` |
||||||
|
- **Examples**: `DATABASE_PATH`, `API_TIMEOUT`, `MAX_RETRIES` |
||||||
|
|
||||||
|
### Config Variables (in dataclass) |
||||||
|
- **Convention**: `UPPER_SNAKE_CASE` |
||||||
|
- **Examples**: `QWEN_MODEL`, `POLICY_AREAS` |
||||||
|
|
||||||
|
### Booleans |
||||||
|
- **Convention**: `is_`, `has_`, `can_` prefixes or `_flag` suffix |
||||||
|
- **Examples**: `is_active`, `has_votes`, `skip_extract` |
||||||
|
|
||||||
|
### Private Variables |
||||||
|
- **Convention**: `_underscore_prefix` |
||||||
|
- **Examples**: `_conn`, `_cache`, `_session` |
||||||
|
|
||||||
|
## Singleton Instances |
||||||
|
|
||||||
|
- **Convention**: `lower_snake_case` at module level |
||||||
|
- **Examples**: `db = MotionDatabase()`, `summarizer = MotionSummarizer()` |
||||||
|
|
||||||
|
```python |
||||||
|
# database.py |
||||||
|
class MotionDatabase: |
||||||
|
... |
||||||
|
|
||||||
|
# Singleton instance |
||||||
|
db = MotionDatabase() |
||||||
|
|
||||||
|
# Usage |
||||||
|
from database import db |
||||||
|
motions = db.get_motions() |
||||||
|
``` |
||||||
|
|
||||||
|
## Type Variables |
||||||
|
|
||||||
|
- **Convention**: `PascalCase` |
||||||
|
- **Examples**: `T = TypeVar('T')`, `MotionDict = Dict[str, Any]` |
||||||
|
|
||||||
|
## Anti-Patterns |
||||||
|
|
||||||
|
### Inconsistent Naming |
||||||
|
```python |
||||||
|
# BAD - mixing styles |
||||||
|
get_motions() # snake_case |
||||||
|
GetMotionById() # PascalCase |
||||||
|
processData() # camelCase |
||||||
|
|
||||||
|
# GOOD - consistent snake_case |
||||||
|
get_motions() |
||||||
|
get_motion_by_id() |
||||||
|
process_voting_data() |
||||||
|
``` |
||||||
|
|
||||||
|
### Abbreviations |
||||||
|
```python |
||||||
|
# AVOID - unclear abbreviations |
||||||
|
calc_similarity() # calculate_* |
||||||
|
proc_votes() # process_* |
||||||
|
get_mp_data() # get_mp_metadata() |
||||||
|
|
||||||
|
# PREFER - full words |
||||||
|
calculate_similarity() |
||||||
|
process_votes() |
||||||
|
get_mp_metadata() |
||||||
|
``` |
||||||
|
|
||||||
|
### Hungarian Notation |
||||||
|
```python |
||||||
|
# BAD - Hungarian notation |
||||||
|
str_title = "..." |
||||||
|
int_count = 0 |
||||||
|
b_is_active = True |
||||||
|
|
||||||
|
# GOOD - clear types via naming |
||||||
|
title = "..." |
||||||
|
count = 0 |
||||||
|
is_active = True |
||||||
|
``` |
||||||
|
|
||||||
|
## Special Cases |
||||||
|
|
||||||
|
### Window IDs |
||||||
|
- **Format**: `"YYYY-QN"` or `"YYYY"` |
||||||
|
- **Examples**: `"2024-Q1"`, `"2024-Q2"`, `"2024"` |
||||||
|
|
||||||
|
### Policy Areas |
||||||
|
- **Convention**: PascalCase with spaces |
||||||
|
- **Examples**: `"Economie"`, `"Sociale Zaken"`, `"Klimaat"` |
||||||
|
|
||||||
|
### Vote Values |
||||||
|
- **Convention**: PascalCase Dutch terms |
||||||
|
- **Values**: `"Voor"`, `"Tegen"`, `"Onthouden"`, `"Geen stem"`, `"Afwezig"` |
||||||
|
|||||||
@ -0,0 +1,233 @@ |
|||||||
|
# Type Hint Constraints |
||||||
|
|
||||||
|
## Core Rule |
||||||
|
|
||||||
|
**Use type hints on all public functions and methods** |
||||||
|
|
||||||
|
## Function Type Hints |
||||||
|
|
||||||
|
### Required on Public APIs |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD - complete type hints |
||||||
|
def get_motion(self, motion_id: int) -> Optional[Dict]: |
||||||
|
... |
||||||
|
|
||||||
|
def get_filtered_motions( |
||||||
|
self, |
||||||
|
policy_area: str = "Alle", |
||||||
|
limit: int = 10 |
||||||
|
) -> List[Dict]: |
||||||
|
... |
||||||
|
|
||||||
|
def calculate_similarity(self, motion_a: int, motion_b: int) -> float: |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
### Optional Parameters |
||||||
|
|
||||||
|
Use `Optional[X]` or `X | None`: |
||||||
|
|
||||||
|
```python |
||||||
|
# Both forms are acceptable |
||||||
|
def get_motion(self, motion_id: Optional[int] = None) -> Optional[Dict]: |
||||||
|
... |
||||||
|
|
||||||
|
def get_motion(self, motion_id: int | None = None) -> dict | None: |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
### Multiple Return Types |
||||||
|
|
||||||
|
Use `Union[X, Y]` or `|` operator: |
||||||
|
|
||||||
|
```python |
||||||
|
# Acceptable forms |
||||||
|
def parse_value(self, value: str) -> Union[bool, str, None]: |
||||||
|
... |
||||||
|
|
||||||
|
def parse_value(self, value: str) -> bool | str | None: |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
### Generic Types |
||||||
|
|
||||||
|
Use `List[X]`, `Dict[K, V]`, `Tuple[X, Y]`: |
||||||
|
|
||||||
|
```python |
||||||
|
from typing import Dict, List, Optional, Tuple |
||||||
|
|
||||||
|
def get_motions(self, ids: List[int]) -> Dict[int, Dict]: |
||||||
|
"""Map motion_id -> motion data.""" |
||||||
|
... |
||||||
|
|
||||||
|
def process_batch(self, items: List[str]) -> Tuple[List[str], List[str]]: |
||||||
|
"""Returns (successes, failures).""" |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
## Collection Types |
||||||
|
|
||||||
|
Prefer specific types over bare `list`/`dict`: |
||||||
|
|
||||||
|
```python |
||||||
|
# GOOD - specific types |
||||||
|
def get_votes(self) -> List[str]: |
||||||
|
... |
||||||
|
|
||||||
|
def get_metadata(self) -> Dict[str, Any]: |
||||||
|
... |
||||||
|
|
||||||
|
# ACCEPTABLE - for truly generic collections |
||||||
|
def merge_dicts(*dicts: dict) -> dict: |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
## DuckDB Result Types |
||||||
|
|
||||||
|
DuckDB returns tuples/lists - document expected structure: |
||||||
|
|
||||||
|
```python |
||||||
|
def get_motion(self, motion_id: int) -> Optional[Tuple]: |
||||||
|
"""Returns (id, title, description, date, ...) or None.""" |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
try: |
||||||
|
result = conn.execute( |
||||||
|
"SELECT * FROM motions WHERE id = ?", (motion_id,) |
||||||
|
).fetchone() |
||||||
|
return result |
||||||
|
finally: |
||||||
|
conn.close() |
||||||
|
|
||||||
|
# Or use Dict for clarity |
||||||
|
def get_motion_as_dict(self, motion_id: int) -> Optional[Dict]: |
||||||
|
"""Returns motion dict or None.""" |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
try: |
||||||
|
row = conn.execute( |
||||||
|
"SELECT * FROM motions WHERE id = ?", (motion_id,) |
||||||
|
).fetchone() |
||||||
|
if row: |
||||||
|
return { |
||||||
|
"id": row[0], |
||||||
|
"title": row[1], |
||||||
|
"description": row[2], |
||||||
|
... |
||||||
|
} |
||||||
|
return None |
||||||
|
finally: |
||||||
|
conn.close() |
||||||
|
``` |
||||||
|
|
||||||
|
## Class/Instance Types |
||||||
|
|
||||||
|
Use `Self` for methods returning instance type: |
||||||
|
|
||||||
|
```python |
||||||
|
from typing import Self |
||||||
|
|
||||||
|
class MotionDatabase: |
||||||
|
def with_connection(self, path: str) -> Self: |
||||||
|
"""Return new instance with different path.""" |
||||||
|
return MotionDatabase(db_path=path) |
||||||
|
``` |
||||||
|
|
||||||
|
## Callback/Function Types |
||||||
|
|
||||||
|
Use `Callable` for function parameters: |
||||||
|
|
||||||
|
```python |
||||||
|
from typing import Callable |
||||||
|
|
||||||
|
def process_motions( |
||||||
|
motions: List[Dict], |
||||||
|
processor: Callable[[Dict], Any] |
||||||
|
) -> List[Any]: |
||||||
|
return [processor(m) for m in motions] |
||||||
|
``` |
||||||
|
|
||||||
|
## Type Aliases |
||||||
|
|
||||||
|
Define clear type aliases for domain concepts: |
||||||
|
|
||||||
|
```python |
||||||
|
from typing import Dict, List, TypedDict, Literal |
||||||
|
|
||||||
|
# Vote values |
||||||
|
VoteValue = Literal["Voor", "Tegen", "Onthouden", "Geen stem", "Afwezig"] |
||||||
|
|
||||||
|
# Policy areas |
||||||
|
PolicyArea = Literal["Alle", "Economie", "Klimaat", "Immigratie", ...] |
||||||
|
|
||||||
|
# Motion dict |
||||||
|
class MotionDict(TypedDict): |
||||||
|
id: int |
||||||
|
title: str |
||||||
|
description: Optional[str] |
||||||
|
date: Optional[str] |
||||||
|
policy_area: Optional[str] |
||||||
|
voting_results: Optional[str] # JSON string |
||||||
|
winning_margin: Optional[float] |
||||||
|
|
||||||
|
def get_motion(self, motion_id: int) -> Optional[MotionDict]: |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
## Avoid `Any` |
||||||
|
|
||||||
|
Use `Any` sparingly - prefer specific types: |
||||||
|
|
||||||
|
```python |
||||||
|
# AVOID - too vague |
||||||
|
def process(data: Any) -> Any: |
||||||
|
... |
||||||
|
|
||||||
|
# PREFER - specific types |
||||||
|
def process(motion: MotionDict) -> Optional[SimilarityResult]: |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
## Inline Type Hints |
||||||
|
|
||||||
|
For simple cases, inline hints are fine: |
||||||
|
|
||||||
|
```python |
||||||
|
def get_count(self) -> int: |
||||||
|
... |
||||||
|
|
||||||
|
def is_empty(self) -> bool: |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
## Docstring Type Hints |
||||||
|
|
||||||
|
For complex types, include in docstrings: |
||||||
|
|
||||||
|
```python |
||||||
|
def get_party_positions(self, window_id: str) -> Dict[str, List[float]]: |
||||||
|
"""Get party positions in political space. |
||||||
|
|
||||||
|
Args: |
||||||
|
window_id: Time window (e.g., "2024-Q1") |
||||||
|
|
||||||
|
Returns: |
||||||
|
Dict mapping party_name -> [x, y] coordinates |
||||||
|
|
||||||
|
Example: |
||||||
|
>>> positions = db.get_party_positions("2024-Q1") |
||||||
|
>>> positions["VVD"] |
||||||
|
[0.5, -0.3] |
||||||
|
""" |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
## Type Checking |
||||||
|
|
||||||
|
For runtime type checking, use runtime checks: |
||||||
|
|
||||||
|
```python |
||||||
|
def set_count(self, count: int) -> None: |
||||||
|
if not isinstance(count, int): |
||||||
|
raise TypeError(f"Expected int, got {type(count).__name__}") |
||||||
|
self._count = count |
||||||
|
``` |
||||||
@ -0,0 +1,124 @@ |
|||||||
|
# Naming Conventions |
||||||
|
|
||||||
|
## Files |
||||||
|
- **snake_case** for all Python files: `database.py`, `explorer_helpers.py`, `motion_cache.py` |
||||||
|
- **PascalCase** NOT used for files |
||||||
|
|
||||||
|
## Functions |
||||||
|
- **snake_case**: `get_svd_vectors()`, `compute_party_coords()`, `build_scatter_trace()` |
||||||
|
- Private helpers prefixed with `_`: `_get_window_data()` |
||||||
|
|
||||||
|
## Classes |
||||||
|
- **PascalCase**: `MotionDatabase`, `Config` |
||||||
|
- **Dataclass pattern** for Config: `@dataclass` decorator with typed fields |
||||||
|
|
||||||
|
## Variables |
||||||
|
- **snake_case**: `party_map`, `mp_name`, `svd_vectors`, `party_centroids` |
||||||
|
- **CONSTANT_SNAKE_CASE** for module-level constants: `PARTY_COLOURS`, `DEFAULT_WINDOW` |
||||||
|
|
||||||
|
## Module-Level Exports |
||||||
|
- **Singleton instance**: `db = MotionDatabase()` at module bottom (not class-level) |
||||||
|
- **Config instance**: `config = Config(...)` at module bottom |
||||||
|
- **Dicts**: `PARTY_COLOURS` exported from `config.py` |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
# Error Handling |
||||||
|
|
||||||
|
## Known Patterns |
||||||
|
1. **Bare except with pass** (ANTI-PATTERN - see anti-patterns.yaml) |
||||||
|
```python |
||||||
|
except: |
||||||
|
pass # database.py:47 |
||||||
|
``` |
||||||
|
|
||||||
|
2. **Graceful degradation**: catch specific exceptions, fall back to default |
||||||
|
```python |
||||||
|
try: |
||||||
|
result = compute_svd() |
||||||
|
except ImportError: |
||||||
|
result = DEFAULT_SVD |
||||||
|
``` |
||||||
|
|
||||||
|
3. **Optional dependency fallbacks**: |
||||||
|
```python |
||||||
|
try: |
||||||
|
import umap |
||||||
|
use_umap = True |
||||||
|
except ImportError: |
||||||
|
use_umap = False |
||||||
|
``` |
||||||
|
|
||||||
|
4. **Nested exception handling** (ANTI-PATTERN - see anti-patterns.yaml): |
||||||
|
```python |
||||||
|
try: |
||||||
|
... |
||||||
|
except Exception: |
||||||
|
try: |
||||||
|
... |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
``` |
||||||
|
|
||||||
|
## Rules |
||||||
|
- Never use bare `except:` — always specify exception type |
||||||
|
- Never swallow exceptions silently — log or return a sensible default |
||||||
|
- For optional deps, use `ImportError` or `ModuleNotFoundError` explicitly |
||||||
|
- Avoid nested try/except blocks |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
# Code Organization |
||||||
|
|
||||||
|
## Singleton Pattern |
||||||
|
Each module owns one shared instance: |
||||||
|
```python |
||||||
|
# database.py |
||||||
|
db = MotionDatabase() |
||||||
|
|
||||||
|
# config.py |
||||||
|
config = Config(...) |
||||||
|
PARTY_COLOURS = {...} |
||||||
|
``` |
||||||
|
|
||||||
|
## Pure Functions in Helpers |
||||||
|
`explorer_helpers.py` contains only pure functions (no IO, no Streamlit calls): |
||||||
|
```python |
||||||
|
def compute_party_coords(svd_vectors, party_map): |
||||||
|
"""Pure: no side effects, no imports from this module""" |
||||||
|
... |
||||||
|
|
||||||
|
def build_scatter_trace(df, color_col): |
||||||
|
"""Pure: returns Plotly trace dict""" |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
## Cached Data Loaders |
||||||
|
Use `@st.cache_data` for expensive data loading: |
||||||
|
```python |
||||||
|
@st.cache_data |
||||||
|
def load_svd_vectors(window: str) -> pd.DataFrame: |
||||||
|
return db.get_svd_vectors(window) |
||||||
|
``` |
||||||
|
|
||||||
|
## Dataclass Config |
||||||
|
```python |
||||||
|
@dataclass |
||||||
|
class Config: |
||||||
|
db_path: str = "data/stemwijzer.duckdb" |
||||||
|
default_window: str = "2023" |
||||||
|
party_colours: dict = field(default_factory=lambda: PARTY_COLOURS) |
||||||
|
``` |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
# Imports |
||||||
|
|
||||||
|
## Ordering (convention) |
||||||
|
1. Standard library |
||||||
|
2. Third-party (streamlit, ibis, plotly, sklearn, umap) |
||||||
|
3. Local/relative imports |
||||||
|
|
||||||
|
## Avoid |
||||||
|
- Wildcard imports (`from module import *`) |
||||||
|
- Circular imports (ensure dependency direction: helpers → database → config) |
||||||
@ -0,0 +1,78 @@ |
|||||||
|
# Dependencies |
||||||
|
|
||||||
|
## Core Library Wiring |
||||||
|
|
||||||
|
### Database Layer |
||||||
|
``` |
||||||
|
ibis → DuckDB → MotionDatabase singleton (database.py) |
||||||
|
↑ |
||||||
|
sqlglot (ibis dependency) |
||||||
|
``` |
||||||
|
|
||||||
|
### Data Processing |
||||||
|
``` |
||||||
|
pandas → (used throughout for DataFrame operations) |
||||||
|
numpy → (used by sklearn, scipy, umap) |
||||||
|
scipy → spatial.procrustes for window alignment |
||||||
|
``` |
||||||
|
|
||||||
|
### ML Pipeline |
||||||
|
``` |
||||||
|
sklearn.cluster → KMeans, Procrustes |
||||||
|
sklearn.preprocessing → StandardScaler |
||||||
|
umap → UMAP (optional, graceful fallback) |
||||||
|
``` |
||||||
|
|
||||||
|
### Visualization |
||||||
|
``` |
||||||
|
plotly → explorer_helpers.py chart builders |
||||||
|
st.plotly_chart → explorer.py rendering |
||||||
|
``` |
||||||
|
|
||||||
|
### Streamlit |
||||||
|
``` |
||||||
|
streamlit → all pages, @st.cache_data decorators |
||||||
|
``` |
||||||
|
|
||||||
|
## Optional Dependencies |
||||||
|
| Package | Required | Fallback | |
||||||
|
|---------|----------|----------| |
||||||
|
| `umap` | No | Use raw SVD vectors (first 2 dims) | |
||||||
|
| `plotly` | Yes | Raises ImportError | |
||||||
|
| `duckdb` | Yes | — | |
||||||
|
| `ibis` | Yes | — | |
||||||
|
| `sklearn` | Yes | — | |
||||||
|
|
||||||
|
## Singleton Instances |
||||||
|
| Module | Instance | Type | |
||||||
|
|--------|----------|------| |
||||||
|
| `database.py` | `db` | `MotionDatabase` | |
||||||
|
| `config.py` | `config` | `Config` (dataclass) | |
||||||
|
| `config.py` | `PARTY_COLOURS` | `dict[str, str]` | |
||||||
|
|
||||||
|
## Key Imports by File |
||||||
|
``` |
||||||
|
explorer.py: |
||||||
|
- import streamlit as st |
||||||
|
- from database import db |
||||||
|
- from explorer_helpers import * |
||||||
|
|
||||||
|
explorer_helpers.py: |
||||||
|
- import pandas as pd |
||||||
|
- import plotly.graph_objects as go |
||||||
|
- from database import db (optional, for type hints) |
||||||
|
|
||||||
|
database.py: |
||||||
|
- import ibis |
||||||
|
- import duckdb |
||||||
|
- from config import config, PARTY_COLOURS |
||||||
|
|
||||||
|
config.py: |
||||||
|
- from dataclasses import dataclass, field |
||||||
|
- import streamlit as st (optional, for warnings) |
||||||
|
``` |
||||||
|
|
||||||
|
## Environment |
||||||
|
- Python ≥3.13 |
||||||
|
- Environment variables via `.env` (DB path, API keys) |
||||||
|
- No `.env` values in constraint files (security) |
||||||
@ -0,0 +1,107 @@ |
|||||||
|
# Domain Glossary - Dutch Political Terms |
||||||
|
|
||||||
|
## Core Entities |
||||||
|
|
||||||
|
### Motion / Motie |
||||||
|
- Parliamentary motion submitted by MPs |
||||||
|
- Fields: `id`, `title`, `date`, `category` |
||||||
|
- MPs vote: **For** (+1), **Against** (-1), **Abstain** (0), **Absent** |
||||||
|
|
||||||
|
### MP / Kamerlid |
||||||
|
- Member of Parliament (Tweede Kamerlid) |
||||||
|
- Identified by full name (e.g., "Van Dijk, I.") |
||||||
|
- Has voting record, party affiliation, SVD position vector |
||||||
|
- Historical: `mp_party_history` tracks party changes over time |
||||||
|
|
||||||
|
### Party / Fractie |
||||||
|
- Political party (e.g., "GroenLinks-PvdA", "PVV", "VVD") |
||||||
|
- Party centroids: average SVD position of all MPs in party |
||||||
|
- Aliases: multiple spelling variants exist (see anti-patterns.yaml) |
||||||
|
|
||||||
|
### Vote / Stemming |
||||||
|
- Individual MP's vote on a motion: +1, 0, -1 |
||||||
|
- Aggregated to compute SVD vectors |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Time & Analysis Concepts |
||||||
|
|
||||||
|
### Window / Tijdsvenster |
||||||
|
- Time period for analysis (annual or quarterly) |
||||||
|
- Values: "2023", "2023-Q1", "2024", etc. |
||||||
|
- SVD vectors computed per window |
||||||
|
- Windows can be aligned across time using Procrustes |
||||||
|
|
||||||
|
### Trajectory |
||||||
|
- MP's position change across multiple windows |
||||||
|
- Computed from `svd_vectors` + window ordering |
||||||
|
- Used for trend analysis in Evolution tab |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Mathematical / Algorithmic Terms |
||||||
|
|
||||||
|
### SVD Vector |
||||||
|
- 2D vector from Singular Value Decomposition of MP × Motion vote matrix |
||||||
|
- Represents MP's position in political space |
||||||
|
- `entity_id` in `svd_vectors`: either MP name (when individual MPs) or party name (when party-level) |
||||||
|
|
||||||
|
### Political Compass |
||||||
|
- 2D visualization: X-axis = Left↔Right, Y-axis = Progressive↔Conservative |
||||||
|
- SVD vectors mapped to compass quadrants |
||||||
|
- UMAP used for projection |
||||||
|
|
||||||
|
### Procrustes Alignment |
||||||
|
- Algorithm to align SVD vectors across time windows |
||||||
|
- Ensures comparable positions across years/quarters |
||||||
|
- Implemented via `scipy.spatial.procrustes` or scikit-learn |
||||||
|
|
||||||
|
### Centroid |
||||||
|
- Geometric center of a set of points |
||||||
|
- Party centroid = average SVD position of all MPs in that party |
||||||
|
- Computed from `svd_vectors` filtered by party |
||||||
|
|
||||||
|
### UMAP |
||||||
|
- Uniform Manifold Approximation and Projection |
||||||
|
- Dimensionality reduction for visualization |
||||||
|
- Optional dependency — graceful fallback if unavailable |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Visualization |
||||||
|
|
||||||
|
### PARTY_COLOURS |
||||||
|
- Dict mapping party names to hex color codes |
||||||
|
- Used in all Plotly charts for consistent party coloring |
||||||
|
- Source: `config.py` → `PARTY_COLOURS` constant |
||||||
|
- **Issue**: 3 separate alias dictionaries exist (no single source of truth) |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Application Pages |
||||||
|
|
||||||
|
### Home |
||||||
|
- Landing page with app overview |
||||||
|
|
||||||
|
### Stemwijzer (Quiz) |
||||||
|
- User answers questions → matched to parties |
||||||
|
- Thin wrapper around quiz module |
||||||
|
|
||||||
|
### Explorer (4 tabs) |
||||||
|
- **Motion tab**: SVD positions colored by vote on selected motion |
||||||
|
- **MP tab**: Individual MP trajectories across windows |
||||||
|
- **Party tab**: Party centroids with members as scatter |
||||||
|
- **Evolution tab**: How positions change over time |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Database Table Reference |
||||||
|
| Table | Key Fields | |
||||||
|
|-------|-----------| |
||||||
|
| `motions` | id, title, date, category | |
||||||
|
| `mp_votes` | mp_id, motion_id, vote | |
||||||
|
| `svd_vectors` | entity_id, window, vector_2d (list[2]) | |
||||||
|
| `party_centroids` | party, window, centroid_2d | |
||||||
|
| `mp_party_history` | mp_id, party, start_date, end_date | |
||||||
|
| `windows` | window_id, start_date, end_date, period_type | |
||||||
|
| `mp_trajectories` | mp_id, window, trajectory_vector | |
||||||
@ -0,0 +1,196 @@ |
|||||||
|
"""Example: TweedeKamerAPI usage - from api_client.py and actual codebase.""" |
||||||
|
|
||||||
|
from datetime import datetime, timedelta |
||||||
|
from typing import Dict, List |
||||||
|
|
||||||
|
# Import the API client |
||||||
|
from api_client import TweedeKamerAPI |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 1: Basic API usage |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_fetch_motions(): |
||||||
|
"""Fetch recent parliamentary motions from TweedeKamer API.""" |
||||||
|
|
||||||
|
api = TweedeKamerAPI() |
||||||
|
|
||||||
|
# Fetch motions from last 30 days |
||||||
|
start_date = datetime.now() - timedelta(days=30) |
||||||
|
|
||||||
|
try: |
||||||
|
motions = api.get_motions(start_date=start_date, limit=100) |
||||||
|
|
||||||
|
print(f"Fetched {len(motions)} motions") |
||||||
|
|
||||||
|
for motion in motions[:5]: # Show first 5 |
||||||
|
print(f" - {motion.get('title', 'N/A')}") |
||||||
|
|
||||||
|
return motions |
||||||
|
finally: |
||||||
|
api.close() |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 2: Fetching with date range |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_date_range(): |
||||||
|
"""Fetch motions from a specific date range.""" |
||||||
|
|
||||||
|
api = TweedeKamerAPI() |
||||||
|
|
||||||
|
start = datetime(2024, 1, 1) |
||||||
|
end = datetime(2024, 3, 31) # Q1 2024 |
||||||
|
|
||||||
|
try: |
||||||
|
motions = api.get_motions(start_date=start, end_date=end, limit=500) |
||||||
|
|
||||||
|
# Group by policy area |
||||||
|
by_area = {} |
||||||
|
for m in motions: |
||||||
|
area = m.get("policy_area", "Onbekend") |
||||||
|
by_area.setdefault(area, []).append(m) |
||||||
|
|
||||||
|
for area, area_motions in sorted(by_area.items()): |
||||||
|
print(f"{area}: {len(area_motions)} motions") |
||||||
|
|
||||||
|
return motions |
||||||
|
finally: |
||||||
|
api.close() |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 3: Context manager usage |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_context_manager(): |
||||||
|
"""Use API client as context manager.""" |
||||||
|
|
||||||
|
with TweedeKamerAPI() as api: |
||||||
|
motions = api.get_motions( |
||||||
|
start_date=datetime.now() - timedelta(days=7), limit=50 |
||||||
|
) |
||||||
|
|
||||||
|
print(f"Fetched {len(motions)} motions this week") |
||||||
|
|
||||||
|
return motions |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 4: Processing voting records |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_process_votes(): |
||||||
|
"""Process individual voting records from API.""" |
||||||
|
|
||||||
|
api = TweedeKamerAPI() |
||||||
|
|
||||||
|
start_date = datetime.now() - timedelta(days=7) |
||||||
|
|
||||||
|
try: |
||||||
|
# Get voting records directly |
||||||
|
voting_records, besluit_meta = api._get_voting_records( |
||||||
|
start_date=start_date, limit=1000 |
||||||
|
) |
||||||
|
|
||||||
|
print(f"Fetched {len(voting_records)} voting records") |
||||||
|
print(f"From {len(besluit_meta)} unique decisions") |
||||||
|
|
||||||
|
# Count votes by party |
||||||
|
party_votes = {} |
||||||
|
for record in voting_records: |
||||||
|
party = record.get("Fractie", "Onbekend") |
||||||
|
vote = record.get("Soort", "Onbekend") |
||||||
|
party_votes.setdefault(party, {})[vote] = ( |
||||||
|
party_votes.get(party, {}).get(vote, 0) + 1 |
||||||
|
) |
||||||
|
|
||||||
|
for party, votes in sorted(party_votes.items()): |
||||||
|
total = sum(votes.values()) |
||||||
|
voor = votes.get("Voor", 0) |
||||||
|
print(f"{party}: {total} votes ({voor} voor)") |
||||||
|
|
||||||
|
return voting_records |
||||||
|
finally: |
||||||
|
api.close() |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 5: Safe API call with fallback |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_safe_call(): |
||||||
|
"""Make API call with safe fallback on failure.""" |
||||||
|
|
||||||
|
api = TweedeKamerAPI() |
||||||
|
|
||||||
|
try: |
||||||
|
# This will return [] on any error |
||||||
|
motions = api.get_motions( |
||||||
|
start_date=datetime.now() - timedelta(days=30), limit=100 |
||||||
|
) |
||||||
|
|
||||||
|
if not motions: |
||||||
|
print("No motions returned - using cached data") |
||||||
|
# Fallback to cached/local data |
||||||
|
from database import db |
||||||
|
|
||||||
|
return db.get_filtered_motions(limit=10) |
||||||
|
|
||||||
|
return motions |
||||||
|
finally: |
||||||
|
api.close() |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 6: Pagination handling |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_pagination(): |
||||||
|
"""Understand how pagination works in the API.""" |
||||||
|
|
||||||
|
api = TweedeKamerAPI() |
||||||
|
|
||||||
|
start_date = datetime.now() - timedelta(days=365) |
||||||
|
|
||||||
|
# Simulate pagination |
||||||
|
page_size = 250 |
||||||
|
total_limit = 500 |
||||||
|
|
||||||
|
all_motions = [] |
||||||
|
skip = 0 |
||||||
|
|
||||||
|
while len(all_motions) < total_limit: |
||||||
|
print(f"Fetching page with skip={skip}...") |
||||||
|
|
||||||
|
# In real usage, get_motions handles pagination internally |
||||||
|
# This demonstrates what's happening under the hood |
||||||
|
page_motions = api._fetch_page(start_date=start_date, skip=skip, top=page_size) |
||||||
|
|
||||||
|
if not page_motions: |
||||||
|
break |
||||||
|
|
||||||
|
all_motions.extend(page_motions) |
||||||
|
skip += page_size |
||||||
|
|
||||||
|
if len(page_motions) < page_size: |
||||||
|
break # Last page |
||||||
|
|
||||||
|
print(f"Total fetched: {len(all_motions)} motions") |
||||||
|
return all_motions |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
print("=== Basic Fetch ===") |
||||||
|
example_fetch_motions() |
||||||
|
|
||||||
|
print("\n=== Process Votes ===") |
||||||
|
example_process_votes() |
||||||
@ -0,0 +1,191 @@ |
|||||||
|
"""Example: MotionDatabase usage - from database.py and actual codebase.""" |
||||||
|
|
||||||
|
from typing import Dict, List, Optional |
||||||
|
import duckdb |
||||||
|
import json |
||||||
|
from config import config |
||||||
|
|
||||||
|
# Import the singleton instance |
||||||
|
from database import db |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 1: Getting filtered motions |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_get_filtered_motions(): |
||||||
|
"""Get controversial motions from a specific policy area.""" |
||||||
|
|
||||||
|
motions = db.get_filtered_motions( |
||||||
|
policy_area="Klimaat", |
||||||
|
min_margin=0.0, |
||||||
|
max_margin=0.3, # Controversial: close margin |
||||||
|
limit=10, |
||||||
|
) |
||||||
|
|
||||||
|
for motion in motions: |
||||||
|
print(f"{motion['title']}: {motion['winning_margin']:.1%} margin") |
||||||
|
|
||||||
|
return motions |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 2: Creating a voting session |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_voting_session(): |
||||||
|
"""Create a new user session and record votes.""" |
||||||
|
|
||||||
|
# Create session for 10 motions |
||||||
|
session_id = db.create_session(total_motions=10) |
||||||
|
print(f"Created session: {session_id}") |
||||||
|
|
||||||
|
# Get motions for the session |
||||||
|
motions = db.get_filtered_motions(policy_area="Alle", limit=10) |
||||||
|
|
||||||
|
# Record votes |
||||||
|
for motion in motions: |
||||||
|
# In real app, user would choose vote |
||||||
|
vote = "Voor" # Example vote |
||||||
|
db.record_vote(session_id=session_id, motion_id=motion["id"], vote=vote) |
||||||
|
|
||||||
|
# Get results |
||||||
|
results = db.get_party_results(session_id) |
||||||
|
|
||||||
|
for party, result in sorted(results.items(), key=lambda x: -x[1]["agreement"]): |
||||||
|
print(f"{party}: {result['agreement']:.1%} agreement") |
||||||
|
|
||||||
|
return results |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 3: Working with DuckDB connections directly |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_direct_duckdb(): |
||||||
|
"""Example of proper DuckDB connection handling.""" |
||||||
|
|
||||||
|
conn = duckdb.connect(config.DATABASE_PATH) |
||||||
|
try: |
||||||
|
# Get motion with votes |
||||||
|
result = conn.execute( |
||||||
|
""" |
||||||
|
SELECT m.*, |
||||||
|
JSON_EXTRACT(voting_results, '$.total_votes') as total_votes |
||||||
|
FROM motions m |
||||||
|
WHERE m.id = ? |
||||||
|
""", |
||||||
|
(123,), |
||||||
|
).fetchone() |
||||||
|
|
||||||
|
if result: |
||||||
|
print(f"Motion: {result[1]}") # title is index 1 |
||||||
|
|
||||||
|
return result |
||||||
|
finally: |
||||||
|
conn.close() |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 4: Bulk operations |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_bulk_insert(): |
||||||
|
"""Example of bulk inserting motions.""" |
||||||
|
|
||||||
|
# Sample data |
||||||
|
motions = [ |
||||||
|
{ |
||||||
|
"title": "Motion about climate policy", |
||||||
|
"description": "Proposal to reduce emissions", |
||||||
|
"date": "2024-01-15", |
||||||
|
"policy_area": "Klimaat", |
||||||
|
"voting_results": json.dumps({"Voor": 75, "Tegen": 65}), |
||||||
|
"winning_margin": 0.07, |
||||||
|
"controversy_score": 0.85, |
||||||
|
}, |
||||||
|
{ |
||||||
|
"title": "Motion about healthcare", |
||||||
|
"description": "Increase healthcare budget", |
||||||
|
"date": "2024-01-20", |
||||||
|
"policy_area": "Zorg", |
||||||
|
"voting_results": json.dumps({"Voor": 90, "Tegen": 50}), |
||||||
|
"winning_margin": 0.29, |
||||||
|
"controversy_score": 0.42, |
||||||
|
}, |
||||||
|
] |
||||||
|
|
||||||
|
conn = duckdb.connect(config.DATABASE_PATH) |
||||||
|
try: |
||||||
|
for motion in motions: |
||||||
|
conn.execute( |
||||||
|
""" |
||||||
|
INSERT INTO motions |
||||||
|
(title, description, date, policy_area, voting_results, |
||||||
|
winning_margin, controversy_score) |
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?) |
||||||
|
""", |
||||||
|
( |
||||||
|
motion["title"], |
||||||
|
motion["description"], |
||||||
|
motion["date"], |
||||||
|
motion["policy_area"], |
||||||
|
motion["voting_results"], |
||||||
|
motion["winning_margin"], |
||||||
|
motion["controversy_score"], |
||||||
|
), |
||||||
|
) |
||||||
|
conn.close() |
||||||
|
print(f"Inserted {len(motions)} motions") |
||||||
|
except Exception as e: |
||||||
|
conn.close() |
||||||
|
print(f"Error inserting motions: {e}") |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 5: Query with aggregation |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_aggregation(): |
||||||
|
"""Example of aggregate queries.""" |
||||||
|
|
||||||
|
conn = duckdb.connect(config.DATABASE_PATH) |
||||||
|
try: |
||||||
|
# Get statistics by policy area |
||||||
|
results = conn.execute(""" |
||||||
|
SELECT |
||||||
|
policy_area, |
||||||
|
COUNT(*) as motion_count, |
||||||
|
AVG(winning_margin) as avg_margin, |
||||||
|
AVG(controversy_score) as avg_controversy |
||||||
|
FROM motions |
||||||
|
WHERE policy_area IS NOT NULL |
||||||
|
GROUP BY policy_area |
||||||
|
ORDER BY motion_count DESC |
||||||
|
""").fetchall() |
||||||
|
|
||||||
|
for row in results: |
||||||
|
print( |
||||||
|
f"{row[0]}: {row[1]} motions, " |
||||||
|
f"avg margin {row[2]:.1%}, " |
||||||
|
f"controversy {row[3]:.2f}" |
||||||
|
) |
||||||
|
|
||||||
|
conn.close() |
||||||
|
return results |
||||||
|
except Exception as e: |
||||||
|
conn.close() |
||||||
|
return [] |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
print("=== Filtered Motions ===") |
||||||
|
example_get_filtered_motions() |
||||||
|
|
||||||
|
print("\n=== Aggregation ===") |
||||||
|
example_aggregation() |
||||||
@ -0,0 +1,217 @@ |
|||||||
|
"""Example: Pipeline phase execution - from pipeline/run_pipeline.py and actual codebase.""" |
||||||
|
|
||||||
|
import argparse |
||||||
|
from datetime import date, timedelta |
||||||
|
from typing import List, Tuple |
||||||
|
|
||||||
|
# Import pipeline modules |
||||||
|
from pipeline.fetch_mp_metadata import fetch_mp_metadata |
||||||
|
from pipeline.extract_mp_votes import extract_mp_votes |
||||||
|
from pipeline.svd_pipeline import run_svd_pipeline |
||||||
|
from pipeline.text_pipeline import run_text_pipeline |
||||||
|
from pipeline.fusion import run_fusion |
||||||
|
|
||||||
|
from database import MotionDatabase |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 1: Running full pipeline |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_full_pipeline(): |
||||||
|
"""Run the complete data ingestion pipeline.""" |
||||||
|
|
||||||
|
# Parse arguments like CLI would |
||||||
|
parser = argparse.ArgumentParser(description="Pipeline runner") |
||||||
|
parser.add_argument("--db-path", default="data/motions.db") |
||||||
|
parser.add_argument("--start-date", default=None) |
||||||
|
parser.add_argument("--end-date", default=None) |
||||||
|
parser.add_argument( |
||||||
|
"--window-size", choices=["quarterly", "annual"], default="quarterly" |
||||||
|
) |
||||||
|
parser.add_argument("--svd-k", type=int, default=50) |
||||||
|
|
||||||
|
args = parser.parse_args([]) |
||||||
|
|
||||||
|
# Resolve dates |
||||||
|
end_date = date.fromisoformat(args.end_date) if args.end_date else date.today() |
||||||
|
start_date = ( |
||||||
|
date.fromisoformat(args.start_date) |
||||||
|
if args.start_date |
||||||
|
else end_date - timedelta(days=730) |
||||||
|
) |
||||||
|
|
||||||
|
print(f"Running pipeline: {start_date} → {end_date}") |
||||||
|
print(f"Window size: {args.window_size}") |
||||||
|
print(f"DB path: {args.db_path}") |
||||||
|
|
||||||
|
# Initialize database |
||||||
|
db = MotionDatabase(args.db_path) |
||||||
|
|
||||||
|
# Phase 1: Fetch MP metadata |
||||||
|
print("\n=== Phase 1: MP Metadata ===") |
||||||
|
n_mp = fetch_mp_metadata(db_path=args.db_path) |
||||||
|
print(f"Processed {n_mp} MPs") |
||||||
|
|
||||||
|
# Phase 2: Extract MP votes |
||||||
|
print("\n=== Phase 2: Extract Votes ===") |
||||||
|
n_votes = extract_mp_votes(db_path=args.db_path) |
||||||
|
print(f"Extracted {n_votes} vote records") |
||||||
|
|
||||||
|
# Phase 3: Generate time windows |
||||||
|
print("\n=== Phase 3: SVD Pipeline ===") |
||||||
|
windows = generate_windows(start_date, end_date, args.window_size) |
||||||
|
print(f"Generated {len(windows)} windows: {windows}") |
||||||
|
|
||||||
|
# Phase 4: SVD per window |
||||||
|
run_svd_pipeline(db, windows, args.svd_k) |
||||||
|
print(f"Computed SVD for {len(windows)} windows") |
||||||
|
|
||||||
|
# Phase 5: Text embeddings |
||||||
|
print("\n=== Phase 4: Text Embeddings ===") |
||||||
|
run_text_pipeline(args.db_path, batch_size=50) |
||||||
|
print("Text embeddings completed") |
||||||
|
|
||||||
|
# Phase 6: Fusion |
||||||
|
print("\n=== Phase 5: Fusion ===") |
||||||
|
run_fusion(args.db_path, windows) |
||||||
|
print("Fusion completed") |
||||||
|
|
||||||
|
print("\n=== Pipeline Complete ===") |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 2: Generate time windows |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def generate_windows( |
||||||
|
start: date, end: date, granularity: str |
||||||
|
) -> List[Tuple[str, str, str]]: |
||||||
|
"""Generate time windows for pipeline processing.""" |
||||||
|
|
||||||
|
windows = [] |
||||||
|
cursor = date(start.year, start.month, 1) |
||||||
|
|
||||||
|
if granularity == "annual": |
||||||
|
cursor = date(start.year, 1, 1) |
||||||
|
while cursor <= end: |
||||||
|
year_end = date(cursor.year, 12, 31) |
||||||
|
w_end = min(year_end, end) |
||||||
|
windows.append((str(cursor.year), cursor.isoformat(), w_end.isoformat())) |
||||||
|
cursor = date(cursor.year + 1, 1, 1) |
||||||
|
else: |
||||||
|
# quarterly |
||||||
|
quarter_starts = {1: 1, 2: 4, 3: 7, 4: 10} |
||||||
|
quarter_ends = {1: 3, 2: 6, 3: 9, 4: 12} |
||||||
|
|
||||||
|
q = (cursor.month - 1) // 3 + 1 |
||||||
|
cursor = date(cursor.year, quarter_starts[q], 1) |
||||||
|
|
||||||
|
while cursor <= end: |
||||||
|
q = (cursor.month - 1) // 3 + 1 |
||||||
|
import calendar |
||||||
|
|
||||||
|
q_end_month = quarter_ends[q] |
||||||
|
last_day = calendar.monthrange(cursor.year, q_end_month)[1] |
||||||
|
q_end = date(cursor.year, q_end_month, last_day) |
||||||
|
w_end = min(q_end, end) |
||||||
|
window_id = f"{cursor.year}-Q{q}" |
||||||
|
windows.append((window_id, cursor.isoformat(), w_end.isoformat())) |
||||||
|
cursor = q_end + timedelta(days=1) |
||||||
|
|
||||||
|
return windows |
||||||
|
|
||||||
|
|
||||||
|
def example_window_generation(): |
||||||
|
"""Example of window generation.""" |
||||||
|
|
||||||
|
start = date(2023, 1, 1) |
||||||
|
end = date(2024, 6, 30) |
||||||
|
|
||||||
|
print("Quarterly windows:") |
||||||
|
quarterly = generate_windows(start, end, "quarterly") |
||||||
|
for wid, s, e in quarterly: |
||||||
|
print(f" {wid}: {s} to {e}") |
||||||
|
|
||||||
|
print("\nAnnual windows:") |
||||||
|
annual = generate_windows(start, end, "annual") |
||||||
|
for wid, s, e in annual: |
||||||
|
print(f" {wid}: {s} to {e}") |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 3: Running individual phases |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_individual_phases(): |
||||||
|
"""Run pipeline phases individually for debugging.""" |
||||||
|
|
||||||
|
db_path = "data/motions.db" |
||||||
|
db = MotionDatabase(db_path) |
||||||
|
|
||||||
|
# Only run MP metadata fetch |
||||||
|
print("Fetching MP metadata...") |
||||||
|
n = fetch_mp_metadata(db_path=db_path) |
||||||
|
print(f" {n} MPs processed") |
||||||
|
|
||||||
|
# Only run vote extraction |
||||||
|
print("Extracting votes...") |
||||||
|
n = extract_mp_votes(db_path=db_path) |
||||||
|
print(f" {n} votes extracted") |
||||||
|
|
||||||
|
# Only run SVD for specific window |
||||||
|
print("Computing SVD...") |
||||||
|
windows = [("2024-Q1", "2024-01-01", "2024-03-31")] |
||||||
|
run_svd_pipeline(db, windows, k=50) |
||||||
|
print(" SVD computed") |
||||||
|
|
||||||
|
# Only run text embeddings |
||||||
|
print("Computing embeddings...") |
||||||
|
run_text_pipeline(db_path, batch_size=25) # Smaller batch for testing |
||||||
|
print(" Embeddings computed") |
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================= |
||||||
|
# Example 4: Dry run |
||||||
|
# ============================================================================= |
||||||
|
|
||||||
|
|
||||||
|
def example_dry_run(): |
||||||
|
"""Show what pipeline would do without making changes.""" |
||||||
|
|
||||||
|
print("DRY RUN - no writes will be made") |
||||||
|
|
||||||
|
start_date = date(2024, 1, 1) |
||||||
|
end_date = date(2024, 6, 30) |
||||||
|
|
||||||
|
# Generate and show windows |
||||||
|
windows = generate_windows(start_date, end_date, "quarterly") |
||||||
|
|
||||||
|
print(f"Would process {len(windows)} windows:") |
||||||
|
for wid, s, e in windows: |
||||||
|
print(f" {wid}: {s} to {e}") |
||||||
|
|
||||||
|
print("\nWould run phases:") |
||||||
|
print(" 1. fetch_mp_metadata") |
||||||
|
print(" 2. extract_mp_votes") |
||||||
|
print(" 3. svd_pipeline") |
||||||
|
print(" 4. text_pipeline") |
||||||
|
print(" 5. fusion") |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
import logging |
||||||
|
|
||||||
|
logging.basicConfig( |
||||||
|
level=logging.INFO, |
||||||
|
format="%(asctime)s %(levelname)s %(name)s: %(message)s", |
||||||
|
) |
||||||
|
|
||||||
|
print("=== Window Generation ===") |
||||||
|
example_window_generation() |
||||||
|
|
||||||
|
print("\n=== Dry Run ===") |
||||||
|
example_dry_run() |
||||||
@ -0,0 +1,265 @@ |
|||||||
|
# API Client Patterns |
||||||
|
|
||||||
|
## Base API Client Pattern |
||||||
|
|
||||||
|
Using requests.Session for connection pooling: |
||||||
|
|
||||||
|
```python |
||||||
|
# api_client.py |
||||||
|
import requests |
||||||
|
from typing import Dict, List, Optional |
||||||
|
from config import config |
||||||
|
|
||||||
|
class TweedeKamerAPI: |
||||||
|
def __init__(self): |
||||||
|
self.odata_base_url = "https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0" |
||||||
|
self.session = requests.Session() |
||||||
|
self.session.headers.update({ |
||||||
|
"Accept": "application/json", |
||||||
|
"User-Agent": "Dutch-Political-Compass-Tool/1.0", |
||||||
|
}) |
||||||
|
|
||||||
|
def get_motions( |
||||||
|
self, |
||||||
|
start_date: datetime = None, |
||||||
|
end_date: datetime = None, |
||||||
|
limit: int = 500, |
||||||
|
) -> List[Dict]: |
||||||
|
"""Get motions with voting results using OData API.""" |
||||||
|
if not start_date: |
||||||
|
start_date = datetime.now() - timedelta(days=730) |
||||||
|
|
||||||
|
try: |
||||||
|
voting_records, besluit_meta = self._get_voting_records( |
||||||
|
start_date, end_date, limit |
||||||
|
) |
||||||
|
return self._process_voting_records(voting_records, besluit_meta) |
||||||
|
except Exception as e: |
||||||
|
print(f"Error fetching motions from API: {e}") |
||||||
|
return [] |
||||||
|
``` |
||||||
|
|
||||||
|
## OData Pagination Pattern |
||||||
|
|
||||||
|
Handle server-side pagination with $skip: |
||||||
|
|
||||||
|
```python |
||||||
|
def _get_voting_records( |
||||||
|
self, |
||||||
|
start_date: datetime, |
||||||
|
end_date: datetime = None, |
||||||
|
limit: int = 50000 |
||||||
|
) -> tuple: |
||||||
|
"""Fetch with automatic pagination.""" |
||||||
|
|
||||||
|
filter_query = ( |
||||||
|
f"GewijzigdOp ge {start_date.strftime('%Y-%m-%d')}T00:00:00Z" |
||||||
|
" and StemmingsSoort ne null" |
||||||
|
" and Verwijderd eq false" |
||||||
|
) |
||||||
|
|
||||||
|
page_size = 250 # API caps $top at 250 |
||||||
|
base_url = f"{self.odata_base_url}/Besluit" |
||||||
|
base_params = { |
||||||
|
"$filter": filter_query, |
||||||
|
"$top": page_size, |
||||||
|
"$expand": "Stemming", |
||||||
|
"$orderby": "GewijzigdOp desc", |
||||||
|
} |
||||||
|
|
||||||
|
all_records = [] |
||||||
|
skip = 0 |
||||||
|
|
||||||
|
while len(all_records) < limit: |
||||||
|
params = {**base_params, "$skip": skip} |
||||||
|
response = self.session.get( |
||||||
|
base_url, |
||||||
|
params=params, |
||||||
|
timeout=config.API_TIMEOUT |
||||||
|
) |
||||||
|
response.raise_for_status() |
||||||
|
data = response.json() |
||||||
|
|
||||||
|
besluit_page = data.get("value", []) |
||||||
|
if not besluit_page: |
||||||
|
break |
||||||
|
|
||||||
|
# Process page |
||||||
|
for besluit in besluit_page: |
||||||
|
all_records.extend(self._extract_votes(besluit)) |
||||||
|
|
||||||
|
skip += page_size |
||||||
|
|
||||||
|
return all_records |
||||||
|
``` |
||||||
|
|
||||||
|
## Retry with Backoff Pattern |
||||||
|
|
||||||
|
For transient failures: |
||||||
|
|
||||||
|
```python |
||||||
|
# ai_provider.py |
||||||
|
import time |
||||||
|
import random |
||||||
|
from requests.exceptions import ConnectionError |
||||||
|
|
||||||
|
def _post_with_retries( |
||||||
|
path: str, |
||||||
|
json: dict, |
||||||
|
retries: int = 3 |
||||||
|
) -> requests.Response: |
||||||
|
"""POST with exponential backoff retry.""" |
||||||
|
|
||||||
|
backoff = 0.5 |
||||||
|
for attempt in range(1, retries + 1): |
||||||
|
try: |
||||||
|
resp = requests.post(url, json=json, headers=headers, timeout=10) |
||||||
|
|
||||||
|
# Handle rate limiting |
||||||
|
if resp.status_code == 429: |
||||||
|
if attempt == retries: |
||||||
|
raise ProviderError("Rate limited") |
||||||
|
|
||||||
|
retry_after = resp.headers.get("Retry-After") |
||||||
|
if retry_after: |
||||||
|
time.sleep(int(retry_after)) |
||||||
|
else: |
||||||
|
sleep = backoff * (2 ** (attempt - 1)) |
||||||
|
sleep += random.uniform(0, sleep * 0.1) |
||||||
|
time.sleep(sleep) |
||||||
|
continue |
||||||
|
|
||||||
|
# Handle server errors |
||||||
|
if 500 <= resp.status_code < 600: |
||||||
|
if attempt == retries: |
||||||
|
raise ProviderError(f"Server error: {resp.status_code}") |
||||||
|
time.sleep(backoff * (2 ** (attempt - 1))) |
||||||
|
continue |
||||||
|
|
||||||
|
return resp |
||||||
|
|
||||||
|
except ConnectionError as exc: |
||||||
|
if attempt == retries: |
||||||
|
raise ProviderError(f"Connection error: {exc}") |
||||||
|
time.sleep(backoff * (2 ** (attempt - 1))) |
||||||
|
|
||||||
|
raise ProviderError("Failed after retries") |
||||||
|
``` |
||||||
|
|
||||||
|
## Batch Processing Pattern |
||||||
|
|
||||||
|
Process items in batches to manage API limits: |
||||||
|
|
||||||
|
```python |
||||||
|
def get_embeddings_with_retry( |
||||||
|
texts: List[str], |
||||||
|
batch_size: int = 50, |
||||||
|
retries: int = 3, |
||||||
|
) -> List[Optional[List[float]]]: |
||||||
|
"""Process embeddings in batches with fallback to single items.""" |
||||||
|
|
||||||
|
results = [None] * len(texts) |
||||||
|
|
||||||
|
i = 0 |
||||||
|
while i < len(texts): |
||||||
|
end = min(len(texts), i + batch_size) |
||||||
|
chunk = texts[i:end] |
||||||
|
|
||||||
|
# Try batch first |
||||||
|
try: |
||||||
|
emb_chunk = get_embeddings_batch(chunk) |
||||||
|
for j, emb in enumerate(emb_chunk): |
||||||
|
results[i + j] = emb |
||||||
|
i = end |
||||||
|
continue |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
|
||||||
|
# Fallback: single items |
||||||
|
for j, text in enumerate(chunk): |
||||||
|
try: |
||||||
|
results[i + j] = get_embedding(text) |
||||||
|
except Exception: |
||||||
|
results[i + j] = None |
||||||
|
|
||||||
|
i = end |
||||||
|
|
||||||
|
return results |
||||||
|
``` |
||||||
|
|
||||||
|
## Response Validation Pattern |
||||||
|
|
||||||
|
Validate API responses before processing: |
||||||
|
|
||||||
|
```python |
||||||
|
def _process_response(self, response: requests.Response) -> Dict: |
||||||
|
"""Validate and parse API response.""" |
||||||
|
|
||||||
|
response.raise_for_status() |
||||||
|
data = response.json() |
||||||
|
|
||||||
|
if "value" not in data: |
||||||
|
raise ValueError("Unexpected response format: missing 'value' key") |
||||||
|
|
||||||
|
return data |
||||||
|
|
||||||
|
def _validate_besluit(self, besluit: Dict) -> bool: |
||||||
|
"""Check required fields exist.""" |
||||||
|
required = ["Id", "GewijzigdOp"] |
||||||
|
return all(field in besluit for field in required) |
||||||
|
``` |
||||||
|
|
||||||
|
## Error Handling Patterns |
||||||
|
|
||||||
|
Always provide safe fallbacks: |
||||||
|
|
||||||
|
```python |
||||||
|
def safe_api_call(self, endpoint: str, params: Dict = None) -> List[Dict]: |
||||||
|
"""Call API with error handling and fallback.""" |
||||||
|
try: |
||||||
|
response = self.session.get( |
||||||
|
endpoint, |
||||||
|
params=params, |
||||||
|
timeout=config.API_TIMEOUT |
||||||
|
) |
||||||
|
response.raise_for_status() |
||||||
|
data = response.json() |
||||||
|
return data.get("value", []) |
||||||
|
except requests.Timeout: |
||||||
|
_logger.warning(f"API timeout for {endpoint}") |
||||||
|
return [] |
||||||
|
except requests.HTTPError as e: |
||||||
|
_logger.error(f"HTTP error: {e}") |
||||||
|
return [] |
||||||
|
except Exception as e: |
||||||
|
_logger.error(f"API call failed: {e}") |
||||||
|
return [] |
||||||
|
``` |
||||||
|
|
||||||
|
## Session Management |
||||||
|
|
||||||
|
Reuse session for connection pooling: |
||||||
|
|
||||||
|
```python |
||||||
|
class TweedeKamerAPI: |
||||||
|
def __init__(self): |
||||||
|
self.session = requests.Session() |
||||||
|
self.session.headers.update({ |
||||||
|
"Accept": "application/json", |
||||||
|
"User-Agent": "Dutch-Political-Compass-Tool/1.0", |
||||||
|
}) |
||||||
|
|
||||||
|
def close(self): |
||||||
|
"""Clean up session when done.""" |
||||||
|
self.session.close() |
||||||
|
|
||||||
|
def __enter__(self): |
||||||
|
return self |
||||||
|
|
||||||
|
def __exit__(self, *args): |
||||||
|
self.close() |
||||||
|
|
||||||
|
# Usage |
||||||
|
with TweedeKamerAPI() as api: |
||||||
|
motions = api.get_motions(start_date) |
||||||
|
``` |
||||||
@ -0,0 +1,230 @@ |
|||||||
|
# Architectural Patterns |
||||||
|
|
||||||
|
## Repository Pattern |
||||||
|
|
||||||
|
The `MotionDatabase` class acts as a repository, encapsulating all database operations behind a clean interface. |
||||||
|
|
||||||
|
```python |
||||||
|
# database.py |
||||||
|
class MotionDatabase: |
||||||
|
def __init__(self, db_path: str = config.DATABASE_PATH): |
||||||
|
self.db_path = db_path |
||||||
|
self._init_database() |
||||||
|
|
||||||
|
def get_motion(self, motion_id: int) -> Optional[Dict]: |
||||||
|
"""Get a single motion by ID.""" |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
try: |
||||||
|
result = conn.execute( |
||||||
|
"SELECT * FROM motions WHERE id = ?", (motion_id,) |
||||||
|
).fetchone() |
||||||
|
return result |
||||||
|
finally: |
||||||
|
conn.close() |
||||||
|
|
||||||
|
def get_filtered_motions( |
||||||
|
self, |
||||||
|
policy_area: str = "Alle", |
||||||
|
min_margin: float = 0.0, |
||||||
|
max_margin: float = 1.0, |
||||||
|
limit: int = 10 |
||||||
|
) -> List[Dict]: |
||||||
|
"""Get filtered list of motions.""" |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
**Usage**: Import the singleton instance for all DB operations. |
||||||
|
```python |
||||||
|
from database import db |
||||||
|
|
||||||
|
motions = db.get_filtered_motions(policy_area="Klimaat", limit=20) |
||||||
|
``` |
||||||
|
|
||||||
|
## Facade Pattern |
||||||
|
|
||||||
|
Simplified interfaces over complex subsystems. |
||||||
|
|
||||||
|
### MotionDatabase Facade |
||||||
|
```python |
||||||
|
# Single entry point for all database operations |
||||||
|
db = MotionDatabase() # Singleton instance |
||||||
|
|
||||||
|
# Operations are abstracted: |
||||||
|
db.create_session(total_motions) |
||||||
|
db.record_vote(session_id, motion_id, vote) |
||||||
|
db.get_party_results(session_id) |
||||||
|
``` |
||||||
|
|
||||||
|
### API Client Facade |
||||||
|
```python |
||||||
|
# api_client.py |
||||||
|
class TweedeKamerAPI: |
||||||
|
def __init__(self): |
||||||
|
self.session = requests.Session() # Connection pooling |
||||||
|
|
||||||
|
def get_motions(self, start_date, end_date) -> List[Dict]: |
||||||
|
"""Simple interface hiding OData pagination details.""" |
||||||
|
voting_records, besluit_meta = self._get_voting_records(start_date, end_date) |
||||||
|
return self._process_voting_records(voting_records, besluit_meta) |
||||||
|
``` |
||||||
|
|
||||||
|
### MotionScraper Facade |
||||||
|
```python |
||||||
|
# scraper.py (if used) |
||||||
|
class MotionScraper: |
||||||
|
def get_motion_content(self, url: str) -> Optional[str]: |
||||||
|
"""Extract body text from official website.""" |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
## Pipeline Pattern |
||||||
|
|
||||||
|
Sequential phases with explicit dependencies: |
||||||
|
|
||||||
|
``` |
||||||
|
pipeline/run_pipeline.py |
||||||
|
├── Phase 1: fetch_mp_metadata |
||||||
|
│ └── pipeline/fetch_mp_metadata.py |
||||||
|
├── Phase 2: extract_mp_votes |
||||||
|
│ └── pipeline/extract_mp_votes.py |
||||||
|
├── Phase 3: svd_pipeline |
||||||
|
│ └── pipeline/svd_pipeline.py |
||||||
|
├── Phase 4: text_pipeline (gap-fill) |
||||||
|
│ └── pipeline/text_pipeline.py |
||||||
|
└── Phase 5: fusion (combine SVD + text) |
||||||
|
└── pipeline/fusion.py |
||||||
|
``` |
||||||
|
|
||||||
|
### Phase Orchestration |
||||||
|
```python |
||||||
|
# pipeline/run_pipeline.py |
||||||
|
def run(args: argparse.Namespace) -> int: |
||||||
|
db = MotionDatabase(args.db_path) |
||||||
|
|
||||||
|
# Phase 1: MP metadata |
||||||
|
if not args.skip_metadata: |
||||||
|
from pipeline.fetch_mp_metadata import fetch_mp_metadata |
||||||
|
fetch_mp_metadata(db_path=db.db_path) |
||||||
|
|
||||||
|
# Phase 2: Extract votes |
||||||
|
if not args.skip_extract: |
||||||
|
from pipeline.extract_mp_votes import extract_mp_votes |
||||||
|
extract_mp_votes(db_path=db.db_path) |
||||||
|
|
||||||
|
# Phase 3: SVD per window |
||||||
|
if not args.skip_svd: |
||||||
|
from pipeline.svd_pipeline import run_svd_pipeline |
||||||
|
run_svd_pipeline(db, windows, args.svd_k) |
||||||
|
|
||||||
|
# ... additional phases |
||||||
|
``` |
||||||
|
|
||||||
|
## Strategy Pattern |
||||||
|
|
||||||
|
Interchangeable algorithms for axis computation: |
||||||
|
|
||||||
|
```python |
||||||
|
# analysis/political_axis.py |
||||||
|
def compute_political_axis( |
||||||
|
vectors: Dict[str, np.ndarray], |
||||||
|
method: str = "pca" # or "anchor" |
||||||
|
) -> Tuple[np.ndarray, np.ndarray]: |
||||||
|
"""Compute political axis using specified method. |
||||||
|
|
||||||
|
Methods: |
||||||
|
- 'pca': Use first principal component |
||||||
|
- 'anchor': Use predefined anchor motions |
||||||
|
""" |
||||||
|
if method == "pca": |
||||||
|
return _compute_pca_axis(vectors) |
||||||
|
elif method == "anchor": |
||||||
|
return _compute_anchor_axis(vectors) |
||||||
|
``` |
||||||
|
|
||||||
|
## Visitor Pattern |
||||||
|
|
||||||
|
External operations on data structures: |
||||||
|
|
||||||
|
```python |
||||||
|
# analysis/trajectory.py |
||||||
|
def _procrustes_align_windows( |
||||||
|
window_vecs: Dict[str, Dict[str, np.ndarray]], |
||||||
|
min_overlap: int = 5, |
||||||
|
) -> Dict[str, Dict[str, np.ndarray]]: |
||||||
|
"""Align SVD vectors across windows using Procrustes rotations. |
||||||
|
|
||||||
|
Takes the first window as reference and aligns each subsequent window |
||||||
|
to it via orthogonal Procrustes on the set of common entities. |
||||||
|
""" |
||||||
|
``` |
||||||
|
|
||||||
|
## Builder Pattern |
||||||
|
|
||||||
|
Configuration via method chaining: |
||||||
|
|
||||||
|
```python |
||||||
|
# CLI argument parsing |
||||||
|
parser = argparse.ArgumentParser(description="Pipeline runner") |
||||||
|
parser.add_argument("--db-path", default="data/motions.db") |
||||||
|
parser.add_argument("--start-date", default=None) |
||||||
|
parser.add_argument("--end-date", default=None) |
||||||
|
parser.add_argument("--window-size", choices=["quarterly", "annual"], default="quarterly") |
||||||
|
parser.add_argument("--svd-k", type=int, default=50) |
||||||
|
``` |
||||||
|
|
||||||
|
## Decorator Pattern |
||||||
|
|
||||||
|
Retry logic for transient failures: |
||||||
|
|
||||||
|
```python |
||||||
|
# pipeline/ai_provider_wrapper.py |
||||||
|
def get_embeddings_with_retry( |
||||||
|
texts: List[str], |
||||||
|
retries: int = 3, |
||||||
|
batch_size: int = 50, |
||||||
|
) -> List[Optional[List[float]]]: |
||||||
|
"""Return embeddings with automatic retry on failure.""" |
||||||
|
for attempt in range(1, retries + 1): |
||||||
|
try: |
||||||
|
return _embedder(texts, batch_size=len(texts)) |
||||||
|
except Exception as exc: |
||||||
|
if attempt == retries: |
||||||
|
break |
||||||
|
time.sleep(backoff * (2 ** (attempt - 1))) |
||||||
|
return [None] * len(texts) # Safe fallback |
||||||
|
``` |
||||||
|
|
||||||
|
## Data Patterns |
||||||
|
|
||||||
|
### Batch Processing |
||||||
|
Process items in chunks to manage memory and API limits: |
||||||
|
```python |
||||||
|
for i in range(0, len(items), batch_size): |
||||||
|
chunk = items[i:i + batch_size] |
||||||
|
process_batch(chunk) |
||||||
|
``` |
||||||
|
|
||||||
|
### Caching |
||||||
|
Pre-compute and store expensive results: |
||||||
|
```python |
||||||
|
# SimilarityCache table stores computed similarities |
||||||
|
db.get_similarity(motion_a, motion_b) |
||||||
|
``` |
||||||
|
|
||||||
|
### Lazy Loading |
||||||
|
Load data only when needed: |
||||||
|
```python |
||||||
|
class MotionDatabase: |
||||||
|
@property |
||||||
|
def _connection(self): |
||||||
|
if self._conn is None: |
||||||
|
self._conn = duckdb.connect(self.db_path) |
||||||
|
return self._conn |
||||||
|
``` |
||||||
|
|
||||||
|
### Vectorization |
||||||
|
Use numpy for batch operations: |
||||||
|
```python |
||||||
|
vectors = np.array([v for v in entity_vectors.values()]) |
||||||
|
normalized = vectors / np.linalg.norm(vectors, axis=1, keepdims=True) |
||||||
|
``` |
||||||
@ -0,0 +1,239 @@ |
|||||||
|
# DuckDB Database Patterns |
||||||
|
|
||||||
|
## Connection Management |
||||||
|
|
||||||
|
### Pattern 1: Short-lived per Method (Most Common) |
||||||
|
|
||||||
|
Always create a new connection, use try/finally for cleanup: |
||||||
|
|
||||||
|
```python |
||||||
|
# database.py |
||||||
|
class MotionDatabase: |
||||||
|
def get_motion(self, motion_id: int) -> Optional[Dict]: |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
try: |
||||||
|
result = conn.execute( |
||||||
|
"SELECT * FROM motions WHERE id = ?", |
||||||
|
(motion_id,) |
||||||
|
).fetchone() |
||||||
|
conn.close() |
||||||
|
return result |
||||||
|
except Exception: |
||||||
|
conn.close() |
||||||
|
return None |
||||||
|
|
||||||
|
def get_filtered_motions( |
||||||
|
self, |
||||||
|
policy_area: str = "Alle", |
||||||
|
min_margin: float = 0.0, |
||||||
|
max_margin: float = 1.0, |
||||||
|
limit: int = 10 |
||||||
|
) -> List[Dict]: |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
try: |
||||||
|
query = """ |
||||||
|
SELECT * FROM motions |
||||||
|
WHERE (? = 'Alle' OR policy_area = ?) |
||||||
|
AND winning_margin BETWEEN ? AND ? |
||||||
|
ORDER BY RANDOM() |
||||||
|
LIMIT ? |
||||||
|
""" |
||||||
|
rows = conn.execute(query, (policy_area, policy_area, min_margin, max_margin, limit)).fetchall() |
||||||
|
conn.close() |
||||||
|
return rows |
||||||
|
except Exception: |
||||||
|
conn.close() |
||||||
|
return [] |
||||||
|
``` |
||||||
|
|
||||||
|
### Pattern 2: With Statement (Cleaner) |
||||||
|
|
||||||
|
```python |
||||||
|
def execute_query(self, query: str, params: tuple = ()): |
||||||
|
with duckdb.connect(self.db_path) as conn: |
||||||
|
return conn.execute(query, params).fetchall() |
||||||
|
``` |
||||||
|
|
||||||
|
### Pattern 3: Lazy Connection Caching |
||||||
|
|
||||||
|
For frequently accessed connections: |
||||||
|
|
||||||
|
```python |
||||||
|
class MotionDatabase: |
||||||
|
def __init__(self, db_path: str = config.DATABASE_PATH): |
||||||
|
self.db_path = db_path |
||||||
|
self._conn = None |
||||||
|
|
||||||
|
@property |
||||||
|
def connection(self): |
||||||
|
if self._conn is None: |
||||||
|
self._conn = duckdb.connect(self.db_path) |
||||||
|
return self._conn |
||||||
|
|
||||||
|
def close(self): |
||||||
|
if self._conn: |
||||||
|
self._conn.close() |
||||||
|
self._conn = None |
||||||
|
``` |
||||||
|
|
||||||
|
## Table Initialization |
||||||
|
|
||||||
|
Create tables with proper constraints and sequences: |
||||||
|
|
||||||
|
```python |
||||||
|
def _init_database(self): |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
|
||||||
|
# Create sequence for auto-incrementing IDs |
||||||
|
try: |
||||||
|
conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1") |
||||||
|
except: |
||||||
|
pass |
||||||
|
|
||||||
|
# Create tables |
||||||
|
conn.execute(""" |
||||||
|
CREATE TABLE IF NOT EXISTS motions ( |
||||||
|
id INTEGER DEFAULT nextval('motions_id_seq'), |
||||||
|
title TEXT NOT NULL, |
||||||
|
description TEXT, |
||||||
|
date DATE, |
||||||
|
policy_area TEXT, |
||||||
|
voting_results JSON, |
||||||
|
winning_margin FLOAT, |
||||||
|
controversy_score FLOAT, |
||||||
|
layman_explanation TEXT, |
||||||
|
externe_identifier TEXT, |
||||||
|
body_text TEXT, |
||||||
|
url TEXT UNIQUE, |
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
||||||
|
PRIMARY KEY (id) |
||||||
|
) |
||||||
|
""") |
||||||
|
|
||||||
|
# Add columns to existing tables safely |
||||||
|
try: |
||||||
|
conn.execute("ALTER TABLE motions ADD COLUMN IF NOT EXISTS body_text TEXT") |
||||||
|
except Exception: |
||||||
|
pass # Column may already exist |
||||||
|
|
||||||
|
conn.close() |
||||||
|
``` |
||||||
|
|
||||||
|
## JSON Column Handling |
||||||
|
|
||||||
|
Store and retrieve JSON data: |
||||||
|
|
||||||
|
```python |
||||||
|
# Insert JSON |
||||||
|
def store_motion(self, motion: Dict): |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
try: |
||||||
|
conn.execute( |
||||||
|
"INSERT INTO motions (title, voting_results) VALUES (?, ?)", |
||||||
|
(motion["title"], json.dumps(motion["voting_results"])) |
||||||
|
) |
||||||
|
conn.close() |
||||||
|
except Exception: |
||||||
|
conn.close() |
||||||
|
|
||||||
|
# Query JSON |
||||||
|
def get_motions_with_votes(self, party: str) -> List[Dict]: |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
try: |
||||||
|
rows = conn.execute(""" |
||||||
|
SELECT title, voting_results |
||||||
|
FROM motions |
||||||
|
WHERE JSON_EXTRACT(voting_results, '$.party') = ? |
||||||
|
""", (party,)).fetchall() |
||||||
|
conn.close() |
||||||
|
return rows |
||||||
|
except Exception: |
||||||
|
conn.close() |
||||||
|
return [] |
||||||
|
``` |
||||||
|
|
||||||
|
## Query Patterns |
||||||
|
|
||||||
|
### Parameterized Queries (Always!) |
||||||
|
```python |
||||||
|
# SAFE - uses parameterized query |
||||||
|
conn.execute("SELECT * FROM motions WHERE id = ?", (motion_id,)) |
||||||
|
|
||||||
|
# AVOID - SQL injection risk |
||||||
|
# conn.execute(f"SELECT * FROM motions WHERE id = {motion_id}") # BAD! |
||||||
|
``` |
||||||
|
|
||||||
|
### Batch Inserts |
||||||
|
```python |
||||||
|
def bulk_insert_motions(self, motions: List[Dict]): |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
try: |
||||||
|
for motion in motions: |
||||||
|
conn.execute( |
||||||
|
"""INSERT OR IGNORE INTO motions |
||||||
|
(title, date, policy_area) VALUES (?, ?, ?)""", |
||||||
|
(motion["title"], motion["date"], motion["policy_area"]) |
||||||
|
) |
||||||
|
conn.close() |
||||||
|
except Exception: |
||||||
|
conn.close() |
||||||
|
``` |
||||||
|
|
||||||
|
### Aggregation Queries |
||||||
|
```python |
||||||
|
def get_party_vote_stats(self, party: str) -> Dict: |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
try: |
||||||
|
result = conn.execute(""" |
||||||
|
SELECT |
||||||
|
COUNT(*) as total_votes, |
||||||
|
SUM(CASE WHEN vote = 'Voor' THEN 1 ELSE 0 END) as voor, |
||||||
|
SUM(CASE WHEN vote = 'Tegen' THEN 1 ELSE 0 END) as tegen |
||||||
|
FROM mp_votes |
||||||
|
WHERE party = ? |
||||||
|
""", (party,)).fetchone() |
||||||
|
conn.close() |
||||||
|
return {"total": result[0], "voor": result[1], "tegen": result[2]} |
||||||
|
except Exception: |
||||||
|
conn.close() |
||||||
|
return {"total": 0, "voor": 0, "tegen": 0} |
||||||
|
``` |
||||||
|
|
||||||
|
## Error Handling |
||||||
|
|
||||||
|
Always close connections in finally block or with context manager: |
||||||
|
|
||||||
|
```python |
||||||
|
def safe_query(self, query: str, params: tuple = ()): |
||||||
|
conn = None |
||||||
|
try: |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
result = conn.execute(query, params).fetchall() |
||||||
|
return result |
||||||
|
except Exception as e: |
||||||
|
_logger.error(f"Query failed: {e}") |
||||||
|
return [] |
||||||
|
finally: |
||||||
|
if conn: |
||||||
|
conn.close() |
||||||
|
``` |
||||||
|
|
||||||
|
## Testing with Mock |
||||||
|
|
||||||
|
For unit tests without DuckDB: |
||||||
|
|
||||||
|
```python |
||||||
|
# In MotionDatabase.__init__ |
||||||
|
def __init__(self, db_path: str = config.DATABASE_PATH): |
||||||
|
self.db_path = db_path |
||||||
|
self._file_mode = duckdb is None |
||||||
|
|
||||||
|
if duckdb is None: |
||||||
|
# Create JSON fallback files |
||||||
|
for p in (f"{db_path}.embeddings.json", f"{db_path}.similarity_cache.json"): |
||||||
|
if not os.path.exists(p): |
||||||
|
with open(p, "w") as fh: |
||||||
|
fh.write("[]") |
||||||
|
else: |
||||||
|
self._init_database() |
||||||
|
``` |
||||||
@ -0,0 +1,196 @@ |
|||||||
|
# Python-Specific Patterns |
||||||
|
|
||||||
|
## Singleton Pattern |
||||||
|
|
||||||
|
Use module-level instances for shared resources: |
||||||
|
|
||||||
|
```python |
||||||
|
# database.py |
||||||
|
class MotionDatabase: |
||||||
|
def __init__(self, db_path: str = config.DATABASE_PATH): |
||||||
|
self.db_path = db_path |
||||||
|
self._init_database() |
||||||
|
|
||||||
|
def _init_database(self): |
||||||
|
# Initialize tables on first instantiation |
||||||
|
... |
||||||
|
|
||||||
|
# Bottom of file - the singleton |
||||||
|
db = MotionDatabase() |
||||||
|
``` |
||||||
|
|
||||||
|
**Usage across the codebase:** |
||||||
|
```python |
||||||
|
# In other modules |
||||||
|
from database import db |
||||||
|
|
||||||
|
def some_function(): |
||||||
|
motions = db.get_filtered_motions(limit=10) |
||||||
|
return motions |
||||||
|
``` |
||||||
|
|
||||||
|
Similarly for other singletons: |
||||||
|
```python |
||||||
|
# summarizer.py |
||||||
|
class MotionSummarizer: |
||||||
|
def __init__(self): |
||||||
|
pass # Stateless |
||||||
|
|
||||||
|
def generate_layman_explanation(self, title: str, body: str) -> str: |
||||||
|
... |
||||||
|
|
||||||
|
summarizer = MotionSummarizer() |
||||||
|
``` |
||||||
|
|
||||||
|
## Dataclass Config Pattern |
||||||
|
|
||||||
|
Use dataclass for configuration with environment variable support: |
||||||
|
|
||||||
|
```python |
||||||
|
# config.py |
||||||
|
from dataclasses import dataclass |
||||||
|
from typing import List |
||||||
|
import os |
||||||
|
|
||||||
|
@dataclass |
||||||
|
class Config: |
||||||
|
# Database settings |
||||||
|
DATABASE_PATH = "data/motions.db" |
||||||
|
|
||||||
|
# API settings |
||||||
|
TWEEDE_KAMER_ODATA_API = "https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0" |
||||||
|
API_TIMEOUT = 30 |
||||||
|
API_BATCH_SIZE = 250 |
||||||
|
|
||||||
|
# AI settings |
||||||
|
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") |
||||||
|
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" |
||||||
|
QWEN_MODEL = "qwen/qwen-2.5-72b-instruct" |
||||||
|
|
||||||
|
# App settings |
||||||
|
DEFAULT_MOTION_COUNT = 10 |
||||||
|
SESSION_TIMEOUT_DAYS = 30 |
||||||
|
|
||||||
|
# Policy areas |
||||||
|
POLICY_AREAS: List[str] = None |
||||||
|
def __post_init__(self): |
||||||
|
self.POLICY_AREAS = [ |
||||||
|
"Alle", "Economie", "Klimaat", "Immigratie", |
||||||
|
"Zorg", "Onderwijs", "Defensie", "Sociale Zaken", "Algemeen" |
||||||
|
] |
||||||
|
|
||||||
|
config = Config() |
||||||
|
``` |
||||||
|
|
||||||
|
**Usage:** |
||||||
|
```python |
||||||
|
from config import config |
||||||
|
|
||||||
|
# Access as attributes |
||||||
|
timeout = config.API_TIMEOUT |
||||||
|
areas = config.POLICY_AREAS |
||||||
|
``` |
||||||
|
|
||||||
|
## DuckDB Connection Pattern |
||||||
|
|
||||||
|
Short-lived connections with explicit cleanup: |
||||||
|
|
||||||
|
```python |
||||||
|
class MotionDatabase: |
||||||
|
def get_motion(self, motion_id: int) -> Optional[Dict]: |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
try: |
||||||
|
result = conn.execute( |
||||||
|
"SELECT * FROM motions WHERE id = ?", |
||||||
|
(motion_id,) |
||||||
|
).fetchone() |
||||||
|
return result |
||||||
|
finally: |
||||||
|
conn.close() |
||||||
|
|
||||||
|
def get_filtered_motions(self, **kwargs) -> List[Dict]: |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
try: |
||||||
|
rows = conn.execute(query, params).fetchall() |
||||||
|
return rows |
||||||
|
except Exception: |
||||||
|
return [] # Safe fallback |
||||||
|
finally: |
||||||
|
conn.close() |
||||||
|
``` |
||||||
|
|
||||||
|
**Context manager alternative (preferred when applicable):** |
||||||
|
```python |
||||||
|
def some_operation(self): |
||||||
|
with duckdb.connect(self.db_path) as conn: |
||||||
|
result = conn.execute("SELECT ...").fetchall() |
||||||
|
return result |
||||||
|
``` |
||||||
|
|
||||||
|
## Try/Except with Fallback Pattern |
||||||
|
|
||||||
|
Always provide safe fallbacks: |
||||||
|
|
||||||
|
```python |
||||||
|
def get_motion_or_default(self, motion_id: int) -> Dict: |
||||||
|
try: |
||||||
|
conn = duckdb.connect(self.db_path) |
||||||
|
result = conn.execute("SELECT * FROM motions WHERE id = ?", (motion_id,)).fetchone() |
||||||
|
conn.close() |
||||||
|
return result if result else {} |
||||||
|
except Exception: |
||||||
|
return {} |
||||||
|
``` |
||||||
|
|
||||||
|
## Optional Import Pattern |
||||||
|
|
||||||
|
Handle optional dependencies gracefully: |
||||||
|
|
||||||
|
```python |
||||||
|
try: |
||||||
|
import duckdb |
||||||
|
except Exception: # pragma: no cover |
||||||
|
duckdb = None |
||||||
|
|
||||||
|
class MotionDatabase: |
||||||
|
def __init__(self, db_path: str = config.DATABASE_PATH): |
||||||
|
self._file_mode = duckdb is None |
||||||
|
... |
||||||
|
``` |
||||||
|
|
||||||
|
## Property Pattern |
||||||
|
|
||||||
|
Lazy initialization of expensive resources: |
||||||
|
|
||||||
|
```python |
||||||
|
class MotionDatabase: |
||||||
|
def __init__(self, db_path: str = config.DATABASE_PATH): |
||||||
|
self.db_path = db_path |
||||||
|
self._session_cache = None |
||||||
|
|
||||||
|
@property |
||||||
|
def session(self): |
||||||
|
"""Lazy-load expensive resources.""" |
||||||
|
if self._session_cache is None: |
||||||
|
self._session_cache = self._create_session() |
||||||
|
return self._session_cache |
||||||
|
``` |
||||||
|
|
||||||
|
## Type Annotation Patterns |
||||||
|
|
||||||
|
```python |
||||||
|
from typing import Dict, List, Optional, Tuple, Any |
||||||
|
|
||||||
|
# Optional with None default |
||||||
|
def get_motion(self, motion_id: Optional[int] = None) -> Optional[Dict]: |
||||||
|
... |
||||||
|
|
||||||
|
# Multiple return types |
||||||
|
def parse_vote(self, vote_str: str) -> Tuple[bool, str]: |
||||||
|
"""Returns (success, error_message)""" |
||||||
|
... |
||||||
|
|
||||||
|
# Generic types |
||||||
|
def get_batch(self, ids: List[int]) -> Dict[str, Any]: |
||||||
|
... |
||||||
|
``` |
||||||
Loading…
Reference in new issue