You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
230 lines
6.1 KiB
230 lines
6.1 KiB
# Architectural Patterns
|
|
|
|
## Repository Pattern
|
|
|
|
The `MotionDatabase` class acts as a repository, encapsulating all database operations behind a clean interface.
|
|
|
|
```python
|
|
# database.py
|
|
class MotionDatabase:
|
|
def __init__(self, db_path: str = config.DATABASE_PATH):
|
|
self.db_path = db_path
|
|
self._init_database()
|
|
|
|
def get_motion(self, motion_id: int) -> Optional[Dict]:
|
|
"""Get a single motion by ID."""
|
|
conn = duckdb.connect(self.db_path)
|
|
try:
|
|
result = conn.execute(
|
|
"SELECT * FROM motions WHERE id = ?", (motion_id,)
|
|
).fetchone()
|
|
return result
|
|
finally:
|
|
conn.close()
|
|
|
|
def get_filtered_motions(
|
|
self,
|
|
policy_area: str = "Alle",
|
|
min_margin: float = 0.0,
|
|
max_margin: float = 1.0,
|
|
limit: int = 10
|
|
) -> List[Dict]:
|
|
"""Get filtered list of motions."""
|
|
...
|
|
```
|
|
|
|
**Usage**: Import the singleton instance for all DB operations.
|
|
```python
|
|
from database import db
|
|
|
|
motions = db.get_filtered_motions(policy_area="Klimaat", limit=20)
|
|
```
|
|
|
|
## Facade Pattern
|
|
|
|
Simplified interfaces over complex subsystems.
|
|
|
|
### MotionDatabase Facade
|
|
```python
|
|
# Single entry point for all database operations
|
|
db = MotionDatabase() # Singleton instance
|
|
|
|
# Operations are abstracted:
|
|
db.create_session(total_motions)
|
|
db.record_vote(session_id, motion_id, vote)
|
|
db.get_party_results(session_id)
|
|
```
|
|
|
|
### API Client Facade
|
|
```python
|
|
# api_client.py
|
|
class TweedeKamerAPI:
|
|
def __init__(self):
|
|
self.session = requests.Session() # Connection pooling
|
|
|
|
def get_motions(self, start_date, end_date) -> List[Dict]:
|
|
"""Simple interface hiding OData pagination details."""
|
|
voting_records, besluit_meta = self._get_voting_records(start_date, end_date)
|
|
return self._process_voting_records(voting_records, besluit_meta)
|
|
```
|
|
|
|
### MotionScraper Facade
|
|
```python
|
|
# scraper.py (if used)
|
|
class MotionScraper:
|
|
def get_motion_content(self, url: str) -> Optional[str]:
|
|
"""Extract body text from official website."""
|
|
...
|
|
```
|
|
|
|
## Pipeline Pattern
|
|
|
|
Sequential phases with explicit dependencies:
|
|
|
|
```
|
|
pipeline/run_pipeline.py
|
|
├── Phase 1: fetch_mp_metadata
|
|
│ └── pipeline/fetch_mp_metadata.py
|
|
├── Phase 2: extract_mp_votes
|
|
│ └── pipeline/extract_mp_votes.py
|
|
├── Phase 3: svd_pipeline
|
|
│ └── pipeline/svd_pipeline.py
|
|
├── Phase 4: text_pipeline (gap-fill)
|
|
│ └── pipeline/text_pipeline.py
|
|
└── Phase 5: fusion (combine SVD + text)
|
|
└── pipeline/fusion.py
|
|
```
|
|
|
|
### Phase Orchestration
|
|
```python
|
|
# pipeline/run_pipeline.py
|
|
def run(args: argparse.Namespace) -> int:
|
|
db = MotionDatabase(args.db_path)
|
|
|
|
# Phase 1: MP metadata
|
|
if not args.skip_metadata:
|
|
from pipeline.fetch_mp_metadata import fetch_mp_metadata
|
|
fetch_mp_metadata(db_path=db.db_path)
|
|
|
|
# Phase 2: Extract votes
|
|
if not args.skip_extract:
|
|
from pipeline.extract_mp_votes import extract_mp_votes
|
|
extract_mp_votes(db_path=db.db_path)
|
|
|
|
# Phase 3: SVD per window
|
|
if not args.skip_svd:
|
|
from pipeline.svd_pipeline import run_svd_pipeline
|
|
run_svd_pipeline(db, windows, args.svd_k)
|
|
|
|
# ... additional phases
|
|
```
|
|
|
|
## Strategy Pattern
|
|
|
|
Interchangeable algorithms for axis computation:
|
|
|
|
```python
|
|
# analysis/political_axis.py
|
|
def compute_political_axis(
|
|
vectors: Dict[str, np.ndarray],
|
|
method: str = "pca" # or "anchor"
|
|
) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""Compute political axis using specified method.
|
|
|
|
Methods:
|
|
- 'pca': Use first principal component
|
|
- 'anchor': Use predefined anchor motions
|
|
"""
|
|
if method == "pca":
|
|
return _compute_pca_axis(vectors)
|
|
elif method == "anchor":
|
|
return _compute_anchor_axis(vectors)
|
|
```
|
|
|
|
## Visitor Pattern
|
|
|
|
External operations on data structures:
|
|
|
|
```python
|
|
# analysis/trajectory.py
|
|
def _procrustes_align_windows(
|
|
window_vecs: Dict[str, Dict[str, np.ndarray]],
|
|
min_overlap: int = 5,
|
|
) -> Dict[str, Dict[str, np.ndarray]]:
|
|
"""Align SVD vectors across windows using Procrustes rotations.
|
|
|
|
Takes the first window as reference and aligns each subsequent window
|
|
to it via orthogonal Procrustes on the set of common entities.
|
|
"""
|
|
```
|
|
|
|
## Builder Pattern
|
|
|
|
Configuration via method chaining:
|
|
|
|
```python
|
|
# CLI argument parsing
|
|
parser = argparse.ArgumentParser(description="Pipeline runner")
|
|
parser.add_argument("--db-path", default="data/motions.db")
|
|
parser.add_argument("--start-date", default=None)
|
|
parser.add_argument("--end-date", default=None)
|
|
parser.add_argument("--window-size", choices=["quarterly", "annual"], default="quarterly")
|
|
parser.add_argument("--svd-k", type=int, default=50)
|
|
```
|
|
|
|
## Decorator Pattern
|
|
|
|
Retry logic for transient failures:
|
|
|
|
```python
|
|
# pipeline/ai_provider_wrapper.py
|
|
def get_embeddings_with_retry(
|
|
texts: List[str],
|
|
retries: int = 3,
|
|
batch_size: int = 50,
|
|
) -> List[Optional[List[float]]]:
|
|
"""Return embeddings with automatic retry on failure."""
|
|
for attempt in range(1, retries + 1):
|
|
try:
|
|
return _embedder(texts, batch_size=len(texts))
|
|
except Exception as exc:
|
|
if attempt == retries:
|
|
break
|
|
time.sleep(backoff * (2 ** (attempt - 1)))
|
|
return [None] * len(texts) # Safe fallback
|
|
```
|
|
|
|
## Data Patterns
|
|
|
|
### Batch Processing
|
|
Process items in chunks to manage memory and API limits:
|
|
```python
|
|
for i in range(0, len(items), batch_size):
|
|
chunk = items[i:i + batch_size]
|
|
process_batch(chunk)
|
|
```
|
|
|
|
### Caching
|
|
Pre-compute and store expensive results:
|
|
```python
|
|
# SimilarityCache table stores computed similarities
|
|
db.get_similarity(motion_a, motion_b)
|
|
```
|
|
|
|
### Lazy Loading
|
|
Load data only when needed:
|
|
```python
|
|
class MotionDatabase:
|
|
@property
|
|
def _connection(self):
|
|
if self._conn is None:
|
|
self._conn = duckdb.connect(self.db_path)
|
|
return self._conn
|
|
```
|
|
|
|
### Vectorization
|
|
Use numpy for batch operations:
|
|
```python
|
|
vectors = np.array([v for v in entity_vectors.values()])
|
|
normalized = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
|
|
```
|
|
|