You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
116 lines
3.3 KiB
116 lines
3.3 KiB
# Extracted pattern examples (representative snippets)
|
|
|
|
Note: snippets are verbatim extracts from repository files (Phase 1). Paths shown.
|
|
|
|
## DuckDB connect + schema init (database.py)
|
|
```python
|
|
conn = duckdb.connect(self.db_path)
|
|
|
|
# Create sequence for auto-incrementing IDs
|
|
try:
|
|
conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1")
|
|
except:
|
|
pass
|
|
|
|
# Create tables with proper ID handling
|
|
conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS motions (
|
|
id INTEGER DEFAULT nextval('motions_id_seq'),
|
|
title TEXT NOT NULL,
|
|
description TEXT,
|
|
date DATE,
|
|
policy_area TEXT,
|
|
voting_results JSON,
|
|
winning_margin FLOAT,
|
|
controversy_score FLOAT,
|
|
layman_explanation TEXT,
|
|
externe_identifier TEXT,
|
|
body_text TEXT,
|
|
url TEXT UNIQUE,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
PRIMARY KEY (id)
|
|
)
|
|
""")
|
|
conn.close()
|
|
```
|
|
|
|
## Read-only compute worker (svd_pipeline.py)
|
|
```python
|
|
conn = duckdb.connect(db_path, read_only=True)
|
|
try:
|
|
rows = conn.execute(
|
|
"SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ?",
|
|
(start_date, end_date),
|
|
).fetchall()
|
|
finally:
|
|
conn.close()
|
|
```
|
|
|
|
## Requests with retry/backoff (ai_provider.py)
|
|
```python
|
|
resp = requests.post(url, json=json, headers=headers, timeout=10)
|
|
...
|
|
if getattr(resp, "status_code", 0) == 429:
|
|
if attempt == retries:
|
|
raise ProviderError(f"Provider returned HTTP {resp.status_code}")
|
|
retry_after = None
|
|
raw = resp.headers.get("Retry-After") if getattr(resp, "headers", None) else None
|
|
if raw:
|
|
try:
|
|
retry_after = int(raw)
|
|
except Exception:
|
|
try:
|
|
dt = parsedate_to_datetime(raw)
|
|
now = datetime.now(tz=dt.tzinfo or timezone.utc)
|
|
secs = (dt - now).total_seconds()
|
|
retry_after = max(0, int(secs))
|
|
except Exception:
|
|
retry_after = None
|
|
|
|
if retry_after is not None:
|
|
time.sleep(retry_after)
|
|
continue
|
|
```
|
|
|
|
## Embedding batch + per-item fallback (pipeline/ai_provider_wrapper.py)
|
|
```python
|
|
for start in range(0, len(texts), batch_size):
|
|
chunk = texts[i:end]
|
|
emb_chunk, emb_exc = _attempt_batch(chunk, i)
|
|
if emb_chunk is not None:
|
|
for j, emb in enumerate(emb_chunk):
|
|
results[i + j] = emb
|
|
i = end
|
|
continue
|
|
|
|
# batch failed -> fallback to per-item attempts
|
|
for j in range(i, end):
|
|
t = texts[j]
|
|
single, single_exc = _attempt_batch([t], j)
|
|
if single:
|
|
results[j] = single[0]
|
|
continue
|
|
results[j] = None
|
|
```
|
|
|
|
## Similarity compute (similarity/compute.py)
|
|
```python
|
|
# Ensure consistent dimensionality: pad shorter vectors with zeros
|
|
lengths = [len(v) for v in vecs]
|
|
max_dim = max(lengths)
|
|
if len(set(lengths)) != 1:
|
|
logger.warning(
|
|
"Inconsistent vector dimensions detected (max=%d). Padding shorter vectors with zeros.",
|
|
max_dim,
|
|
)
|
|
|
|
matrix = np.zeros((len(vecs), max_dim), dtype=np.float32)
|
|
for i, v in enumerate(vecs):
|
|
matrix[i, : len(v)] = v
|
|
|
|
# Normalize rows and compute cosine similarity
|
|
norms = np.linalg.norm(matrix, axis=1, keepdims=True)
|
|
norms[norms == 0] = 1.0
|
|
normalized = matrix / norms
|
|
sim = normalized @ normalized.T
|
|
```
|
|
|