You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
3.3 KiB
3.3 KiB
Extracted pattern examples (representative snippets)
Note: snippets are verbatim extracts from repository files (Phase 1). Paths shown.
DuckDB connect + schema init (database.py)
conn = duckdb.connect(self.db_path)
# Create sequence for auto-incrementing IDs
try:
conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1")
except:
pass
# Create tables with proper ID handling
conn.execute("""
CREATE TABLE IF NOT EXISTS motions (
id INTEGER DEFAULT nextval('motions_id_seq'),
title TEXT NOT NULL,
description TEXT,
date DATE,
policy_area TEXT,
voting_results JSON,
winning_margin FLOAT,
controversy_score FLOAT,
layman_explanation TEXT,
externe_identifier TEXT,
body_text TEXT,
url TEXT UNIQUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id)
)
""")
conn.close()
Read-only compute worker (svd_pipeline.py)
conn = duckdb.connect(db_path, read_only=True)
try:
rows = conn.execute(
"SELECT motion_id, mp_name, vote FROM mp_votes WHERE date BETWEEN ? AND ?",
(start_date, end_date),
).fetchall()
finally:
conn.close()
Requests with retry/backoff (ai_provider.py)
resp = requests.post(url, json=json, headers=headers, timeout=10)
...
if getattr(resp, "status_code", 0) == 429:
if attempt == retries:
raise ProviderError(f"Provider returned HTTP {resp.status_code}")
retry_after = None
raw = resp.headers.get("Retry-After") if getattr(resp, "headers", None) else None
if raw:
try:
retry_after = int(raw)
except Exception:
try:
dt = parsedate_to_datetime(raw)
now = datetime.now(tz=dt.tzinfo or timezone.utc)
secs = (dt - now).total_seconds()
retry_after = max(0, int(secs))
except Exception:
retry_after = None
if retry_after is not None:
time.sleep(retry_after)
continue
Embedding batch + per-item fallback (pipeline/ai_provider_wrapper.py)
for start in range(0, len(texts), batch_size):
chunk = texts[i:end]
emb_chunk, emb_exc = _attempt_batch(chunk, i)
if emb_chunk is not None:
for j, emb in enumerate(emb_chunk):
results[i + j] = emb
i = end
continue
# batch failed -> fallback to per-item attempts
for j in range(i, end):
t = texts[j]
single, single_exc = _attempt_batch([t], j)
if single:
results[j] = single[0]
continue
results[j] = None
Similarity compute (similarity/compute.py)
# Ensure consistent dimensionality: pad shorter vectors with zeros
lengths = [len(v) for v in vecs]
max_dim = max(lengths)
if len(set(lengths)) != 1:
logger.warning(
"Inconsistent vector dimensions detected (max=%d). Padding shorter vectors with zeros.",
max_dim,
)
matrix = np.zeros((len(vecs), max_dim), dtype=np.float32)
for i, v in enumerate(vecs):
matrix[i, : len(v)] = v
# Normalize rows and compute cosine similarity
norms = np.linalg.norm(matrix, axis=1, keepdims=True)
norms[norms == 0] = 1.0
normalized = matrix / norms
sim = normalized @ normalized.T