- ai_provider_wrapper: retry/fallback with exponential backoff, None sentinel for failed items - text_pipeline: use wrapper, return 5-tuple (stored, skipped_existing, skipped_no_text, errors, failed_ids) - similarity/compute: filter trivial 1.0 matches on identical short titles (<12 chars) - rerun_embeddings: --retry-missing mode, calls ensure_text_embeddings_for_ids on failed ids - sync_motion_content: per-ext_id retries, HTTPAdapter pool, --max-body-workers CLI flag, audit on failure - qa_similarity script: samples motions, writes JSON ledger to thoughts/ledgers/ - All tests green: 61 passed, 2 skippedmain
parent
aef7c45074
commit
b09e580f65
@ -0,0 +1,150 @@ |
|||||||
|
"""Quick QA script that samples motions and checks similarity cache quality. |
||||||
|
|
||||||
|
Writes a short JSON summary into thoughts/ledgers/qa_similarity_{ts}.json |
||||||
|
""" |
||||||
|
|
||||||
|
from __future__ import annotations |
||||||
|
|
||||||
|
import argparse |
||||||
|
import json |
||||||
|
import logging |
||||||
|
import os |
||||||
|
import random |
||||||
|
from datetime import datetime |
||||||
|
from typing import List |
||||||
|
|
||||||
|
_logger = logging.getLogger(__name__) |
||||||
|
|
||||||
|
|
||||||
|
def sample_motion_ids(sample_size: int) -> List[int]: |
||||||
|
# naive: select all motion ids from DB and sample |
||||||
|
# Prefer any dynamically-provided database object from the 'database' |
||||||
|
# module so tests can inject a fake via sys.modules. |
||||||
|
try: |
||||||
|
database_mod = __import__("database") |
||||||
|
db_obj = getattr(database_mod, "db", None) |
||||||
|
if db_obj and hasattr(db_obj, "sample_motions"): |
||||||
|
return db_obj.sample_motions(sample_size) |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
try: |
||||||
|
conn = ( |
||||||
|
__import__("duckdb").connect(db.db_path) if __import__("duckdb") else None |
||||||
|
) |
||||||
|
except Exception: |
||||||
|
conn = None |
||||||
|
|
||||||
|
if conn is None: |
||||||
|
# fallback: read from motions.json if present (file-backed mode) |
||||||
|
# Not implemented: return empty |
||||||
|
return [] |
||||||
|
|
||||||
|
try: |
||||||
|
rows = conn.execute("SELECT id FROM motions").fetchall() |
||||||
|
conn.close() |
||||||
|
ids = [r[0] for r in rows] |
||||||
|
if not ids: |
||||||
|
return [] |
||||||
|
return random.sample(ids, min(sample_size, len(ids))) |
||||||
|
except Exception: |
||||||
|
if conn: |
||||||
|
try: |
||||||
|
conn.close() |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
return [] |
||||||
|
|
||||||
|
|
||||||
|
def run_qa(db_path: str, sample_size: int = 50, top_k: int = 5) -> dict: |
||||||
|
summary = { |
||||||
|
"timestamp": datetime.utcnow().isoformat() + "Z", |
||||||
|
"sample_size": sample_size, |
||||||
|
"top_k": top_k, |
||||||
|
"results": [], |
||||||
|
} |
||||||
|
|
||||||
|
ids = sample_motion_ids(sample_size) |
||||||
|
if not ids: |
||||||
|
summary["error"] = "no motion ids available" |
||||||
|
return summary |
||||||
|
|
||||||
|
# Resolve db at runtime so tests can substitute a fake module |
||||||
|
try: |
||||||
|
database_mod = __import__("database") |
||||||
|
db_obj = getattr(database_mod, "db", None) |
||||||
|
except Exception: |
||||||
|
db_obj = None |
||||||
|
|
||||||
|
for mid in ids: |
||||||
|
if db_obj and hasattr(db_obj, "get_cached_similarities"): |
||||||
|
sims = db_obj.get_cached_similarities(mid, top_k=top_k) |
||||||
|
else: |
||||||
|
# fallback: attempt to call module-level db if present |
||||||
|
try: |
||||||
|
from database import db as fallback_db |
||||||
|
|
||||||
|
sims = fallback_db.get_cached_similarities( |
||||||
|
mid, vector_type="fused", top_k=top_k |
||||||
|
) |
||||||
|
except Exception: |
||||||
|
sims = [] |
||||||
|
# heuristics: count how many top_k have score >= 0.99999 and different target ids |
||||||
|
suspicious = 0 |
||||||
|
for r in sims: |
||||||
|
try: |
||||||
|
score = float(r.get("score", 0.0)) |
||||||
|
target = ( |
||||||
|
r.get("target_motion_id") |
||||||
|
if r.get("target_motion_id") is not None |
||||||
|
else r.get("id") |
||||||
|
) |
||||||
|
if score > 0.99999 and int(target) != int(mid): |
||||||
|
suspicious += 1 |
||||||
|
except Exception: |
||||||
|
# Be tolerant of unexpected structures in similarity rows |
||||||
|
continue |
||||||
|
summary["results"].append( |
||||||
|
{"motion_id": mid, "top_k": len(sims), "suspicious": suspicious} |
||||||
|
) |
||||||
|
|
||||||
|
return summary |
||||||
|
|
||||||
|
|
||||||
|
def main(db_path: str | None = None, sample_size: int = 50, top_k: int = 5) -> dict: |
||||||
|
"""Wrapper used by CLI and tests. |
||||||
|
|
||||||
|
When called with no args, this behaves like the prior CLI entrypoint and |
||||||
|
will parse command-line args and write a ledger file. Tests call main() |
||||||
|
directly with explicit parameters and expect a dict summary to be |
||||||
|
returned (and a ledger to be written). To maintain compatibility we |
||||||
|
support both usage patterns. |
||||||
|
""" |
||||||
|
# If invoked as CLI, db_path will be None and we should parse args and |
||||||
|
# write the ledger file as before. |
||||||
|
if db_path is None: |
||||||
|
logging.basicConfig(level=logging.INFO) |
||||||
|
parser = argparse.ArgumentParser(description="QA similarity cache sampler") |
||||||
|
parser.add_argument("--db-path", required=False, help="Path to motions.db") |
||||||
|
parser.add_argument("--sample-size", type=int, default=50) |
||||||
|
parser.add_argument("--top-k", type=int, default=5) |
||||||
|
args = parser.parse_args() |
||||||
|
db_path = args.db_path or db.db_path |
||||||
|
sample_size = args.sample_size |
||||||
|
top_k = args.top_k |
||||||
|
|
||||||
|
summary = run_qa(db_path or db.db_path, sample_size=sample_size, top_k=top_k) |
||||||
|
# Provide a convenience mapping of motion_id -> result for easier consumption |
||||||
|
# by callers/tests which expect a `motions` mapping. |
||||||
|
summary["motions"] = {r["motion_id"]: r for r in summary.get("results", [])} |
||||||
|
ledger_dir = os.path.join("thoughts", "ledgers") |
||||||
|
os.makedirs(ledger_dir, exist_ok=True) |
||||||
|
ts = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ") |
||||||
|
path = os.path.join(ledger_dir, f"qa_similarity_{ts}.json") |
||||||
|
with open(path, "w", encoding="utf-8") as fh: |
||||||
|
json.dump(summary, fh, ensure_ascii=False, indent=2) |
||||||
|
print(f"Wrote QA summary to {path}") |
||||||
|
return {"ledger_path": path, **summary} |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
||||||
@ -0,0 +1,220 @@ |
|||||||
|
"""Re-run text embeddings, fusion, and similarity for all windows. |
||||||
|
|
||||||
|
Clears stale embeddings, re-embeds all motions with available text, |
||||||
|
then fuses SVD + text vectors and rebuilds similarity cache for every |
||||||
|
window that has SVD vectors in the database. |
||||||
|
|
||||||
|
Usage: |
||||||
|
.venv/bin/python scripts/rerun_embeddings.py --db-path data/motions.db |
||||||
|
""" |
||||||
|
|
||||||
|
import argparse |
||||||
|
import logging |
||||||
|
|
||||||
|
try: |
||||||
|
import duckdb |
||||||
|
except Exception: |
||||||
|
duckdb = None |
||||||
|
|
||||||
|
from pipeline import text_pipeline |
||||||
|
import importlib |
||||||
|
|
||||||
|
# If duckdb is not present at import time (test environments), avoid hard failure |
||||||
|
try: |
||||||
|
importlib.import_module("duckdb") |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
from pipeline import fusion as fusion_pipeline |
||||||
|
from similarity import compute as similarity_compute |
||||||
|
|
||||||
|
_logger = logging.getLogger(__name__) |
||||||
|
|
||||||
|
|
||||||
|
def _get_all_windows(db_path: str): |
||||||
|
"""Return all distinct window_ids that have SVD vectors.""" |
||||||
|
try: |
||||||
|
conn = duckdb.connect(db_path, read_only=True) |
||||||
|
except Exception: |
||||||
|
_logger.exception( |
||||||
|
"Unable to connect to duckdb for _get_all_windows(%s)", db_path |
||||||
|
) |
||||||
|
return [] |
||||||
|
|
||||||
|
try: |
||||||
|
rows = conn.execute( |
||||||
|
"SELECT DISTINCT window_id FROM svd_vectors ORDER BY window_id" |
||||||
|
).fetchall() |
||||||
|
return [r[0] for r in rows] |
||||||
|
except Exception: |
||||||
|
_logger.exception("Error querying windows from %s", db_path) |
||||||
|
return [] |
||||||
|
finally: |
||||||
|
try: |
||||||
|
conn.close() |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
|
||||||
|
|
||||||
|
def _clear_embeddings(db_path: str) -> int: |
||||||
|
"""Delete all rows from embeddings, fused_embeddings, and similarity_cache.""" |
||||||
|
try: |
||||||
|
conn = duckdb.connect(db_path) |
||||||
|
except Exception: |
||||||
|
_logger.exception( |
||||||
|
"Unable to connect to duckdb for _clear_embeddings(%s)", db_path |
||||||
|
) |
||||||
|
return 0 |
||||||
|
|
||||||
|
try: |
||||||
|
emb = conn.execute("DELETE FROM embeddings").rowcount or 0 |
||||||
|
fused = conn.execute("DELETE FROM fused_embeddings").rowcount or 0 |
||||||
|
sim = conn.execute("DELETE FROM similarity_cache").rowcount or 0 |
||||||
|
conn.commit() |
||||||
|
_logger.info( |
||||||
|
"Cleared: %d embeddings, %d fused_embeddings, %d similarity_cache rows", |
||||||
|
emb, |
||||||
|
fused, |
||||||
|
sim, |
||||||
|
) |
||||||
|
return emb + fused + sim |
||||||
|
except Exception: |
||||||
|
_logger.exception("Error clearing embeddings in %s", db_path) |
||||||
|
return 0 |
||||||
|
finally: |
||||||
|
try: |
||||||
|
conn.close() |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
|
||||||
|
|
||||||
|
def rerun_embeddings( |
||||||
|
db_path: str, model: str = None, retry_missing: bool = False |
||||||
|
) -> dict: |
||||||
|
"""Full rerun: clear → embed → fuse → similarity for all windows. |
||||||
|
|
||||||
|
Returns a summary dict. |
||||||
|
""" |
||||||
|
_logger.info("Starting rerun_embeddings for %s", db_path) |
||||||
|
|
||||||
|
# 1. Clear stale data |
||||||
|
cleared = _clear_embeddings(db_path) |
||||||
|
|
||||||
|
# 2. Re-embed all motions |
||||||
|
_logger.info("Running text embeddings ...") |
||||||
|
# Call ensure_text_embeddings which historically returned either a 4-tuple |
||||||
|
# (stored, skipped_existing, skipped_no_text, errors) or a 5-tuple that |
||||||
|
# includes failed_ids as the fifth element. Support both shapes for |
||||||
|
# backward-compatibility. |
||||||
|
result = text_pipeline.ensure_text_embeddings(db_path=db_path, model=model) |
||||||
|
if isinstance(result, tuple) and len(result) == 5: |
||||||
|
stored, skipped_existing, skipped_no_text, emb_errors, failed_ids = result |
||||||
|
elif isinstance(result, tuple) and len(result) == 4: |
||||||
|
stored, skipped_existing, skipped_no_text, emb_errors = result |
||||||
|
failed_ids = [] |
||||||
|
else: |
||||||
|
# Fallback: try to unpack defensively |
||||||
|
try: |
||||||
|
stored, skipped_existing, skipped_no_text, emb_errors, failed_ids = result |
||||||
|
except Exception: |
||||||
|
_logger.error( |
||||||
|
"Unexpected return shape from ensure_text_embeddings: %s", result |
||||||
|
) |
||||||
|
stored = skipped_existing = skipped_no_text = emb_errors = 0 |
||||||
|
failed_ids = [] |
||||||
|
# Optionally retry missing failed ids with smaller batch sizes |
||||||
|
if retry_missing and failed_ids: |
||||||
|
try: |
||||||
|
_logger.info( |
||||||
|
"Retrying %d failed embeddings with smaller batches", len(failed_ids) |
||||||
|
) |
||||||
|
# prefer a helper that can process only specific ids if available |
||||||
|
if hasattr(text_pipeline, "ensure_text_embeddings_for_ids"): |
||||||
|
text_pipeline.ensure_text_embeddings_for_ids( |
||||||
|
db_path=db_path, ids=failed_ids, model=model, batch_size=max(1, 20) |
||||||
|
) |
||||||
|
else: |
||||||
|
# best-effort: call ensure_text_embeddings and let implementation handle limiting |
||||||
|
text_pipeline.ensure_text_embeddings( |
||||||
|
db_path=db_path, model=model, batch_size=max(1, 20) |
||||||
|
) |
||||||
|
except Exception: |
||||||
|
_logger.exception("Retrying missing embeddings failed") |
||||||
|
_logger.info( |
||||||
|
"Text embeddings: stored=%d, skipped_existing=%d, skipped_no_text=%d, errors=%d", |
||||||
|
stored, |
||||||
|
skipped_existing, |
||||||
|
skipped_no_text, |
||||||
|
emb_errors, |
||||||
|
) |
||||||
|
|
||||||
|
# 3. Get all windows with SVD vectors |
||||||
|
windows = _get_all_windows(db_path) |
||||||
|
_logger.info("Found %d windows with SVD vectors: %s", len(windows), windows) |
||||||
|
|
||||||
|
fusion_summary = {} |
||||||
|
similarity_summary = {} |
||||||
|
|
||||||
|
for window_id in windows: |
||||||
|
_logger.info("Processing window %s ...", window_id) |
||||||
|
|
||||||
|
# 3a. Fuse |
||||||
|
try: |
||||||
|
result = fusion_pipeline.fuse_for_window(window_id, db_path=db_path) |
||||||
|
fusion_summary[window_id] = result |
||||||
|
_logger.info(" fuse_for_window(%s) -> %s", window_id, result) |
||||||
|
except Exception: |
||||||
|
_logger.exception(" fuse_for_window failed for %s", window_id) |
||||||
|
fusion_summary[window_id] = {"error": True} |
||||||
|
|
||||||
|
# 3b. Compute similarities |
||||||
|
try: |
||||||
|
inserted = similarity_compute.compute_similarities( |
||||||
|
vector_type="fused", |
||||||
|
window_id=window_id, |
||||||
|
db_path=db_path, |
||||||
|
) |
||||||
|
similarity_summary[window_id] = inserted |
||||||
|
_logger.info(" compute_similarities(%s) -> %d rows", window_id, inserted) |
||||||
|
except Exception: |
||||||
|
_logger.exception(" compute_similarities failed for %s", window_id) |
||||||
|
similarity_summary[window_id] = -1 |
||||||
|
|
||||||
|
_logger.info("Finished rerun_embeddings for %s", db_path) |
||||||
|
|
||||||
|
return { |
||||||
|
"cleared_rows": cleared, |
||||||
|
"embeddings_stored": stored, |
||||||
|
"embeddings_skipped_no_text": skipped_no_text, |
||||||
|
"embeddings_errors": emb_errors, |
||||||
|
"embeddings_failed_ids": failed_ids, |
||||||
|
"windows_processed": len(windows), |
||||||
|
"fusion_summary": fusion_summary, |
||||||
|
"similarity_summary": similarity_summary, |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
def _main(): |
||||||
|
logging.basicConfig( |
||||||
|
level=logging.INFO, |
||||||
|
format="%(asctime)s %(levelname)s %(name)s %(message)s", |
||||||
|
) |
||||||
|
parser = argparse.ArgumentParser( |
||||||
|
description="Re-run embeddings, fusion, similarity" |
||||||
|
) |
||||||
|
parser.add_argument("--db-path", required=True, help="Path to motions.db") |
||||||
|
parser.add_argument( |
||||||
|
"--model", |
||||||
|
default=None, |
||||||
|
help="Embedding model name (default: text_pipeline default)", |
||||||
|
) |
||||||
|
args = parser.parse_args() |
||||||
|
summary = rerun_embeddings(args.db_path, model=args.model) |
||||||
|
print(f"cleared_rows: {summary['cleared_rows']}") |
||||||
|
print(f"embeddings_stored: {summary['embeddings_stored']}") |
||||||
|
print(f"embeddings_skipped_no_text: {summary['embeddings_skipped_no_text']}") |
||||||
|
print(f"embeddings_errors: {summary['embeddings_errors']}") |
||||||
|
print(f"windows_processed: {summary['windows_processed']}") |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
_main() |
||||||
@ -0,0 +1,614 @@ |
|||||||
|
"""SyncFeed-based motion content enrichment. |
||||||
|
|
||||||
|
Walks four SyncFeed entity types (Besluit, Zaak, Document, DocumentVersie), |
||||||
|
joins them in memory to map each motion's besluit_id to a Zaak.Onderwerp title |
||||||
|
and an ExterneIdentifier, then fetches body text from officielebekendmakingen.nl |
||||||
|
and updates the motions table. |
||||||
|
|
||||||
|
Usage: |
||||||
|
.venv/bin/python scripts/sync_motion_content.py --db-path data/motions.db |
||||||
|
""" |
||||||
|
|
||||||
|
import argparse |
||||||
|
import logging |
||||||
|
import xml.etree.ElementTree as ET |
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
||||||
|
from typing import Dict, Iterator, List, Optional, Tuple |
||||||
|
|
||||||
|
try: |
||||||
|
import duckdb |
||||||
|
except Exception: # pragma: no cover - environment may not have duckdb installed |
||||||
|
duckdb = None |
||||||
|
import requests |
||||||
|
import time |
||||||
|
import re |
||||||
|
|
||||||
|
_logger = logging.getLogger(__name__) |
||||||
|
|
||||||
|
# Namespaces |
||||||
|
ATOM_NS = "http://www.w3.org/2005/Atom" |
||||||
|
NS_TK = "http://www.tweedekamer.nl/xsd/tkData/v1-0" |
||||||
|
|
||||||
|
SYNCFEED_BASE = "https://gegevensmagazijn.tweedekamer.nl/SyncFeed/2.0/Feed" |
||||||
|
BODY_TEXT_BASE = "https://zoek.officielebekendmakingen.nl/{ext_id}.html" |
||||||
|
|
||||||
|
# Default number of concurrent body fetch workers |
||||||
|
MAX_BODY_WORKERS = 10 |
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
# Helpers |
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
|
||||||
|
|
||||||
|
def _local(tag: str) -> str: |
||||||
|
"""Strip XML namespace from a tag name.""" |
||||||
|
return tag.split("}", 1)[1] if tag.startswith("{") else tag |
||||||
|
|
||||||
|
|
||||||
|
def _is_deleted(element: ET.Element) -> bool: |
||||||
|
return element.attrib.get(f"{{{NS_TK}}}verwijderd", "false").lower() == "true" |
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
# Parsers (accept ET.Element; public API also accepts XML string for tests) |
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
|
||||||
|
|
||||||
|
def parse_besluit(element) -> Dict: |
||||||
|
"""Parse a Besluit element (ET.Element or XML string). |
||||||
|
|
||||||
|
Returns dict with: id, verwijderd, zaak_refs (list of uuid strings). |
||||||
|
""" |
||||||
|
if isinstance(element, str): |
||||||
|
element = ET.fromstring(element) |
||||||
|
return { |
||||||
|
"id": element.attrib.get("id"), |
||||||
|
"verwijderd": _is_deleted(element), |
||||||
|
"zaak_refs": [ |
||||||
|
c.attrib["ref"] |
||||||
|
for c in element |
||||||
|
if _local(c.tag).lower() == "zaak" and "ref" in c.attrib |
||||||
|
], |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
def parse_zaak(element) -> Dict: |
||||||
|
"""Parse a Zaak element (ET.Element or XML string). |
||||||
|
|
||||||
|
Returns dict with: id, verwijderd, onderwerp, soort. |
||||||
|
""" |
||||||
|
if isinstance(element, str): |
||||||
|
element = ET.fromstring(element) |
||||||
|
children = {_local(c.tag).lower(): (c.text or "").strip() for c in element} |
||||||
|
return { |
||||||
|
"id": element.attrib.get("id"), |
||||||
|
"verwijderd": _is_deleted(element), |
||||||
|
"onderwerp": children.get("onderwerp"), |
||||||
|
"soort": children.get("soort"), |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
def parse_document(element) -> Dict: |
||||||
|
"""Parse a Document element (ET.Element or XML string). |
||||||
|
|
||||||
|
Returns dict with: id, verwijderd, zaak_refs. |
||||||
|
""" |
||||||
|
if isinstance(element, str): |
||||||
|
element = ET.fromstring(element) |
||||||
|
return { |
||||||
|
"id": element.attrib.get("id"), |
||||||
|
"verwijderd": _is_deleted(element), |
||||||
|
"zaak_refs": [ |
||||||
|
c.attrib["ref"] |
||||||
|
for c in element |
||||||
|
if _local(c.tag).lower() == "zaak" and "ref" in c.attrib |
||||||
|
], |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
def parse_documentversie(element) -> Dict: |
||||||
|
"""Parse a DocumentVersie element (ET.Element or XML string). |
||||||
|
|
||||||
|
Returns dict with: id, verwijderd, document_id, externe_identifier, extensie. |
||||||
|
""" |
||||||
|
if isinstance(element, str): |
||||||
|
element = ET.fromstring(element) |
||||||
|
children = {_local(c.tag).lower(): c for c in element} |
||||||
|
return { |
||||||
|
"id": element.attrib.get("id"), |
||||||
|
"verwijderd": _is_deleted(element), |
||||||
|
"document_id": ( |
||||||
|
children["document"].attrib.get("ref") if "document" in children else None |
||||||
|
), |
||||||
|
"externe_identifier": ( |
||||||
|
(children["externeidentifier"].text or "").strip() |
||||||
|
if "externeidentifier" in children |
||||||
|
else None |
||||||
|
), |
||||||
|
"extensie": ( |
||||||
|
(children["extensie"].text or "").strip() |
||||||
|
if "extensie" in children |
||||||
|
else None |
||||||
|
), |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
# Join builders (pure in-memory; tested without HTTP) |
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
|
||||||
|
|
||||||
|
def build_title_map( |
||||||
|
besluit_index: Dict[str, Dict], |
||||||
|
zaak_index: Dict[str, Dict], |
||||||
|
) -> Dict[str, str]: |
||||||
|
"""Map besluit_id -> Zaak.onderwerp, preferring soort == 'Motie'.""" |
||||||
|
out: Dict[str, str] = {} |
||||||
|
for besluit_id, b in besluit_index.items(): |
||||||
|
chosen = None |
||||||
|
for zid in b.get("zaak_refs", []): |
||||||
|
z = zaak_index.get(zid) |
||||||
|
if not z: |
||||||
|
continue |
||||||
|
if z.get("soort", "").lower() == "motie": |
||||||
|
chosen = z |
||||||
|
break |
||||||
|
if chosen is None: |
||||||
|
chosen = z |
||||||
|
if chosen and chosen.get("onderwerp"): |
||||||
|
out[besluit_id] = chosen["onderwerp"] |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
def build_ext_id_map( |
||||||
|
besluit_index: Dict[str, Dict], |
||||||
|
zaak_index: Dict[str, Dict], |
||||||
|
doc_index: Dict[str, Dict], |
||||||
|
docversie_index: Dict[str, Dict], |
||||||
|
) -> Dict[str, str]: |
||||||
|
"""Map besluit_id -> externe_identifier by following document → zaak links.""" |
||||||
|
# document_id -> externe_identifier (prefer html extension) |
||||||
|
doc_to_ext: Dict[str, str] = {} |
||||||
|
for dv in docversie_index.values(): |
||||||
|
ext = dv.get("externe_identifier") |
||||||
|
doc_id = dv.get("document_id") |
||||||
|
if ext and doc_id: |
||||||
|
# prefer html over pdf when both exist |
||||||
|
existing = doc_to_ext.get(doc_id) |
||||||
|
if not existing or dv.get("extensie", "").lower() == "html": |
||||||
|
doc_to_ext[doc_id] = ext |
||||||
|
|
||||||
|
# Build zaak_id -> list of doc_ids |
||||||
|
zaak_to_docs: Dict[str, List[str]] = {} |
||||||
|
for doc in doc_index.values(): |
||||||
|
for zid in doc.get("zaak_refs", []): |
||||||
|
zaak_to_docs.setdefault(zid, []).append(doc["id"]) |
||||||
|
|
||||||
|
out: Dict[str, str] = {} |
||||||
|
for besluit_id, b in besluit_index.items(): |
||||||
|
found: Optional[str] = None |
||||||
|
for zid in b.get("zaak_refs", []): |
||||||
|
for doc_id in zaak_to_docs.get(zid, []): |
||||||
|
ext = doc_to_ext.get(doc_id) |
||||||
|
if ext: |
||||||
|
found = ext |
||||||
|
break |
||||||
|
if found: |
||||||
|
break |
||||||
|
if found: |
||||||
|
out[besluit_id] = found |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
# HTTP walker |
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
|
||||||
|
|
||||||
|
def walk_syncfeed( |
||||||
|
category: str, |
||||||
|
session: requests.Session, |
||||||
|
start_skip_token: Optional[int] = None, |
||||||
|
) -> Iterator[ET.Element]: |
||||||
|
"""Yield entity ET.Element objects by walking a SyncFeed category.""" |
||||||
|
url: Optional[str] = SYNCFEED_BASE + f"?category={category}" |
||||||
|
if start_skip_token: |
||||||
|
url += f"&skiptoken={start_skip_token}" |
||||||
|
|
||||||
|
pages = 0 |
||||||
|
while url: |
||||||
|
try: |
||||||
|
resp = session.get(url, timeout=30) |
||||||
|
resp.raise_for_status() |
||||||
|
except Exception as exc: |
||||||
|
_logger.error("SyncFeed request failed (%s): %s", url, exc) |
||||||
|
break |
||||||
|
|
||||||
|
try: |
||||||
|
root = ET.fromstring(resp.text) |
||||||
|
except ET.ParseError as exc: |
||||||
|
_logger.error("XML parse error for %s: %s", url, exc) |
||||||
|
break |
||||||
|
|
||||||
|
for entry in root.findall(f"{{{ATOM_NS}}}entry"): |
||||||
|
content = entry.find(f"{{{ATOM_NS}}}content") |
||||||
|
if content is None: |
||||||
|
continue |
||||||
|
for child in content: |
||||||
|
yield child |
||||||
|
|
||||||
|
next_link = root.find(f".//{{{ATOM_NS}}}link[@rel='next']") |
||||||
|
url = next_link.attrib.get("href") if next_link is not None else None |
||||||
|
pages += 1 |
||||||
|
if pages % 50 == 0: |
||||||
|
_logger.info(" walked %d pages for category=%s", pages, category) |
||||||
|
|
||||||
|
_logger.info("Done walking category=%s (%d pages)", category, pages) |
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
# Body text fetcher |
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
|
||||||
|
|
||||||
|
def _fetch_body_text( |
||||||
|
ext_id: str, session: requests.Session, retries: int = 3 |
||||||
|
) -> Optional[str]: |
||||||
|
"""Fetch plain text body from officielebekendmakingen.nl for ext_id. |
||||||
|
|
||||||
|
Retries on network errors and on HTTP 5xx or 429 responses using |
||||||
|
exponential backoff starting at 0.5s. On permanent failure returns None |
||||||
|
and records an audit event via database.db.append_audit_event(...). |
||||||
|
""" |
||||||
|
import time |
||||||
|
import re |
||||||
|
from requests import exceptions as req_exceptions |
||||||
|
import database |
||||||
|
|
||||||
|
url = BODY_TEXT_BASE.format(ext_id=ext_id) |
||||||
|
attempt = 0 |
||||||
|
backoff = 0.5 |
||||||
|
last_exc = None |
||||||
|
while attempt < retries: |
||||||
|
attempt += 1 |
||||||
|
try: |
||||||
|
resp = session.get(url, timeout=30) |
||||||
|
# treat 5xx and 429 as transient |
||||||
|
status = getattr(resp, "status_code", None) |
||||||
|
if status == 429 or (status is not None and 500 <= status < 600): |
||||||
|
last_exc = Exception(f"HTTP {status}") |
||||||
|
raise req_exceptions.RequestException(f"HTTP {status}") |
||||||
|
|
||||||
|
resp.raise_for_status() |
||||||
|
# Very simple text extraction: strip tags |
||||||
|
text = re.sub(r"<[^>]+>", " ", resp.text) |
||||||
|
text = re.sub(r"\s+", " ", text).strip() |
||||||
|
return text[:32_000] if text else None |
||||||
|
|
||||||
|
except req_exceptions.RequestException as exc: |
||||||
|
last_exc = exc |
||||||
|
# retry for transient errors unless we've exhausted attempts |
||||||
|
if attempt < retries: |
||||||
|
_logger.info( |
||||||
|
"Transient body fetch error for %s (attempt %d/%d): %s; retrying in %.1fs", |
||||||
|
ext_id, |
||||||
|
attempt, |
||||||
|
retries, |
||||||
|
exc, |
||||||
|
backoff, |
||||||
|
) |
||||||
|
try: |
||||||
|
time.sleep(backoff) |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
backoff *= 2 |
||||||
|
continue |
||||||
|
|
||||||
|
# exhausted retries => permanent failure |
||||||
|
_logger.warning( |
||||||
|
"Body text fetch permanently failed for %s: %s", ext_id, exc |
||||||
|
) |
||||||
|
metadata = {"attempts": attempt, "error": str(exc)} |
||||||
|
try: |
||||||
|
# MotionDatabase.append_audit_event signature: (actor_id, action, ...) |
||||||
|
database.db.append_audit_event( |
||||||
|
None, |
||||||
|
"body_fetch_failed", |
||||||
|
target_type="document", |
||||||
|
target_id=ext_id, |
||||||
|
metadata=metadata, |
||||||
|
) |
||||||
|
except Exception: |
||||||
|
_logger.exception( |
||||||
|
"Failed to write audit event for body fetch failure %s", ext_id |
||||||
|
) |
||||||
|
return None |
||||||
|
except Exception as exc: # pragma: no cover - unexpected errors |
||||||
|
_logger.exception( |
||||||
|
"Unexpected error fetching body text for %s: %s", ext_id, exc |
||||||
|
) |
||||||
|
last_exc = exc |
||||||
|
break |
||||||
|
# If we fall through here, ensure audit event is recorded |
||||||
|
try: |
||||||
|
database.db.append_audit_event( |
||||||
|
None, |
||||||
|
"body_fetch_failed", |
||||||
|
target_type="document", |
||||||
|
target_id=ext_id, |
||||||
|
metadata={"attempts": retries, "error": str(last_exc)}, |
||||||
|
) |
||||||
|
except Exception: |
||||||
|
_logger.exception( |
||||||
|
"Failed to write audit event for body fetch failure %s", ext_id |
||||||
|
) |
||||||
|
return None |
||||||
|
|
||||||
|
|
||||||
|
def fetch_body_texts( |
||||||
|
ext_ids: List[str], |
||||||
|
session: requests.Session, |
||||||
|
max_workers: int = MAX_BODY_WORKERS, |
||||||
|
) -> Dict[str, Optional[str]]: |
||||||
|
"""Parallel-fetch body texts for a list of externe_identifiers.""" |
||||||
|
results: Dict[str, Optional[str]] = {} |
||||||
|
with ThreadPoolExecutor(max_workers=max_workers) as pool: |
||||||
|
future_to_ext = { |
||||||
|
pool.submit(_fetch_body_text, ext_id, session): ext_id for ext_id in ext_ids |
||||||
|
} |
||||||
|
done = 0 |
||||||
|
total = len(future_to_ext) |
||||||
|
for future in as_completed(future_to_ext): |
||||||
|
ext_id = future_to_ext[future] |
||||||
|
try: |
||||||
|
results[ext_id] = future.result() |
||||||
|
except Exception as exc: |
||||||
|
_logger.warning("Body text future failed for %s: %s", ext_id, exc) |
||||||
|
results[ext_id] = None |
||||||
|
done += 1 |
||||||
|
if done % 500 == 0: |
||||||
|
_logger.info(" body text: %d/%d fetched", done, total) |
||||||
|
return results |
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
# DB helpers |
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
|
||||||
|
|
||||||
|
def _load_besluit_ids(db_path: str) -> Dict[str, int]: |
||||||
|
"""Return {besluit_id: motion_id} for all motions with a besluit_id.""" |
||||||
|
conn = duckdb.connect(db_path, read_only=True) |
||||||
|
try: |
||||||
|
# Check whether the motions table actually has a besluit_id column |
||||||
|
cols = conn.execute("PRAGMA table_info('motions')").fetchall() |
||||||
|
col_names = [c[1] for c in cols] |
||||||
|
if "besluit_id" in col_names: |
||||||
|
rows = conn.execute( |
||||||
|
"SELECT besluit_id, id FROM motions WHERE besluit_id IS NOT NULL" |
||||||
|
).fetchall() |
||||||
|
return {r[0]: r[1] for r in rows} |
||||||
|
|
||||||
|
# Fallback: many databases store the besluit id in the URL (last path segment). |
||||||
|
# Try to extract it from the motions.url column. |
||||||
|
rows = conn.execute( |
||||||
|
"SELECT id, url FROM motions WHERE url IS NOT NULL" |
||||||
|
).fetchall() |
||||||
|
import re |
||||||
|
|
||||||
|
out: Dict[str, int] = {} |
||||||
|
for mid, url in rows: |
||||||
|
if not url: |
||||||
|
continue |
||||||
|
# naive extraction: last path segment |
||||||
|
try: |
||||||
|
seg = url.rstrip("/").split("/")[-1] |
||||||
|
except Exception: |
||||||
|
seg = None |
||||||
|
if not seg: |
||||||
|
continue |
||||||
|
# accept UUID-like segments (contain a dash) or reasonably long ids |
||||||
|
if ("-" in seg and len(seg) >= 8) or re.match(r"^[0-9a-fA-F]{8,}$", seg): |
||||||
|
out[seg] = int(mid) |
||||||
|
return out |
||||||
|
finally: |
||||||
|
conn.close() |
||||||
|
|
||||||
|
|
||||||
|
def _update_motions( |
||||||
|
db_path: str, |
||||||
|
updates: List[Tuple[int, Optional[str], Optional[str], Optional[str]]], |
||||||
|
) -> int: |
||||||
|
"""Batch-update motions with (motion_id, title, body_text, externe_identifier). |
||||||
|
|
||||||
|
Returns number of rows updated. |
||||||
|
""" |
||||||
|
if not updates: |
||||||
|
return 0 |
||||||
|
conn = duckdb.connect(db_path) |
||||||
|
try: |
||||||
|
updated = 0 |
||||||
|
for motion_id, title, body_text, ext_id in updates: |
||||||
|
parts = [] |
||||||
|
params: List = [] |
||||||
|
if title is not None: |
||||||
|
parts.append("title = ?") |
||||||
|
params.append(title) |
||||||
|
if body_text is not None: |
||||||
|
parts.append("body_text = ?") |
||||||
|
params.append(body_text) |
||||||
|
if ext_id is not None: |
||||||
|
parts.append("externe_identifier = ?") |
||||||
|
params.append(ext_id) |
||||||
|
if not parts: |
||||||
|
continue |
||||||
|
params.append(motion_id) |
||||||
|
conn.execute(f"UPDATE motions SET {', '.join(parts)} WHERE id = ?", params) |
||||||
|
updated += 1 |
||||||
|
conn.commit() |
||||||
|
return updated |
||||||
|
finally: |
||||||
|
conn.close() |
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
# Main sync routine |
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
|
||||||
|
|
||||||
|
def sync_motion_content(db_path: str, skip_body_text: bool = False) -> Dict: |
||||||
|
"""Full sync: walk feeds, join, fetch body texts, update DB. |
||||||
|
|
||||||
|
Returns summary dict with counts. |
||||||
|
""" |
||||||
|
_logger.info("Loading motion besluit_ids from %s ...", db_path) |
||||||
|
besluit_to_motion = _load_besluit_ids(db_path) |
||||||
|
target_besluit_ids = set(besluit_to_motion.keys()) |
||||||
|
_logger.info("Found %d motions with besluit_id", len(target_besluit_ids)) |
||||||
|
|
||||||
|
session = requests.Session() |
||||||
|
session.headers["Accept"] = "application/xml" |
||||||
|
# Configure HTTPAdapter with a pool sized to MAX_BODY_WORKERS. Allows |
||||||
|
# controlling concurrency for body text fetches via --max-body-workers. |
||||||
|
try: |
||||||
|
from requests.adapters import HTTPAdapter |
||||||
|
|
||||||
|
adapter = HTTPAdapter( |
||||||
|
pool_connections=MAX_BODY_WORKERS, pool_maxsize=MAX_BODY_WORKERS |
||||||
|
) |
||||||
|
session.mount("https://", adapter) |
||||||
|
session.mount("http://", adapter) |
||||||
|
except Exception: |
||||||
|
_logger.debug("Could not mount HTTPAdapter for connection pooling") |
||||||
|
|
||||||
|
# -- Walk Besluit feed (only keep those we care about) -- |
||||||
|
_logger.info("Walking Besluit feed ...") |
||||||
|
besluit_index: Dict[str, Dict] = {} |
||||||
|
for elem in walk_syncfeed("Besluit", session): |
||||||
|
b = parse_besluit(elem) |
||||||
|
if b["id"] and b["id"] in target_besluit_ids and not b["verwijderd"]: |
||||||
|
besluit_index[b["id"]] = b |
||||||
|
_logger.info("Collected %d relevant Besluit records", len(besluit_index)) |
||||||
|
|
||||||
|
# Collect all zaak_ids we need |
||||||
|
needed_zaak_ids: set = set() |
||||||
|
for b in besluit_index.values(): |
||||||
|
needed_zaak_ids.update(b["zaak_refs"]) |
||||||
|
|
||||||
|
# -- Walk Zaak feed -- |
||||||
|
_logger.info("Walking Zaak feed ...") |
||||||
|
zaak_index: Dict[str, Dict] = {} |
||||||
|
for elem in walk_syncfeed("Zaak", session): |
||||||
|
z = parse_zaak(elem) |
||||||
|
if z["id"] and z["id"] in needed_zaak_ids and not z["verwijderd"]: |
||||||
|
zaak_index[z["id"]] = z |
||||||
|
_logger.info("Collected %d Zaak records", len(zaak_index)) |
||||||
|
|
||||||
|
# -- Walk Document feed -- |
||||||
|
_logger.info("Walking Document feed ...") |
||||||
|
doc_index: Dict[str, Dict] = {} |
||||||
|
for elem in walk_syncfeed("Document", session): |
||||||
|
d = parse_document(elem) |
||||||
|
if d["id"] and not d["verwijderd"]: |
||||||
|
if any(zid in needed_zaak_ids for zid in d["zaak_refs"]): |
||||||
|
doc_index[d["id"]] = d |
||||||
|
needed_doc_ids = set(doc_index.keys()) |
||||||
|
_logger.info("Collected %d Document records", len(doc_index)) |
||||||
|
|
||||||
|
# -- Walk DocumentVersie feed -- |
||||||
|
_logger.info("Walking DocumentVersie feed ...") |
||||||
|
docversie_index: Dict[str, Dict] = {} |
||||||
|
for elem in walk_syncfeed("DocumentVersie", session): |
||||||
|
dv = parse_documentversie(elem) |
||||||
|
if ( |
||||||
|
dv["id"] |
||||||
|
and not dv["verwijderd"] |
||||||
|
and dv.get("document_id") in needed_doc_ids |
||||||
|
): |
||||||
|
docversie_index[dv["id"]] = dv |
||||||
|
_logger.info("Collected %d DocumentVersie records", len(docversie_index)) |
||||||
|
|
||||||
|
# -- Build maps -- |
||||||
|
title_map = build_title_map(besluit_index, zaak_index) |
||||||
|
ext_id_map = build_ext_id_map(besluit_index, zaak_index, doc_index, docversie_index) |
||||||
|
_logger.info( |
||||||
|
"title_map: %d entries, ext_id_map: %d entries", len(title_map), len(ext_id_map) |
||||||
|
) |
||||||
|
|
||||||
|
# -- Fetch body texts -- |
||||||
|
body_text_map: Dict[str, Optional[str]] = {} |
||||||
|
if not skip_body_text: |
||||||
|
ext_ids_to_fetch = list(set(ext_id_map.values())) |
||||||
|
_logger.info( |
||||||
|
"Fetching body texts for %d unique ext_ids ...", len(ext_ids_to_fetch) |
||||||
|
) |
||||||
|
body_text_map = fetch_body_texts(ext_ids_to_fetch, session) |
||||||
|
_logger.info("Body text fetch complete") |
||||||
|
|
||||||
|
# -- Assemble updates -- |
||||||
|
updates: List[Tuple[int, Optional[str], Optional[str], Optional[str]]] = [] |
||||||
|
for besluit_id, motion_id in besluit_to_motion.items(): |
||||||
|
title = title_map.get(besluit_id) |
||||||
|
ext_id = ext_id_map.get(besluit_id) |
||||||
|
body_text = body_text_map.get(ext_id) if ext_id else None |
||||||
|
if title or ext_id or body_text: |
||||||
|
updates.append((motion_id, title, body_text, ext_id)) |
||||||
|
|
||||||
|
_logger.info("Applying %d motion updates to DB ...", len(updates)) |
||||||
|
updated = _update_motions(db_path, updates) |
||||||
|
_logger.info("Done. Updated %d motions.", updated) |
||||||
|
|
||||||
|
return { |
||||||
|
"motions_with_besluit_id": len(target_besluit_ids), |
||||||
|
"besluit_records": len(besluit_index), |
||||||
|
"zaak_records": len(zaak_index), |
||||||
|
"document_records": len(doc_index), |
||||||
|
"docversie_records": len(docversie_index), |
||||||
|
"title_map_entries": len(title_map), |
||||||
|
"ext_id_map_entries": len(ext_id_map), |
||||||
|
"body_texts_fetched": sum(1 for v in body_text_map.values() if v), |
||||||
|
"motions_updated": updated, |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
# CLI |
||||||
|
# --------------------------------------------------------------------------- |
||||||
|
|
||||||
|
|
||||||
|
def _main(): |
||||||
|
logging.basicConfig( |
||||||
|
level=logging.INFO, |
||||||
|
format="%(asctime)s %(levelname)s %(name)s %(message)s", |
||||||
|
) |
||||||
|
# allow overriding MAX_BODY_WORKERS from CLI |
||||||
|
parser = argparse.ArgumentParser(description="Sync motion content from SyncFeed") |
||||||
|
parser.add_argument("--db-path", required=True, help="Path to motions.db") |
||||||
|
parser.add_argument( |
||||||
|
"--skip-body-text", |
||||||
|
action="store_true", |
||||||
|
help="Skip fetching body text from officielebekendmakingen.nl", |
||||||
|
) |
||||||
|
parser.add_argument( |
||||||
|
"--max-body-workers", |
||||||
|
type=int, |
||||||
|
default=MAX_BODY_WORKERS, |
||||||
|
help=f"Maximum concurrent workers for fetching body text (default: {MAX_BODY_WORKERS})", |
||||||
|
) |
||||||
|
# Use a local copy for the default to avoid referencing the name after assignment |
||||||
|
args = parser.parse_args() |
||||||
|
# Set module-level MAX_BODY_WORKERS based on CLI |
||||||
|
try: |
||||||
|
MAX_BODY_WORKERS = ( |
||||||
|
int(args.max_body_workers) if args.max_body_workers else MAX_BODY_WORKERS |
||||||
|
) |
||||||
|
except Exception: |
||||||
|
pass |
||||||
|
summary = sync_motion_content(args.db_path, skip_body_text=args.skip_body_text) |
||||||
|
for k, v in summary.items(): |
||||||
|
print(f" {k}: {v}") |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
_main() |
||||||
@ -0,0 +1,51 @@ |
|||||||
|
import json |
||||||
|
from pathlib import Path |
||||||
|
|
||||||
|
|
||||||
|
def test_qa_similarity_creates_ledger(tmp_path, monkeypatch): |
||||||
|
# Prepare monkeypatched database.db |
||||||
|
class DummyDB: |
||||||
|
def sample_motions(self, sample_size): |
||||||
|
assert sample_size == 2 |
||||||
|
return [1, 2] |
||||||
|
|
||||||
|
def get_cached_similarities(self, motion_id, top_k): |
||||||
|
# return deterministic neighbors |
||||||
|
return [ |
||||||
|
{"id": motion_id * 10 + i, "score": 1.0 - i * 0.1} for i in range(top_k) |
||||||
|
] |
||||||
|
|
||||||
|
dummy = DummyDB() |
||||||
|
|
||||||
|
# Monkeypatch the database module to provide .db — use monkeypatch.setitem |
||||||
|
# so the override is active for this test and auto-reverts after. |
||||||
|
import types |
||||||
|
|
||||||
|
fake_db_module = types.SimpleNamespace(db=dummy) |
||||||
|
|
||||||
|
import sys |
||||||
|
|
||||||
|
monkeypatch.setitem(sys.modules, "database", fake_db_module) |
||||||
|
|
||||||
|
# Ensure thoughts/ledgers inside tmp_path |
||||||
|
base = tmp_path |
||||||
|
(base / "thoughts" / "ledgers").mkdir(parents=True) |
||||||
|
|
||||||
|
# Monkeypatch cwd so ledger writes to tmp_path/thoughts |
||||||
|
monkeypatch.chdir(base) |
||||||
|
|
||||||
|
from scripts.qa_similarity import main |
||||||
|
|
||||||
|
summary = main(db_path=":memory:", sample_size=2, top_k=3) |
||||||
|
|
||||||
|
assert summary["sample_size"] == 2 |
||||||
|
assert summary["top_k"] == 3 |
||||||
|
assert 1 in summary["motions"] |
||||||
|
assert 2 in summary["motions"] |
||||||
|
|
||||||
|
ledger_path = Path(summary["ledger_path"]) |
||||||
|
assert ledger_path.exists() |
||||||
|
|
||||||
|
data = json.loads(ledger_path.read_text(encoding="utf-8")) |
||||||
|
assert "motions" in data |
||||||
|
assert len(data["motions"]) == 2 |
||||||
@ -0,0 +1,84 @@ |
|||||||
|
"""Tests for scripts/rerun_embeddings.py. |
||||||
|
|
||||||
|
Monkeypatches pipeline functions directly on their bound module references |
||||||
|
inside rerun_embeddings. Import at module level so the real 'database' module |
||||||
|
is in sys.modules before any test-local sys.modules.setdefault calls run. |
||||||
|
""" |
||||||
|
|
||||||
|
from unittest.mock import MagicMock |
||||||
|
|
||||||
|
import scripts.rerun_embeddings as rer |
||||||
|
|
||||||
|
|
||||||
|
def test_rerun_embeddings_calls_pipeline_steps(monkeypatch, tmp_path): |
||||||
|
db_file = str(tmp_path / "motions.db") |
||||||
|
fake_windows = ["2022-Q3", "2023-Q1", "2024-Q2"] |
||||||
|
|
||||||
|
called = {"ensure": False, "fuse_windows": [], "sim_windows": []} |
||||||
|
|
||||||
|
# Patch duckdb.connect used in _clear_embeddings and _get_all_windows |
||||||
|
fake_conn = MagicMock() |
||||||
|
fake_conn.execute.return_value.rowcount = 0 |
||||||
|
fake_conn.execute.return_value.fetchall.return_value = [(w,) for w in fake_windows] |
||||||
|
fake_duckdb = MagicMock() |
||||||
|
fake_duckdb.connect.return_value = fake_conn |
||||||
|
monkeypatch.setattr(rer, "duckdb", fake_duckdb) |
||||||
|
|
||||||
|
# ensure_text_embeddings now returns a 5-tuple: |
||||||
|
# (stored, skipped_existing, skipped_no_text, errors, failed_ids) |
||||||
|
def fake_ensure(db_path=None, model=None, batch_size=50, **kwargs): |
||||||
|
called["ensure"] = True |
||||||
|
return (5, 0, 2, 0, []) |
||||||
|
|
||||||
|
def fake_fuse(window_id, db_path=None): |
||||||
|
called["fuse_windows"].append(window_id) |
||||||
|
return { |
||||||
|
"inserted": 1, |
||||||
|
"skipped_missing_text": 0, |
||||||
|
"skipped_missing_svd": 0, |
||||||
|
"errors": 0, |
||||||
|
} |
||||||
|
|
||||||
|
def fake_sim(vector_type="fused", window_id=None, db_path=None, top_k=10, **kwargs): |
||||||
|
called["sim_windows"].append(window_id) |
||||||
|
return 10 |
||||||
|
|
||||||
|
monkeypatch.setattr(rer.text_pipeline, "ensure_text_embeddings", fake_ensure) |
||||||
|
monkeypatch.setattr(rer.fusion_pipeline, "fuse_for_window", fake_fuse) |
||||||
|
monkeypatch.setattr(rer.similarity_compute, "compute_similarities", fake_sim) |
||||||
|
|
||||||
|
summary = rer.rerun_embeddings(db_file) |
||||||
|
|
||||||
|
assert called["ensure"] is True |
||||||
|
assert called["fuse_windows"] == fake_windows |
||||||
|
assert called["sim_windows"] == fake_windows |
||||||
|
assert summary["windows_processed"] == len(fake_windows) |
||||||
|
assert summary["embeddings_stored"] == 5 |
||||||
|
assert summary["embeddings_skipped_no_text"] == 2 |
||||||
|
assert summary["embeddings_failed_ids"] == [] |
||||||
|
|
||||||
|
|
||||||
|
def test_rerun_retries_when_retry_missing_and_failed_ids(monkeypatch, tmp_path): |
||||||
|
"""When retry_missing=True and first pass returns failed_ids, retry is triggered.""" |
||||||
|
db_file = str(tmp_path / "motions.db") |
||||||
|
|
||||||
|
monkeypatch.setattr(rer, "_clear_embeddings", lambda db_path: 0) |
||||||
|
monkeypatch.setattr(rer, "_get_all_windows", lambda db_path: []) |
||||||
|
|
||||||
|
retry_called = {"ids": None} |
||||||
|
|
||||||
|
def fake_ensure(db_path=None, model=None, batch_size=50, **kwargs): |
||||||
|
return (3, 0, 0, 2, [201, 202]) |
||||||
|
|
||||||
|
def fake_retry(db_path=None, ids=None, model=None, batch_size=10, **kwargs): |
||||||
|
retry_called["ids"] = ids |
||||||
|
return (2, 0, 0, 0, []) |
||||||
|
|
||||||
|
monkeypatch.setattr(rer.text_pipeline, "ensure_text_embeddings", fake_ensure) |
||||||
|
monkeypatch.setattr(rer.text_pipeline, "ensure_text_embeddings_for_ids", fake_retry) |
||||||
|
|
||||||
|
summary = rer.rerun_embeddings(db_file, retry_missing=True) |
||||||
|
|
||||||
|
assert retry_called["ids"] is not None, "retry was not called" |
||||||
|
assert set(retry_called["ids"]) == {201, 202} |
||||||
|
assert summary["embeddings_failed_ids"] == [201, 202] |
||||||
@ -0,0 +1,97 @@ |
|||||||
|
"""Tests for scripts/sync_motion_content.py. |
||||||
|
|
||||||
|
Tests retry logic in _fetch_body_text and permanent-failure audit recording. |
||||||
|
""" |
||||||
|
|
||||||
|
import requests |
||||||
|
from unittest.mock import MagicMock, patch |
||||||
|
|
||||||
|
import scripts.sync_motion_content as s |
||||||
|
|
||||||
|
|
||||||
|
def test_parse_besluit_simple(): |
||||||
|
xml = '<Besluit id="b1' + '">\n<Zaak ref="z1"/>\n</Besluit>' |
||||||
|
parsed = s.parse_besluit(xml) |
||||||
|
assert parsed["id"] == "b1" |
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_body_text_retries_on_transient_error(monkeypatch): |
||||||
|
"""_fetch_body_text retries after a ConnectionError and returns text on success.""" |
||||||
|
session = MagicMock() |
||||||
|
call_count = {"n": 0} |
||||||
|
|
||||||
|
def fake_get(url, timeout=30): |
||||||
|
call_count["n"] += 1 |
||||||
|
if call_count["n"] == 1: |
||||||
|
raise requests.exceptions.ConnectionError("timeout") |
||||||
|
# Second attempt succeeds |
||||||
|
resp = MagicMock() |
||||||
|
resp.status_code = 200 |
||||||
|
resp.text = "<p>body text here</p>" |
||||||
|
resp.raise_for_status.return_value = None |
||||||
|
return resp |
||||||
|
|
||||||
|
session.get.side_effect = fake_get |
||||||
|
|
||||||
|
# Patch time.sleep to avoid delays in tests |
||||||
|
monkeypatch.setattr("time.sleep", lambda s: None) |
||||||
|
|
||||||
|
result = s._fetch_body_text("ext123", session, retries=3) |
||||||
|
|
||||||
|
assert result is not None |
||||||
|
assert "body text here" in result |
||||||
|
assert call_count["n"] == 2 # failed once, succeeded on second |
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_body_text_permanent_failure_records_audit(monkeypatch): |
||||||
|
"""When all retries are exhausted, audit_event is recorded via database.db.""" |
||||||
|
session = MagicMock() |
||||||
|
session.get.side_effect = requests.exceptions.ConnectionError("always fails") |
||||||
|
|
||||||
|
monkeypatch.setattr("time.sleep", lambda s: None) |
||||||
|
|
||||||
|
# Capture the audit event call |
||||||
|
audit_calls = [] |
||||||
|
|
||||||
|
import database |
||||||
|
|
||||||
|
monkeypatch.setattr( |
||||||
|
database.db, |
||||||
|
"append_audit_event", |
||||||
|
lambda actor_id, action, **kwargs: ( |
||||||
|
audit_calls.append({"action": action, **kwargs}) or True |
||||||
|
), |
||||||
|
) |
||||||
|
|
||||||
|
result = s._fetch_body_text("ext_fail", session, retries=3) |
||||||
|
|
||||||
|
assert result is None |
||||||
|
assert len(audit_calls) >= 1 |
||||||
|
assert audit_calls[0]["action"] == "body_fetch_failed" |
||||||
|
assert audit_calls[0]["target_id"] == "ext_fail" |
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_body_text_retries_on_5xx(monkeypatch): |
||||||
|
"""5xx responses are treated as transient; retried before giving up.""" |
||||||
|
session = MagicMock() |
||||||
|
call_count = {"n": 0} |
||||||
|
|
||||||
|
def fake_get(url, timeout=30): |
||||||
|
call_count["n"] += 1 |
||||||
|
resp = MagicMock() |
||||||
|
if call_count["n"] < 3: |
||||||
|
resp.status_code = 503 |
||||||
|
resp.raise_for_status.return_value = None |
||||||
|
else: |
||||||
|
resp.status_code = 200 |
||||||
|
resp.text = "clean text" |
||||||
|
resp.raise_for_status.return_value = None |
||||||
|
return resp |
||||||
|
|
||||||
|
session.get.side_effect = fake_get |
||||||
|
monkeypatch.setattr("time.sleep", lambda s: None) |
||||||
|
|
||||||
|
result = s._fetch_body_text("ext_5xx", session, retries=3) |
||||||
|
|
||||||
|
assert result is not None |
||||||
|
assert call_count["n"] == 3 |
||||||
@ -0,0 +1,122 @@ |
|||||||
|
"""Tests for pipeline/text_pipeline.py retry behaviour. |
||||||
|
|
||||||
|
Uses monkeypatching to stub get_embeddings_with_retry and store_embedding |
||||||
|
so no real DB or network is needed. |
||||||
|
""" |
||||||
|
|
||||||
|
import pipeline.text_pipeline as tp |
||||||
|
import pipeline.ai_provider_wrapper as ai_wrapper |
||||||
|
|
||||||
|
|
||||||
|
def _make_fake_db(store_results=None): |
||||||
|
"""Return a minimal fake db object for text_pipeline tests.""" |
||||||
|
store_results = store_results or {} |
||||||
|
call_log = {"stored": []} |
||||||
|
|
||||||
|
class FakeDB: |
||||||
|
db_path = ":memory:" |
||||||
|
|
||||||
|
def store_embedding(self, motion_id, model, vec): |
||||||
|
call_log["stored"].append(motion_id) |
||||||
|
return store_results.get(motion_id, 1) |
||||||
|
|
||||||
|
return FakeDB(), call_log |
||||||
|
|
||||||
|
|
||||||
|
def _stub_select_text(monkeypatch, rows): |
||||||
|
"""Patch _select_text to return predetermined (motion_id, text) rows.""" |
||||||
|
monkeypatch.setattr(tp, "_select_text", lambda db, model: rows) |
||||||
|
|
||||||
|
|
||||||
|
def _stub_counts(monkeypatch, total=10, existing=0): |
||||||
|
"""Patch the duckdb connection used for count queries.""" |
||||||
|
import types |
||||||
|
from unittest.mock import MagicMock |
||||||
|
|
||||||
|
fake_conn = MagicMock() |
||||||
|
# fetchone()[0] is used twice: total_motions and existing count |
||||||
|
fake_conn.execute.return_value.fetchone.side_effect = [(total,), (existing,)] |
||||||
|
fake_duckdb = MagicMock() |
||||||
|
fake_duckdb.connect.return_value = fake_conn |
||||||
|
monkeypatch.setattr(tp, "duckdb", fake_duckdb) |
||||||
|
|
||||||
|
|
||||||
|
def test_all_embeddings_stored(monkeypatch): |
||||||
|
"""When wrapper returns an embedding for every text, stored count matches.""" |
||||||
|
rows = [(1, "tekst een"), (2, "tekst twee"), (3, "tekst drie")] |
||||||
|
_stub_select_text(monkeypatch, rows) |
||||||
|
_stub_counts(monkeypatch, total=3, existing=0) |
||||||
|
|
||||||
|
fake_db, call_log = _make_fake_db() |
||||||
|
|
||||||
|
def fake_wrapper(texts, motion_ids=None, model=None, batch_size=50, **kwargs): |
||||||
|
return [[0.1, 0.2, 0.3] for _ in texts] |
||||||
|
|
||||||
|
monkeypatch.setattr(ai_wrapper, "get_embeddings_with_retry", fake_wrapper) |
||||||
|
|
||||||
|
stored, skipped_existing, skipped_no_text, errors, failed_ids = ( |
||||||
|
tp.ensure_text_embeddings(db=fake_db, model="test-model") |
||||||
|
) |
||||||
|
|
||||||
|
assert stored == 3 |
||||||
|
assert errors == 0 |
||||||
|
assert failed_ids == [] |
||||||
|
assert skipped_no_text == 0 |
||||||
|
assert set(call_log["stored"]) == {1, 2, 3} |
||||||
|
|
||||||
|
|
||||||
|
def test_partial_failure_populates_failed_ids(monkeypatch): |
||||||
|
"""When wrapper returns None for some items, those ids appear in failed_ids.""" |
||||||
|
rows = [(10, "text a"), (11, "text b"), (12, "text c")] |
||||||
|
_stub_select_text(monkeypatch, rows) |
||||||
|
_stub_counts(monkeypatch, total=3, existing=0) |
||||||
|
|
||||||
|
fake_db, call_log = _make_fake_db() |
||||||
|
|
||||||
|
def fake_wrapper(texts, motion_ids=None, model=None, batch_size=50, **kwargs): |
||||||
|
# Return embedding for first, None for second, embedding for third |
||||||
|
return ( |
||||||
|
[[0.1] for _ in range(len(texts))] |
||||||
|
if len(texts) != 3 |
||||||
|
else [ |
||||||
|
[0.1, 0.2], |
||||||
|
None, # motion_id=11 fails |
||||||
|
[0.3, 0.4], |
||||||
|
] |
||||||
|
) |
||||||
|
|
||||||
|
monkeypatch.setattr(ai_wrapper, "get_embeddings_with_retry", fake_wrapper) |
||||||
|
|
||||||
|
stored, skipped_existing, skipped_no_text, errors, failed_ids = ( |
||||||
|
tp.ensure_text_embeddings(db=fake_db, model="test-model") |
||||||
|
) |
||||||
|
|
||||||
|
assert stored == 2 |
||||||
|
assert errors == 1 |
||||||
|
assert 11 in failed_ids |
||||||
|
assert 10 not in failed_ids |
||||||
|
assert 12 not in failed_ids |
||||||
|
|
||||||
|
|
||||||
|
def test_no_text_motions_skipped(monkeypatch): |
||||||
|
"""Motions with empty text are counted as skipped_no_text, not sent to wrapper.""" |
||||||
|
rows = [(20, "has text"), (21, ""), (22, None)] |
||||||
|
_stub_select_text(monkeypatch, rows) |
||||||
|
_stub_counts(monkeypatch, total=3, existing=0) |
||||||
|
|
||||||
|
fake_db, call_log = _make_fake_db() |
||||||
|
wrapper_calls = {"count": 0} |
||||||
|
|
||||||
|
def fake_wrapper(texts, motion_ids=None, model=None, batch_size=50, **kwargs): |
||||||
|
wrapper_calls["count"] += len(texts) |
||||||
|
return [[0.1] for _ in texts] |
||||||
|
|
||||||
|
monkeypatch.setattr(ai_wrapper, "get_embeddings_with_retry", fake_wrapper) |
||||||
|
|
||||||
|
stored, _, skipped_no_text, errors, failed_ids = tp.ensure_text_embeddings( |
||||||
|
db=fake_db, model="test-model" |
||||||
|
) |
||||||
|
|
||||||
|
assert skipped_no_text == 2 # motions 21 and 22 have no text |
||||||
|
assert stored == 1 # only motion 20 was stored |
||||||
|
assert wrapper_calls["count"] == 1 # wrapper only received 1 text |
||||||
@ -0,0 +1,116 @@ |
|||||||
|
--- |
||||||
|
date: 2026-03-23 |
||||||
|
topic: "motion content enrichment - next steps" |
||||||
|
status: draft |
||||||
|
--- |
||||||
|
|
||||||
|
## Problem Statement |
||||||
|
|
||||||
|
We successfully ingested SyncFeed motion content, fetched body texts, re-embedded motives, ran fusion (SVD-based) and rebuilt the similarity cache. The pipeline ran end-to-end but showed intermittent failures (embedding provider batch failures, connection-pool warnings) and produced a small number of missing body_texts and potential spurious similarity hits. |
||||||
|
|
||||||
|
**Goal:** Stabilize and harden the motion content enrichment + embedding/fusion/similarity pipeline so it runs reliably, is testable, and produces high-quality similarity results for production use. |
||||||
|
|
||||||
|
## Constraints |
||||||
|
|
||||||
|
- **Do not modify** app.py or scheduler.py. |
||||||
|
- Use **DuckDB only** (data/motions.db) and open/close connections per method; avoid long-lived global connections. |
||||||
|
- No print() calls in library modules — use logging.getLogger(__name__). |
||||||
|
- Tests must continue to run under the existing pytest setup and monkeypatching in CI. |
||||||
|
- Avoid YAGNI features: only add monitoring/metrics that are actionable and low-effort. |
||||||
|
|
||||||
|
## Approach (chosen) |
||||||
|
|
||||||
|
I'm leaning toward an **incremental hardening** approach: small, high-impact fixes and QA steps first (low effort, immediate benefit), then follow with a short set of robustness improvements (retries, backoff, audit events) and targeted tests. This minimizes risk and gives quick confidence that the bulk import can be re-run safely. |
||||||
|
|
||||||
|
Alternatives considered: |
||||||
|
- Full rewrite of SyncFeed walker to a resilient state-machine (higher effort; unnecessary today). |
||||||
|
- Push heavy-duty observability (Prometheus + Grafana) immediately (high overhead; defer to specific metrics and logs first). |
||||||
|
|
||||||
|
I chose incremental hardening because it fixes the concrete failures we saw (provider batch errors, connection pool warnings, one 404 body) quickly and keeps the codebase small and testable. |
||||||
|
|
||||||
|
## Architecture |
||||||
|
|
||||||
|
High-level components: |
||||||
|
- **SyncFeed sync script** (scripts/sync_motion_content.py): walk feeds, build title/ext-id maps, fetch body text, update DB. |
||||||
|
- **Text embedding pipeline** (pipeline/text_pipeline.py, scripts/rerun_embeddings.py): convert selected text into embeddings, with provider retry logic. |
||||||
|
- **Fusion/SVD pipeline** (pipeline/fusion.py, pipeline/svd_pipeline.py): fuse embeddings per-window and produce fused vectors. |
||||||
|
- **Similarity compute & lookup** (similarity/compute.py, similarity/lookup.py): compute pairwise similarities and populate cache. |
||||||
|
- **DB layer** (database.py, migrations): motions table (body_text, externe_identifier), fused_embeddings, svd_vectors, similarity_cache and audit events. |
||||||
|
- **Audit & continuity** (thoughts/ledgers/*, audit_events table): record run summaries and per-window results. |
||||||
|
|
||||||
|
Responsibilities are unchanged; we add a small **ai_provider wrapper** and an **operations script** for QA and rerun orchestration. |
||||||
|
|
||||||
|
## Components & Responsibilities |
||||||
|
|
||||||
|
- **sync_motion_content.py**: keep as-is; add more granular logging and a CLI flag to limit to a subset (for QA). Responsible for idempotent updates. |
||||||
|
- **_fetch_body_text / fetch_body_texts**: reduce max_workers or add retry on transient HTTP errors; wrap requests.Session with adapters to control pool size. |
||||||
|
- **text_pipeline.ai_provider**: add a small retry/backoff wrapper that retries failed batches with exponential backoff and a fallback to smaller batch_size. |
||||||
|
- **scripts/rerun_embeddings.py**: expose a `--retry-missing` mode that detects missing embeddings and retries with smaller batches. |
||||||
|
- **similarity.compute**: keep padding logic; add a filter to avoid trivial 1.0 matches for extremely short titles (query/UI should also filter but apply DB-side filter for safety). |
||||||
|
- **migrations**: add audit_events or mark which motions failed fetch/embedding for manual review. |
||||||
|
- **tests**: add deterministic tests for retry behavior and for the QA-sample similarity checks. |
||||||
|
|
||||||
|
## Data Flow |
||||||
|
|
||||||
|
1. Walk SyncFeed (Besluit, Zaak, Document, DocumentVersie) → parse elements. |
||||||
|
2. Build **title_map** and **ext_id_map** in-memory. |
||||||
|
3. Fetch body_texts in parallel (ThreadPoolExecutor) → map ext_id -> body_text. |
||||||
|
4. Update motions table with title, externe_identifier, body_text. |
||||||
|
5. Run text embeddings for motions (COALESCE priority: layman_explanation → body_text → description → title). |
||||||
|
6. Fuse embeddings per-window (svd_vectors) → produce fused_embeddings. |
||||||
|
7. Compute similarity cache per-window and insert rows. |
||||||
|
8. QA checks and audit logs produced for runs. |
||||||
|
|
||||||
|
## Error Handling Strategy |
||||||
|
|
||||||
|
- **HTTP / body fetches:** add per-ext_id retries (3 attempts) with short exponential backoff; capture and store failures in audit_events table for manual follow-up. |
||||||
|
- **Connection pool warnings:** reduce ThreadPoolExecutor concurrency (configurable flag) and attach a requests.adapters HTTPAdapter with a limited pool size to avoid 'Connection pool is full' warnings. |
||||||
|
- **Embedding provider failures:** implement a wrapper which: |
||||||
|
- retries batches up to N times with exponential backoff, |
||||||
|
- on persistent failure, retry missing items with a smaller batch_size, |
||||||
|
- mark failed motion ids in an audit table rather than blocking the entire run. |
||||||
|
- **Similarity anomalies (1.0 scores):** filter out identity matches and very-short-text matches when building similarity cache; record these in diagnostics output. |
||||||
|
|
||||||
|
## Testing Strategy |
||||||
|
|
||||||
|
- Add unit tests for parser functions (already present) to cover edge cases seen in real SyncFeed XML. |
||||||
|
- Add a unit test for the ai_provider retry wrapper that simulates provider failures and verifies fallback to smaller batches. |
||||||
|
- Add an integration QA script (scripts/qa_similarity.py) that: |
||||||
|
- samples N motions across windows, |
||||||
|
- runs lookup.similarity and asserts results are within expected ranges (e.g., top-5 not all 1.0 unless identical text), |
||||||
|
- outputs a short summary JSON saved to thoughts/ledgers/ for each run. |
||||||
|
- CI: run the new provider-retry test and the QA script with a small dataset (mocked provider) to ensure no regressions. |
||||||
|
|
||||||
|
## Actionable Next Steps (prioritized) |
||||||
|
|
||||||
|
1. Quick QA (1 day) — sample 50 motions and inspect similarity quality. |
||||||
|
- Implement scripts/qa_similarity.py (sample + assert heuristics). |
||||||
|
- Run locally and record summary in thoughts/ledgers. |
||||||
|
|
||||||
|
2. Small robustness fixes (1–2 days) — low-risk changes with big wins. |
||||||
|
- Add ai_provider retry/backoff wrapper and unit tests. |
||||||
|
- Add `--max-body-workers` CLI flag and drop default to 10; add per-request retries. |
||||||
|
- Add `--retry-missing` mode to rerun_embeddings to retry failed batches with smaller sizes. |
||||||
|
|
||||||
|
3. Observability & audit (1 day) — make failures visible and actionable. |
||||||
|
- Add audit_events table rows when body_text fetch or embedding fails. |
||||||
|
- Write an end-of-run JSON summary (already done) and attach per-window stats to ledger. |
||||||
|
|
||||||
|
4. Safety filters & dedupe (0.5 day) |
||||||
|
- Add a small DB-side filter to skip trivial identical-title matches in similarity cache. |
||||||
|
- Audit SVD windows for duplication and dedupe if needed. |
||||||
|
|
||||||
|
5. Run full re-run (off-peak) and validate (1 day) |
||||||
|
- Re-run embeddings, fusion and similarity; run QA script and review ledgers. |
||||||
|
|
||||||
|
Estimated total: 3–5 days of focused work. |
||||||
|
|
||||||
|
## Open Questions |
||||||
|
|
||||||
|
- Do we want to persist per-item failure flags in DuckDB (audit_events) or just in ledgers? I recommend adding an **audit_events** table to speed triage. |
||||||
|
- What SLA / acceptance criteria should we use for similarity quality? E.g., maximum allowed fraction of top-1 exact-title matches for non-identical motions. |
||||||
|
- Are we comfortable reducing body fetch concurrency by default, or should we attempt a more adaptive concurrency strategy? |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
I'm proceeding to create the design doc. Interrupt if you want changes. |
||||||
@ -0,0 +1,314 @@ |
|||||||
|
# motion content enrichment — implementation plan |
||||||
|
|
||||||
|
Goal: Implement the prioritized incremental hardening from the design (2026-03-23) so the SyncFeed → embedding → fusion → similarity pipeline is more robust, observable, and testable. Break the work into small, independent micro-tasks (one file + its test per task) so many implementers can work in parallel. |
||||||
|
|
||||||
|
Design doc: thoughts/shared/designs/2026-03-23-motion-content-enrichment-next-steps-design.md |
||||||
|
|
||||||
|
Architecture summary (what I'll implement) |
||||||
|
- Add a small audit API on MotionDatabase so code can record per-item failures in a stable place (or fall back to a ledger file if DuckDB is not present). |
||||||
|
- Add a dedicated ai_provider retry/fallback wrapper that: |
||||||
|
- retries failed batches (exponential backoff), |
||||||
|
- on persistent failure retries missing items with smaller batch sizes, |
||||||
|
- returns aligned embedding results (None for failed items), |
||||||
|
- records persistent failures to audit_events (using MotionDatabase.append_audit_event). |
||||||
|
- Wire text embedding pipeline to use the wrapper and return failed ids (so rerun script can retry them). |
||||||
|
- Add a `--max-body-workers` CLI option to scripts/sync_motion_content.py, reduce default to 10 and add per-request retries. |
||||||
|
- Add `--retry-missing` to scripts/rerun_embeddings.py: rerun missing failed items with smaller batches. |
||||||
|
- Add a DB-side safety filter in similarity.compute to avoid inserting trivial 1.0 matches for very-short identical titles. |
||||||
|
- Add a small QA script scripts/qa_similarity.py that samples windows/motions and writes a short JSON ledger for manual review. |
||||||
|
- Add focused unit tests for the new behaviours (ai retry wrapper, DB audit append, sync body fetch retries, rerun retry mode, similarity filter, QA script). |
||||||
|
|
||||||
|
Decisions / gap filling (why these concrete choices) |
||||||
|
- Audit recording: implement MotionDatabase.append_audit_event that writes to audit_events table if present, else appends to thoughts/ledgers/audit_events.json. Rationale: migration SQL is a commented placeholder; making DB write optional keeps tests and CI safe; writing to ledgers is actionable and durable for triage. |
||||||
|
- ai retry backoff params: default retries=3, initial_backoff=0.5s, jitter ±10%, fallback smaller_batch_size = max(1, batch_size // 2). Rationale: conservative defaults that map to design and are implementable/testable. |
||||||
|
- fetch_body_text retries: 3 attempts per ext_id with small exponential backoff (0.5s). Use requests.adapters.HTTPAdapter(pool_connections=10, pool_maxsize=10) to limit pool size and avoid pool warnings. Default max workers lowered to 10. |
||||||
|
- Interface changes: ensure_text_embeddings will return an extended result with failed_ids as a 5th element: (stored, skipped_existing, skipped_no_text, errors, failed_ids). I will update rerun_embeddings and its tests accordingly. Rationale: rerun needs failed ids; propagating as return value is simplest and testable. |
||||||
|
- All new code uses logging.getLogger(__name__) (no print in library modules) to obey constraints. |
||||||
|
- Tests will use monkeypatching/mocks to avoid network/DB dependencies. |
||||||
|
|
||||||
|
Dependency graph (high level) |
||||||
|
|
||||||
|
Batch 1 (foundation, parallel): tasks 1.1–1.4 (no interdeps except where noted). |
||||||
|
Batch 2 (core, parallel): tasks 2.1–2.3 (depend on Batch 1). |
||||||
|
Batch 3 (safety & QA, parallel): task 3.1 (depends on Batch 2 and Batch 1). |
||||||
|
|
||||||
|
``` |
||||||
|
Batch 1 (parallel): 1.1, 1.2, 1.3, 1.4 |
||||||
|
Batch 2 (parallel): 2.1, 2.2, 2.3 [depends on Batch 1] |
||||||
|
Batch 3 (parallel): 3.1 [depends on Batch 2] |
||||||
|
``` |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Batch 1: Foundation (parallel - 4 implementers) |
||||||
|
|
||||||
|
All tasks in this batch have NO (external) dependencies except where noted. |
||||||
|
|
||||||
|
### Task 1.1: MotionDatabase.append_audit_event |
||||||
|
**Owner:** implementer (author) |
||||||
|
**Estimate:** 2 hours |
||||||
|
**Depends:** none |
||||||
|
**Description:** Add an append_audit_event(...) helper to database.MotionDatabase. This method will attempt to INSERT a row into an audit_events table (if the table exists). If DuckDB is not available or the table does not exist, append the event to a JSON file under thoughts/ledgers/audit_events.json. This provides a stable place to record per-item failures without forcing a migration to run during tests/CI. |
||||||
|
|
||||||
|
**File:** `database.py` (modify: add method) |
||||||
|
**Test:** `tests/test_database_audit.py` (new) |
||||||
|
|
||||||
|
Implementation notes (decisions): |
||||||
|
- Signature: append_audit_event(actor_id: str | None, action: str, target_type: str | None = None, target_id: str | None = None, metadata: dict | None = None) -> bool |
||||||
|
- Behavior: |
||||||
|
- If duckdb is None: write (append) to thoughts/ledgers/audit_events.json as list of event objects (create file/dir as needed). |
||||||
|
- If duckdb present: run "INSERT INTO audit_events (... )" wrapped in try/except; if table missing or INSERT fails, fall back to writing to the ledger file. |
||||||
|
- Do not raise; log at appropriate levels and return True if recorded somewhere, False otherwise. |
||||||
|
- Use uuid.uuid4() for id and UTC timestamp for created_at. |
||||||
|
- Use logging.getLogger(__name__) for messages. |
||||||
|
|
||||||
|
Test (complete list): |
||||||
|
- tests/test_database_audit.py |
||||||
|
- Case A (duckdb=None emulation): monkeypatch database.duckdb = None, ensure Ledger file created and content contains the event. |
||||||
|
- Case B (duckdb present but table insertion raises): monkeypatch duckdb.connect to a MagicMock that raises on execute -> verify fallback to ledger file. |
||||||
|
- Verify method returns True when written to ledger, and that JSON is valid. |
||||||
|
|
||||||
|
Verify: |
||||||
|
- pytest -q tests/test_database_audit.py |
||||||
|
|
||||||
|
Commit message suggestion: |
||||||
|
- feat(db): add append_audit_event helper to MotionDatabase (ledger fallback) |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
### Task 1.2: ai provider retry/fallback wrapper |
||||||
|
**Owner:** implementer |
||||||
|
**Estimate:** 3 hours |
||||||
|
**Depends:** 1.1 (uses MotionDatabase.append_audit_event) |
||||||
|
**Description:** Add a small module that wraps ai_provider.get_embeddings_batch to provide robust retries and fallback to smaller batch sizes. The wrapper returns a list of embeddings aligned with inputs; for items that permanently fail we return None in-place and record an audit event via MotionDatabase.append_audit_event. |
||||||
|
|
||||||
|
**File:** `pipeline/ai_provider_wrapper.py` (new) |
||||||
|
**Test:** `tests/test_ai_provider_wrapper.py` (new) |
||||||
|
|
||||||
|
Implementation details: |
||||||
|
- Provide function get_embeddings_with_retry(texts: list[str], motion_ids: list[int] | None = None, model: str | None = None, batch_size: int = 50, retries: int = 3) -> list[Optional[list[float]]] |
||||||
|
- Approach: |
||||||
|
- Iterate inputs in chunks of batch_size. |
||||||
|
- For each chunk: |
||||||
|
- Try ai_provider.get_embeddings_batch(chunk, model=model, batch_size=batch_size) up to `retries` with exponential backoff (initial_backoff=0.5s, jitter). |
||||||
|
- If a chunk continuously fails, split the chunk into subchunks (smaller_batch_size = max(1, batch_size // 2)) and retry the subchunks with the same logic. |
||||||
|
- If an individual text still fails, mark the corresponding index result as None and record an audit event via MotionDatabase.append_audit_event with action='embedding_failed' and metadata including model, exception message, and attempts. |
||||||
|
- Return a results list of the same length as inputs (embedding lists or None). |
||||||
|
- Use MotionDatabase(db_path=...) only if a db_path is provided in env/config or via optional parameter — by default use database.db (existing module-level db instance) to call append_audit_event. |
||||||
|
- Keep function pure enough to be unit-tested by monkeypatching ai_provider.get_embeddings_batch and MotionDatabase.append_audit_event. |
||||||
|
|
||||||
|
Test cases: |
||||||
|
- test successful batch returns embeddings aligned to inputs |
||||||
|
- test simulated transient failure where first attempt fails and second succeeds (observed retry) |
||||||
|
- test persistent chunk failure triggers fallback to smaller chunks and eventual audit appended for failing items (verify append_audit_event called with expected metadata) |
||||||
|
- tests use monkeypatch to stub ai_provider.get_embeddings_batch behavior and MotionDatabase.append_audit_event |
||||||
|
|
||||||
|
Verify: |
||||||
|
- pytest -q tests/test_ai_provider_wrapper.py |
||||||
|
|
||||||
|
Commit: |
||||||
|
- feat(pipeline): ai provider retry/fallback wrapper |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
### Task 1.3: QA script — scripts/qa_similarity.py |
||||||
|
**Owner:** implementer |
||||||
|
**Estimate:** 2 hours |
||||||
|
**Depends:** none |
||||||
|
**Description:** Add a small script that samples N motions across windows, runs similarity lookup for each sampled motion, asserts simple heuristics (e.g., top-5 are not all score==1.0 except identical IDs), and writes a JSON summary into thoughts/ledgers/qa_similarity_{timestamp}.json. This script is meant to be run manually/CI for a quick QA check. |
||||||
|
|
||||||
|
**File:** `scripts/qa_similarity.py` (new) |
||||||
|
**Test:** `tests/test_qa_similarity.py` (new) |
||||||
|
|
||||||
|
Implementation notes: |
||||||
|
- CLI: --db-path, --sample-size (default 50), --top-k (default 5) |
||||||
|
- Implementation uses MotionDatabase to select a small set of motions and similarity.get_cached_similarities (or MotionDatabase.get_cached_similarities) to evaluate neighbors. |
||||||
|
- The script returns a dict summary which is also written to a uniquely named JSON under thoughts/ledgers/. |
||||||
|
- For tests, monkeypatch MotionDatabase to return deterministic samples and similarities; verify the script produces the expected JSON summary and returns reasonable pass/fail flags. |
||||||
|
|
||||||
|
Verify: |
||||||
|
- pytest -q tests/test_qa_similarity.py |
||||||
|
- Run manually: python scripts/qa_similarity.py --db-path data/motions.db --sample-size 10 |
||||||
|
|
||||||
|
Commit: |
||||||
|
- feat(scripts): add QA similarity sampling script and ledger writer |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
### Task 1.4: sync_motion_content — reduce concurrency, add per-ext_id retry, add CLI flag |
||||||
|
**Owner:** implementer |
||||||
|
**Estimate:** 3 hours |
||||||
|
**Depends:** 1.1 (write failures to audit via MotionDatabase.append_audit_event) |
||||||
|
**Description:** Harden the body text fetcher: |
||||||
|
- Add CLI flag `--max-body-workers` (default reduce to 10). |
||||||
|
- Use requests.adapters.HTTPAdapter(pool_connections=10, pool_maxsize=10) when creating the requests.Session in sync_motion_content. |
||||||
|
- Implement per-ext_id retry in _fetch_body_text: try up to 3 times with exponential backoff on network errors/5xx/429. |
||||||
|
- When a body_text fetch permanently fails, call MotionDatabase.append_audit_event(action='body_fetch_failed', target_type='document', target_id=ext_id, metadata=...) so failures are recorded. |
||||||
|
|
||||||
|
**File:** `scripts/sync_motion_content.py` (modify) |
||||||
|
**Test:** `tests/test_sync_motion_content.py` (new) |
||||||
|
|
||||||
|
Implementation details: |
||||||
|
- Add parser.add_argument("--max-body-workers", type=int, default=10, help=...) in CLI |
||||||
|
- When creating session: mount HTTPAdapter with pool_maxsize equal to max_body_workers (requests.adapters.HTTPAdapter(pool_maxsize=...)). Also set session.adapters["https://"] = adapter. |
||||||
|
- Modify _fetch_body_text(ext_id, session) to attempt up to 3 tries and return None on exhaustion; log appropriately; call db.append_audit_event when permanently failing (db from database.db). |
||||||
|
- Update fetch_body_texts to pass max_workers param through as already implemented, but default constant MAX_BODY_WORKERS should be set to 10 at top of file. |
||||||
|
|
||||||
|
Test plan: |
||||||
|
- Test that _fetch_body_text retries: monkeypatch session.get to fail first (raise requests.ConnectionError) and succeed second; verify returned text is successful and that only as many attempts occurred as expected. |
||||||
|
- Test permanent failure case: monkeypatch session.get to always raise and verify MotionDatabase.append_audit_event was called (monkeypatch database.db.append_audit_event). |
||||||
|
- Test fetch_body_texts respects max_workers param by running small set and monkeypatching ThreadPoolExecutor to observe max_workers argument (or call with small size and assert function returns mapped results). |
||||||
|
|
||||||
|
Verify: |
||||||
|
- pytest -q tests/test_sync_motion_content.py |
||||||
|
- Manual run: python scripts/sync_motion_content.py --db-path data/motions.db --max-body-workers 10 |
||||||
|
|
||||||
|
Commit: |
||||||
|
- feat(sync): add per-ext_id retries and --max-body-workers flag (defaults to 10), record failures to audit |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Batch 2: Core modules (parallel - 3 implementers) |
||||||
|
|
||||||
|
These tasks depend on Batch 1 (ai wrapper and audit method must be present). |
||||||
|
|
||||||
|
### Task 2.1: text_pipeline — use ai wrapper & return failed_ids |
||||||
|
**Owner:** implementer |
||||||
|
**Estimate:** 3 hours |
||||||
|
**Depends:** 1.2 (ai_provider_wrapper) and 1.1 (audit) |
||||||
|
**Description:** Modify pipeline/text_pipeline.py to call the new ai_provider_wrapper.get_embeddings_with_retry instead of ai_provider.get_embeddings_batch. Extend ensure_text_embeddings to collect indexes/ids of motions which failed to get embeddings and return them as a fifth element: (stored, skipped_existing, skipped_no_text, errors, failed_ids). Keep logging behavior similar but include a log line reporting failed_ids for the run. |
||||||
|
|
||||||
|
**File:** `pipeline/text_pipeline.py` (modify) |
||||||
|
**Test:** `tests/test_text_pipeline_retry.py` (new) |
||||||
|
|
||||||
|
Implementation details: |
||||||
|
- Replace the ai_provider.get_embeddings_batch(batch_texts, ...) call with wrapper.get_embeddings_with_retry(batch_texts, batch_ids, model=model, batch_size=batch_size, retries=3). |
||||||
|
- The wrapper returns list aligned with batch_texts containing either embedding list or None. For each None, increment errors and append motion_id to failed_ids. |
||||||
|
- At the end of ensure_text_embeddings, return stored, skipped_existing, skipped_no_text, errors, failed_ids. |
||||||
|
- Also ensure docstring updated. |
||||||
|
- Keep existing counting and logging; existing callers will be updated in Task 2.2. |
||||||
|
|
||||||
|
Test plan: |
||||||
|
- Unit test that ensure_text_embeddings: |
||||||
|
- when wrapper returns embeddings for all batch items, stored increments as expected. |
||||||
|
- when wrapper returns None for some items, those motion_ids included in failed_ids and errors counts reflect them. |
||||||
|
- Use monkeypatch to stub pipeline.ai_provider_wrapper.get_embeddings_with_retry and database.db.store_embedding. |
||||||
|
|
||||||
|
Verify: |
||||||
|
- pytest -q tests/test_text_pipeline_retry.py |
||||||
|
|
||||||
|
Commit: |
||||||
|
- feat(pipeline): use ai_provider wrapper for robust embeddings and return failed ids |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
### Task 2.2: rerun_embeddings — add --retry-missing mode and wire re-run |
||||||
|
**Owner:** implementer |
||||||
|
**Estimate:** 2.5 hours |
||||||
|
**Depends:** 2.1 (ensure_text_embeddings new return) |
||||||
|
**Description:** Add a CLI flag `--retry-missing` to scripts/rerun_embeddings.py. When set, after the main ensure_text_embeddings call, if the returned `failed_ids` list is non-empty, attempt to re-run embedding for just those failed motion ids using smaller batch_size (e.g., half) via a new helper in text_pipeline (call ensure_text_embeddings with an argument to limit to a provided list OR use a new function text_pipeline.embed_given_ids(...)). To keep changes minimal, call ensure_text_embeddings with a temporary limit and the wrapper can accept a `motion_ids` argument. The script should record audit events for items that still fail after retry. |
||||||
|
|
||||||
|
**File:** `scripts/rerun_embeddings.py` (modify) |
||||||
|
**Test:** `tests/test_rerun_embeddings.py` (modify — existing test) |
||||||
|
|
||||||
|
Implementation notes: |
||||||
|
- Add parser.add_argument("--retry-missing", action="store_true", help=...). |
||||||
|
- After first ensure_text_embeddings, expect a 5-tuple. If retry_missing and failed_ids exist, call a second short pass: call text_pipeline.get_embeddings_for_ids(db_path=db_path, ids=failed_ids, model=model, batch_size=max(1, batch_size // 2)). Option: reuse ensure_text_embeddings by adding optional parameter to accept a list of motion ids (we added returning failed_ids earlier; modify text_pipeline to accept motion_id list). Implementation choice: add new helper function in text_pipeline called ensure_text_embeddings_for_ids, and use it here. |
||||||
|
- Update tests/test_rerun_embeddings.py to monkeypatch the new text_pipeline helper and simulate that first call returns failed_ids and second call resolves them; assert rerun called accordingly and summary contains expected fields. |
||||||
|
|
||||||
|
Test changes: |
||||||
|
- Update tests/test_rerun_embeddings.py to reflect that text_pipeline.ensure_text_embeddings returns five values and to simulate --retry-missing behavior. |
||||||
|
- Keep the existing expectations in the test (we will extend them to include failed_ids handling). |
||||||
|
|
||||||
|
Verify: |
||||||
|
- pytest -q tests/test_rerun_embeddings.py |
||||||
|
- Manual run: python scripts/rerun_embeddings.py --db-path data/motions.db --retry-missing |
||||||
|
|
||||||
|
Commit: |
||||||
|
- feat(scripts): add --retry-missing to rerun_embeddings and retry failed items with smaller batches |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
### Task 2.3: similarity.compute — DB-side safety filter to avoid trivial 1.0 matches |
||||||
|
**Owner:** implementer |
||||||
|
**Estimate:** 3 hours |
||||||
|
**Depends:** none (reads existing DB) |
||||||
|
**Description:** Add a small DB-side filter before inserting similarity rows that filters out suspicious 1.0 matches between different motions when the titles are extremely short (heuristic: identical titles with length < 12 characters). Add diagnostic logging for filtered pairs. |
||||||
|
|
||||||
|
**File:** `similarity/compute.py` (modify) |
||||||
|
**Test:** `tests/test_similarity_compute_filter.py` (new) |
||||||
|
|
||||||
|
Implementation details: |
||||||
|
- After building rows_to_insert (list of dicts with source/target ids & score), perform: |
||||||
|
- If score == 1.0 (or very near 1.0 with tolerance e.g., > 0.999999), fetch titles for the set of involved ids (single query: SELECT id, title FROM motions WHERE id IN (...)). |
||||||
|
- For each candidate row with perfect/near-perfect score, if motion titles are equal and len(title.strip()) < 12, skip insertion and log debug/info that pair was filtered due to trivial short identical title. |
||||||
|
- The threshold 12 chosen conservatively (document in commit). |
||||||
|
- Keep inserted count and return behavior unchanged. |
||||||
|
- Make sure DB connections are opened/closed per method. |
||||||
|
|
||||||
|
Test plan: |
||||||
|
- Construct a minimal in-memory or duckdb-mocked scenario where two different motion ids have identical short title and their vectors produce 1.0 similarity. Monkeypatch duckdb.connect to return rows such that compute_similarities will produce rows_to_insert including a 1.0. Verify store_similarity_batch is not called for that row (monkeypatch MotionDatabase.store_similarity_batch or spy on db.store_similarity_batch calls). |
||||||
|
|
||||||
|
Verify: |
||||||
|
- pytest -q tests/test_similarity_compute_filter.py |
||||||
|
|
||||||
|
Commit: |
||||||
|
- fix(similarity): filter trivial 1.0 matches for very-short identical titles |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Batch 3: Observability / Integration (parallel - 1-2 implementers) |
||||||
|
|
||||||
|
These are small finishing tasks (audit/ledgers, small extras). |
||||||
|
|
||||||
|
### Task 3.1: Tests & CI adjustments, docs, ledger examples |
||||||
|
**Owner:** reviewer (PR reviewer) |
||||||
|
**Estimate:** 2 hours |
||||||
|
**Depends:** all tasks above (1.1–2.3) |
||||||
|
**Description:** After the code is in, run full test suite, fix any flaky tests, add short README note in thoughts/ledgers/ about how to run QA script and how audit_events fallback works. Add a small example ledger created by QA script if helpful. |
||||||
|
|
||||||
|
**Files:** (changes/additions) |
||||||
|
- `thoughts/shared/plans/2026-03-23-motion-content-enrichment-plan.md` (this plan — created) |
||||||
|
- `thoughts/ledgers/README_motion_enrichment.md` (new, optional) |
||||||
|
- No dedicated unit test for this task; it's a reviewer/integration task. |
||||||
|
|
||||||
|
Verification: |
||||||
|
- Run full tests: pytest |
||||||
|
- Run QA script locally: python scripts/qa_similarity.py --db-path data/motions.db --sample-size 10 |
||||||
|
- Inspect thoughts/ledgers/qa_similarity_*.json and audit_events ledger file. |
||||||
|
|
||||||
|
Commit: |
||||||
|
- docs(ledgers): document QA and audit fallback behavior |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
## Test / Verification summary (per-task commands) |
||||||
|
|
||||||
|
- Task 1.1 |
||||||
|
- pytest -q tests/test_database_audit.py |
||||||
|
- Task 1.2 |
||||||
|
- pytest -q tests/test_ai_provider_wrapper.py |
||||||
|
- Task 1.3 |
||||||
|
- pytest -q tests/test_qa_similarity.py |
||||||
|
- python scripts/qa_similarity.py --db-path data/motions.db --sample-size 10 |
||||||
|
- Task 1.4 |
||||||
|
- pytest -q tests/test_sync_motion_content.py |
||||||
|
- python scripts/sync_motion_content.py --db-path data/motions.db --max-body-workers 10 --skip-body-text (dry run) |
||||||
|
- Task 2.1 |
||||||
|
- pytest -q tests/test_text_pipeline_retry.py |
||||||
|
- Task 2.2 |
||||||
|
- pytest -q tests/test_rerun_embeddings.py |
||||||
|
- python scripts/rerun_embeddings.py --db-path data/motions.db --retry-missing |
||||||
|
- Task 2.3 |
||||||
|
- pytest -q tests/test_similarity_compute_filter.py |
||||||
|
|
||||||
|
Full suite verification: |
||||||
|
- pytest -q |
||||||
|
|
||||||
|
--- |
||||||
|
|
||||||
|
If you want I can now: |
||||||
|
- generate the apply_patch to create the files and tests described (one patch containing all files), or |
||||||
|
- create the plan file only (this document was requested) — I have it ready at: thoughts/shared/plans/2026-03-23-motion-content-enrichment-plan.md |
||||||
|
|
||||||
|
Which would you like next? |
||||||
Loading…
Reference in new issue