# database.py (final working version) try: import duckdb except Exception: # pragma: no cover - environment may not have duckdb installed duckdb = None import json import uuid from datetime import datetime, timedelta from typing import Dict, List, Optional, Tuple from config import config import logging _logger = logging.getLogger(__name__) class MotionDatabase: def __init__(self, db_path: str = config.DATABASE_PATH): self.db_path = db_path # If duckdb is not available, operate in lightweight file-backed mode self._file_mode = duckdb is None self._init_database() def _init_database(self): """Initialize database with required tables""" # Create directory if it doesn't exist import os os.makedirs(os.path.dirname(self.db_path), exist_ok=True) # If duckdb isn't available in this environment, create lightweight # JSON-backed files to allow tests to run without the duckdb dependency. if duckdb is None: # create simple JSON files representing embeddings and similarity cache emb_file = f"{self.db_path}.embeddings.json" sim_file = f"{self.db_path}.similarity_cache.json" for p in (emb_file, sim_file): if not os.path.exists(p): with open(p, "w", encoding="utf-8") as fh: fh.write("[]") return conn = duckdb.connect(self.db_path) # Create sequence for auto-incrementing IDs try: conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1") except: pass # Create tables with proper ID handling conn.execute(""" CREATE TABLE IF NOT EXISTS motions ( id INTEGER DEFAULT nextval('motions_id_seq'), title TEXT NOT NULL, description TEXT, date DATE, policy_area TEXT, voting_results JSON, winning_margin FLOAT, controversy_score FLOAT, layman_explanation TEXT, url TEXT UNIQUE, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id) ) """) conn.execute(""" CREATE TABLE IF NOT EXISTS user_sessions ( session_id TEXT PRIMARY KEY, user_votes JSON, completed_motions INTEGER DEFAULT 0, total_motions INTEGER DEFAULT 10, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) conn.execute(""" CREATE TABLE IF NOT EXISTS party_results ( session_id TEXT, party_name TEXT, agreement_percentage FLOAT, agreed_motions JSON, disagreed_motions JSON, PRIMARY KEY (session_id, party_name) ) """) # New pipeline tables conn.execute(""" CREATE SEQUENCE IF NOT EXISTS mp_votes_id_seq START 1 """) conn.execute(""" CREATE TABLE IF NOT EXISTS mp_votes ( id INTEGER DEFAULT nextval('mp_votes_id_seq'), motion_id INTEGER NOT NULL, mp_name TEXT NOT NULL, party TEXT, vote TEXT NOT NULL, date DATE, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id) ) """) conn.execute(""" CREATE TABLE IF NOT EXISTS mp_metadata ( mp_name TEXT PRIMARY KEY, party TEXT, van DATE, tot_en_met DATE, persoon_id TEXT ) """) conn.execute(""" CREATE SEQUENCE IF NOT EXISTS svd_vectors_id_seq START 1 """) conn.execute(""" CREATE TABLE IF NOT EXISTS svd_vectors ( id INTEGER DEFAULT nextval('svd_vectors_id_seq'), window_id TEXT NOT NULL, entity_type TEXT NOT NULL, entity_id TEXT NOT NULL, vector JSON NOT NULL, model TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id) ) """) conn.execute(""" CREATE SEQUENCE IF NOT EXISTS fused_embeddings_id_seq START 1 """) conn.execute(""" CREATE TABLE IF NOT EXISTS fused_embeddings ( id INTEGER DEFAULT nextval('fused_embeddings_id_seq'), motion_id INTEGER NOT NULL, window_id TEXT NOT NULL, vector JSON NOT NULL, svd_dims INTEGER NOT NULL, text_dims INTEGER NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id) ) """) # Embeddings table for raw text embeddings conn.execute(""" CREATE SEQUENCE IF NOT EXISTS embeddings_id_seq START 1 """) conn.execute(""" CREATE TABLE IF NOT EXISTS embeddings ( id INTEGER DEFAULT nextval('embeddings_id_seq'), motion_id INTEGER NOT NULL, model TEXT, vector JSON NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id) ) """) # Similarity cache table for precomputed neighbors conn.execute(""" CREATE SEQUENCE IF NOT EXISTS similarity_cache_id_seq START 1 """) conn.execute(""" CREATE TABLE IF NOT EXISTS similarity_cache ( id INTEGER DEFAULT nextval('similarity_cache_id_seq'), source_motion_id INTEGER NOT NULL, target_motion_id INTEGER NOT NULL, score REAL NOT NULL, vector_type TEXT NOT NULL, window_id TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id) ) """) # Embeddings table and sequence (stores vectors as JSON) conn.execute(""" CREATE SEQUENCE IF NOT EXISTS embeddings_id_seq START 1 """) conn.execute(""" CREATE TABLE IF NOT EXISTS embeddings ( id INTEGER DEFAULT nextval('embeddings_id_seq'), motion_id INTEGER NOT NULL, model TEXT, vector JSON NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id) ) """) # Similarity cache and sequence (stores only ids and score, no vectors) conn.execute(""" CREATE SEQUENCE IF NOT EXISTS similarity_cache_id_seq START 1 """) conn.execute(""" CREATE TABLE IF NOT EXISTS similarity_cache ( id INTEGER DEFAULT nextval('similarity_cache_id_seq'), source_motion_id INTEGER NOT NULL, target_motion_id INTEGER NOT NULL, vector_type TEXT NOT NULL, window_id TEXT, score FLOAT NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id) ) """) conn.close() def reset_database(self): """Development helper: drop known tables and re-run initialization. WARNING: intended for dev/test only. This will remove tables and recreate schema. """ conn = duckdb.connect(self.db_path) try: # Drop known tables if they exist for t in ("party_results", "user_sessions", "motions"): try: conn.execute(f"DROP TABLE IF EXISTS {t}") except Exception: pass # Recreate schema conn.close() self._init_database() finally: try: conn.close() except Exception: pass def insert_motion(self, motion_data: Dict) -> bool: """Insert a new motion into database""" try: conn = duckdb.connect(self.db_path) # Check if motion already exists by URL to avoid duplicates existing = conn.execute( """ SELECT COUNT(*) FROM motions WHERE url = ? """, (motion_data["url"],), ).fetchone() if existing and existing[0] > 0: conn.close() return False # Motion already exists # Insert motion - id will be auto-generated by sequence conn.execute( """ INSERT INTO motions (title, description, date, policy_area, voting_results, winning_margin, controversy_score, url, externe_identifier, body_text, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP) """, ( motion_data["title"], motion_data["description"] or "", motion_data["date"], motion_data["policy_area"], json.dumps(motion_data["voting_results"]), motion_data["winning_margin"], 1 - motion_data["winning_margin"], # controversy score motion_data["url"], motion_data.get("externe_identifier"), motion_data.get("body_text"), ), ) conn.close() # Also insert mp_vote rows for individual MPs if party data is available. # This only runs for brand-new motions (existing motions are rejected above), # so there is no risk of duplicates — no existence check needed here. mp_vote_parties = motion_data.get("mp_vote_parties", {}) voting_results_raw = motion_data.get("voting_results", {}) if mp_vote_parties: conn2 = duckdb.connect(self.db_path) row = conn2.execute( "SELECT id FROM motions WHERE url = ? LIMIT 1", (motion_data["url"],), ).fetchone() conn2.close() motion_id = row[0] if row else None if motion_id is not None: motion_date = motion_data.get("date", "") for mp_name, party in mp_vote_parties.items(): vote = voting_results_raw.get(mp_name, "afwezig") self.insert_mp_vote( motion_id=motion_id, mp_name=mp_name, party=party, vote=vote, date=motion_date, ) return True except Exception as e: print(f"Error inserting motion: {e}") if "conn" in locals(): conn.close() return False def batch_insert_motions(self, motions_data: List[Dict]) -> Tuple[int, int]: """Batch-insert motions and their mp_votes using a single DuckDB connection. Returns (inserted_count, duplicate_count). """ if not motions_data: return 0, 0 try: conn = duckdb.connect(self.db_path) # 1. Find which URLs already exist — single query urls = [m["url"] for m in motions_data] placeholders = ", ".join("?" * len(urls)) existing_urls = set( row[0] for row in conn.execute( f"SELECT url FROM motions WHERE url IN ({placeholders})", urls ).fetchall() ) new_motions = [m for m in motions_data if m["url"] not in existing_urls] duplicates = len(motions_data) - len(new_motions) if not new_motions: conn.close() return 0, duplicates # 2. Bulk-insert motions motion_rows = [ ( m["title"], m["description"] or "", m["date"], m["policy_area"], json.dumps(m["voting_results"]), m["winning_margin"], 1 - m["winning_margin"], m["url"], m.get("externe_identifier"), m.get("body_text"), ) for m in new_motions ] conn.executemany( """ INSERT INTO motions (title, description, date, policy_area, voting_results, winning_margin, controversy_score, url, externe_identifier, body_text, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP) """, motion_rows, ) # 3. Fetch the newly-assigned IDs in one query new_urls = [m["url"] for m in new_motions] np = ", ".join("?" * len(new_urls)) url_to_id = { row[1]: row[0] for row in conn.execute( f"SELECT id, url FROM motions WHERE url IN ({np})", new_urls ).fetchall() } # 4. Bulk-insert mp_votes vote_rows = [] for m in new_motions: motion_id = url_to_id.get(m["url"]) if motion_id is None: continue mp_vote_parties = m.get("mp_vote_parties", {}) voting_results_raw = m.get("voting_results", {}) motion_date = m.get("date", "") for mp_name, party in mp_vote_parties.items(): vote = voting_results_raw.get(mp_name, "afwezig") vote_rows.append((motion_id, mp_name, party, vote, motion_date)) if vote_rows: conn.executemany( """ INSERT INTO mp_votes (motion_id, mp_name, party, vote, date, created_at) VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP) """, vote_rows, ) conn.close() return len(new_motions), duplicates except Exception as e: _logger.error(f"Error in batch_insert_motions: {e}") try: conn.close() except Exception: pass raise def get_filtered_motions( self, policy_area: str = "Alle", min_margin: float = 0.2, max_margin: float = 0.8, limit: int = 100, ) -> List[Dict]: """Get motions filtered by criteria""" conn = duckdb.connect(self.db_path) query = """ SELECT * FROM motions WHERE winning_margin BETWEEN ? AND ? AND layman_explanation IS NOT NULL AND layman_explanation != '' """ params = [min_margin, max_margin] if policy_area != "Alle": query += " AND policy_area = ?" params.append(policy_area) query += " ORDER BY controversy_score DESC LIMIT ?" params.append(limit) try: result = conn.execute(query, params).fetchall() columns = [desc[0] for desc in conn.description] conn.close() return [dict(zip(columns, row)) for row in result] except Exception as e: print(f"Error querying motions: {e}") conn.close() return [] def create_session(self, total_motions: int = 10) -> str: """Create new user session""" session_id = str(uuid.uuid4()) conn = duckdb.connect(self.db_path) conn.execute( """ INSERT INTO user_sessions (session_id, user_votes, total_motions) VALUES (?, '{}', ?) """, (session_id, total_motions), ) conn.close() return session_id def update_user_vote(self, session_id: str, motion_id: int, vote: str): """Update user vote for a motion""" conn = duckdb.connect(self.db_path) # Get current votes current_votes = conn.execute( """ SELECT user_votes FROM user_sessions WHERE session_id = ? """, (session_id,), ).fetchone() if current_votes: votes_dict = json.loads(current_votes[0]) votes_dict[str(motion_id)] = vote conn.execute( """ UPDATE user_sessions SET user_votes = ?, completed_motions = ?, last_updated = CURRENT_TIMESTAMP WHERE session_id = ? """, (json.dumps(votes_dict), len(votes_dict), session_id), ) conn.close() def calculate_party_matches(self, session_id: str) -> List[Dict]: """Calculate party agreement percentages""" conn = duckdb.connect(self.db_path) # Get user votes and motion data user_data = conn.execute( """ SELECT user_votes FROM user_sessions WHERE session_id = ? """, (session_id,), ).fetchone() if not user_data: return [] user_votes = json.loads(user_data[0]) motion_ids = list(user_votes.keys()) if not motion_ids: return [] # Get motion voting results placeholders = ",".join(["?" for _ in motion_ids]) motions = conn.execute( f""" SELECT id, voting_results FROM motions WHERE id IN ({placeholders}) """, motion_ids, ).fetchall() conn.close() # Calculate agreements party_scores = {} for motion_id, voting_results_json in motions: voting_results = json.loads(voting_results_json) user_vote = user_votes[str(motion_id)] if user_vote == "Geen stem": # Skip abstentions continue for party, party_vote in voting_results.items(): # Skip individual MP names (contain comma, e.g. "Yesilgöz-Zegerius, D.") # Party/fractie names never contain a comma. if "," in party: continue if party not in party_scores: party_scores[party] = {"agreed": 0, "total": 0} party_scores[party]["total"] += 1 # Check agreement if (user_vote == "Voor" and party_vote == "voor") or ( user_vote == "Tegen" and party_vote == "tegen" ): party_scores[party]["agreed"] += 1 # Convert to percentages and sort results = [] for party, scores in party_scores.items(): if scores["total"] > 0: agreement_pct = (scores["agreed"] / scores["total"]) * 100 results.append( { "party": party, "agreement_percentage": round(agreement_pct, 1), "agreed_motions": scores["agreed"], "total_motions": scores["total"], } ) return sorted(results, key=lambda x: x["agreement_percentage"], reverse=True) def store_embedding(self, motion_id: int, model: str, vector: List[float]) -> int: """Store an embedding for a motion. Returns inserted row id or -1 on failure.""" try: conn = duckdb.connect(self.db_path) # Use explicit nextval for id since older tables may lack DEFAULT conn.execute( "INSERT INTO embeddings (id, motion_id, model, vector, created_at) VALUES (nextval('embeddings_id_seq'), ?, ?, ?, CURRENT_TIMESTAMP)", (motion_id, model, json.dumps(vector)), ) row = conn.execute("SELECT currval('embeddings_id_seq')").fetchone() conn.close() if row and row[0] is not None: return int(row[0]) return -1 except Exception as e: _logger.error("Error storing embedding: %s", e) try: conn.close() except Exception: pass return -1 def search_similar( self, query_vector: List[float], top_k: int = 5, model: Optional[str] = None ) -> List[Dict]: """Naive in-Python cosine similarity search over stored embeddings. Returns list of dicts with keys: id, motion_id, model, score, created_at """ try: conn = duckdb.connect(self.db_path) if model: rows = conn.execute( "SELECT id, motion_id, model, vector, created_at FROM embeddings WHERE model = ?", (model,), ).fetchall() else: rows = conn.execute( "SELECT id, motion_id, model, vector, created_at FROM embeddings" ).fetchall() conn.close() results = [] import math for r in rows: id_, motion_id, mdl, vector_json, created_at = r try: vec = json.loads(vector_json) except Exception: continue # cosine similarity try: dot = sum(float(a) * float(b) for a, b in zip(query_vector, vec)) na = math.sqrt(sum(float(a) * float(a) for a in query_vector)) nb = math.sqrt(sum(float(b) * float(b) for b in vec)) score = dot / (na * nb) if na and nb else 0.0 except Exception: score = 0.0 results.append( { "id": id_, "motion_id": motion_id, "model": mdl, "score": score, "created_at": created_at, } ) results.sort(key=lambda x: x["score"], reverse=True) return results[:top_k] except Exception as e: print(f"Error searching embeddings: {e}") try: conn.close() except Exception: pass return [] def mp_votes_exists_for_motion(self, motion_id: int) -> bool: try: conn = duckdb.connect(self.db_path) row = conn.execute( "SELECT COUNT(*) FROM mp_votes WHERE motion_id = ?", (motion_id,), ).fetchone() conn.close() return bool(row and row[0] > 0) except Exception as e: _logger.error(f"Error checking mp_votes existence: {e}") try: conn.close() except Exception: pass return False def insert_mp_vote( self, motion_id: int, mp_name: str, vote: str, date: Optional[str] = None, party: Optional[str] = None, ) -> int: try: conn = duckdb.connect(self.db_path) conn.execute( """ INSERT INTO mp_votes (motion_id, mp_name, party, vote, date, created_at) VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP) """, (motion_id, mp_name, party, vote, date), ) row = conn.execute("SELECT max(id) FROM mp_votes").fetchone() conn.close() if row and row[0] is not None: return int(row[0]) return -1 except Exception as e: _logger.error(f"Error inserting mp_vote: {e}") try: conn.close() except Exception: pass return -1 def upsert_mp_metadata( self, mp_name: str, party: Optional[str], van: Optional[str], tot_en_met: Optional[str], persoon_id: Optional[str], ) -> None: try: conn = duckdb.connect(self.db_path) exists = conn.execute( "SELECT COUNT(*) FROM mp_metadata WHERE mp_name = ?", (mp_name,) ).fetchone() if exists and exists[0] > 0: # Only update if this record is newer (higher Van date) than the stored one, # preferring active memberships (TotEnMet IS NULL) over ended ones. conn.execute( """ UPDATE mp_metadata SET party = ?, van = ?, tot_en_met = ?, persoon_id = ? WHERE mp_name = ? AND ( -- prefer active over ended (? IS NULL AND tot_en_met IS NOT NULL) -- or same active status but newer start date OR (? IS NULL AND tot_en_met IS NULL AND CAST(? AS DATE) > CAST(van AS DATE)) OR (? IS NOT NULL AND tot_en_met IS NOT NULL AND CAST(? AS DATE) > CAST(van AS DATE)) ) """, ( party, van, tot_en_met, persoon_id, mp_name, tot_en_met, # prefer active tot_en_met, van, # both active, newer tot_en_met, van, ), # both ended, newer ) else: conn.execute( """ INSERT INTO mp_metadata (mp_name, party, van, tot_en_met, persoon_id) VALUES (?, ?, ?, ?, ?) """, (mp_name, party, van, tot_en_met, persoon_id), ) conn.close() except Exception as e: _logger.error(f"Error upserting mp_metadata: {e}") try: conn.close() except Exception: pass def store_svd_vector( self, window_id: str, entity_type: str, entity_id: str, vector: List[float], model: Optional[str] = None, ) -> int: try: conn = duckdb.connect(self.db_path) conn.execute( """ INSERT INTO svd_vectors (window_id, entity_type, entity_id, vector, model, created_at) VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP) """, (window_id, entity_type, entity_id, json.dumps(vector), model), ) row = conn.execute("SELECT max(id) FROM svd_vectors").fetchone() conn.close() if row and row[0] is not None: return int(row[0]) return -1 except Exception as e: _logger.error(f"Error storing svd_vector: {e}") try: conn.close() except Exception: pass return -1 def batch_store_svd_vectors( self, window_id: str, rows: List[Tuple], # each: (entity_type, entity_id, vector_list, model_or_None) ) -> int: """Batch-upsert SVD vectors for a window using a single connection. Deletes all existing rows for the window first, then inserts the new batch. Returns number of rows inserted. """ if not rows: return 0 try: conn = duckdb.connect(self.db_path) conn.execute("DELETE FROM svd_vectors WHERE window_id = ?", (window_id,)) insert_rows = [ (window_id, entity_type, entity_id, json.dumps(vector), model) for entity_type, entity_id, vector, model in rows ] conn.executemany( """ INSERT INTO svd_vectors (window_id, entity_type, entity_id, vector, model, created_at) VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP) """, insert_rows, ) conn.close() return len(insert_rows) except Exception as e: _logger.error(f"Error in batch_store_svd_vectors: {e}") try: conn.close() except Exception: pass raise def store_fused_embedding( self, motion_id: int, window_id: str, vector: List[float], svd_dims: int, text_dims: int, ) -> int: try: conn = duckdb.connect(self.db_path) # Delete any existing row for this (motion_id, window_id) to prevent duplicates conn.execute( "DELETE FROM fused_embeddings WHERE motion_id = ? AND window_id = ?", (motion_id, window_id), ) conn.execute( """ INSERT INTO fused_embeddings (motion_id, window_id, vector, svd_dims, text_dims, created_at) VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP) """, (motion_id, window_id, json.dumps(vector), svd_dims, text_dims), ) row = conn.execute("SELECT max(id) FROM fused_embeddings").fetchone() conn.close() if row and row[0] is not None: return int(row[0]) return -1 except Exception as e: _logger.error(f"Error storing fused_embedding: {e}") try: conn.close() except Exception: pass return -1 def store_similarity_batch(self, rows: List[Dict]) -> int: """Insert multiple similarity_cache rows. Returns number inserted.""" if not rows: return 0 inserted = 0 # File-backed fallback when duckdb is not available if duckdb is None: sim_file = f"{self.db_path}.similarity_cache.json" try: with open(sim_file, "r+", encoding="utf-8") as fh: data = json.load(fh) # assign incremental ids max_id = max((item.get("id", 0) for item in data), default=0) for r in rows: max_id += 1 entry = { "id": max_id, "source_motion_id": int(r["source_motion_id"]), "target_motion_id": int(r["target_motion_id"]), "score": float(r["score"]), "vector_type": r["vector_type"], "window_id": r.get("window_id"), } data.append(entry) inserted += 1 fh.seek(0) json.dump(data, fh) fh.truncate() return inserted except Exception as e: _logger.error(f"Error writing similarity cache file: {e}") return inserted try: conn = duckdb.connect(self.db_path) for r in rows: try: conn.execute( """ INSERT INTO similarity_cache (source_motion_id, target_motion_id, score, vector_type, window_id, created_at) VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP) """, ( r["source_motion_id"], r["target_motion_id"], float(r["score"]), r["vector_type"], r.get("window_id"), ), ) inserted += 1 except Exception as e: _logger.error(f"Error inserting similarity row {r}: {e}") conn.close() return inserted except Exception as e: _logger.error(f"Error in store_similarity_batch: {e}") try: conn.close() except Exception: pass return inserted def get_cached_similarities( self, source_motion_id: int, vector_type: str, window_id: Optional[str] = None, top_k: int = 10, ) -> List[Dict]: """Retrieve cached similarities for a source motion. Returns list of dicts with keys: target_motion_id, score, created_at, id """ # File-backed fallback if duckdb is None: sim_file = f"{self.db_path}.similarity_cache.json" try: with open(sim_file, "r", encoding="utf-8") as fh: data = json.load(fh) rows = [ r for r in data if int(r.get("source_motion_id")) == int(source_motion_id) and r.get("vector_type") == vector_type and (window_id is None or r.get("window_id") == window_id) ] # sort by score desc rows.sort(key=lambda x: float(x.get("score", 0)), reverse=True) return rows[:top_k] except Exception as e: _logger.error(f"Error reading similarity cache file: {e}") return [] try: conn = duckdb.connect(self.db_path) params = [source_motion_id, vector_type] query = ( "SELECT id, target_motion_id, score, created_at FROM similarity_cache" " WHERE source_motion_id = ? AND vector_type = ?" ) if window_id is not None: query += " AND window_id = ?" params.append(window_id) query += " ORDER BY score DESC LIMIT ?" params.append(top_k) rows = conn.execute(query, params).fetchall() columns = [desc[0] for desc in conn.description] conn.close() return [dict(zip(columns, row)) for row in rows] except Exception as e: _logger.error(f"Error fetching cached similarities: {e}") try: conn.close() except Exception: pass return [] def clear_similarity_cache( self, vector_type: str, window_id: Optional[str] = None ) -> int: """Delete cached similarity rows matching vector_type and optional window_id. Returns count deleted.""" try: # File-backed fallback if duckdb is None: sim_file = f"{self.db_path}.similarity_cache.json" try: with open(sim_file, "r+", encoding="utf-8") as fh: data = json.load(fh) before = len(data) data = [ r for r in data if not ( r.get("vector_type") == vector_type and ( window_id is None or r.get("window_id") == window_id ) ) ] deleted = before - len(data) fh.seek(0) json.dump(data, fh) fh.truncate() return deleted except Exception as e: _logger.error(f"Error clearing similarity cache file: {e}") return 0 conn = duckdb.connect(self.db_path) params = [vector_type] count_q = "SELECT COUNT(*) FROM similarity_cache WHERE vector_type = ?" del_q = "DELETE FROM similarity_cache WHERE vector_type = ?" if window_id is not None: count_q += " AND window_id = ?" del_q += " AND window_id = ?" params.append(window_id) row = conn.execute(count_q, params).fetchone() to_delete = int(row[0]) if row and row[0] is not None else 0 if to_delete > 0: conn.execute(del_q, params) conn.close() return to_delete except Exception as e: _logger.error(f"Error clearing similarity_cache: {e}") try: conn.close() except Exception: pass return 0 db = MotionDatabase()