From 88110b0aaa0db182d79451f26cee8b3806c748a7 Mon Sep 17 00:00:00 2001 From: Sven Geboers Date: Sun, 29 Mar 2026 23:28:40 +0200 Subject: [PATCH] Fix update_existing_motions: single write connection and module-level duckdb import Use one DuckDB write connection for the entire update loop instead of opening/closing per row, wrapped in try/finally for proper cleanup. Move 'import duckdb' to module level with other imports. --- scripts/download_past_year.py | 106 ++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 50 deletions(-) diff --git a/scripts/download_past_year.py b/scripts/download_past_year.py index 50bf060..a1f3e69 100644 --- a/scripts/download_past_year.py +++ b/scripts/download_past_year.py @@ -18,6 +18,8 @@ from datetime import datetime, timedelta from typing import Optional, Tuple from urllib.parse import urlparse +import duckdb + sys.path.insert(0, ".") # run from project root from api_client import TweedeKamerAPI @@ -71,8 +73,6 @@ def update_existing_motions( Returns: (updated_count, skipped_count) tuple. """ - import duckdb - # Read motions with missing body_text conn_read = duckdb.connect(db_path, read_only=True) rows = conn_read.execute( @@ -89,56 +89,62 @@ def update_existing_motions( updated = 0 skipped = 0 - for row in rows: - motion_id, url, title, description = row - - besluit_id = extract_besluit_id(url or "") - if not besluit_id: - print(f" Skipping motion {motion_id}: cannot extract besluit_id from URL") - skipped += 1 - continue - - print(f" Fetching details for motion {motion_id} (besluit_id={besluit_id})...") - details = api._get_motion_details(besluit_id) - - if not details or not details.get("body_text"): - print(f" Skipping motion {motion_id}: no body_text returned") - skipped += 1 - continue - - # Build update: always set body_text; also update title/description if - # they were placeholder values (e.g. "Motion abc12345" or "No description available") - new_body = details["body_text"] - new_title = title - new_desc = description - - if title and (title.startswith("Motion ") or title.startswith("Besluit ")): - new_title = details.get("title") or title - - if description in ( - None, - "", - "No description available", - "Geen beschrijving beschikbaar", - ): - new_desc = details.get("description") or description - - conn_write = duckdb.connect(db_path, read_only=False) - conn_write.execute( - """ - UPDATE motions - SET body_text = ?, title = ?, description = ? - WHERE id = ? - """, - (new_body, new_title, new_desc, motion_id), - ) - conn_write.close() + conn_write = duckdb.connect(db_path, read_only=False) + try: + for row in rows: + motion_id, url, title, description = row + + besluit_id = extract_besluit_id(url or "") + if not besluit_id: + print( + f" Skipping motion {motion_id}: cannot extract besluit_id from URL" + ) + skipped += 1 + continue + + print( + f" Fetching details for motion {motion_id} (besluit_id={besluit_id})..." + ) + details = api._get_motion_details(besluit_id) + + if not details or not details.get("body_text"): + print(f" Skipping motion {motion_id}: no body_text returned") + skipped += 1 + continue + + # Build update: always set body_text; also update title/description if + # they were placeholder values (e.g. "Motion abc12345" or "No description available") + new_body = details["body_text"] + new_title = title + new_desc = description + + if title and (title.startswith("Motion ") or title.startswith("Besluit ")): + new_title = details.get("title") or title + + if description in ( + None, + "", + "No description available", + "Geen beschrijving beschikbaar", + ): + new_desc = details.get("description") or description + + conn_write.execute( + """ + UPDATE motions + SET body_text = ?, title = ?, description = ? + WHERE id = ? + """, + (new_body, new_title, new_desc, motion_id), + ) - updated += 1 - print(f" Updated motion {motion_id}") + updated += 1 + print(f" Updated motion {motion_id}") - if delay > 0 and updated + skipped < len(rows): - time.sleep(delay) + if delay > 0 and updated + skipped < len(rows): + time.sleep(delay) + finally: + conn_write.close() return updated, skipped