Fix update_existing_motions: single write connection and module-level duckdb import

Use one DuckDB write connection for the entire update loop instead of
opening/closing per row, wrapped in try/finally for proper cleanup.
Move 'import duckdb' to module level with other imports.
main
Sven Geboers 1 month ago
parent be8887f6f8
commit 88110b0aaa
  1. 18
      scripts/download_past_year.py

@ -18,6 +18,8 @@ from datetime import datetime, timedelta
from typing import Optional, Tuple from typing import Optional, Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
import duckdb
sys.path.insert(0, ".") # run from project root sys.path.insert(0, ".") # run from project root
from api_client import TweedeKamerAPI from api_client import TweedeKamerAPI
@ -71,8 +73,6 @@ def update_existing_motions(
Returns: Returns:
(updated_count, skipped_count) tuple. (updated_count, skipped_count) tuple.
""" """
import duckdb
# Read motions with missing body_text # Read motions with missing body_text
conn_read = duckdb.connect(db_path, read_only=True) conn_read = duckdb.connect(db_path, read_only=True)
rows = conn_read.execute( rows = conn_read.execute(
@ -89,16 +89,22 @@ def update_existing_motions(
updated = 0 updated = 0
skipped = 0 skipped = 0
conn_write = duckdb.connect(db_path, read_only=False)
try:
for row in rows: for row in rows:
motion_id, url, title, description = row motion_id, url, title, description = row
besluit_id = extract_besluit_id(url or "") besluit_id = extract_besluit_id(url or "")
if not besluit_id: if not besluit_id:
print(f" Skipping motion {motion_id}: cannot extract besluit_id from URL") print(
f" Skipping motion {motion_id}: cannot extract besluit_id from URL"
)
skipped += 1 skipped += 1
continue continue
print(f" Fetching details for motion {motion_id} (besluit_id={besluit_id})...") print(
f" Fetching details for motion {motion_id} (besluit_id={besluit_id})..."
)
details = api._get_motion_details(besluit_id) details = api._get_motion_details(besluit_id)
if not details or not details.get("body_text"): if not details or not details.get("body_text"):
@ -123,7 +129,6 @@ def update_existing_motions(
): ):
new_desc = details.get("description") or description new_desc = details.get("description") or description
conn_write = duckdb.connect(db_path, read_only=False)
conn_write.execute( conn_write.execute(
""" """
UPDATE motions UPDATE motions
@ -132,13 +137,14 @@ def update_existing_motions(
""", """,
(new_body, new_title, new_desc, motion_id), (new_body, new_title, new_desc, motion_id),
) )
conn_write.close()
updated += 1 updated += 1
print(f" Updated motion {motion_id}") print(f" Updated motion {motion_id}")
if delay > 0 and updated + skipped < len(rows): if delay > 0 and updated + skipped < len(rows):
time.sleep(delay) time.sleep(delay)
finally:
conn_write.close()
return updated, skipped return updated, skipped

Loading…
Cancel
Save