Fix update_existing_motions: single write connection and module-level duckdb import

Use one DuckDB write connection for the entire update loop instead of
opening/closing per row, wrapped in try/finally for proper cleanup.
Move 'import duckdb' to module level with other imports.
main
Sven Geboers 1 month ago
parent be8887f6f8
commit 88110b0aaa
  1. 106
      scripts/download_past_year.py

@ -18,6 +18,8 @@ from datetime import datetime, timedelta
from typing import Optional, Tuple from typing import Optional, Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
import duckdb
sys.path.insert(0, ".") # run from project root sys.path.insert(0, ".") # run from project root
from api_client import TweedeKamerAPI from api_client import TweedeKamerAPI
@ -71,8 +73,6 @@ def update_existing_motions(
Returns: Returns:
(updated_count, skipped_count) tuple. (updated_count, skipped_count) tuple.
""" """
import duckdb
# Read motions with missing body_text # Read motions with missing body_text
conn_read = duckdb.connect(db_path, read_only=True) conn_read = duckdb.connect(db_path, read_only=True)
rows = conn_read.execute( rows = conn_read.execute(
@ -89,56 +89,62 @@ def update_existing_motions(
updated = 0 updated = 0
skipped = 0 skipped = 0
for row in rows: conn_write = duckdb.connect(db_path, read_only=False)
motion_id, url, title, description = row try:
for row in rows:
besluit_id = extract_besluit_id(url or "") motion_id, url, title, description = row
if not besluit_id:
print(f" Skipping motion {motion_id}: cannot extract besluit_id from URL") besluit_id = extract_besluit_id(url or "")
skipped += 1 if not besluit_id:
continue print(
f" Skipping motion {motion_id}: cannot extract besluit_id from URL"
print(f" Fetching details for motion {motion_id} (besluit_id={besluit_id})...") )
details = api._get_motion_details(besluit_id) skipped += 1
continue
if not details or not details.get("body_text"):
print(f" Skipping motion {motion_id}: no body_text returned") print(
skipped += 1 f" Fetching details for motion {motion_id} (besluit_id={besluit_id})..."
continue )
details = api._get_motion_details(besluit_id)
# Build update: always set body_text; also update title/description if
# they were placeholder values (e.g. "Motion abc12345" or "No description available") if not details or not details.get("body_text"):
new_body = details["body_text"] print(f" Skipping motion {motion_id}: no body_text returned")
new_title = title skipped += 1
new_desc = description continue
if title and (title.startswith("Motion ") or title.startswith("Besluit ")): # Build update: always set body_text; also update title/description if
new_title = details.get("title") or title # they were placeholder values (e.g. "Motion abc12345" or "No description available")
new_body = details["body_text"]
if description in ( new_title = title
None, new_desc = description
"",
"No description available", if title and (title.startswith("Motion ") or title.startswith("Besluit ")):
"Geen beschrijving beschikbaar", new_title = details.get("title") or title
):
new_desc = details.get("description") or description if description in (
None,
conn_write = duckdb.connect(db_path, read_only=False) "",
conn_write.execute( "No description available",
""" "Geen beschrijving beschikbaar",
UPDATE motions ):
SET body_text = ?, title = ?, description = ? new_desc = details.get("description") or description
WHERE id = ?
""", conn_write.execute(
(new_body, new_title, new_desc, motion_id), """
) UPDATE motions
conn_write.close() SET body_text = ?, title = ?, description = ?
WHERE id = ?
""",
(new_body, new_title, new_desc, motion_id),
)
updated += 1 updated += 1
print(f" Updated motion {motion_id}") print(f" Updated motion {motion_id}")
if delay > 0 and updated + skipped < len(rows): if delay > 0 and updated + skipped < len(rows):
time.sleep(delay) time.sleep(delay)
finally:
conn_write.close()
return updated, skipped return updated, skipped

Loading…
Cancel
Save