Fix update_existing_motions: single write connection and module-level duckdb import

Use one DuckDB write connection for the entire update loop instead of
opening/closing per row, wrapped in try/finally for proper cleanup.
Move 'import duckdb' to module level with other imports.
main
Sven Geboers 1 month ago
parent be8887f6f8
commit 88110b0aaa
  1. 106
      scripts/download_past_year.py

@ -18,6 +18,8 @@ from datetime import datetime, timedelta
from typing import Optional, Tuple
from urllib.parse import urlparse
import duckdb
sys.path.insert(0, ".") # run from project root
from api_client import TweedeKamerAPI
@ -71,8 +73,6 @@ def update_existing_motions(
Returns:
(updated_count, skipped_count) tuple.
"""
import duckdb
# Read motions with missing body_text
conn_read = duckdb.connect(db_path, read_only=True)
rows = conn_read.execute(
@ -89,56 +89,62 @@ def update_existing_motions(
updated = 0
skipped = 0
for row in rows:
motion_id, url, title, description = row
besluit_id = extract_besluit_id(url or "")
if not besluit_id:
print(f" Skipping motion {motion_id}: cannot extract besluit_id from URL")
skipped += 1
continue
print(f" Fetching details for motion {motion_id} (besluit_id={besluit_id})...")
details = api._get_motion_details(besluit_id)
if not details or not details.get("body_text"):
print(f" Skipping motion {motion_id}: no body_text returned")
skipped += 1
continue
# Build update: always set body_text; also update title/description if
# they were placeholder values (e.g. "Motion abc12345" or "No description available")
new_body = details["body_text"]
new_title = title
new_desc = description
if title and (title.startswith("Motion ") or title.startswith("Besluit ")):
new_title = details.get("title") or title
if description in (
None,
"",
"No description available",
"Geen beschrijving beschikbaar",
):
new_desc = details.get("description") or description
conn_write = duckdb.connect(db_path, read_only=False)
conn_write.execute(
"""
UPDATE motions
SET body_text = ?, title = ?, description = ?
WHERE id = ?
""",
(new_body, new_title, new_desc, motion_id),
)
conn_write.close()
conn_write = duckdb.connect(db_path, read_only=False)
try:
for row in rows:
motion_id, url, title, description = row
besluit_id = extract_besluit_id(url or "")
if not besluit_id:
print(
f" Skipping motion {motion_id}: cannot extract besluit_id from URL"
)
skipped += 1
continue
print(
f" Fetching details for motion {motion_id} (besluit_id={besluit_id})..."
)
details = api._get_motion_details(besluit_id)
if not details or not details.get("body_text"):
print(f" Skipping motion {motion_id}: no body_text returned")
skipped += 1
continue
# Build update: always set body_text; also update title/description if
# they were placeholder values (e.g. "Motion abc12345" or "No description available")
new_body = details["body_text"]
new_title = title
new_desc = description
if title and (title.startswith("Motion ") or title.startswith("Besluit ")):
new_title = details.get("title") or title
if description in (
None,
"",
"No description available",
"Geen beschrijving beschikbaar",
):
new_desc = details.get("description") or description
conn_write.execute(
"""
UPDATE motions
SET body_text = ?, title = ?, description = ?
WHERE id = ?
""",
(new_body, new_title, new_desc, motion_id),
)
updated += 1
print(f" Updated motion {motion_id}")
updated += 1
print(f" Updated motion {motion_id}")
if delay > 0 and updated + skipped < len(rows):
time.sleep(delay)
if delay > 0 and updated + skipped < len(rows):
time.sleep(delay)
finally:
conn_write.close()
return updated, skipped

Loading…
Cancel
Save