Add --skip-details, --update-existing flags to download_past_year.py with tests

Enable backfilling body_text for existing motions that lack it (2016-2018 data).
New extract_besluit_id() and update_existing_motions() helpers support the
--update-existing mode, while --no-skip-details enables detail fetching during
normal downloads. Includes 7 tests covering URL parsing, DB update flow, and
argparse wiring.
main
Sven Geboers 1 month ago
parent 72a8dd2721
commit be8887f6f8
  1. 162
      scripts/download_past_year.py
  2. 182
      tests/test_download_script.py

@ -8,12 +8,15 @@ Skips AI summarisation — this is a raw data fetch for the embedding pipeline.
Usage: Usage:
uv run python scripts/download_past_year.py [--db-path data/motions.db] [--days 365] uv run python scripts/download_past_year.py [--db-path data/motions.db] [--days 365]
uv run python scripts/download_past_year.py --start-date 2019-01-01 --end-date 2022-01-01 uv run python scripts/download_past_year.py --start-date 2019-01-01 --end-date 2022-01-01
uv run python scripts/download_past_year.py --update-existing --start-date 2016-01-01 --end-date 2018-12-31
""" """
import argparse import argparse
import sys import sys
import time import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Optional, Tuple
from urllib.parse import urlparse
sys.path.insert(0, ".") # run from project root sys.path.insert(0, ".") # run from project root
@ -21,7 +24,127 @@ from api_client import TweedeKamerAPI
from database import MotionDatabase from database import MotionDatabase
def main(): _STEMMINGSUITSLAGEN_PREFIX = "/kamerstukken/stemmingsuitslagen/"
def extract_besluit_id(url: str) -> Optional[str]:
"""Extract the besluit_id (last path segment) from a motion URL.
Expected format: https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/{besluit_id}
Returns None if the URL doesn't match expected format.
"""
if not url:
return None
try:
parsed = urlparse(url)
path = parsed.path.rstrip("/")
if _STEMMINGSUITSLAGEN_PREFIX.rstrip("/") not in path:
return None
# Last path segment is the besluit_id
besluit_id = path.split("/")[-1]
return besluit_id if besluit_id else None
except Exception:
return None
def update_existing_motions(
db_path: str,
api: "TweedeKamerAPI",
start_date: str,
end_date: str,
delay: float = 1.0,
) -> Tuple[int, int]:
"""Backfill body_text for motions that are missing it.
Queries for motions with NULL/empty body_text in the given date range,
extracts besluit_id from the URL, fetches details via the API, and updates
the row.
Args:
db_path: Path to the DuckDB database file.
api: TweedeKamerAPI instance (uses api._get_motion_details).
start_date: Start date string (YYYY-MM-DD).
end_date: End date string (YYYY-MM-DD).
delay: Seconds to wait between API calls.
Returns:
(updated_count, skipped_count) tuple.
"""
import duckdb
# Read motions with missing body_text
conn_read = duckdb.connect(db_path, read_only=True)
rows = conn_read.execute(
"""
SELECT id, url, title, description
FROM motions
WHERE date BETWEEN ? AND ?
AND (body_text IS NULL OR body_text = '')
""",
(start_date, end_date),
).fetchall()
conn_read.close()
updated = 0
skipped = 0
for row in rows:
motion_id, url, title, description = row
besluit_id = extract_besluit_id(url or "")
if not besluit_id:
print(f" Skipping motion {motion_id}: cannot extract besluit_id from URL")
skipped += 1
continue
print(f" Fetching details for motion {motion_id} (besluit_id={besluit_id})...")
details = api._get_motion_details(besluit_id)
if not details or not details.get("body_text"):
print(f" Skipping motion {motion_id}: no body_text returned")
skipped += 1
continue
# Build update: always set body_text; also update title/description if
# they were placeholder values (e.g. "Motion abc12345" or "No description available")
new_body = details["body_text"]
new_title = title
new_desc = description
if title and (title.startswith("Motion ") or title.startswith("Besluit ")):
new_title = details.get("title") or title
if description in (
None,
"",
"No description available",
"Geen beschrijving beschikbaar",
):
new_desc = details.get("description") or description
conn_write = duckdb.connect(db_path, read_only=False)
conn_write.execute(
"""
UPDATE motions
SET body_text = ?, title = ?, description = ?
WHERE id = ?
""",
(new_body, new_title, new_desc, motion_id),
)
conn_write.close()
updated += 1
print(f" Updated motion {motion_id}")
if delay > 0 and updated + skipped < len(rows):
time.sleep(delay)
return updated, skipped
def build_parser() -> argparse.ArgumentParser:
"""Build and return the argument parser for the download script."""
parser = argparse.ArgumentParser(description="Download motions for a date range") parser = argparse.ArgumentParser(description="Download motions for a date range")
parser.add_argument("--db-path", default="data/motions.db") parser.add_argument("--db-path", default="data/motions.db")
parser.add_argument( parser.add_argument(
@ -52,6 +175,23 @@ def main():
parser.add_argument( parser.add_argument(
"--delay", type=float, default=2.0, help="Seconds between chunks" "--delay", type=float, default=2.0, help="Seconds between chunks"
) )
parser.add_argument(
"--skip-details",
action=argparse.BooleanOptionalAction,
default=True,
help="Skip fetching per-motion details (default: True). Use --no-skip-details to fetch body text.",
)
parser.add_argument(
"--update-existing",
action="store_true",
default=False,
help="Backfill body_text for existing motions with missing text. Skips normal download.",
)
return parser
def main():
parser = build_parser()
args = parser.parse_args() args = parser.parse_args()
api = TweedeKamerAPI() api = TweedeKamerAPI()
@ -73,6 +213,24 @@ def main():
print(f"DB: {args.db_path}") print(f"DB: {args.db_path}")
print() print()
# --- Update-existing mode: backfill body_text, then exit ---
if args.update_existing:
print("Mode: update-existing (backfilling body_text for existing motions)\n")
updated, skipped = update_existing_motions(
db_path=args.db_path,
api=api,
start_date=start_date.strftime("%Y-%m-%d"),
end_date=end_date.strftime("%Y-%m-%d"),
delay=args.delay,
)
print()
print("=" * 50)
print(f"Done. Updated: {updated} | Skipped: {skipped}")
print("=" * 50)
return
# --- Normal download mode ---
# Test connectivity first # Test connectivity first
test_url = f"{api.odata_base_url}/Stemming" test_url = f"{api.odata_base_url}/Stemming"
r = api.session.get(test_url, params={"$top": 1}, timeout=10) r = api.session.get(test_url, params={"$top": 1}, timeout=10)
@ -98,7 +256,7 @@ def main():
start_date=chunk_start, start_date=chunk_start,
end_date=chunk_end, end_date=chunk_end,
limit=args.limit_per_chunk, limit=args.limit_per_chunk,
skip_details=True, skip_details=args.skip_details,
) )
print(f" Fetched {len(motions)} motions") print(f" Fetched {len(motions)} motions")
total_fetched += len(motions) total_fetched += len(motions)

@ -0,0 +1,182 @@
"""Tests for scripts/download_past_year.py enhancements.
Tests extract_besluit_id helper, update_existing_motions function,
and --skip-details flag wiring.
"""
import sys
import argparse
from unittest.mock import MagicMock, patch, call
import pytest
sys.path.insert(0, ".")
from scripts.download_past_year import extract_besluit_id
# --- extract_besluit_id tests ---
def test_extract_besluit_id_valid():
url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789"
assert extract_besluit_id(url) == "abc123-def456-ghi789"
def test_extract_besluit_id_trailing_slash():
url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789/"
assert extract_besluit_id(url) == "abc123-def456-ghi789"
def test_extract_besluit_id_invalid():
url = "https://example.com/not-a-motion-url"
assert extract_besluit_id(url) is None
def test_extract_besluit_id_empty():
assert extract_besluit_id("") is None
# --- update_existing_motions tests ---
def test_update_existing_motions_updates_body_text(tmp_path):
"""Mock DuckDB + mock API, verify UPDATE is called with correct body_text."""
import duckdb
from scripts.download_past_year import update_existing_motions
db_path = str(tmp_path / "test.db")
# Set up a real DuckDB database with the motions table
conn = duckdb.connect(db_path)
conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1")
conn.execute("""
CREATE TABLE motions (
id INTEGER DEFAULT nextval('motions_id_seq'),
title TEXT NOT NULL,
description TEXT,
date DATE,
policy_area TEXT,
voting_results JSON,
winning_margin FLOAT,
controversy_score FLOAT,
layman_explanation TEXT,
externe_identifier TEXT,
body_text TEXT,
url TEXT UNIQUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id)
)
""")
# Insert a motion with missing body_text
conn.execute("""
INSERT INTO motions (title, description, date, url, body_text)
VALUES ('Test Motion', 'desc', '2017-06-01',
'https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/besluit-id-123',
NULL)
""")
conn.close()
# Mock API that returns body_text
mock_api = MagicMock()
mock_api._get_motion_details.return_value = {
"title": "Real Title",
"description": "Real Description",
"date": "2017-06-01",
"externe_identifier": "kst-12345",
"body_text": "constaterende dat de motie gaat over iets belangrijks",
}
updated, skipped = update_existing_motions(
db_path=db_path,
api=mock_api,
start_date="2017-01-01",
end_date="2017-12-31",
delay=0.0,
)
assert updated == 1
assert skipped == 0
# Verify the body_text was actually written to the DB
conn = duckdb.connect(db_path, read_only=True)
row = conn.execute("SELECT body_text FROM motions WHERE id = 1").fetchone()
conn.close()
assert row[0] == "constaterende dat de motie gaat over iets belangrijks"
# Verify the API was called with the correct besluit_id
mock_api._get_motion_details.assert_called_once_with("besluit-id-123")
def test_update_existing_motions_skips_when_no_besluit_id(tmp_path):
"""URL without valid besluit_id is skipped."""
import duckdb
from scripts.download_past_year import update_existing_motions
db_path = str(tmp_path / "test.db")
conn = duckdb.connect(db_path)
conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1")
conn.execute("""
CREATE TABLE motions (
id INTEGER DEFAULT nextval('motions_id_seq'),
title TEXT NOT NULL,
description TEXT,
date DATE,
policy_area TEXT,
voting_results JSON,
winning_margin FLOAT,
controversy_score FLOAT,
layman_explanation TEXT,
externe_identifier TEXT,
body_text TEXT,
url TEXT UNIQUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id)
)
""")
# Insert a motion with a URL that won't parse to a besluit_id
conn.execute("""
INSERT INTO motions (title, description, date, url, body_text)
VALUES ('Bad URL Motion', 'desc', '2017-06-01',
'https://example.com/not-a-valid-url',
NULL)
""")
conn.close()
mock_api = MagicMock()
updated, skipped = update_existing_motions(
db_path=db_path,
api=mock_api,
start_date="2017-01-01",
end_date="2017-12-31",
delay=0.0,
)
assert updated == 0
assert skipped == 1
# API should never have been called
mock_api._get_motion_details.assert_not_called()
def test_skip_details_flag_passed_to_api():
"""Verify the argparse flag is wired correctly by parsing args."""
from scripts.download_past_year import build_parser
# Default: skip_details should be True
parser = build_parser()
args = parser.parse_args([])
assert args.skip_details is True
# Explicitly set to false via --no-skip-details
args = parser.parse_args(["--no-skip-details"])
assert args.skip_details is False
# Explicitly set --update-existing
args = parser.parse_args(["--update-existing"])
assert args.update_existing is True
# Default: update_existing should be False
args = parser.parse_args([])
assert args.update_existing is False
Loading…
Cancel
Save