Add --skip-details, --update-existing flags to download_past_year.py with tests

Enable backfilling body_text for existing motions that lack it (2016-2018 data).
New extract_besluit_id() and update_existing_motions() helpers support the
--update-existing mode, while --no-skip-details enables detail fetching during
normal downloads. Includes 7 tests covering URL parsing, DB update flow, and
argparse wiring.
main
Sven Geboers 1 month ago
parent 72a8dd2721
commit be8887f6f8
  1. 162
      scripts/download_past_year.py
  2. 182
      tests/test_download_script.py

@ -8,12 +8,15 @@ Skips AI summarisation — this is a raw data fetch for the embedding pipeline.
Usage:
uv run python scripts/download_past_year.py [--db-path data/motions.db] [--days 365]
uv run python scripts/download_past_year.py --start-date 2019-01-01 --end-date 2022-01-01
uv run python scripts/download_past_year.py --update-existing --start-date 2016-01-01 --end-date 2018-12-31
"""
import argparse
import sys
import time
from datetime import datetime, timedelta
from typing import Optional, Tuple
from urllib.parse import urlparse
sys.path.insert(0, ".") # run from project root
@ -21,7 +24,127 @@ from api_client import TweedeKamerAPI
from database import MotionDatabase
def main():
_STEMMINGSUITSLAGEN_PREFIX = "/kamerstukken/stemmingsuitslagen/"
def extract_besluit_id(url: str) -> Optional[str]:
"""Extract the besluit_id (last path segment) from a motion URL.
Expected format: https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/{besluit_id}
Returns None if the URL doesn't match expected format.
"""
if not url:
return None
try:
parsed = urlparse(url)
path = parsed.path.rstrip("/")
if _STEMMINGSUITSLAGEN_PREFIX.rstrip("/") not in path:
return None
# Last path segment is the besluit_id
besluit_id = path.split("/")[-1]
return besluit_id if besluit_id else None
except Exception:
return None
def update_existing_motions(
db_path: str,
api: "TweedeKamerAPI",
start_date: str,
end_date: str,
delay: float = 1.0,
) -> Tuple[int, int]:
"""Backfill body_text for motions that are missing it.
Queries for motions with NULL/empty body_text in the given date range,
extracts besluit_id from the URL, fetches details via the API, and updates
the row.
Args:
db_path: Path to the DuckDB database file.
api: TweedeKamerAPI instance (uses api._get_motion_details).
start_date: Start date string (YYYY-MM-DD).
end_date: End date string (YYYY-MM-DD).
delay: Seconds to wait between API calls.
Returns:
(updated_count, skipped_count) tuple.
"""
import duckdb
# Read motions with missing body_text
conn_read = duckdb.connect(db_path, read_only=True)
rows = conn_read.execute(
"""
SELECT id, url, title, description
FROM motions
WHERE date BETWEEN ? AND ?
AND (body_text IS NULL OR body_text = '')
""",
(start_date, end_date),
).fetchall()
conn_read.close()
updated = 0
skipped = 0
for row in rows:
motion_id, url, title, description = row
besluit_id = extract_besluit_id(url or "")
if not besluit_id:
print(f" Skipping motion {motion_id}: cannot extract besluit_id from URL")
skipped += 1
continue
print(f" Fetching details for motion {motion_id} (besluit_id={besluit_id})...")
details = api._get_motion_details(besluit_id)
if not details or not details.get("body_text"):
print(f" Skipping motion {motion_id}: no body_text returned")
skipped += 1
continue
# Build update: always set body_text; also update title/description if
# they were placeholder values (e.g. "Motion abc12345" or "No description available")
new_body = details["body_text"]
new_title = title
new_desc = description
if title and (title.startswith("Motion ") or title.startswith("Besluit ")):
new_title = details.get("title") or title
if description in (
None,
"",
"No description available",
"Geen beschrijving beschikbaar",
):
new_desc = details.get("description") or description
conn_write = duckdb.connect(db_path, read_only=False)
conn_write.execute(
"""
UPDATE motions
SET body_text = ?, title = ?, description = ?
WHERE id = ?
""",
(new_body, new_title, new_desc, motion_id),
)
conn_write.close()
updated += 1
print(f" Updated motion {motion_id}")
if delay > 0 and updated + skipped < len(rows):
time.sleep(delay)
return updated, skipped
def build_parser() -> argparse.ArgumentParser:
"""Build and return the argument parser for the download script."""
parser = argparse.ArgumentParser(description="Download motions for a date range")
parser.add_argument("--db-path", default="data/motions.db")
parser.add_argument(
@ -52,6 +175,23 @@ def main():
parser.add_argument(
"--delay", type=float, default=2.0, help="Seconds between chunks"
)
parser.add_argument(
"--skip-details",
action=argparse.BooleanOptionalAction,
default=True,
help="Skip fetching per-motion details (default: True). Use --no-skip-details to fetch body text.",
)
parser.add_argument(
"--update-existing",
action="store_true",
default=False,
help="Backfill body_text for existing motions with missing text. Skips normal download.",
)
return parser
def main():
parser = build_parser()
args = parser.parse_args()
api = TweedeKamerAPI()
@ -73,6 +213,24 @@ def main():
print(f"DB: {args.db_path}")
print()
# --- Update-existing mode: backfill body_text, then exit ---
if args.update_existing:
print("Mode: update-existing (backfilling body_text for existing motions)\n")
updated, skipped = update_existing_motions(
db_path=args.db_path,
api=api,
start_date=start_date.strftime("%Y-%m-%d"),
end_date=end_date.strftime("%Y-%m-%d"),
delay=args.delay,
)
print()
print("=" * 50)
print(f"Done. Updated: {updated} | Skipped: {skipped}")
print("=" * 50)
return
# --- Normal download mode ---
# Test connectivity first
test_url = f"{api.odata_base_url}/Stemming"
r = api.session.get(test_url, params={"$top": 1}, timeout=10)
@ -98,7 +256,7 @@ def main():
start_date=chunk_start,
end_date=chunk_end,
limit=args.limit_per_chunk,
skip_details=True,
skip_details=args.skip_details,
)
print(f" Fetched {len(motions)} motions")
total_fetched += len(motions)

@ -0,0 +1,182 @@
"""Tests for scripts/download_past_year.py enhancements.
Tests extract_besluit_id helper, update_existing_motions function,
and --skip-details flag wiring.
"""
import sys
import argparse
from unittest.mock import MagicMock, patch, call
import pytest
sys.path.insert(0, ".")
from scripts.download_past_year import extract_besluit_id
# --- extract_besluit_id tests ---
def test_extract_besluit_id_valid():
url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789"
assert extract_besluit_id(url) == "abc123-def456-ghi789"
def test_extract_besluit_id_trailing_slash():
url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789/"
assert extract_besluit_id(url) == "abc123-def456-ghi789"
def test_extract_besluit_id_invalid():
url = "https://example.com/not-a-motion-url"
assert extract_besluit_id(url) is None
def test_extract_besluit_id_empty():
assert extract_besluit_id("") is None
# --- update_existing_motions tests ---
def test_update_existing_motions_updates_body_text(tmp_path):
"""Mock DuckDB + mock API, verify UPDATE is called with correct body_text."""
import duckdb
from scripts.download_past_year import update_existing_motions
db_path = str(tmp_path / "test.db")
# Set up a real DuckDB database with the motions table
conn = duckdb.connect(db_path)
conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1")
conn.execute("""
CREATE TABLE motions (
id INTEGER DEFAULT nextval('motions_id_seq'),
title TEXT NOT NULL,
description TEXT,
date DATE,
policy_area TEXT,
voting_results JSON,
winning_margin FLOAT,
controversy_score FLOAT,
layman_explanation TEXT,
externe_identifier TEXT,
body_text TEXT,
url TEXT UNIQUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id)
)
""")
# Insert a motion with missing body_text
conn.execute("""
INSERT INTO motions (title, description, date, url, body_text)
VALUES ('Test Motion', 'desc', '2017-06-01',
'https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/besluit-id-123',
NULL)
""")
conn.close()
# Mock API that returns body_text
mock_api = MagicMock()
mock_api._get_motion_details.return_value = {
"title": "Real Title",
"description": "Real Description",
"date": "2017-06-01",
"externe_identifier": "kst-12345",
"body_text": "constaterende dat de motie gaat over iets belangrijks",
}
updated, skipped = update_existing_motions(
db_path=db_path,
api=mock_api,
start_date="2017-01-01",
end_date="2017-12-31",
delay=0.0,
)
assert updated == 1
assert skipped == 0
# Verify the body_text was actually written to the DB
conn = duckdb.connect(db_path, read_only=True)
row = conn.execute("SELECT body_text FROM motions WHERE id = 1").fetchone()
conn.close()
assert row[0] == "constaterende dat de motie gaat over iets belangrijks"
# Verify the API was called with the correct besluit_id
mock_api._get_motion_details.assert_called_once_with("besluit-id-123")
def test_update_existing_motions_skips_when_no_besluit_id(tmp_path):
"""URL without valid besluit_id is skipped."""
import duckdb
from scripts.download_past_year import update_existing_motions
db_path = str(tmp_path / "test.db")
conn = duckdb.connect(db_path)
conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1")
conn.execute("""
CREATE TABLE motions (
id INTEGER DEFAULT nextval('motions_id_seq'),
title TEXT NOT NULL,
description TEXT,
date DATE,
policy_area TEXT,
voting_results JSON,
winning_margin FLOAT,
controversy_score FLOAT,
layman_explanation TEXT,
externe_identifier TEXT,
body_text TEXT,
url TEXT UNIQUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id)
)
""")
# Insert a motion with a URL that won't parse to a besluit_id
conn.execute("""
INSERT INTO motions (title, description, date, url, body_text)
VALUES ('Bad URL Motion', 'desc', '2017-06-01',
'https://example.com/not-a-valid-url',
NULL)
""")
conn.close()
mock_api = MagicMock()
updated, skipped = update_existing_motions(
db_path=db_path,
api=mock_api,
start_date="2017-01-01",
end_date="2017-12-31",
delay=0.0,
)
assert updated == 0
assert skipped == 1
# API should never have been called
mock_api._get_motion_details.assert_not_called()
def test_skip_details_flag_passed_to_api():
"""Verify the argparse flag is wired correctly by parsing args."""
from scripts.download_past_year import build_parser
# Default: skip_details should be True
parser = build_parser()
args = parser.parse_args([])
assert args.skip_details is True
# Explicitly set to false via --no-skip-details
args = parser.parse_args(["--no-skip-details"])
assert args.skip_details is False
# Explicitly set --update-existing
args = parser.parse_args(["--update-existing"])
assert args.update_existing is True
# Default: update_existing should be False
args = parser.parse_args([])
assert args.update_existing is False
Loading…
Cancel
Save