diff --git a/scripts/download_past_year.py b/scripts/download_past_year.py index 8ccc6f7..50bf060 100644 --- a/scripts/download_past_year.py +++ b/scripts/download_past_year.py @@ -8,12 +8,15 @@ Skips AI summarisation — this is a raw data fetch for the embedding pipeline. Usage: uv run python scripts/download_past_year.py [--db-path data/motions.db] [--days 365] uv run python scripts/download_past_year.py --start-date 2019-01-01 --end-date 2022-01-01 + uv run python scripts/download_past_year.py --update-existing --start-date 2016-01-01 --end-date 2018-12-31 """ import argparse import sys import time from datetime import datetime, timedelta +from typing import Optional, Tuple +from urllib.parse import urlparse sys.path.insert(0, ".") # run from project root @@ -21,7 +24,127 @@ from api_client import TweedeKamerAPI from database import MotionDatabase -def main(): +_STEMMINGSUITSLAGEN_PREFIX = "/kamerstukken/stemmingsuitslagen/" + + +def extract_besluit_id(url: str) -> Optional[str]: + """Extract the besluit_id (last path segment) from a motion URL. + + Expected format: https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/{besluit_id} + Returns None if the URL doesn't match expected format. + """ + if not url: + return None + + try: + parsed = urlparse(url) + path = parsed.path.rstrip("/") + if _STEMMINGSUITSLAGEN_PREFIX.rstrip("/") not in path: + return None + # Last path segment is the besluit_id + besluit_id = path.split("/")[-1] + return besluit_id if besluit_id else None + except Exception: + return None + + +def update_existing_motions( + db_path: str, + api: "TweedeKamerAPI", + start_date: str, + end_date: str, + delay: float = 1.0, +) -> Tuple[int, int]: + """Backfill body_text for motions that are missing it. + + Queries for motions with NULL/empty body_text in the given date range, + extracts besluit_id from the URL, fetches details via the API, and updates + the row. + + Args: + db_path: Path to the DuckDB database file. + api: TweedeKamerAPI instance (uses api._get_motion_details). + start_date: Start date string (YYYY-MM-DD). + end_date: End date string (YYYY-MM-DD). + delay: Seconds to wait between API calls. + + Returns: + (updated_count, skipped_count) tuple. + """ + import duckdb + + # Read motions with missing body_text + conn_read = duckdb.connect(db_path, read_only=True) + rows = conn_read.execute( + """ + SELECT id, url, title, description + FROM motions + WHERE date BETWEEN ? AND ? + AND (body_text IS NULL OR body_text = '') + """, + (start_date, end_date), + ).fetchall() + conn_read.close() + + updated = 0 + skipped = 0 + + for row in rows: + motion_id, url, title, description = row + + besluit_id = extract_besluit_id(url or "") + if not besluit_id: + print(f" Skipping motion {motion_id}: cannot extract besluit_id from URL") + skipped += 1 + continue + + print(f" Fetching details for motion {motion_id} (besluit_id={besluit_id})...") + details = api._get_motion_details(besluit_id) + + if not details or not details.get("body_text"): + print(f" Skipping motion {motion_id}: no body_text returned") + skipped += 1 + continue + + # Build update: always set body_text; also update title/description if + # they were placeholder values (e.g. "Motion abc12345" or "No description available") + new_body = details["body_text"] + new_title = title + new_desc = description + + if title and (title.startswith("Motion ") or title.startswith("Besluit ")): + new_title = details.get("title") or title + + if description in ( + None, + "", + "No description available", + "Geen beschrijving beschikbaar", + ): + new_desc = details.get("description") or description + + conn_write = duckdb.connect(db_path, read_only=False) + conn_write.execute( + """ + UPDATE motions + SET body_text = ?, title = ?, description = ? + WHERE id = ? + """, + (new_body, new_title, new_desc, motion_id), + ) + conn_write.close() + + updated += 1 + print(f" Updated motion {motion_id}") + + if delay > 0 and updated + skipped < len(rows): + time.sleep(delay) + + return updated, skipped + + +def build_parser() -> argparse.ArgumentParser: + """Build and return the argument parser for the download script.""" parser = argparse.ArgumentParser(description="Download motions for a date range") parser.add_argument("--db-path", default="data/motions.db") parser.add_argument( @@ -52,6 +175,23 @@ def main(): parser.add_argument( "--delay", type=float, default=2.0, help="Seconds between chunks" ) + parser.add_argument( + "--skip-details", + action=argparse.BooleanOptionalAction, + default=True, + help="Skip fetching per-motion details (default: True). Use --no-skip-details to fetch body text.", + ) + parser.add_argument( + "--update-existing", + action="store_true", + default=False, + help="Backfill body_text for existing motions with missing text. Skips normal download.", + ) + return parser + + +def main(): + parser = build_parser() args = parser.parse_args() api = TweedeKamerAPI() @@ -73,6 +213,24 @@ def main(): print(f"DB: {args.db_path}") print() + # --- Update-existing mode: backfill body_text, then exit --- + if args.update_existing: + print("Mode: update-existing (backfilling body_text for existing motions)\n") + updated, skipped = update_existing_motions( + db_path=args.db_path, + api=api, + start_date=start_date.strftime("%Y-%m-%d"), + end_date=end_date.strftime("%Y-%m-%d"), + delay=args.delay, + ) + print() + print("=" * 50) + print(f"Done. Updated: {updated} | Skipped: {skipped}") + print("=" * 50) + return + + # --- Normal download mode --- + # Test connectivity first test_url = f"{api.odata_base_url}/Stemming" r = api.session.get(test_url, params={"$top": 1}, timeout=10) @@ -98,7 +256,7 @@ def main(): start_date=chunk_start, end_date=chunk_end, limit=args.limit_per_chunk, - skip_details=True, + skip_details=args.skip_details, ) print(f" Fetched {len(motions)} motions") total_fetched += len(motions) diff --git a/tests/test_download_script.py b/tests/test_download_script.py new file mode 100644 index 0000000..1228999 --- /dev/null +++ b/tests/test_download_script.py @@ -0,0 +1,182 @@ +"""Tests for scripts/download_past_year.py enhancements. + +Tests extract_besluit_id helper, update_existing_motions function, +and --skip-details flag wiring. +""" + +import sys +import argparse +from unittest.mock import MagicMock, patch, call + +import pytest + +sys.path.insert(0, ".") + +from scripts.download_past_year import extract_besluit_id + + +# --- extract_besluit_id tests --- + + +def test_extract_besluit_id_valid(): + url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789" + assert extract_besluit_id(url) == "abc123-def456-ghi789" + + +def test_extract_besluit_id_trailing_slash(): + url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789/" + assert extract_besluit_id(url) == "abc123-def456-ghi789" + + +def test_extract_besluit_id_invalid(): + url = "https://example.com/not-a-motion-url" + assert extract_besluit_id(url) is None + + +def test_extract_besluit_id_empty(): + assert extract_besluit_id("") is None + + +# --- update_existing_motions tests --- + + +def test_update_existing_motions_updates_body_text(tmp_path): + """Mock DuckDB + mock API, verify UPDATE is called with correct body_text.""" + import duckdb + from scripts.download_past_year import update_existing_motions + + db_path = str(tmp_path / "test.db") + + # Set up a real DuckDB database with the motions table + conn = duckdb.connect(db_path) + conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1") + conn.execute(""" + CREATE TABLE motions ( + id INTEGER DEFAULT nextval('motions_id_seq'), + title TEXT NOT NULL, + description TEXT, + date DATE, + policy_area TEXT, + voting_results JSON, + winning_margin FLOAT, + controversy_score FLOAT, + layman_explanation TEXT, + externe_identifier TEXT, + body_text TEXT, + url TEXT UNIQUE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (id) + ) + """) + # Insert a motion with missing body_text + conn.execute(""" + INSERT INTO motions (title, description, date, url, body_text) + VALUES ('Test Motion', 'desc', '2017-06-01', + 'https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/besluit-id-123', + NULL) + """) + conn.close() + + # Mock API that returns body_text + mock_api = MagicMock() + mock_api._get_motion_details.return_value = { + "title": "Real Title", + "description": "Real Description", + "date": "2017-06-01", + "externe_identifier": "kst-12345", + "body_text": "constaterende dat de motie gaat over iets belangrijks", + } + + updated, skipped = update_existing_motions( + db_path=db_path, + api=mock_api, + start_date="2017-01-01", + end_date="2017-12-31", + delay=0.0, + ) + + assert updated == 1 + assert skipped == 0 + + # Verify the body_text was actually written to the DB + conn = duckdb.connect(db_path, read_only=True) + row = conn.execute("SELECT body_text FROM motions WHERE id = 1").fetchone() + conn.close() + assert row[0] == "constaterende dat de motie gaat over iets belangrijks" + + # Verify the API was called with the correct besluit_id + mock_api._get_motion_details.assert_called_once_with("besluit-id-123") + + +def test_update_existing_motions_skips_when_no_besluit_id(tmp_path): + """URL without valid besluit_id is skipped.""" + import duckdb + from scripts.download_past_year import update_existing_motions + + db_path = str(tmp_path / "test.db") + + conn = duckdb.connect(db_path) + conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1") + conn.execute(""" + CREATE TABLE motions ( + id INTEGER DEFAULT nextval('motions_id_seq'), + title TEXT NOT NULL, + description TEXT, + date DATE, + policy_area TEXT, + voting_results JSON, + winning_margin FLOAT, + controversy_score FLOAT, + layman_explanation TEXT, + externe_identifier TEXT, + body_text TEXT, + url TEXT UNIQUE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (id) + ) + """) + # Insert a motion with a URL that won't parse to a besluit_id + conn.execute(""" + INSERT INTO motions (title, description, date, url, body_text) + VALUES ('Bad URL Motion', 'desc', '2017-06-01', + 'https://example.com/not-a-valid-url', + NULL) + """) + conn.close() + + mock_api = MagicMock() + + updated, skipped = update_existing_motions( + db_path=db_path, + api=mock_api, + start_date="2017-01-01", + end_date="2017-12-31", + delay=0.0, + ) + + assert updated == 0 + assert skipped == 1 + # API should never have been called + mock_api._get_motion_details.assert_not_called() + + +def test_skip_details_flag_passed_to_api(): + """Verify the argparse flag is wired correctly by parsing args.""" + from scripts.download_past_year import build_parser + + # Default: skip_details should be True + parser = build_parser() + args = parser.parse_args([]) + assert args.skip_details is True + + # Explicitly set to false via --no-skip-details + args = parser.parse_args(["--no-skip-details"]) + assert args.skip_details is False + + # Explicitly set --update-existing + args = parser.parse_args(["--update-existing"]) + assert args.update_existing is True + + # Default: update_existing should be False + args = parser.parse_args([]) + assert args.update_existing is False