Add --skip-details, --update-existing flags to download_past_year.py with tests

Enable backfilling body_text for existing motions that lack it (2016-2018 data). New extract_besluit_id() and update_existing_motions() helpers support the --update-existing mode, while --no-skip-details enables detail fetching during normal downloads. Includes 7 tests covering URL parsing, DB update flow, and argparse wiring.
1 month ago · be8887f6f8
parent 72a8dd2721
commit be8887f6f8
2 changed files with 342 additions and 2 deletions
--- a/scripts/download_past_year.py
+++ b/scripts/download_past_year.py
@ -8,12 +8,15 @@ Skips AI summarisation — this is a raw data fetch for the embedding pipeline.
 Usage:
    uv run python scripts/download_past_year.py [--db-path data/motions.db] [--days 365]
    uv run python scripts/download_past_year.py --start-date 2019-01-01 --end-date 2022-01-01
+    uv run python scripts/download_past_year.py --update-existing --start-date 2016-01-01 --end-date 2018-12-31
 """

 import argparse
 import sys
 import time
 from datetime import datetime, timedelta
+from typing import Optional, Tuple
+from urllib.parse import urlparse

 sys.path.insert(0, ".")  # run from project root

@ -21,7 +24,127 @@ from api_client import TweedeKamerAPI
 from database import MotionDatabase


-def main():
+_STEMMINGSUITSLAGEN_PREFIX = "/kamerstukken/stemmingsuitslagen/"
+
+
+def extract_besluit_id(url: str) -> Optional[str]:
+    """Extract the besluit_id (last path segment) from a motion URL.
+
+    Expected format: https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/{besluit_id}
+    Returns None if the URL doesn't match expected format.
+    """
+    if not url:
+        return None
+
+    try:
+        parsed = urlparse(url)
+        path = parsed.path.rstrip("/")
+        if _STEMMINGSUITSLAGEN_PREFIX.rstrip("/") not in path:
+            return None
+        # Last path segment is the besluit_id
+        besluit_id = path.split("/")[-1]
+        return besluit_id if besluit_id else None
+    except Exception:
+        return None
+
+
+def update_existing_motions(
+    db_path: str,
+    api: "TweedeKamerAPI",
+    start_date: str,
+    end_date: str,
+    delay: float = 1.0,
+) -> Tuple[int, int]:
+    """Backfill body_text for motions that are missing it.
+
+    Queries for motions with NULL/empty body_text in the given date range,
+    extracts besluit_id from the URL, fetches details via the API, and updates
+    the row.
+
+    Args:
+        db_path: Path to the DuckDB database file.
+        api: TweedeKamerAPI instance (uses api._get_motion_details).
+        start_date: Start date string (YYYY-MM-DD).
+        end_date: End date string (YYYY-MM-DD).
+        delay: Seconds to wait between API calls.
+
+    Returns:
+        (updated_count, skipped_count) tuple.
+    """
+    import duckdb
+
+    # Read motions with missing body_text
+    conn_read = duckdb.connect(db_path, read_only=True)
+    rows = conn_read.execute(
+        """
+        SELECT id, url, title, description
+        FROM motions
+        WHERE date BETWEEN ? AND ?
+          AND (body_text IS NULL OR body_text = '')
+        """,
+        (start_date, end_date),
+    ).fetchall()
+    conn_read.close()
+
+    updated = 0
+    skipped = 0
+
+    for row in rows:
+        motion_id, url, title, description = row
+
+        besluit_id = extract_besluit_id(url or "")
+        if not besluit_id:
+            print(f"  Skipping motion {motion_id}: cannot extract besluit_id from URL")
+            skipped += 1
+            continue
+
+        print(f"  Fetching details for motion {motion_id} (besluit_id={besluit_id})...")
+        details = api._get_motion_details(besluit_id)
+
+        if not details or not details.get("body_text"):
+            print(f"  Skipping motion {motion_id}: no body_text returned")
+            skipped += 1
+            continue
+
+        # Build update: always set body_text; also update title/description if
+        # they were placeholder values (e.g. "Motion abc12345" or "No description available")
+        new_body = details["body_text"]
+        new_title = title
+        new_desc = description
+
+        if title and (title.startswith("Motion ") or title.startswith("Besluit ")):
+            new_title = details.get("title") or title
+
+        if description in (
+            None,
+            "",
+            "No description available",
+            "Geen beschrijving beschikbaar",
+        ):
+            new_desc = details.get("description") or description
+
+        conn_write = duckdb.connect(db_path, read_only=False)
+        conn_write.execute(
+            """
+            UPDATE motions
+            SET body_text = ?, title = ?, description = ?
+            WHERE id = ?
+            """,
+            (new_body, new_title, new_desc, motion_id),
+        )
+        conn_write.close()
+
+        updated += 1
+        print(f"  Updated motion {motion_id}")
+
+        if delay > 0 and updated + skipped < len(rows):
+            time.sleep(delay)
+
+    return updated, skipped
+
+
+def build_parser() -> argparse.ArgumentParser:
+    """Build and return the argument parser for the download script."""
    parser = argparse.ArgumentParser(description="Download motions for a date range")
    parser.add_argument("--db-path", default="data/motions.db")
    parser.add_argument(
@ -52,6 +175,23 @@ def main():
    parser.add_argument(
        "--delay", type=float, default=2.0, help="Seconds between chunks"
    )
+    parser.add_argument(
+        "--skip-details",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Skip fetching per-motion details (default: True). Use --no-skip-details to fetch body text.",
+    )
+    parser.add_argument(
+        "--update-existing",
+        action="store_true",
+        default=False,
+        help="Backfill body_text for existing motions with missing text. Skips normal download.",
+    )
+    return parser
+
+
+def main():
+    parser = build_parser()
    args = parser.parse_args()

    api = TweedeKamerAPI()
@ -73,6 +213,24 @@ def main():
    print(f"DB: {args.db_path}")
    print()

+    # --- Update-existing mode: backfill body_text, then exit ---
+    if args.update_existing:
+        print("Mode: update-existing (backfilling body_text for existing motions)\n")
+        updated, skipped = update_existing_motions(
+            db_path=args.db_path,
+            api=api,
+            start_date=start_date.strftime("%Y-%m-%d"),
+            end_date=end_date.strftime("%Y-%m-%d"),
+            delay=args.delay,
+        )
+        print()
+        print("=" * 50)
+        print(f"Done. Updated: {updated}  |  Skipped: {skipped}")
+        print("=" * 50)
+        return
+
+    # --- Normal download mode ---
+
    # Test connectivity first
    test_url = f"{api.odata_base_url}/Stemming"
    r = api.session.get(test_url, params={"$top": 1}, timeout=10)
@ -98,7 +256,7 @@ def main():
                start_date=chunk_start,
                end_date=chunk_end,
                limit=args.limit_per_chunk,
-                skip_details=True,
+                skip_details=args.skip_details,
            )
            print(f"  Fetched {len(motions)} motions")
            total_fetched += len(motions)
--- a/tests/test_download_script.py
+++ b/tests/test_download_script.py
@ -0,0 +1,182 @@
+"""Tests for scripts/download_past_year.py enhancements.
+
+Tests extract_besluit_id helper, update_existing_motions function,
+and --skip-details flag wiring.
+"""
+
+import sys
+import argparse
+from unittest.mock import MagicMock, patch, call
+
+import pytest
+
+sys.path.insert(0, ".")
+
+from scripts.download_past_year import extract_besluit_id
+
+
+# --- extract_besluit_id tests ---
+
+
+def test_extract_besluit_id_valid():
+    url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789"
+    assert extract_besluit_id(url) == "abc123-def456-ghi789"
+
+
+def test_extract_besluit_id_trailing_slash():
+    url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789/"
+    assert extract_besluit_id(url) == "abc123-def456-ghi789"
+
+
+def test_extract_besluit_id_invalid():
+    url = "https://example.com/not-a-motion-url"
+    assert extract_besluit_id(url) is None
+
+
+def test_extract_besluit_id_empty():
+    assert extract_besluit_id("") is None
+
+
+# --- update_existing_motions tests ---
+
+
+def test_update_existing_motions_updates_body_text(tmp_path):
+    """Mock DuckDB + mock API, verify UPDATE is called with correct body_text."""
+    import duckdb
+    from scripts.download_past_year import update_existing_motions
+
+    db_path = str(tmp_path / "test.db")
+
+    # Set up a real DuckDB database with the motions table
+    conn = duckdb.connect(db_path)
+    conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1")
+    conn.execute("""
+        CREATE TABLE motions (
+            id INTEGER DEFAULT nextval('motions_id_seq'),
+            title TEXT NOT NULL,
+            description TEXT,
+            date DATE,
+            policy_area TEXT,
+            voting_results JSON,
+            winning_margin FLOAT,
+            controversy_score FLOAT,
+            layman_explanation TEXT,
+            externe_identifier TEXT,
+            body_text TEXT,
+            url TEXT UNIQUE,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            PRIMARY KEY (id)
+        )
+    """)
+    # Insert a motion with missing body_text
+    conn.execute("""
+        INSERT INTO motions (title, description, date, url, body_text)
+        VALUES ('Test Motion', 'desc', '2017-06-01',
+                'https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/besluit-id-123',
+                NULL)
+    """)
+    conn.close()
+
+    # Mock API that returns body_text
+    mock_api = MagicMock()
+    mock_api._get_motion_details.return_value = {
+        "title": "Real Title",
+        "description": "Real Description",
+        "date": "2017-06-01",
+        "externe_identifier": "kst-12345",
+        "body_text": "constaterende dat de motie gaat over iets belangrijks",
+    }
+
+    updated, skipped = update_existing_motions(
+        db_path=db_path,
+        api=mock_api,
+        start_date="2017-01-01",
+        end_date="2017-12-31",
+        delay=0.0,
+    )
+
+    assert updated == 1
+    assert skipped == 0
+
+    # Verify the body_text was actually written to the DB
+    conn = duckdb.connect(db_path, read_only=True)
+    row = conn.execute("SELECT body_text FROM motions WHERE id = 1").fetchone()
+    conn.close()
+    assert row[0] == "constaterende dat de motie gaat over iets belangrijks"
+
+    # Verify the API was called with the correct besluit_id
+    mock_api._get_motion_details.assert_called_once_with("besluit-id-123")
+
+
+def test_update_existing_motions_skips_when_no_besluit_id(tmp_path):
+    """URL without valid besluit_id is skipped."""
+    import duckdb
+    from scripts.download_past_year import update_existing_motions
+
+    db_path = str(tmp_path / "test.db")
+
+    conn = duckdb.connect(db_path)
+    conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1")
+    conn.execute("""
+        CREATE TABLE motions (
+            id INTEGER DEFAULT nextval('motions_id_seq'),
+            title TEXT NOT NULL,
+            description TEXT,
+            date DATE,
+            policy_area TEXT,
+            voting_results JSON,
+            winning_margin FLOAT,
+            controversy_score FLOAT,
+            layman_explanation TEXT,
+            externe_identifier TEXT,
+            body_text TEXT,
+            url TEXT UNIQUE,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            PRIMARY KEY (id)
+        )
+    """)
+    # Insert a motion with a URL that won't parse to a besluit_id
+    conn.execute("""
+        INSERT INTO motions (title, description, date, url, body_text)
+        VALUES ('Bad URL Motion', 'desc', '2017-06-01',
+                'https://example.com/not-a-valid-url',
+                NULL)
+    """)
+    conn.close()
+
+    mock_api = MagicMock()
+
+    updated, skipped = update_existing_motions(
+        db_path=db_path,
+        api=mock_api,
+        start_date="2017-01-01",
+        end_date="2017-12-31",
+        delay=0.0,
+    )
+
+    assert updated == 0
+    assert skipped == 1
+    # API should never have been called
+    mock_api._get_motion_details.assert_not_called()
+
+
+def test_skip_details_flag_passed_to_api():
+    """Verify the argparse flag is wired correctly by parsing args."""
+    from scripts.download_past_year import build_parser
+
+    # Default: skip_details should be True
+    parser = build_parser()
+    args = parser.parse_args([])
+    assert args.skip_details is True
+
+    # Explicitly set to false via --no-skip-details
+    args = parser.parse_args(["--no-skip-details"])
+    assert args.skip_details is False
+
+    # Explicitly set --update-existing
+    args = parser.parse_args(["--update-existing"])
+    assert args.update_existing is True
+
+    # Default: update_existing should be False
+    args = parser.parse_args([])
+    assert args.update_existing is False