Enable backfilling body_text for existing motions that lack it (2016-2018 data). New extract_besluit_id() and update_existing_motions() helpers support the --update-existing mode, while --no-skip-details enables detail fetching during normal downloads. Includes 7 tests covering URL parsing, DB update flow, and argparse wiring.main
parent
72a8dd2721
commit
be8887f6f8
@ -0,0 +1,182 @@ |
||||
"""Tests for scripts/download_past_year.py enhancements. |
||||
|
||||
Tests extract_besluit_id helper, update_existing_motions function, |
||||
and --skip-details flag wiring. |
||||
""" |
||||
|
||||
import sys |
||||
import argparse |
||||
from unittest.mock import MagicMock, patch, call |
||||
|
||||
import pytest |
||||
|
||||
sys.path.insert(0, ".") |
||||
|
||||
from scripts.download_past_year import extract_besluit_id |
||||
|
||||
|
||||
# --- extract_besluit_id tests --- |
||||
|
||||
|
||||
def test_extract_besluit_id_valid(): |
||||
url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789" |
||||
assert extract_besluit_id(url) == "abc123-def456-ghi789" |
||||
|
||||
|
||||
def test_extract_besluit_id_trailing_slash(): |
||||
url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789/" |
||||
assert extract_besluit_id(url) == "abc123-def456-ghi789" |
||||
|
||||
|
||||
def test_extract_besluit_id_invalid(): |
||||
url = "https://example.com/not-a-motion-url" |
||||
assert extract_besluit_id(url) is None |
||||
|
||||
|
||||
def test_extract_besluit_id_empty(): |
||||
assert extract_besluit_id("") is None |
||||
|
||||
|
||||
# --- update_existing_motions tests --- |
||||
|
||||
|
||||
def test_update_existing_motions_updates_body_text(tmp_path): |
||||
"""Mock DuckDB + mock API, verify UPDATE is called with correct body_text.""" |
||||
import duckdb |
||||
from scripts.download_past_year import update_existing_motions |
||||
|
||||
db_path = str(tmp_path / "test.db") |
||||
|
||||
# Set up a real DuckDB database with the motions table |
||||
conn = duckdb.connect(db_path) |
||||
conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1") |
||||
conn.execute(""" |
||||
CREATE TABLE motions ( |
||||
id INTEGER DEFAULT nextval('motions_id_seq'), |
||||
title TEXT NOT NULL, |
||||
description TEXT, |
||||
date DATE, |
||||
policy_area TEXT, |
||||
voting_results JSON, |
||||
winning_margin FLOAT, |
||||
controversy_score FLOAT, |
||||
layman_explanation TEXT, |
||||
externe_identifier TEXT, |
||||
body_text TEXT, |
||||
url TEXT UNIQUE, |
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
||||
PRIMARY KEY (id) |
||||
) |
||||
""") |
||||
# Insert a motion with missing body_text |
||||
conn.execute(""" |
||||
INSERT INTO motions (title, description, date, url, body_text) |
||||
VALUES ('Test Motion', 'desc', '2017-06-01', |
||||
'https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/besluit-id-123', |
||||
NULL) |
||||
""") |
||||
conn.close() |
||||
|
||||
# Mock API that returns body_text |
||||
mock_api = MagicMock() |
||||
mock_api._get_motion_details.return_value = { |
||||
"title": "Real Title", |
||||
"description": "Real Description", |
||||
"date": "2017-06-01", |
||||
"externe_identifier": "kst-12345", |
||||
"body_text": "constaterende dat de motie gaat over iets belangrijks", |
||||
} |
||||
|
||||
updated, skipped = update_existing_motions( |
||||
db_path=db_path, |
||||
api=mock_api, |
||||
start_date="2017-01-01", |
||||
end_date="2017-12-31", |
||||
delay=0.0, |
||||
) |
||||
|
||||
assert updated == 1 |
||||
assert skipped == 0 |
||||
|
||||
# Verify the body_text was actually written to the DB |
||||
conn = duckdb.connect(db_path, read_only=True) |
||||
row = conn.execute("SELECT body_text FROM motions WHERE id = 1").fetchone() |
||||
conn.close() |
||||
assert row[0] == "constaterende dat de motie gaat over iets belangrijks" |
||||
|
||||
# Verify the API was called with the correct besluit_id |
||||
mock_api._get_motion_details.assert_called_once_with("besluit-id-123") |
||||
|
||||
|
||||
def test_update_existing_motions_skips_when_no_besluit_id(tmp_path): |
||||
"""URL without valid besluit_id is skipped.""" |
||||
import duckdb |
||||
from scripts.download_past_year import update_existing_motions |
||||
|
||||
db_path = str(tmp_path / "test.db") |
||||
|
||||
conn = duckdb.connect(db_path) |
||||
conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1") |
||||
conn.execute(""" |
||||
CREATE TABLE motions ( |
||||
id INTEGER DEFAULT nextval('motions_id_seq'), |
||||
title TEXT NOT NULL, |
||||
description TEXT, |
||||
date DATE, |
||||
policy_area TEXT, |
||||
voting_results JSON, |
||||
winning_margin FLOAT, |
||||
controversy_score FLOAT, |
||||
layman_explanation TEXT, |
||||
externe_identifier TEXT, |
||||
body_text TEXT, |
||||
url TEXT UNIQUE, |
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
||||
PRIMARY KEY (id) |
||||
) |
||||
""") |
||||
# Insert a motion with a URL that won't parse to a besluit_id |
||||
conn.execute(""" |
||||
INSERT INTO motions (title, description, date, url, body_text) |
||||
VALUES ('Bad URL Motion', 'desc', '2017-06-01', |
||||
'https://example.com/not-a-valid-url', |
||||
NULL) |
||||
""") |
||||
conn.close() |
||||
|
||||
mock_api = MagicMock() |
||||
|
||||
updated, skipped = update_existing_motions( |
||||
db_path=db_path, |
||||
api=mock_api, |
||||
start_date="2017-01-01", |
||||
end_date="2017-12-31", |
||||
delay=0.0, |
||||
) |
||||
|
||||
assert updated == 0 |
||||
assert skipped == 1 |
||||
# API should never have been called |
||||
mock_api._get_motion_details.assert_not_called() |
||||
|
||||
|
||||
def test_skip_details_flag_passed_to_api(): |
||||
"""Verify the argparse flag is wired correctly by parsing args.""" |
||||
from scripts.download_past_year import build_parser |
||||
|
||||
# Default: skip_details should be True |
||||
parser = build_parser() |
||||
args = parser.parse_args([]) |
||||
assert args.skip_details is True |
||||
|
||||
# Explicitly set to false via --no-skip-details |
||||
args = parser.parse_args(["--no-skip-details"]) |
||||
assert args.skip_details is False |
||||
|
||||
# Explicitly set --update-existing |
||||
args = parser.parse_args(["--update-existing"]) |
||||
assert args.update_existing is True |
||||
|
||||
# Default: update_existing should be False |
||||
args = parser.parse_args([]) |
||||
assert args.update_existing is False |
||||
Loading…
Reference in new issue