Enable backfilling body_text for existing motions that lack it (2016-2018 data). New extract_besluit_id() and update_existing_motions() helpers support the --update-existing mode, while --no-skip-details enables detail fetching during normal downloads. Includes 7 tests covering URL parsing, DB update flow, and argparse wiring.main
parent
72a8dd2721
commit
be8887f6f8
@ -0,0 +1,182 @@ |
|||||||
|
"""Tests for scripts/download_past_year.py enhancements. |
||||||
|
|
||||||
|
Tests extract_besluit_id helper, update_existing_motions function, |
||||||
|
and --skip-details flag wiring. |
||||||
|
""" |
||||||
|
|
||||||
|
import sys |
||||||
|
import argparse |
||||||
|
from unittest.mock import MagicMock, patch, call |
||||||
|
|
||||||
|
import pytest |
||||||
|
|
||||||
|
sys.path.insert(0, ".") |
||||||
|
|
||||||
|
from scripts.download_past_year import extract_besluit_id |
||||||
|
|
||||||
|
|
||||||
|
# --- extract_besluit_id tests --- |
||||||
|
|
||||||
|
|
||||||
|
def test_extract_besluit_id_valid(): |
||||||
|
url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789" |
||||||
|
assert extract_besluit_id(url) == "abc123-def456-ghi789" |
||||||
|
|
||||||
|
|
||||||
|
def test_extract_besluit_id_trailing_slash(): |
||||||
|
url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/abc123-def456-ghi789/" |
||||||
|
assert extract_besluit_id(url) == "abc123-def456-ghi789" |
||||||
|
|
||||||
|
|
||||||
|
def test_extract_besluit_id_invalid(): |
||||||
|
url = "https://example.com/not-a-motion-url" |
||||||
|
assert extract_besluit_id(url) is None |
||||||
|
|
||||||
|
|
||||||
|
def test_extract_besluit_id_empty(): |
||||||
|
assert extract_besluit_id("") is None |
||||||
|
|
||||||
|
|
||||||
|
# --- update_existing_motions tests --- |
||||||
|
|
||||||
|
|
||||||
|
def test_update_existing_motions_updates_body_text(tmp_path): |
||||||
|
"""Mock DuckDB + mock API, verify UPDATE is called with correct body_text.""" |
||||||
|
import duckdb |
||||||
|
from scripts.download_past_year import update_existing_motions |
||||||
|
|
||||||
|
db_path = str(tmp_path / "test.db") |
||||||
|
|
||||||
|
# Set up a real DuckDB database with the motions table |
||||||
|
conn = duckdb.connect(db_path) |
||||||
|
conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1") |
||||||
|
conn.execute(""" |
||||||
|
CREATE TABLE motions ( |
||||||
|
id INTEGER DEFAULT nextval('motions_id_seq'), |
||||||
|
title TEXT NOT NULL, |
||||||
|
description TEXT, |
||||||
|
date DATE, |
||||||
|
policy_area TEXT, |
||||||
|
voting_results JSON, |
||||||
|
winning_margin FLOAT, |
||||||
|
controversy_score FLOAT, |
||||||
|
layman_explanation TEXT, |
||||||
|
externe_identifier TEXT, |
||||||
|
body_text TEXT, |
||||||
|
url TEXT UNIQUE, |
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
||||||
|
PRIMARY KEY (id) |
||||||
|
) |
||||||
|
""") |
||||||
|
# Insert a motion with missing body_text |
||||||
|
conn.execute(""" |
||||||
|
INSERT INTO motions (title, description, date, url, body_text) |
||||||
|
VALUES ('Test Motion', 'desc', '2017-06-01', |
||||||
|
'https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen/besluit-id-123', |
||||||
|
NULL) |
||||||
|
""") |
||||||
|
conn.close() |
||||||
|
|
||||||
|
# Mock API that returns body_text |
||||||
|
mock_api = MagicMock() |
||||||
|
mock_api._get_motion_details.return_value = { |
||||||
|
"title": "Real Title", |
||||||
|
"description": "Real Description", |
||||||
|
"date": "2017-06-01", |
||||||
|
"externe_identifier": "kst-12345", |
||||||
|
"body_text": "constaterende dat de motie gaat over iets belangrijks", |
||||||
|
} |
||||||
|
|
||||||
|
updated, skipped = update_existing_motions( |
||||||
|
db_path=db_path, |
||||||
|
api=mock_api, |
||||||
|
start_date="2017-01-01", |
||||||
|
end_date="2017-12-31", |
||||||
|
delay=0.0, |
||||||
|
) |
||||||
|
|
||||||
|
assert updated == 1 |
||||||
|
assert skipped == 0 |
||||||
|
|
||||||
|
# Verify the body_text was actually written to the DB |
||||||
|
conn = duckdb.connect(db_path, read_only=True) |
||||||
|
row = conn.execute("SELECT body_text FROM motions WHERE id = 1").fetchone() |
||||||
|
conn.close() |
||||||
|
assert row[0] == "constaterende dat de motie gaat over iets belangrijks" |
||||||
|
|
||||||
|
# Verify the API was called with the correct besluit_id |
||||||
|
mock_api._get_motion_details.assert_called_once_with("besluit-id-123") |
||||||
|
|
||||||
|
|
||||||
|
def test_update_existing_motions_skips_when_no_besluit_id(tmp_path): |
||||||
|
"""URL without valid besluit_id is skipped.""" |
||||||
|
import duckdb |
||||||
|
from scripts.download_past_year import update_existing_motions |
||||||
|
|
||||||
|
db_path = str(tmp_path / "test.db") |
||||||
|
|
||||||
|
conn = duckdb.connect(db_path) |
||||||
|
conn.execute("CREATE SEQUENCE IF NOT EXISTS motions_id_seq START 1") |
||||||
|
conn.execute(""" |
||||||
|
CREATE TABLE motions ( |
||||||
|
id INTEGER DEFAULT nextval('motions_id_seq'), |
||||||
|
title TEXT NOT NULL, |
||||||
|
description TEXT, |
||||||
|
date DATE, |
||||||
|
policy_area TEXT, |
||||||
|
voting_results JSON, |
||||||
|
winning_margin FLOAT, |
||||||
|
controversy_score FLOAT, |
||||||
|
layman_explanation TEXT, |
||||||
|
externe_identifier TEXT, |
||||||
|
body_text TEXT, |
||||||
|
url TEXT UNIQUE, |
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
||||||
|
PRIMARY KEY (id) |
||||||
|
) |
||||||
|
""") |
||||||
|
# Insert a motion with a URL that won't parse to a besluit_id |
||||||
|
conn.execute(""" |
||||||
|
INSERT INTO motions (title, description, date, url, body_text) |
||||||
|
VALUES ('Bad URL Motion', 'desc', '2017-06-01', |
||||||
|
'https://example.com/not-a-valid-url', |
||||||
|
NULL) |
||||||
|
""") |
||||||
|
conn.close() |
||||||
|
|
||||||
|
mock_api = MagicMock() |
||||||
|
|
||||||
|
updated, skipped = update_existing_motions( |
||||||
|
db_path=db_path, |
||||||
|
api=mock_api, |
||||||
|
start_date="2017-01-01", |
||||||
|
end_date="2017-12-31", |
||||||
|
delay=0.0, |
||||||
|
) |
||||||
|
|
||||||
|
assert updated == 0 |
||||||
|
assert skipped == 1 |
||||||
|
# API should never have been called |
||||||
|
mock_api._get_motion_details.assert_not_called() |
||||||
|
|
||||||
|
|
||||||
|
def test_skip_details_flag_passed_to_api(): |
||||||
|
"""Verify the argparse flag is wired correctly by parsing args.""" |
||||||
|
from scripts.download_past_year import build_parser |
||||||
|
|
||||||
|
# Default: skip_details should be True |
||||||
|
parser = build_parser() |
||||||
|
args = parser.parse_args([]) |
||||||
|
assert args.skip_details is True |
||||||
|
|
||||||
|
# Explicitly set to false via --no-skip-details |
||||||
|
args = parser.parse_args(["--no-skip-details"]) |
||||||
|
assert args.skip_details is False |
||||||
|
|
||||||
|
# Explicitly set --update-existing |
||||||
|
args = parser.parse_args(["--update-existing"]) |
||||||
|
assert args.update_existing is True |
||||||
|
|
||||||
|
# Default: update_existing should be False |
||||||
|
args = parser.parse_args([]) |
||||||
|
assert args.update_existing is False |
||||||
Loading…
Reference in new issue