From aa2f66ac9f699f15bcd7b877808d93a8430a51b7 Mon Sep 17 00:00:00 2001 From: Sven Geboers Date: Sat, 21 Mar 2026 23:33:47 +0100 Subject: [PATCH] feat(analysis): fetch real MP metadata, fix anchor axis for party-level actors - fetch_mp_metadata: use real OData URL with pagination (1200 records, 5 pages) uses Fractie.Afkorting not NaamNL for abbreviation matching skips Verwijderd=true records - upsert_mp_metadata: keep most recent membership (prefer active over ended, then higher Van date) so current party affiliations are not overwritten by historical - compute_anchor_axis: anchor directly on party-level SVD entities (GroenLinks-PvdA etc) before falling back to mp_metadata individual MP lookup - test_fetch_mp_metadata: fix mock for timeout kwarg + pagination + Afkorting field - Generated anchor axis HTML for 2025-Q2 through 2026-Q1 in outputs/ --- analysis/political_axis.py | 28 ++++---- database.py | 22 +++++- outputs/anchor_axis_2025_Q2.html | 7 ++ outputs/anchor_axis_2025_Q3.html | 7 ++ outputs/anchor_axis_2025_Q4.html | 7 ++ outputs/anchor_axis_2026_Q1.html | 7 ++ pipeline/fetch_mp_metadata.py | 112 +++++++++++++++++++------------ tests/test_fetch_mp_metadata.py | 35 +++++++--- 8 files changed, 157 insertions(+), 68 deletions(-) create mode 100644 outputs/anchor_axis_2025_Q2.html create mode 100644 outputs/anchor_axis_2025_Q3.html create mode 100644 outputs/anchor_axis_2025_Q4.html create mode 100644 outputs/anchor_axis_2026_Q1.html diff --git a/analysis/political_axis.py b/analysis/political_axis.py index b90bf56..8b0bb4b 100644 --- a/analysis/political_axis.py +++ b/analysis/political_axis.py @@ -87,22 +87,24 @@ def compute_anchor_axis( if not mp_vecs: return {} - # Load party affiliation for this window from mp_metadata + left_set = set(left_parties) + right_set = set(right_parties) + + # 1. Party-level actors whose entity_id IS a party name (e.g. "GroenLinks-PvdA") + left_vecs = [mp_vecs[p] for p in left_set if p in mp_vecs] + right_vecs = [mp_vecs[p] for p in right_set if p in mp_vecs] + + # 2. Individual MPs via mp_metadata party affiliation conn = duckdb.connect(db_path) rows = conn.execute("SELECT mp_name, party FROM mp_metadata").fetchall() conn.close() - party_of = {mp: party for mp, party in rows} - - left_vecs = [ - mp_vecs[mp] - for mp, party in party_of.items() - if party in left_parties and mp in mp_vecs - ] - right_vecs = [ - mp_vecs[mp] - for mp, party in party_of.items() - if party in right_parties and mp in mp_vecs - ] + for mp_name, party in rows: + if mp_name not in mp_vecs: + continue + if party in left_set and mp_name not in left_set: + left_vecs.append(mp_vecs[mp_name]) + elif party in right_set and mp_name not in right_set: + right_vecs.append(mp_vecs[mp_name]) if not left_vecs or not right_vecs: _logger.warning( diff --git a/database.py b/database.py index 0490411..0163e2b 100644 --- a/database.py +++ b/database.py @@ -522,12 +522,32 @@ class MotionDatabase: "SELECT COUNT(*) FROM mp_metadata WHERE mp_name = ?", (mp_name,) ).fetchone() if exists and exists[0] > 0: + # Only update if this record is newer (higher Van date) than the stored one, + # preferring active memberships (TotEnMet IS NULL) over ended ones. conn.execute( """ UPDATE mp_metadata SET party = ?, van = ?, tot_en_met = ?, persoon_id = ? WHERE mp_name = ? + AND ( + -- prefer active over ended + (? IS NULL AND tot_en_met IS NOT NULL) + -- or same active status but newer start date + OR (? IS NULL AND tot_en_met IS NULL AND CAST(? AS DATE) > CAST(van AS DATE)) + OR (? IS NOT NULL AND tot_en_met IS NOT NULL AND CAST(? AS DATE) > CAST(van AS DATE)) + ) """, - (party, van, tot_en_met, persoon_id, mp_name), + ( + party, + van, + tot_en_met, + persoon_id, + mp_name, + tot_en_met, # prefer active + tot_en_met, + van, # both active, newer + tot_en_met, + van, + ), # both ended, newer ) else: conn.execute( diff --git a/outputs/anchor_axis_2025_Q2.html b/outputs/anchor_axis_2025_Q2.html new file mode 100644 index 0000000..4c88290 --- /dev/null +++ b/outputs/anchor_axis_2025_Q2.html @@ -0,0 +1,7 @@ + + + +
+
+ + \ No newline at end of file diff --git a/outputs/anchor_axis_2025_Q3.html b/outputs/anchor_axis_2025_Q3.html new file mode 100644 index 0000000..c8c7378 --- /dev/null +++ b/outputs/anchor_axis_2025_Q3.html @@ -0,0 +1,7 @@ + + + +
+
+ + \ No newline at end of file diff --git a/outputs/anchor_axis_2025_Q4.html b/outputs/anchor_axis_2025_Q4.html new file mode 100644 index 0000000..ad1c776 --- /dev/null +++ b/outputs/anchor_axis_2025_Q4.html @@ -0,0 +1,7 @@ + + + +
+
+ + \ No newline at end of file diff --git a/outputs/anchor_axis_2026_Q1.html b/outputs/anchor_axis_2026_Q1.html new file mode 100644 index 0000000..8075e46 --- /dev/null +++ b/outputs/anchor_axis_2026_Q1.html @@ -0,0 +1,7 @@ + + + +
+
+ + \ No newline at end of file diff --git a/pipeline/fetch_mp_metadata.py b/pipeline/fetch_mp_metadata.py index ac31861..d677e45 100644 --- a/pipeline/fetch_mp_metadata.py +++ b/pipeline/fetch_mp_metadata.py @@ -34,61 +34,85 @@ def normalize_mp_name( return name +_ODATA_BASE = "https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0" +_PAGE_SIZE = 250 + + def fetch_mp_metadata( - db_path: str, odata_url: str = "https://odata.example/FractieZetelPersoon" + db_path: str, + odata_url: str = f"{_ODATA_BASE}/FractieZetelPersoon", ) -> int: """Fetch MP party membership and tenure from OData and upsert into DB. + Paginates through all records using $skip. Uses Fractie.Afkorting as + the party name so it matches the abbreviations used in mp_votes. + Returns the number of records processed (inserted or updated). """ + expand = "$expand=FractieZetel($expand=Fractie),Persoon" session = requests.Session() - try: - resp = session.get(odata_url) - resp.raise_for_status() - data = resp.json() - except Exception as e: - logger.error("Failed to fetch MP metadata: %s", e) - raise - - values = data.get("value") if isinstance(data, dict) else None - if values is None: - logger.error("Unexpected OData payload; missing 'value' list") - return 0 - db = MotionDatabase(db_path) processed = 0 + skip = 0 - for item in values: + while True: + url = f"{odata_url}?{expand}&$top={_PAGE_SIZE}&$skip={skip}" try: - persoon = item.get("Persoon") or {} - fractiezetel = item.get("FractieZetel") or {} - fractie = fractiezetel.get("Fractie") or {} - - achternaam = persoon.get("Achternaam") - initialen = persoon.get("Initialen") - tussenvoegsel = persoon.get("Tussenvoegsel") - persoon_id = persoon.get("Id") - - party = fractie.get("NaamNL") - van = item.get("Van") - tot_en_met = item.get("TotEnMet") - - if not achternaam: - logger.debug("Skipping record without achternaam: %s", item) - continue - - mp_name = normalize_mp_name(achternaam, initialen, tussenvoegsel) - - db.upsert_mp_metadata( - mp_name=mp_name, - party=party, - van=van, - tot_en_met=tot_en_met, - persoon_id=persoon_id, - ) - processed += 1 - except Exception: - logger.exception("Error processing OData item: %s", item) + resp = session.get(url, timeout=30) + resp.raise_for_status() + data = resp.json() + except Exception as e: + logger.error("Failed to fetch MP metadata (skip=%d): %s", skip, e) + raise + + values = data.get("value") if isinstance(data, dict) else None + if values is None: + logger.error("Unexpected OData payload at skip=%d; missing 'value'", skip) + break + + if not values: + break # no more pages + + for item in values: + try: + if item.get("Verwijderd"): + continue + + persoon = item.get("Persoon") or {} + fractiezetel = item.get("FractieZetel") or {} + fractie = fractiezetel.get("Fractie") or {} + + achternaam = persoon.get("Achternaam") + initialen = persoon.get("Initialen") + tussenvoegsel = persoon.get("Tussenvoegsel") + persoon_id = persoon.get("Id") + + # Use Afkorting (e.g. "VVD", "GroenLinks-PvdA") to match mp_votes party column + party = fractie.get("Afkorting") or fractie.get("NaamNL") + van = item.get("Van") + tot_en_met = item.get("TotEnMet") + + if not achternaam: + logger.debug("Skipping record without achternaam: %s", item) + continue + + mp_name = normalize_mp_name(achternaam, initialen, tussenvoegsel) + + db.upsert_mp_metadata( + mp_name=mp_name, + party=party, + van=van, + tot_en_met=tot_en_met, + persoon_id=persoon_id, + ) + processed += 1 + except Exception: + logger.exception("Error processing OData item: %s", item) + + logger.debug("Fetched page skip=%d, got %d records", skip, len(values)) + if len(values) < _PAGE_SIZE: + break # last page + skip += _PAGE_SIZE logger.info("Processed %d MP metadata records", processed) return processed diff --git a/tests/test_fetch_mp_metadata.py b/tests/test_fetch_mp_metadata.py index 9c99e1c..d7171e3 100644 --- a/tests/test_fetch_mp_metadata.py +++ b/tests/test_fetch_mp_metadata.py @@ -28,44 +28,58 @@ class MockResponse: class MockSession: - def __init__(self, response): - self._response = response + """Session mock that returns a data page on first call and empty page on second.""" - def get(self, url): - return self._response + def __init__(self, data_page): + self._pages = [data_page, {"value": []}] + self._call = 0 + + def get(self, url, **kwargs): + resp = MockResponse(self._pages[min(self._call, len(self._pages) - 1)]) + self._call += 1 + return resp def test_fetch_mp_metadata_idempotent(tmp_path, monkeypatch): - # Prepare canned OData response with two FractieZetelPersoon records + # Prepare canned OData response with two FractieZetelPersoon records. + # Use Afkorting (not NaamNL) because fetch_mp_metadata prefers Afkorting. data = { "value": [ { + "Verwijderd": False, "Persoon": { "Achternaam": "Yesilgöz-Zegerius", "Initialen": "D.", "Tussenvoegsel": None, "Id": "guid-1", }, - "FractieZetel": {"Fractie": {"NaamNL": "VVD"}}, + "FractieZetel": { + "Fractie": { + "Afkorting": "VVD", + "NaamNL": "Volkspartij voor Vrijheid en Democratie", + } + }, "Van": "2023-01-01", "TotEnMet": None, }, { + "Verwijderd": False, "Persoon": { "Achternaam": "Plas", "Initialen": "C.", "Tussenvoegsel": "van der", "Id": "guid-2", }, - "FractieZetel": {"Fractie": {"NaamNL": "BBB"}}, + "FractieZetel": { + "Fractie": {"Afkorting": "BBB", "NaamNL": "BoerBurgerBeweging"} + }, "Van": "2023-06-01", "TotEnMet": "2024-01-01", }, ] } - mock_resp = MockResponse(data) - mock_session = MockSession(mock_resp) + mock_session = MockSession(data) # Patch requests.Session to return our mock session monkeypatch.setattr(requests, "Session", lambda: mock_session) @@ -98,6 +112,7 @@ def test_fetch_mp_metadata_idempotent(tmp_path, monkeypatch): assert rows[1][3] == None assert rows[1][4] == "guid-1" - # Run again to assert idempotence (no exception and same count processed) + # Run again to assert idempotence: same records processed, DB unchanged + monkeypatch.setattr(requests, "Session", lambda: MockSession(data)) count2 = fetch_mp_metadata(db_path=db_path, odata_url="http://example/odata") assert count2 == 2