feat(analysis): fetch real MP metadata, fix anchor axis for party-level actors

- fetch_mp_metadata: use real OData URL with pagination (1200 records, 5 pages)
  uses Fractie.Afkorting not NaamNL for abbreviation matching
  skips Verwijderd=true records
- upsert_mp_metadata: keep most recent membership (prefer active over ended,
  then higher Van date) so current party affiliations are not overwritten by historical
- compute_anchor_axis: anchor directly on party-level SVD entities (GroenLinks-PvdA etc)
  before falling back to mp_metadata individual MP lookup
- test_fetch_mp_metadata: fix mock for timeout kwarg + pagination + Afkorting field
- Generated anchor axis HTML for 2025-Q2 through 2026-Q1 in outputs/
main
Sven Geboers 1 month ago
parent 5ad83ef1be
commit aa2f66ac9f
  1. 28
      analysis/political_axis.py
  2. 22
      database.py
  3. 7
      outputs/anchor_axis_2025_Q2.html
  4. 7
      outputs/anchor_axis_2025_Q3.html
  5. 7
      outputs/anchor_axis_2025_Q4.html
  6. 7
      outputs/anchor_axis_2026_Q1.html
  7. 112
      pipeline/fetch_mp_metadata.py
  8. 35
      tests/test_fetch_mp_metadata.py

@ -87,22 +87,24 @@ def compute_anchor_axis(
if not mp_vecs:
return {}
# Load party affiliation for this window from mp_metadata
left_set = set(left_parties)
right_set = set(right_parties)
# 1. Party-level actors whose entity_id IS a party name (e.g. "GroenLinks-PvdA")
left_vecs = [mp_vecs[p] for p in left_set if p in mp_vecs]
right_vecs = [mp_vecs[p] for p in right_set if p in mp_vecs]
# 2. Individual MPs via mp_metadata party affiliation
conn = duckdb.connect(db_path)
rows = conn.execute("SELECT mp_name, party FROM mp_metadata").fetchall()
conn.close()
party_of = {mp: party for mp, party in rows}
left_vecs = [
mp_vecs[mp]
for mp, party in party_of.items()
if party in left_parties and mp in mp_vecs
]
right_vecs = [
mp_vecs[mp]
for mp, party in party_of.items()
if party in right_parties and mp in mp_vecs
]
for mp_name, party in rows:
if mp_name not in mp_vecs:
continue
if party in left_set and mp_name not in left_set:
left_vecs.append(mp_vecs[mp_name])
elif party in right_set and mp_name not in right_set:
right_vecs.append(mp_vecs[mp_name])
if not left_vecs or not right_vecs:
_logger.warning(

@ -522,12 +522,32 @@ class MotionDatabase:
"SELECT COUNT(*) FROM mp_metadata WHERE mp_name = ?", (mp_name,)
).fetchone()
if exists and exists[0] > 0:
# Only update if this record is newer (higher Van date) than the stored one,
# preferring active memberships (TotEnMet IS NULL) over ended ones.
conn.execute(
"""
UPDATE mp_metadata SET party = ?, van = ?, tot_en_met = ?, persoon_id = ?
WHERE mp_name = ?
AND (
-- prefer active over ended
(? IS NULL AND tot_en_met IS NOT NULL)
-- or same active status but newer start date
OR (? IS NULL AND tot_en_met IS NULL AND CAST(? AS DATE) > CAST(van AS DATE))
OR (? IS NOT NULL AND tot_en_met IS NOT NULL AND CAST(? AS DATE) > CAST(van AS DATE))
)
""",
(party, van, tot_en_met, persoon_id, mp_name),
(
party,
van,
tot_en_met,
persoon_id,
mp_name,
tot_en_met, # prefer active
tot_en_met,
van, # both active, newer
tot_en_met,
van,
), # both ended, newer
)
else:
conn.execute(

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -34,61 +34,85 @@ def normalize_mp_name(
return name
_ODATA_BASE = "https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0"
_PAGE_SIZE = 250
def fetch_mp_metadata(
db_path: str, odata_url: str = "https://odata.example/FractieZetelPersoon"
db_path: str,
odata_url: str = f"{_ODATA_BASE}/FractieZetelPersoon",
) -> int:
"""Fetch MP party membership and tenure from OData and upsert into DB.
Paginates through all records using $skip. Uses Fractie.Afkorting as
the party name so it matches the abbreviations used in mp_votes.
Returns the number of records processed (inserted or updated).
"""
expand = "$expand=FractieZetel($expand=Fractie),Persoon"
session = requests.Session()
try:
resp = session.get(odata_url)
resp.raise_for_status()
data = resp.json()
except Exception as e:
logger.error("Failed to fetch MP metadata: %s", e)
raise
values = data.get("value") if isinstance(data, dict) else None
if values is None:
logger.error("Unexpected OData payload; missing 'value' list")
return 0
db = MotionDatabase(db_path)
processed = 0
skip = 0
for item in values:
while True:
url = f"{odata_url}?{expand}&$top={_PAGE_SIZE}&$skip={skip}"
try:
persoon = item.get("Persoon") or {}
fractiezetel = item.get("FractieZetel") or {}
fractie = fractiezetel.get("Fractie") or {}
achternaam = persoon.get("Achternaam")
initialen = persoon.get("Initialen")
tussenvoegsel = persoon.get("Tussenvoegsel")
persoon_id = persoon.get("Id")
party = fractie.get("NaamNL")
van = item.get("Van")
tot_en_met = item.get("TotEnMet")
if not achternaam:
logger.debug("Skipping record without achternaam: %s", item)
continue
mp_name = normalize_mp_name(achternaam, initialen, tussenvoegsel)
db.upsert_mp_metadata(
mp_name=mp_name,
party=party,
van=van,
tot_en_met=tot_en_met,
persoon_id=persoon_id,
)
processed += 1
except Exception:
logger.exception("Error processing OData item: %s", item)
resp = session.get(url, timeout=30)
resp.raise_for_status()
data = resp.json()
except Exception as e:
logger.error("Failed to fetch MP metadata (skip=%d): %s", skip, e)
raise
values = data.get("value") if isinstance(data, dict) else None
if values is None:
logger.error("Unexpected OData payload at skip=%d; missing 'value'", skip)
break
if not values:
break # no more pages
for item in values:
try:
if item.get("Verwijderd"):
continue
persoon = item.get("Persoon") or {}
fractiezetel = item.get("FractieZetel") or {}
fractie = fractiezetel.get("Fractie") or {}
achternaam = persoon.get("Achternaam")
initialen = persoon.get("Initialen")
tussenvoegsel = persoon.get("Tussenvoegsel")
persoon_id = persoon.get("Id")
# Use Afkorting (e.g. "VVD", "GroenLinks-PvdA") to match mp_votes party column
party = fractie.get("Afkorting") or fractie.get("NaamNL")
van = item.get("Van")
tot_en_met = item.get("TotEnMet")
if not achternaam:
logger.debug("Skipping record without achternaam: %s", item)
continue
mp_name = normalize_mp_name(achternaam, initialen, tussenvoegsel)
db.upsert_mp_metadata(
mp_name=mp_name,
party=party,
van=van,
tot_en_met=tot_en_met,
persoon_id=persoon_id,
)
processed += 1
except Exception:
logger.exception("Error processing OData item: %s", item)
logger.debug("Fetched page skip=%d, got %d records", skip, len(values))
if len(values) < _PAGE_SIZE:
break # last page
skip += _PAGE_SIZE
logger.info("Processed %d MP metadata records", processed)
return processed

@ -28,44 +28,58 @@ class MockResponse:
class MockSession:
def __init__(self, response):
self._response = response
"""Session mock that returns a data page on first call and empty page on second."""
def get(self, url):
return self._response
def __init__(self, data_page):
self._pages = [data_page, {"value": []}]
self._call = 0
def get(self, url, **kwargs):
resp = MockResponse(self._pages[min(self._call, len(self._pages) - 1)])
self._call += 1
return resp
def test_fetch_mp_metadata_idempotent(tmp_path, monkeypatch):
# Prepare canned OData response with two FractieZetelPersoon records
# Prepare canned OData response with two FractieZetelPersoon records.
# Use Afkorting (not NaamNL) because fetch_mp_metadata prefers Afkorting.
data = {
"value": [
{
"Verwijderd": False,
"Persoon": {
"Achternaam": "Yesilgöz-Zegerius",
"Initialen": "D.",
"Tussenvoegsel": None,
"Id": "guid-1",
},
"FractieZetel": {"Fractie": {"NaamNL": "VVD"}},
"FractieZetel": {
"Fractie": {
"Afkorting": "VVD",
"NaamNL": "Volkspartij voor Vrijheid en Democratie",
}
},
"Van": "2023-01-01",
"TotEnMet": None,
},
{
"Verwijderd": False,
"Persoon": {
"Achternaam": "Plas",
"Initialen": "C.",
"Tussenvoegsel": "van der",
"Id": "guid-2",
},
"FractieZetel": {"Fractie": {"NaamNL": "BBB"}},
"FractieZetel": {
"Fractie": {"Afkorting": "BBB", "NaamNL": "BoerBurgerBeweging"}
},
"Van": "2023-06-01",
"TotEnMet": "2024-01-01",
},
]
}
mock_resp = MockResponse(data)
mock_session = MockSession(mock_resp)
mock_session = MockSession(data)
# Patch requests.Session to return our mock session
monkeypatch.setattr(requests, "Session", lambda: mock_session)
@ -98,6 +112,7 @@ def test_fetch_mp_metadata_idempotent(tmp_path, monkeypatch):
assert rows[1][3] == None
assert rows[1][4] == "guid-1"
# Run again to assert idempotence (no exception and same count processed)
# Run again to assert idempotence: same records processed, DB unchanged
monkeypatch.setattr(requests, "Session", lambda: MockSession(data))
count2 = fetch_mp_metadata(db_path=db_path, odata_url="http://example/odata")
assert count2 == 2

Loading…
Cancel
Save