feat(analysis): fetch real MP metadata, fix anchor axis for party-level actors

- fetch_mp_metadata: use real OData URL with pagination (1200 records, 5 pages)
  uses Fractie.Afkorting not NaamNL for abbreviation matching
  skips Verwijderd=true records
- upsert_mp_metadata: keep most recent membership (prefer active over ended,
  then higher Van date) so current party affiliations are not overwritten by historical
- compute_anchor_axis: anchor directly on party-level SVD entities (GroenLinks-PvdA etc)
  before falling back to mp_metadata individual MP lookup
- test_fetch_mp_metadata: fix mock for timeout kwarg + pagination + Afkorting field
- Generated anchor axis HTML for 2025-Q2 through 2026-Q1 in outputs/
main
Sven Geboers 1 month ago
parent 5ad83ef1be
commit aa2f66ac9f
  1. 28
      analysis/political_axis.py
  2. 22
      database.py
  3. 7
      outputs/anchor_axis_2025_Q2.html
  4. 7
      outputs/anchor_axis_2025_Q3.html
  5. 7
      outputs/anchor_axis_2025_Q4.html
  6. 7
      outputs/anchor_axis_2026_Q1.html
  7. 112
      pipeline/fetch_mp_metadata.py
  8. 35
      tests/test_fetch_mp_metadata.py

@ -87,22 +87,24 @@ def compute_anchor_axis(
if not mp_vecs: if not mp_vecs:
return {} return {}
# Load party affiliation for this window from mp_metadata left_set = set(left_parties)
right_set = set(right_parties)
# 1. Party-level actors whose entity_id IS a party name (e.g. "GroenLinks-PvdA")
left_vecs = [mp_vecs[p] for p in left_set if p in mp_vecs]
right_vecs = [mp_vecs[p] for p in right_set if p in mp_vecs]
# 2. Individual MPs via mp_metadata party affiliation
conn = duckdb.connect(db_path) conn = duckdb.connect(db_path)
rows = conn.execute("SELECT mp_name, party FROM mp_metadata").fetchall() rows = conn.execute("SELECT mp_name, party FROM mp_metadata").fetchall()
conn.close() conn.close()
party_of = {mp: party for mp, party in rows} for mp_name, party in rows:
if mp_name not in mp_vecs:
left_vecs = [ continue
mp_vecs[mp] if party in left_set and mp_name not in left_set:
for mp, party in party_of.items() left_vecs.append(mp_vecs[mp_name])
if party in left_parties and mp in mp_vecs elif party in right_set and mp_name not in right_set:
] right_vecs.append(mp_vecs[mp_name])
right_vecs = [
mp_vecs[mp]
for mp, party in party_of.items()
if party in right_parties and mp in mp_vecs
]
if not left_vecs or not right_vecs: if not left_vecs or not right_vecs:
_logger.warning( _logger.warning(

@ -522,12 +522,32 @@ class MotionDatabase:
"SELECT COUNT(*) FROM mp_metadata WHERE mp_name = ?", (mp_name,) "SELECT COUNT(*) FROM mp_metadata WHERE mp_name = ?", (mp_name,)
).fetchone() ).fetchone()
if exists and exists[0] > 0: if exists and exists[0] > 0:
# Only update if this record is newer (higher Van date) than the stored one,
# preferring active memberships (TotEnMet IS NULL) over ended ones.
conn.execute( conn.execute(
""" """
UPDATE mp_metadata SET party = ?, van = ?, tot_en_met = ?, persoon_id = ? UPDATE mp_metadata SET party = ?, van = ?, tot_en_met = ?, persoon_id = ?
WHERE mp_name = ? WHERE mp_name = ?
AND (
-- prefer active over ended
(? IS NULL AND tot_en_met IS NOT NULL)
-- or same active status but newer start date
OR (? IS NULL AND tot_en_met IS NULL AND CAST(? AS DATE) > CAST(van AS DATE))
OR (? IS NOT NULL AND tot_en_met IS NOT NULL AND CAST(? AS DATE) > CAST(van AS DATE))
)
""", """,
(party, van, tot_en_met, persoon_id, mp_name), (
party,
van,
tot_en_met,
persoon_id,
mp_name,
tot_en_met, # prefer active
tot_en_met,
van, # both active, newer
tot_en_met,
van,
), # both ended, newer
) )
else: else:
conn.execute( conn.execute(

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -34,61 +34,85 @@ def normalize_mp_name(
return name return name
_ODATA_BASE = "https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0"
_PAGE_SIZE = 250
def fetch_mp_metadata( def fetch_mp_metadata(
db_path: str, odata_url: str = "https://odata.example/FractieZetelPersoon" db_path: str,
odata_url: str = f"{_ODATA_BASE}/FractieZetelPersoon",
) -> int: ) -> int:
"""Fetch MP party membership and tenure from OData and upsert into DB. """Fetch MP party membership and tenure from OData and upsert into DB.
Paginates through all records using $skip. Uses Fractie.Afkorting as
the party name so it matches the abbreviations used in mp_votes.
Returns the number of records processed (inserted or updated). Returns the number of records processed (inserted or updated).
""" """
expand = "$expand=FractieZetel($expand=Fractie),Persoon"
session = requests.Session() session = requests.Session()
try:
resp = session.get(odata_url)
resp.raise_for_status()
data = resp.json()
except Exception as e:
logger.error("Failed to fetch MP metadata: %s", e)
raise
values = data.get("value") if isinstance(data, dict) else None
if values is None:
logger.error("Unexpected OData payload; missing 'value' list")
return 0
db = MotionDatabase(db_path) db = MotionDatabase(db_path)
processed = 0 processed = 0
skip = 0
for item in values: while True:
url = f"{odata_url}?{expand}&$top={_PAGE_SIZE}&$skip={skip}"
try: try:
persoon = item.get("Persoon") or {} resp = session.get(url, timeout=30)
fractiezetel = item.get("FractieZetel") or {} resp.raise_for_status()
fractie = fractiezetel.get("Fractie") or {} data = resp.json()
except Exception as e:
achternaam = persoon.get("Achternaam") logger.error("Failed to fetch MP metadata (skip=%d): %s", skip, e)
initialen = persoon.get("Initialen") raise
tussenvoegsel = persoon.get("Tussenvoegsel")
persoon_id = persoon.get("Id") values = data.get("value") if isinstance(data, dict) else None
if values is None:
party = fractie.get("NaamNL") logger.error("Unexpected OData payload at skip=%d; missing 'value'", skip)
van = item.get("Van") break
tot_en_met = item.get("TotEnMet")
if not values:
if not achternaam: break # no more pages
logger.debug("Skipping record without achternaam: %s", item)
continue for item in values:
try:
mp_name = normalize_mp_name(achternaam, initialen, tussenvoegsel) if item.get("Verwijderd"):
continue
db.upsert_mp_metadata(
mp_name=mp_name, persoon = item.get("Persoon") or {}
party=party, fractiezetel = item.get("FractieZetel") or {}
van=van, fractie = fractiezetel.get("Fractie") or {}
tot_en_met=tot_en_met,
persoon_id=persoon_id, achternaam = persoon.get("Achternaam")
) initialen = persoon.get("Initialen")
processed += 1 tussenvoegsel = persoon.get("Tussenvoegsel")
except Exception: persoon_id = persoon.get("Id")
logger.exception("Error processing OData item: %s", item)
# Use Afkorting (e.g. "VVD", "GroenLinks-PvdA") to match mp_votes party column
party = fractie.get("Afkorting") or fractie.get("NaamNL")
van = item.get("Van")
tot_en_met = item.get("TotEnMet")
if not achternaam:
logger.debug("Skipping record without achternaam: %s", item)
continue
mp_name = normalize_mp_name(achternaam, initialen, tussenvoegsel)
db.upsert_mp_metadata(
mp_name=mp_name,
party=party,
van=van,
tot_en_met=tot_en_met,
persoon_id=persoon_id,
)
processed += 1
except Exception:
logger.exception("Error processing OData item: %s", item)
logger.debug("Fetched page skip=%d, got %d records", skip, len(values))
if len(values) < _PAGE_SIZE:
break # last page
skip += _PAGE_SIZE
logger.info("Processed %d MP metadata records", processed) logger.info("Processed %d MP metadata records", processed)
return processed return processed

@ -28,44 +28,58 @@ class MockResponse:
class MockSession: class MockSession:
def __init__(self, response): """Session mock that returns a data page on first call and empty page on second."""
self._response = response
def get(self, url): def __init__(self, data_page):
return self._response self._pages = [data_page, {"value": []}]
self._call = 0
def get(self, url, **kwargs):
resp = MockResponse(self._pages[min(self._call, len(self._pages) - 1)])
self._call += 1
return resp
def test_fetch_mp_metadata_idempotent(tmp_path, monkeypatch): def test_fetch_mp_metadata_idempotent(tmp_path, monkeypatch):
# Prepare canned OData response with two FractieZetelPersoon records # Prepare canned OData response with two FractieZetelPersoon records.
# Use Afkorting (not NaamNL) because fetch_mp_metadata prefers Afkorting.
data = { data = {
"value": [ "value": [
{ {
"Verwijderd": False,
"Persoon": { "Persoon": {
"Achternaam": "Yesilgöz-Zegerius", "Achternaam": "Yesilgöz-Zegerius",
"Initialen": "D.", "Initialen": "D.",
"Tussenvoegsel": None, "Tussenvoegsel": None,
"Id": "guid-1", "Id": "guid-1",
}, },
"FractieZetel": {"Fractie": {"NaamNL": "VVD"}}, "FractieZetel": {
"Fractie": {
"Afkorting": "VVD",
"NaamNL": "Volkspartij voor Vrijheid en Democratie",
}
},
"Van": "2023-01-01", "Van": "2023-01-01",
"TotEnMet": None, "TotEnMet": None,
}, },
{ {
"Verwijderd": False,
"Persoon": { "Persoon": {
"Achternaam": "Plas", "Achternaam": "Plas",
"Initialen": "C.", "Initialen": "C.",
"Tussenvoegsel": "van der", "Tussenvoegsel": "van der",
"Id": "guid-2", "Id": "guid-2",
}, },
"FractieZetel": {"Fractie": {"NaamNL": "BBB"}}, "FractieZetel": {
"Fractie": {"Afkorting": "BBB", "NaamNL": "BoerBurgerBeweging"}
},
"Van": "2023-06-01", "Van": "2023-06-01",
"TotEnMet": "2024-01-01", "TotEnMet": "2024-01-01",
}, },
] ]
} }
mock_resp = MockResponse(data) mock_session = MockSession(data)
mock_session = MockSession(mock_resp)
# Patch requests.Session to return our mock session # Patch requests.Session to return our mock session
monkeypatch.setattr(requests, "Session", lambda: mock_session) monkeypatch.setattr(requests, "Session", lambda: mock_session)
@ -98,6 +112,7 @@ def test_fetch_mp_metadata_idempotent(tmp_path, monkeypatch):
assert rows[1][3] == None assert rows[1][3] == None
assert rows[1][4] == "guid-1" assert rows[1][4] == "guid-1"
# Run again to assert idempotence (no exception and same count processed) # Run again to assert idempotence: same records processed, DB unchanged
monkeypatch.setattr(requests, "Session", lambda: MockSession(data))
count2 = fetch_mp_metadata(db_path=db_path, odata_url="http://example/odata") count2 = fetch_mp_metadata(db_path=db_path, odata_url="http://example/odata")
assert count2 == 2 assert count2 == 2

Loading…
Cancel
Save