You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
motief/pipeline/extract_mp_votes.py

75 lines
2.5 KiB

import json
import logging
from typing import Optional
import duckdb
from database import MotionDatabase
_logger = logging.getLogger(__name__)
def extract_mp_votes(db_path: Optional[str] = None, limit: Optional[int] = None):
"""Extract individual MP votes from motions.voting_results and store them
in the mp_votes table.
Returns a dict with summary counts:
- motions_scanned: number of motions inspected
- mp_rows_inserted: number of mp_votes rows inserted
- motions_skipped: number of motions skipped because mp_votes already existed
"""
db = MotionDatabase(db_path=db_path) if db_path else MotionDatabase()
conn = duckdb.connect(db.db_path)
try:
# support optional limit to only scan a subset of motions
if limit is not None:
rows = conn.execute(
"SELECT id, voting_results, date FROM motions LIMIT ?", (limit,)
).fetchall()
else:
rows = conn.execute(
"SELECT id, voting_results, date FROM motions"
).fetchall()
finally:
conn.close()
mp_rows_inserted = 0
motions_skipped = 0
motions_scanned = 0
for motion_id, voting_results_json, date in rows:
motions_scanned += 1
try:
if db.mp_votes_exists_for_motion(motion_id):
_logger.debug(
"Skipping motion %s because mp_votes already exist", motion_id
)
motions_skipped += 1
continue
# voting_results may be stored as JSON text or as native JSON; ensure it's a dict
if isinstance(voting_results_json, str):
voting_results = json.loads(voting_results_json)
else:
voting_results = voting_results_json
for actor, vote in (voting_results or {}).items():
# Individual MP names contain a comma (e.g. "Last, F.")
if "," not in actor:
continue
inserted_id = db.insert_mp_vote(
motion_id=motion_id, mp_name=actor, vote=vote, date=date, party=None
)
if inserted_id and inserted_id > 0:
mp_rows_inserted += 1
except Exception as e:
_logger.error("Error processing motion %s: %s", motion_id, e)
return {
"motions_scanned": motions_scanned,
"mp_rows_inserted": mp_rows_inserted,
"motions_skipped": motions_skipped,
}