- _get_voting_records returns (records, besluit_meta) tuple; paginate via Besluit?expand=Stemming (469/mo vs 8400) - get_motions(skip_details=True) bypasses per-motion detail chain (3 HTTP calls/motion) - extract_mp_votes rewritten: bulk DataFrame insert (80k rows in 1.9s), includes party-level actors - run_pipeline.py fixed: pass db_path not db, handle dict/int return types - download_past_year.py: skip_details=True default, limit-per-chunk default 50000main
parent
f2a831dfcf
commit
847b783877
@ -0,0 +1,112 @@ |
|||||||
|
"""download_past_year.py — One-shot data download: past year of parliamentary motions. |
||||||
|
|
||||||
|
Fetches Stemming records from the OData API in quarterly chunks (90-day windows), |
||||||
|
stores motions into data/motions.db using MotionDatabase.insert_motion(). |
||||||
|
|
||||||
|
Skips AI summarisation — this is a raw data fetch for the embedding pipeline. |
||||||
|
|
||||||
|
Usage: |
||||||
|
uv run python scripts/download_past_year.py [--db-path data/motions.db] [--days 365] |
||||||
|
""" |
||||||
|
|
||||||
|
import argparse |
||||||
|
import sys |
||||||
|
import time |
||||||
|
from datetime import datetime, timedelta |
||||||
|
|
||||||
|
sys.path.insert(0, ".") # run from project root |
||||||
|
|
||||||
|
from api_client import TweedeKamerAPI |
||||||
|
from database import MotionDatabase |
||||||
|
|
||||||
|
|
||||||
|
def main(): |
||||||
|
parser = argparse.ArgumentParser(description="Download past year of motions") |
||||||
|
parser.add_argument("--db-path", default="data/motions.db") |
||||||
|
parser.add_argument( |
||||||
|
"--days", type=int, default=365, help="How many days back to fetch" |
||||||
|
) |
||||||
|
parser.add_argument("--chunk-days", type=int, default=90, help="Days per API chunk") |
||||||
|
parser.add_argument( |
||||||
|
"--limit-per-chunk", |
||||||
|
type=int, |
||||||
|
default=50000, |
||||||
|
help="Max motions (Besluit) per chunk", |
||||||
|
) |
||||||
|
parser.add_argument( |
||||||
|
"--delay", type=float, default=2.0, help="Seconds between chunks" |
||||||
|
) |
||||||
|
args = parser.parse_args() |
||||||
|
|
||||||
|
api = TweedeKamerAPI() |
||||||
|
db = MotionDatabase(args.db_path) |
||||||
|
|
||||||
|
end_date = datetime.now() |
||||||
|
start_date = end_date - timedelta(days=args.days) |
||||||
|
|
||||||
|
print( |
||||||
|
f"Downloading motions from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}" |
||||||
|
) |
||||||
|
print(f"DB: {args.db_path}") |
||||||
|
print() |
||||||
|
|
||||||
|
# Test connectivity first |
||||||
|
test_url = f"{api.odata_base_url}/Stemming" |
||||||
|
r = api.session.get(test_url, params={"$top": 1}, timeout=10) |
||||||
|
if r.status_code != 200: |
||||||
|
print(f"ERROR: API returned {r.status_code}. Aborting.") |
||||||
|
sys.exit(1) |
||||||
|
print("✅ API connection OK\n") |
||||||
|
|
||||||
|
chunk_start = start_date |
||||||
|
chunk_num = 0 |
||||||
|
total_fetched = 0 |
||||||
|
total_inserted = 0 |
||||||
|
total_duplicates = 0 |
||||||
|
|
||||||
|
while chunk_start < end_date: |
||||||
|
chunk_end = min(chunk_start + timedelta(days=args.chunk_days), end_date) |
||||||
|
chunk_num += 1 |
||||||
|
label = f"{chunk_start.strftime('%Y-%m-%d')} → {chunk_end.strftime('%Y-%m-%d')}" |
||||||
|
print(f"[Chunk {chunk_num}] {label}") |
||||||
|
|
||||||
|
try: |
||||||
|
motions = api.get_motions( |
||||||
|
start_date=chunk_start, |
||||||
|
end_date=chunk_end, |
||||||
|
limit=args.limit_per_chunk, |
||||||
|
skip_details=True, |
||||||
|
) |
||||||
|
print(f" Fetched {len(motions)} motions") |
||||||
|
total_fetched += len(motions) |
||||||
|
|
||||||
|
inserted = 0 |
||||||
|
duplicates = 0 |
||||||
|
for m in motions: |
||||||
|
if db.insert_motion(m): |
||||||
|
inserted += 1 |
||||||
|
else: |
||||||
|
duplicates += 1 |
||||||
|
|
||||||
|
total_inserted += inserted |
||||||
|
total_duplicates += duplicates |
||||||
|
print(f" Inserted {inserted} new | {duplicates} duplicates skipped") |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f" ERROR: {e}") |
||||||
|
|
||||||
|
chunk_start = chunk_end |
||||||
|
if chunk_start < end_date: |
||||||
|
print(f" Waiting {args.delay}s before next chunk…") |
||||||
|
time.sleep(args.delay) |
||||||
|
|
||||||
|
print() |
||||||
|
print("=" * 50) |
||||||
|
print(f"Done. Total fetched: {total_fetched}") |
||||||
|
print(f" Inserted: {total_inserted}") |
||||||
|
print(f" Duplicates: {total_duplicates}") |
||||||
|
print("=" * 50) |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
||||||
Loading…
Reference in new issue