- _get_voting_records returns (records, besluit_meta) tuple; paginate via Besluit?expand=Stemming (469/mo vs 8400) - get_motions(skip_details=True) bypasses per-motion detail chain (3 HTTP calls/motion) - extract_mp_votes rewritten: bulk DataFrame insert (80k rows in 1.9s), includes party-level actors - run_pipeline.py fixed: pass db_path not db, handle dict/int return types - download_past_year.py: skip_details=True default, limit-per-chunk default 50000main
parent
f2a831dfcf
commit
847b783877
@ -0,0 +1,112 @@ |
||||
"""download_past_year.py — One-shot data download: past year of parliamentary motions. |
||||
|
||||
Fetches Stemming records from the OData API in quarterly chunks (90-day windows), |
||||
stores motions into data/motions.db using MotionDatabase.insert_motion(). |
||||
|
||||
Skips AI summarisation — this is a raw data fetch for the embedding pipeline. |
||||
|
||||
Usage: |
||||
uv run python scripts/download_past_year.py [--db-path data/motions.db] [--days 365] |
||||
""" |
||||
|
||||
import argparse |
||||
import sys |
||||
import time |
||||
from datetime import datetime, timedelta |
||||
|
||||
sys.path.insert(0, ".") # run from project root |
||||
|
||||
from api_client import TweedeKamerAPI |
||||
from database import MotionDatabase |
||||
|
||||
|
||||
def main(): |
||||
parser = argparse.ArgumentParser(description="Download past year of motions") |
||||
parser.add_argument("--db-path", default="data/motions.db") |
||||
parser.add_argument( |
||||
"--days", type=int, default=365, help="How many days back to fetch" |
||||
) |
||||
parser.add_argument("--chunk-days", type=int, default=90, help="Days per API chunk") |
||||
parser.add_argument( |
||||
"--limit-per-chunk", |
||||
type=int, |
||||
default=50000, |
||||
help="Max motions (Besluit) per chunk", |
||||
) |
||||
parser.add_argument( |
||||
"--delay", type=float, default=2.0, help="Seconds between chunks" |
||||
) |
||||
args = parser.parse_args() |
||||
|
||||
api = TweedeKamerAPI() |
||||
db = MotionDatabase(args.db_path) |
||||
|
||||
end_date = datetime.now() |
||||
start_date = end_date - timedelta(days=args.days) |
||||
|
||||
print( |
||||
f"Downloading motions from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}" |
||||
) |
||||
print(f"DB: {args.db_path}") |
||||
print() |
||||
|
||||
# Test connectivity first |
||||
test_url = f"{api.odata_base_url}/Stemming" |
||||
r = api.session.get(test_url, params={"$top": 1}, timeout=10) |
||||
if r.status_code != 200: |
||||
print(f"ERROR: API returned {r.status_code}. Aborting.") |
||||
sys.exit(1) |
||||
print("✅ API connection OK\n") |
||||
|
||||
chunk_start = start_date |
||||
chunk_num = 0 |
||||
total_fetched = 0 |
||||
total_inserted = 0 |
||||
total_duplicates = 0 |
||||
|
||||
while chunk_start < end_date: |
||||
chunk_end = min(chunk_start + timedelta(days=args.chunk_days), end_date) |
||||
chunk_num += 1 |
||||
label = f"{chunk_start.strftime('%Y-%m-%d')} → {chunk_end.strftime('%Y-%m-%d')}" |
||||
print(f"[Chunk {chunk_num}] {label}") |
||||
|
||||
try: |
||||
motions = api.get_motions( |
||||
start_date=chunk_start, |
||||
end_date=chunk_end, |
||||
limit=args.limit_per_chunk, |
||||
skip_details=True, |
||||
) |
||||
print(f" Fetched {len(motions)} motions") |
||||
total_fetched += len(motions) |
||||
|
||||
inserted = 0 |
||||
duplicates = 0 |
||||
for m in motions: |
||||
if db.insert_motion(m): |
||||
inserted += 1 |
||||
else: |
||||
duplicates += 1 |
||||
|
||||
total_inserted += inserted |
||||
total_duplicates += duplicates |
||||
print(f" Inserted {inserted} new | {duplicates} duplicates skipped") |
||||
|
||||
except Exception as e: |
||||
print(f" ERROR: {e}") |
||||
|
||||
chunk_start = chunk_end |
||||
if chunk_start < end_date: |
||||
print(f" Waiting {args.delay}s before next chunk…") |
||||
time.sleep(args.delay) |
||||
|
||||
print() |
||||
print("=" * 50) |
||||
print(f"Done. Total fetched: {total_fetched}") |
||||
print(f" Inserted: {total_inserted}") |
||||
print(f" Duplicates: {total_duplicates}") |
||||
print("=" * 50) |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
main() |
||||
Loading…
Reference in new issue