"""download_past_year.py — One-shot data download: parliamentary motions for a date range. Fetches Stemming records from the OData API in chunks (default 90-day windows), stores motions into data/motions.db using MotionDatabase.batch_insert_motions(). Skips AI summarisation — this is a raw data fetch for the embedding pipeline. Usage: uv run python scripts/download_past_year.py [--db-path data/motions.db] [--days 365] uv run python scripts/download_past_year.py --start-date 2019-01-01 --end-date 2022-01-01 """ import argparse import sys import time from datetime import datetime, timedelta sys.path.insert(0, ".") # run from project root from api_client import TweedeKamerAPI from database import MotionDatabase def main(): parser = argparse.ArgumentParser(description="Download motions for a date range") parser.add_argument("--db-path", default="data/motions.db") parser.add_argument( "--days", type=int, default=365, help="How many days back to fetch (ignored if --start-date given)", ) parser.add_argument( "--start-date", type=str, default=None, help="Explicit start date YYYY-MM-DD (overrides --days)", ) parser.add_argument( "--end-date", type=str, default=None, help="Explicit end date YYYY-MM-DD (default: today)", ) parser.add_argument("--chunk-days", type=int, default=90, help="Days per API chunk") parser.add_argument( "--limit-per-chunk", type=int, default=50000, help="Max motions (Besluit) per chunk", ) parser.add_argument( "--delay", type=float, default=2.0, help="Seconds between chunks" ) args = parser.parse_args() api = TweedeKamerAPI() db = MotionDatabase(args.db_path) end_date = ( datetime.strptime(args.end_date, "%Y-%m-%d") if args.end_date else datetime.now() ) if args.start_date: start_date = datetime.strptime(args.start_date, "%Y-%m-%d") else: start_date = end_date - timedelta(days=args.days) print( f"Downloading motions from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}" ) print(f"DB: {args.db_path}") print() # Test connectivity first test_url = f"{api.odata_base_url}/Stemming" r = api.session.get(test_url, params={"$top": 1}, timeout=10) if r.status_code != 200: print(f"ERROR: API returned {r.status_code}. Aborting.") sys.exit(1) print("✅ API connection OK\n") chunk_start = start_date chunk_num = 0 total_fetched = 0 total_inserted = 0 total_duplicates = 0 while chunk_start < end_date: chunk_end = min(chunk_start + timedelta(days=args.chunk_days), end_date) chunk_num += 1 label = f"{chunk_start.strftime('%Y-%m-%d')} → {chunk_end.strftime('%Y-%m-%d')}" print(f"[Chunk {chunk_num}] {label}") try: motions = api.get_motions( start_date=chunk_start, end_date=chunk_end, limit=args.limit_per_chunk, skip_details=True, ) print(f" Fetched {len(motions)} motions") total_fetched += len(motions) inserted = 0 duplicates = 0 inserted, duplicates = db.batch_insert_motions(motions) total_inserted += inserted total_duplicates += duplicates print(f" Inserted {inserted} new | {duplicates} duplicates skipped") except Exception as e: print(f" ERROR: {e}") chunk_start = chunk_end if chunk_start < end_date: print(f" Waiting {args.delay}s before next chunk…") time.sleep(args.delay) print() print("=" * 50) print(f"Done. Total fetched: {total_fetched}") print(f" Inserted: {total_inserted}") print(f" Duplicates: {total_duplicates}") print("=" * 50) if __name__ == "__main__": main()