motief/scripts/download_past_year.py

"""download_past_year.py — One-shot data download: past year of parliamentary motions.

Fetches Stemming records from the OData API in quarterly chunks (90-day windows),
stores motions into data/motions.db using MotionDatabase.insert_motion().

Skips AI summarisation — this is a raw data fetch for the embedding pipeline.

Usage:
    uv run python scripts/download_past_year.py [--db-path data/motions.db] [--days 365]
"""

import argparse
import sys
import time
from datetime import datetime, timedelta

sys.path.insert(0, ".")  # run from project root

from api_client import TweedeKamerAPI
from database import MotionDatabase


def main():
    parser = argparse.ArgumentParser(description="Download past year of motions")
    parser.add_argument("--db-path", default="data/motions.db")
    parser.add_argument(
        "--days", type=int, default=365, help="How many days back to fetch"
    )
    parser.add_argument("--chunk-days", type=int, default=90, help="Days per API chunk")
    parser.add_argument(
        "--limit-per-chunk",
        type=int,
        default=50000,
        help="Max motions (Besluit) per chunk",
    )
    parser.add_argument(
        "--delay", type=float, default=2.0, help="Seconds between chunks"
    )
    args = parser.parse_args()

    api = TweedeKamerAPI()
    db = MotionDatabase(args.db_path)

    end_date = datetime.now()
    start_date = end_date - timedelta(days=args.days)

    print(
        f"Downloading motions from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
    )
    print(f"DB: {args.db_path}")
    print()

    # Test connectivity first
    test_url = f"{api.odata_base_url}/Stemming"
    r = api.session.get(test_url, params={"$top": 1}, timeout=10)
    if r.status_code != 200:
        print(f"ERROR: API returned {r.status_code}. Aborting.")
        sys.exit(1)
    print("✅ API connection OK\n")

    chunk_start = start_date
    chunk_num = 0
    total_fetched = 0
    total_inserted = 0
    total_duplicates = 0

    while chunk_start < end_date:
        chunk_end = min(chunk_start + timedelta(days=args.chunk_days), end_date)
        chunk_num += 1
        label = f"{chunk_start.strftime('%Y-%m-%d')} → {chunk_end.strftime('%Y-%m-%d')}"
        print(f"[Chunk {chunk_num}] {label}")

        try:
            motions = api.get_motions(
                start_date=chunk_start,
                end_date=chunk_end,
                limit=args.limit_per_chunk,
                skip_details=True,
            )
            print(f"  Fetched {len(motions)} motions")
            total_fetched += len(motions)

            inserted = 0
            duplicates = 0
            for m in motions:
                if db.insert_motion(m):
                    inserted += 1
                else:
                    duplicates += 1

            total_inserted += inserted
            total_duplicates += duplicates
            print(f"  Inserted {inserted} new  |  {duplicates} duplicates skipped")

        except Exception as e:
            print(f"  ERROR: {e}")

        chunk_start = chunk_end
        if chunk_start < end_date:
            print(f"  Waiting {args.delay}s before next chunk…")
            time.sleep(args.delay)

    print()
    print("=" * 50)
    print(f"Done. Total fetched: {total_fetched}")
    print(f"      Inserted:      {total_inserted}")
    print(f"      Duplicates:    {total_duplicates}")
    print("=" * 50)


if __name__ == "__main__":
    main()