You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
motief/scripts/download_past_year.py

112 lines
3.4 KiB

"""download_past_year.py — One-shot data download: past year of parliamentary motions.
Fetches Stemming records from the OData API in quarterly chunks (90-day windows),
stores motions into data/motions.db using MotionDatabase.insert_motion().
Skips AI summarisation — this is a raw data fetch for the embedding pipeline.
Usage:
uv run python scripts/download_past_year.py [--db-path data/motions.db] [--days 365]
"""
import argparse
import sys
import time
from datetime import datetime, timedelta
sys.path.insert(0, ".") # run from project root
from api_client import TweedeKamerAPI
from database import MotionDatabase
def main():
parser = argparse.ArgumentParser(description="Download past year of motions")
parser.add_argument("--db-path", default="data/motions.db")
parser.add_argument(
"--days", type=int, default=365, help="How many days back to fetch"
)
parser.add_argument("--chunk-days", type=int, default=90, help="Days per API chunk")
parser.add_argument(
"--limit-per-chunk",
type=int,
default=50000,
help="Max motions (Besluit) per chunk",
)
parser.add_argument(
"--delay", type=float, default=2.0, help="Seconds between chunks"
)
args = parser.parse_args()
api = TweedeKamerAPI()
db = MotionDatabase(args.db_path)
end_date = datetime.now()
start_date = end_date - timedelta(days=args.days)
print(
f"Downloading motions from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
)
print(f"DB: {args.db_path}")
print()
# Test connectivity first
test_url = f"{api.odata_base_url}/Stemming"
r = api.session.get(test_url, params={"$top": 1}, timeout=10)
if r.status_code != 200:
print(f"ERROR: API returned {r.status_code}. Aborting.")
sys.exit(1)
print("✅ API connection OK\n")
chunk_start = start_date
chunk_num = 0
total_fetched = 0
total_inserted = 0
total_duplicates = 0
while chunk_start < end_date:
chunk_end = min(chunk_start + timedelta(days=args.chunk_days), end_date)
chunk_num += 1
label = f"{chunk_start.strftime('%Y-%m-%d')}{chunk_end.strftime('%Y-%m-%d')}"
print(f"[Chunk {chunk_num}] {label}")
try:
motions = api.get_motions(
start_date=chunk_start,
end_date=chunk_end,
limit=args.limit_per_chunk,
skip_details=True,
)
print(f" Fetched {len(motions)} motions")
total_fetched += len(motions)
inserted = 0
duplicates = 0
for m in motions:
if db.insert_motion(m):
inserted += 1
else:
duplicates += 1
total_inserted += inserted
total_duplicates += duplicates
print(f" Inserted {inserted} new | {duplicates} duplicates skipped")
except Exception as e:
print(f" ERROR: {e}")
chunk_start = chunk_end
if chunk_start < end_date:
print(f" Waiting {args.delay}s before next chunk…")
time.sleep(args.delay)
print()
print("=" * 50)
print(f"Done. Total fetched: {total_fetched}")
print(f" Inserted: {total_inserted}")
print(f" Duplicates: {total_duplicates}")
print("=" * 50)
if __name__ == "__main__":
main()