You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
112 lines
3.4 KiB
112 lines
3.4 KiB
"""download_past_year.py — One-shot data download: past year of parliamentary motions.
|
|
|
|
Fetches Stemming records from the OData API in quarterly chunks (90-day windows),
|
|
stores motions into data/motions.db using MotionDatabase.insert_motion().
|
|
|
|
Skips AI summarisation — this is a raw data fetch for the embedding pipeline.
|
|
|
|
Usage:
|
|
uv run python scripts/download_past_year.py [--db-path data/motions.db] [--days 365]
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timedelta
|
|
|
|
sys.path.insert(0, ".") # run from project root
|
|
|
|
from api_client import TweedeKamerAPI
|
|
from database import MotionDatabase
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Download past year of motions")
|
|
parser.add_argument("--db-path", default="data/motions.db")
|
|
parser.add_argument(
|
|
"--days", type=int, default=365, help="How many days back to fetch"
|
|
)
|
|
parser.add_argument("--chunk-days", type=int, default=90, help="Days per API chunk")
|
|
parser.add_argument(
|
|
"--limit-per-chunk",
|
|
type=int,
|
|
default=50000,
|
|
help="Max motions (Besluit) per chunk",
|
|
)
|
|
parser.add_argument(
|
|
"--delay", type=float, default=2.0, help="Seconds between chunks"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
api = TweedeKamerAPI()
|
|
db = MotionDatabase(args.db_path)
|
|
|
|
end_date = datetime.now()
|
|
start_date = end_date - timedelta(days=args.days)
|
|
|
|
print(
|
|
f"Downloading motions from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
|
|
)
|
|
print(f"DB: {args.db_path}")
|
|
print()
|
|
|
|
# Test connectivity first
|
|
test_url = f"{api.odata_base_url}/Stemming"
|
|
r = api.session.get(test_url, params={"$top": 1}, timeout=10)
|
|
if r.status_code != 200:
|
|
print(f"ERROR: API returned {r.status_code}. Aborting.")
|
|
sys.exit(1)
|
|
print("✅ API connection OK\n")
|
|
|
|
chunk_start = start_date
|
|
chunk_num = 0
|
|
total_fetched = 0
|
|
total_inserted = 0
|
|
total_duplicates = 0
|
|
|
|
while chunk_start < end_date:
|
|
chunk_end = min(chunk_start + timedelta(days=args.chunk_days), end_date)
|
|
chunk_num += 1
|
|
label = f"{chunk_start.strftime('%Y-%m-%d')} → {chunk_end.strftime('%Y-%m-%d')}"
|
|
print(f"[Chunk {chunk_num}] {label}")
|
|
|
|
try:
|
|
motions = api.get_motions(
|
|
start_date=chunk_start,
|
|
end_date=chunk_end,
|
|
limit=args.limit_per_chunk,
|
|
skip_details=True,
|
|
)
|
|
print(f" Fetched {len(motions)} motions")
|
|
total_fetched += len(motions)
|
|
|
|
inserted = 0
|
|
duplicates = 0
|
|
for m in motions:
|
|
if db.insert_motion(m):
|
|
inserted += 1
|
|
else:
|
|
duplicates += 1
|
|
|
|
total_inserted += inserted
|
|
total_duplicates += duplicates
|
|
print(f" Inserted {inserted} new | {duplicates} duplicates skipped")
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
|
|
chunk_start = chunk_end
|
|
if chunk_start < end_date:
|
|
print(f" Waiting {args.delay}s before next chunk…")
|
|
time.sleep(args.delay)
|
|
|
|
print()
|
|
print("=" * 50)
|
|
print(f"Done. Total fetched: {total_fetched}")
|
|
print(f" Inserted: {total_inserted}")
|
|
print(f" Duplicates: {total_duplicates}")
|
|
print("=" * 50)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|