|
|
# scheduler.py (fixed infinite loop issue)
|
|
|
import schedule
|
|
|
import time
|
|
|
import duckdb
|
|
|
from datetime import datetime, timedelta
|
|
|
from api_client import TweedeKamerAPI
|
|
|
from summarizer import summarizer
|
|
|
from database import db
|
|
|
from config import config
|
|
|
|
|
|
class DataUpdateScheduler:
|
|
|
def __init__(self):
|
|
|
self.api_client = TweedeKamerAPI()
|
|
|
|
|
|
def test_api_connection(self) -> bool:
|
|
|
"""Test API connection before proceeding"""
|
|
|
print("Testing API connection...")
|
|
|
if self.api_client.test_api_connection():
|
|
|
print("✅ API connection successful")
|
|
|
return True
|
|
|
else:
|
|
|
print("❌ API connection failed")
|
|
|
return False
|
|
|
|
|
|
def check_database_has_data(self) -> bool:
|
|
|
"""Check if database has any motion data"""
|
|
|
try:
|
|
|
conn = duckdb.connect(config.DATABASE_PATH)
|
|
|
result = conn.execute("SELECT COUNT(*) FROM motions").fetchone()
|
|
|
conn.close()
|
|
|
return result[0] > 0 if result else False
|
|
|
except Exception as e:
|
|
|
print(f"Error checking database: {e}")
|
|
|
return False
|
|
|
|
|
|
def update_motions_data(self, days_back: int = 30, max_records: int = 1000):
|
|
|
"""Fetch new motions from API and update database"""
|
|
|
print(f"Starting motion data update at {datetime.now()}")
|
|
|
|
|
|
if not self.test_api_connection():
|
|
|
return False
|
|
|
|
|
|
try:
|
|
|
# Fetch recent motions from API (respecting API limits)
|
|
|
start_date = datetime.now() - timedelta(days=days_back)
|
|
|
motions = self.api_client.get_motions(
|
|
|
start_date=start_date,
|
|
|
limit=max_records
|
|
|
)
|
|
|
print(f"Fetched {len(motions)} motions from API")
|
|
|
|
|
|
if not motions:
|
|
|
print("No motions received from API")
|
|
|
return False
|
|
|
|
|
|
# Insert new motions into database
|
|
|
successful_inserts = 0
|
|
|
duplicate_count = 0
|
|
|
|
|
|
for motion in motions:
|
|
|
if db.insert_motion(motion):
|
|
|
successful_inserts += 1
|
|
|
else:
|
|
|
duplicate_count += 1
|
|
|
|
|
|
print(f"Successfully inserted {successful_inserts} new motions")
|
|
|
if duplicate_count > 0:
|
|
|
print(f"Skipped {duplicate_count} duplicate motions")
|
|
|
|
|
|
# Generate AI summaries for new motions (only if we have new data)
|
|
|
if successful_inserts > 0:
|
|
|
print("Generating AI summaries for new motions...")
|
|
|
summarizer.update_motion_summaries()
|
|
|
|
|
|
print("Motion data update completed successfully")
|
|
|
return True
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error during motion data update: {e}")
|
|
|
return False
|
|
|
|
|
|
def initial_data_load(self):
|
|
|
"""Perform initial data load with comprehensive data"""
|
|
|
print("Performing initial comprehensive data load...")
|
|
|
|
|
|
if not self.test_api_connection():
|
|
|
return False
|
|
|
|
|
|
try:
|
|
|
# Start from 2 years ago but make sure we don't go into the future
|
|
|
start_date = datetime.now() - timedelta(days=730)
|
|
|
end_date = datetime.now()
|
|
|
|
|
|
print(f"Loading data from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
|
|
|
|
|
# Use a single request for recent data first, then expand if needed
|
|
|
chunk_days = 90 # 3-month chunks
|
|
|
current_date = start_date
|
|
|
all_motions = []
|
|
|
chunks_processed = 0
|
|
|
max_chunks = 10 # Safety limit to prevent infinite loops
|
|
|
|
|
|
while current_date < end_date and chunks_processed < max_chunks:
|
|
|
chunk_end_date = min(current_date + timedelta(days=chunk_days), end_date)
|
|
|
|
|
|
print(f"Fetching chunk {chunks_processed + 1}/{max_chunks}: {current_date.strftime('%Y-%m-%d')} to {chunk_end_date.strftime('%Y-%m-%d')}")
|
|
|
|
|
|
try:
|
|
|
# Fetch data for this time chunk
|
|
|
chunk_motions = self.api_client.get_motions(
|
|
|
start_date=current_date,
|
|
|
end_date=chunk_end_date,
|
|
|
limit=250 # Reasonable limit per chunk
|
|
|
)
|
|
|
|
|
|
if chunk_motions:
|
|
|
all_motions.extend(chunk_motions)
|
|
|
print(f"✅ Found {len(chunk_motions)} motions in this chunk (Total: {len(all_motions)})")
|
|
|
else:
|
|
|
print(f"⚠️ No motions found in chunk {current_date.strftime('%Y-%m-%d')} to {chunk_end_date.strftime('%Y-%m-%d')}")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ Error fetching chunk {current_date.strftime('%Y-%m-%d')} to {chunk_end_date.strftime('%Y-%m-%d')}: {e}")
|
|
|
|
|
|
# IMPORTANT: Always increment the date to avoid infinite loop
|
|
|
current_date = chunk_end_date
|
|
|
chunks_processed += 1
|
|
|
|
|
|
# Add delay between chunks
|
|
|
if chunks_processed < max_chunks and current_date < end_date:
|
|
|
time.sleep(2)
|
|
|
|
|
|
print(f"Data collection completed. Total motions fetched: {len(all_motions)}")
|
|
|
|
|
|
if not all_motions:
|
|
|
print("❌ No motions retrieved from API. This might be normal if the API doesn't have recent data.")
|
|
|
print("💡 Try adjusting the date range or check if the API has data for the selected period.")
|
|
|
|
|
|
# Try a broader date range as fallback
|
|
|
print("🔄 Trying broader date range (last 30 days)...")
|
|
|
fallback_start = datetime.now() - timedelta(days=30)
|
|
|
fallback_motions = self.api_client.get_motions(
|
|
|
start_date=fallback_start,
|
|
|
limit=250
|
|
|
)
|
|
|
|
|
|
if fallback_motions:
|
|
|
all_motions = fallback_motions
|
|
|
print(f"✅ Fallback successful: Found {len(fallback_motions)} motions")
|
|
|
else:
|
|
|
print("❌ No data found even with broader date range")
|
|
|
return False
|
|
|
|
|
|
# Insert all motions with progress tracking
|
|
|
successful_inserts = 0
|
|
|
duplicate_count = 0
|
|
|
|
|
|
print(f"Inserting {len(all_motions)} motions into database...")
|
|
|
|
|
|
for i, motion in enumerate(all_motions):
|
|
|
if i % 25 == 0: # Progress indicator every 25 motions
|
|
|
print(f"Processing motion {i+1}/{len(all_motions)} ({((i+1)/len(all_motions)*100):.1f}%)")
|
|
|
|
|
|
if db.insert_motion(motion):
|
|
|
successful_inserts += 1
|
|
|
else:
|
|
|
duplicate_count += 1
|
|
|
|
|
|
print(f"✅ Successfully inserted {successful_inserts} motions")
|
|
|
if duplicate_count > 0:
|
|
|
print(f"ℹ️ Skipped {duplicate_count} duplicate motions")
|
|
|
|
|
|
# Generate summaries if we have data
|
|
|
if successful_inserts > 0:
|
|
|
print("🤖 Generating AI summaries...")
|
|
|
summarizer.update_motion_summaries()
|
|
|
|
|
|
print("🎉 Initial data load completed!")
|
|
|
return successful_inserts > 0
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ Error during initial data load: {e}")
|
|
|
return False
|
|
|
|
|
|
def weekly_update_job(self):
|
|
|
"""Weekly job to update with new motions"""
|
|
|
print(f"Starting weekly update job at {datetime.now()}")
|
|
|
# Use smaller limits for regular updates
|
|
|
self.update_motions_data(days_back=14, max_records=250)
|
|
|
print("Weekly update job completed")
|
|
|
|
|
|
def run_scheduler(self):
|
|
|
"""Main scheduler function"""
|
|
|
print("=" * 50)
|
|
|
print("Dutch Political Compass Data Scheduler")
|
|
|
print("=" * 50)
|
|
|
|
|
|
# Check if database has data
|
|
|
has_data = self.check_database_has_data()
|
|
|
print(f"Database has existing data: {has_data}")
|
|
|
|
|
|
if not has_data:
|
|
|
print("\n🔄 No data found in database. Running initial data load...")
|
|
|
success = self.initial_data_load()
|
|
|
if success:
|
|
|
print("✅ Initial data load completed successfully!")
|
|
|
else:
|
|
|
print("❌ Initial data load failed or no data available.")
|
|
|
print("💡 You may need to check the API or adjust the date range.")
|
|
|
return
|
|
|
else:
|
|
|
print("✅ Database already contains motion data.")
|
|
|
|
|
|
# Ask if user wants to update anyway
|
|
|
try:
|
|
|
response = input("\nDo you want to fetch recent motions anyway? (y/n): ").lower().strip()
|
|
|
if response in ['y', 'yes']:
|
|
|
print("🔄 Updating with recent motions...")
|
|
|
self.update_motions_data(days_back=7, max_records=250)
|
|
|
except KeyboardInterrupt:
|
|
|
print("\nSkipping manual update.")
|
|
|
|
|
|
# Schedule regular updates
|
|
|
print("\n📅 Scheduling regular updates...")
|
|
|
schedule.every().monday.at("02:00").do(self.weekly_update_job)
|
|
|
schedule.every().thursday.at("14:00").do(lambda: self.update_motions_data(days_back=7, max_records=250))
|
|
|
|
|
|
print("Jobs scheduled:")
|
|
|
print("- Weekly motion update: Every Monday at 02:00")
|
|
|
print("- Mid-week update: Every Thursday at 14:00")
|
|
|
print(f"- API limit per request: {config.API_MAX_LIMIT} records")
|
|
|
print("\n🔄 Scheduler is now running. Press Ctrl+C to stop.")
|
|
|
|
|
|
try:
|
|
|
while True:
|
|
|
schedule.run_pending()
|
|
|
time.sleep(3600) # Check every hour
|
|
|
except KeyboardInterrupt:
|
|
|
print("\n👋 Scheduler stopped by user.")
|
|
|
|
|
|
def run_once():
|
|
|
"""Run data update once and exit"""
|
|
|
scheduler = DataUpdateScheduler()
|
|
|
|
|
|
print("Running one-time data update...")
|
|
|
has_data = scheduler.check_database_has_data()
|
|
|
|
|
|
if not has_data:
|
|
|
print("No existing data found. Running initial data load...")
|
|
|
scheduler.initial_data_load()
|
|
|
else:
|
|
|
print("Updating existing data with recent motions...")
|
|
|
scheduler.update_motions_data(days_back=14, max_records=250)
|
|
|
|
|
|
print("One-time update completed!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
import sys
|
|
|
|
|
|
if len(sys.argv) > 1 and sys.argv[1] == "--once":
|
|
|
run_once()
|
|
|
else:
|
|
|
scheduler = DataUpdateScheduler()
|
|
|
scheduler.run_scheduler()
|
|
|
|