# scheduler.py (fixed infinite loop issue) import schedule import time import duckdb from datetime import datetime, timedelta from api_client import TweedeKamerAPI from summarizer import summarizer from database import db from config import config class DataUpdateScheduler: def __init__(self): self.api_client = TweedeKamerAPI() def test_api_connection(self) -> bool: """Test API connection before proceeding""" print("Testing API connection...") if self.api_client.test_api_connection(): print("✅ API connection successful") return True else: print("❌ API connection failed") return False def check_database_has_data(self) -> bool: """Check if database has any motion data""" try: conn = duckdb.connect(config.DATABASE_PATH) result = conn.execute("SELECT COUNT(*) FROM motions").fetchone() conn.close() return result[0] > 0 if result else False except Exception as e: print(f"Error checking database: {e}") return False def update_motions_data(self, days_back: int = 30, max_records: int = 1000): """Fetch new motions from API and update database""" print(f"Starting motion data update at {datetime.now()}") if not self.test_api_connection(): return False try: # Fetch recent motions from API (respecting API limits) start_date = datetime.now() - timedelta(days=days_back) motions = self.api_client.get_motions( start_date=start_date, limit=max_records ) print(f"Fetched {len(motions)} motions from API") if not motions: print("No motions received from API") return False # Insert new motions into database successful_inserts = 0 duplicate_count = 0 for motion in motions: if db.insert_motion(motion): successful_inserts += 1 else: duplicate_count += 1 print(f"Successfully inserted {successful_inserts} new motions") if duplicate_count > 0: print(f"Skipped {duplicate_count} duplicate motions") # Generate AI summaries for new motions (only if we have new data) if successful_inserts > 0: print("Generating AI summaries for new motions...") summarizer.update_motion_summaries() print("Motion data update completed successfully") return True except Exception as e: print(f"Error during motion data update: {e}") return False def initial_data_load(self): """Perform initial data load with comprehensive data""" print("Performing initial comprehensive data load...") if not self.test_api_connection(): return False try: # Start from 2 years ago but make sure we don't go into the future start_date = datetime.now() - timedelta(days=730) end_date = datetime.now() print(f"Loading data from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}") # Use a single request for recent data first, then expand if needed chunk_days = 90 # 3-month chunks current_date = start_date all_motions = [] chunks_processed = 0 max_chunks = 10 # Safety limit to prevent infinite loops while current_date < end_date and chunks_processed < max_chunks: chunk_end_date = min(current_date + timedelta(days=chunk_days), end_date) print(f"Fetching chunk {chunks_processed + 1}/{max_chunks}: {current_date.strftime('%Y-%m-%d')} to {chunk_end_date.strftime('%Y-%m-%d')}") try: # Fetch data for this time chunk chunk_motions = self.api_client.get_motions( start_date=current_date, end_date=chunk_end_date, limit=250 # Reasonable limit per chunk ) if chunk_motions: all_motions.extend(chunk_motions) print(f"✅ Found {len(chunk_motions)} motions in this chunk (Total: {len(all_motions)})") else: print(f"⚠️ No motions found in chunk {current_date.strftime('%Y-%m-%d')} to {chunk_end_date.strftime('%Y-%m-%d')}") except Exception as e: print(f"❌ Error fetching chunk {current_date.strftime('%Y-%m-%d')} to {chunk_end_date.strftime('%Y-%m-%d')}: {e}") # IMPORTANT: Always increment the date to avoid infinite loop current_date = chunk_end_date chunks_processed += 1 # Add delay between chunks if chunks_processed < max_chunks and current_date < end_date: time.sleep(2) print(f"Data collection completed. Total motions fetched: {len(all_motions)}") if not all_motions: print("❌ No motions retrieved from API. This might be normal if the API doesn't have recent data.") print("💡 Try adjusting the date range or check if the API has data for the selected period.") # Try a broader date range as fallback print("🔄 Trying broader date range (last 30 days)...") fallback_start = datetime.now() - timedelta(days=30) fallback_motions = self.api_client.get_motions( start_date=fallback_start, limit=250 ) if fallback_motions: all_motions = fallback_motions print(f"✅ Fallback successful: Found {len(fallback_motions)} motions") else: print("❌ No data found even with broader date range") return False # Insert all motions with progress tracking successful_inserts = 0 duplicate_count = 0 print(f"Inserting {len(all_motions)} motions into database...") for i, motion in enumerate(all_motions): if i % 25 == 0: # Progress indicator every 25 motions print(f"Processing motion {i+1}/{len(all_motions)} ({((i+1)/len(all_motions)*100):.1f}%)") if db.insert_motion(motion): successful_inserts += 1 else: duplicate_count += 1 print(f"✅ Successfully inserted {successful_inserts} motions") if duplicate_count > 0: print(f"ℹ️ Skipped {duplicate_count} duplicate motions") # Generate summaries if we have data if successful_inserts > 0: print("🤖 Generating AI summaries...") summarizer.update_motion_summaries() print("🎉 Initial data load completed!") return successful_inserts > 0 except Exception as e: print(f"❌ Error during initial data load: {e}") return False def weekly_update_job(self): """Weekly job to update with new motions""" print(f"Starting weekly update job at {datetime.now()}") # Use smaller limits for regular updates self.update_motions_data(days_back=14, max_records=250) print("Weekly update job completed") def run_scheduler(self): """Main scheduler function""" print("=" * 50) print("Dutch Political Compass Data Scheduler") print("=" * 50) # Check if database has data has_data = self.check_database_has_data() print(f"Database has existing data: {has_data}") if not has_data: print("\n🔄 No data found in database. Running initial data load...") success = self.initial_data_load() if success: print("✅ Initial data load completed successfully!") else: print("❌ Initial data load failed or no data available.") print("💡 You may need to check the API or adjust the date range.") return else: print("✅ Database already contains motion data.") # Ask if user wants to update anyway try: response = input("\nDo you want to fetch recent motions anyway? (y/n): ").lower().strip() if response in ['y', 'yes']: print("🔄 Updating with recent motions...") self.update_motions_data(days_back=7, max_records=250) except KeyboardInterrupt: print("\nSkipping manual update.") # Schedule regular updates print("\n📅 Scheduling regular updates...") schedule.every().monday.at("02:00").do(self.weekly_update_job) schedule.every().thursday.at("14:00").do(lambda: self.update_motions_data(days_back=7, max_records=250)) print("Jobs scheduled:") print("- Weekly motion update: Every Monday at 02:00") print("- Mid-week update: Every Thursday at 14:00") print(f"- API limit per request: {config.API_MAX_LIMIT} records") print("\n🔄 Scheduler is now running. Press Ctrl+C to stop.") try: while True: schedule.run_pending() time.sleep(3600) # Check every hour except KeyboardInterrupt: print("\n👋 Scheduler stopped by user.") def run_once(): """Run data update once and exit""" scheduler = DataUpdateScheduler() print("Running one-time data update...") has_data = scheduler.check_database_has_data() if not has_data: print("No existing data found. Running initial data load...") scheduler.initial_data_load() else: print("Updating existing data with recent motions...") scheduler.update_motions_data(days_back=14, max_records=250) print("One-time update completed!") if __name__ == "__main__": import sys if len(sys.argv) > 1 and sys.argv[1] == "--once": run_once() else: scheduler = DataUpdateScheduler() scheduler.run_scheduler()