You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
motief/scheduler.py

264 lines
11 KiB

# scheduler.py (fixed infinite loop issue)
import schedule
import time
import duckdb
from datetime import datetime, timedelta
from api_client import TweedeKamerAPI
from summarizer import summarizer
from database import db
from config import config
class DataUpdateScheduler:
def __init__(self):
self.api_client = TweedeKamerAPI()
def test_api_connection(self) -> bool:
"""Test API connection before proceeding"""
print("Testing API connection...")
if self.api_client.test_api_connection():
print("✅ API connection successful")
return True
else:
print("❌ API connection failed")
return False
def check_database_has_data(self) -> bool:
"""Check if database has any motion data"""
try:
conn = duckdb.connect(config.DATABASE_PATH)
result = conn.execute("SELECT COUNT(*) FROM motions").fetchone()
conn.close()
return result[0] > 0 if result else False
except Exception as e:
print(f"Error checking database: {e}")
return False
def update_motions_data(self, days_back: int = 30, max_records: int = 1000):
"""Fetch new motions from API and update database"""
print(f"Starting motion data update at {datetime.now()}")
if not self.test_api_connection():
return False
try:
# Fetch recent motions from API (respecting API limits)
start_date = datetime.now() - timedelta(days=days_back)
motions = self.api_client.get_motions(
start_date=start_date,
limit=max_records
)
print(f"Fetched {len(motions)} motions from API")
if not motions:
print("No motions received from API")
return False
# Insert new motions into database
successful_inserts = 0
duplicate_count = 0
for motion in motions:
if db.insert_motion(motion):
successful_inserts += 1
else:
duplicate_count += 1
print(f"Successfully inserted {successful_inserts} new motions")
if duplicate_count > 0:
print(f"Skipped {duplicate_count} duplicate motions")
# Generate AI summaries for new motions (only if we have new data)
if successful_inserts > 0:
print("Generating AI summaries for new motions...")
summarizer.update_motion_summaries()
print("Motion data update completed successfully")
return True
except Exception as e:
print(f"Error during motion data update: {e}")
return False
def initial_data_load(self):
"""Perform initial data load with comprehensive data"""
print("Performing initial comprehensive data load...")
if not self.test_api_connection():
return False
try:
# Start from 2 years ago but make sure we don't go into the future
start_date = datetime.now() - timedelta(days=730)
end_date = datetime.now()
print(f"Loading data from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
# Use a single request for recent data first, then expand if needed
chunk_days = 90 # 3-month chunks
current_date = start_date
all_motions = []
chunks_processed = 0
max_chunks = 10 # Safety limit to prevent infinite loops
while current_date < end_date and chunks_processed < max_chunks:
chunk_end_date = min(current_date + timedelta(days=chunk_days), end_date)
print(f"Fetching chunk {chunks_processed + 1}/{max_chunks}: {current_date.strftime('%Y-%m-%d')} to {chunk_end_date.strftime('%Y-%m-%d')}")
try:
# Fetch data for this time chunk
chunk_motions = self.api_client.get_motions(
start_date=current_date,
end_date=chunk_end_date,
limit=250 # Reasonable limit per chunk
)
if chunk_motions:
all_motions.extend(chunk_motions)
print(f"✅ Found {len(chunk_motions)} motions in this chunk (Total: {len(all_motions)})")
else:
print(f" No motions found in chunk {current_date.strftime('%Y-%m-%d')} to {chunk_end_date.strftime('%Y-%m-%d')}")
except Exception as e:
print(f"❌ Error fetching chunk {current_date.strftime('%Y-%m-%d')} to {chunk_end_date.strftime('%Y-%m-%d')}: {e}")
# IMPORTANT: Always increment the date to avoid infinite loop
current_date = chunk_end_date
chunks_processed += 1
# Add delay between chunks
if chunks_processed < max_chunks and current_date < end_date:
time.sleep(2)
print(f"Data collection completed. Total motions fetched: {len(all_motions)}")
if not all_motions:
print("❌ No motions retrieved from API. This might be normal if the API doesn't have recent data.")
print("💡 Try adjusting the date range or check if the API has data for the selected period.")
# Try a broader date range as fallback
print("🔄 Trying broader date range (last 30 days)...")
fallback_start = datetime.now() - timedelta(days=30)
fallback_motions = self.api_client.get_motions(
start_date=fallback_start,
limit=250
)
if fallback_motions:
all_motions = fallback_motions
print(f"✅ Fallback successful: Found {len(fallback_motions)} motions")
else:
print("❌ No data found even with broader date range")
return False
# Insert all motions with progress tracking
successful_inserts = 0
duplicate_count = 0
print(f"Inserting {len(all_motions)} motions into database...")
for i, motion in enumerate(all_motions):
if i % 25 == 0: # Progress indicator every 25 motions
print(f"Processing motion {i+1}/{len(all_motions)} ({((i+1)/len(all_motions)*100):.1f}%)")
if db.insert_motion(motion):
successful_inserts += 1
else:
duplicate_count += 1
print(f"✅ Successfully inserted {successful_inserts} motions")
if duplicate_count > 0:
print(f" Skipped {duplicate_count} duplicate motions")
# Generate summaries if we have data
if successful_inserts > 0:
print("🤖 Generating AI summaries...")
summarizer.update_motion_summaries()
print("🎉 Initial data load completed!")
return successful_inserts > 0
except Exception as e:
print(f"❌ Error during initial data load: {e}")
return False
def weekly_update_job(self):
"""Weekly job to update with new motions"""
print(f"Starting weekly update job at {datetime.now()}")
# Use smaller limits for regular updates
self.update_motions_data(days_back=14, max_records=250)
print("Weekly update job completed")
def run_scheduler(self):
"""Main scheduler function"""
print("=" * 50)
print("Dutch Political Compass Data Scheduler")
print("=" * 50)
# Check if database has data
has_data = self.check_database_has_data()
print(f"Database has existing data: {has_data}")
if not has_data:
print("\n🔄 No data found in database. Running initial data load...")
success = self.initial_data_load()
if success:
print("✅ Initial data load completed successfully!")
else:
print("❌ Initial data load failed or no data available.")
print("💡 You may need to check the API or adjust the date range.")
return
else:
print("✅ Database already contains motion data.")
# Ask if user wants to update anyway
try:
response = input("\nDo you want to fetch recent motions anyway? (y/n): ").lower().strip()
if response in ['y', 'yes']:
print("🔄 Updating with recent motions...")
self.update_motions_data(days_back=7, max_records=250)
except KeyboardInterrupt:
print("\nSkipping manual update.")
# Schedule regular updates
print("\n📅 Scheduling regular updates...")
schedule.every().monday.at("02:00").do(self.weekly_update_job)
schedule.every().thursday.at("14:00").do(lambda: self.update_motions_data(days_back=7, max_records=250))
print("Jobs scheduled:")
print("- Weekly motion update: Every Monday at 02:00")
print("- Mid-week update: Every Thursday at 14:00")
print(f"- API limit per request: {config.API_MAX_LIMIT} records")
print("\n🔄 Scheduler is now running. Press Ctrl+C to stop.")
try:
while True:
schedule.run_pending()
time.sleep(3600) # Check every hour
except KeyboardInterrupt:
print("\n👋 Scheduler stopped by user.")
def run_once():
"""Run data update once and exit"""
scheduler = DataUpdateScheduler()
print("Running one-time data update...")
has_data = scheduler.check_database_has_data()
if not has_data:
print("No existing data found. Running initial data load...")
scheduler.initial_data_load()
else:
print("Updating existing data with recent motions...")
scheduler.update_motions_data(days_back=14, max_records=250)
print("One-time update completed!")
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "--once":
run_once()
else:
scheduler = DataUpdateScheduler()
scheduler.run_scheduler()