# scraper.py import requests from bs4 import BeautifulSoup import time import re from datetime import datetime, timedelta from typing import Dict, List, Optional from database import db from config import config class MotionScraper: def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) def scrape_motion_list(self, start_date: datetime = None, end_date: datetime = None) -> List[str]: """Scrape motion URLs from the main page""" if not start_date: start_date = datetime.now() - timedelta(days=730) # 2 years ago if not end_date: end_date = datetime.now() motion_urls = [] page = 1 while True: try: url = f"{config.BASE_URL}?page={page}" response = self.session.get(url, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Find motion links (adjust selectors based on actual HTML structure) motion_links = soup.find_all('a', href=re.compile(r'/stemmingsuitslagen/')) if not motion_links: break for link in motion_links: href = link.get('href') if href and href not in motion_urls: motion_urls.append(href) page += 1 time.sleep(config.SCRAPING_DELAY) except Exception as e: print(f"Error scraping page {page}: {e}") break return motion_urls def parse_motion_detail(self, motion_url: str) -> Optional[Dict]: """Parse individual motion details""" try: full_url = f"https://www.tweedekamer.nl{motion_url}" if motion_url.startswith('/') else motion_url response = self.session.get(full_url, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Extract motion data (adjust selectors based on actual HTML structure) title = self._extract_title(soup) description = self._extract_description(soup) date = self._extract_date(soup) policy_area = self._extract_policy_area(soup) voting_results = self._extract_voting_results(soup) if not all([title, voting_results]): return None # Calculate winning margin total_votes = sum(1 for vote in voting_results.values() if vote in ['voor', 'tegen']) if total_votes == 0: return None votes_for = sum(1 for vote in voting_results.values() if vote == 'voor') winning_margin = abs(votes_for - (total_votes - votes_for)) / total_votes return { 'title': title, 'description': description or '', 'date': date, 'policy_area': policy_area or 'Onbekend', 'voting_results': voting_results, 'winning_margin': winning_margin, 'url': full_url } except Exception as e: print(f"Error parsing motion {motion_url}: {e}") return None def _extract_title(self, soup: BeautifulSoup) -> Optional[str]: """Extract motion title""" # Look for common title selectors selectors = ['h1', '.motion-title', '.title', 'h2'] for selector in selectors: element = soup.select_one(selector) if element: return element.get_text(strip=True) return None def _extract_description(self, soup: BeautifulSoup) -> Optional[str]: """Extract motion description""" # Look for description elements selectors = ['.motion-description', '.description', '.content', 'p'] for selector in selectors: elements = soup.select(selector) if elements: return ' '.join(el.get_text(strip=True) for el in elements[:3]) return None def _extract_date(self, soup: BeautifulSoup) -> Optional[str]: """Extract motion date""" # Look for date patterns date_pattern = re.compile(r'\d{1,2}-\d{1,2}-\d{4}|\d{4}-\d{1,2}-\d{1,2}') text = soup.get_text() match = date_pattern.search(text) if match: return match.group() return datetime.now().strftime('%Y-%m-%d') def _extract_policy_area(self, soup: BeautifulSoup) -> Optional[str]: """Extract policy area/category""" # Look for category indicators text = soup.get_text().lower() for area in config.POLICY_AREAS[1:]: # Skip "Alle" if area.lower() in text: return area return "Algemeen" def _extract_voting_results(self, soup: BeautifulSoup) -> Dict[str, str]: """Extract party voting results""" # This is a simplified extraction - you'll need to adjust based on actual HTML voting_results = {} # Look for voting tables or lists tables = soup.find_all('table') for table in tables: rows = table.find_all('tr') for row in rows: cells = row.find_all(['td', 'th']) if len(cells) >= 2: party = cells[0].get_text(strip=True) vote = cells[1].get_text(strip=True).lower() if vote in ['voor', 'tegen', 'afwezig']: voting_results[party] = vote # Fallback: simulate some voting data for testing if not voting_results: parties = ['VVD', 'PVV', 'CDA', 'D66', 'GL', 'SP', 'PvdA', 'CU', 'PvdD', 'FVD', '50PLUS', 'SGP'] import random for party in parties: voting_results[party] = random.choice(['voor', 'tegen', 'afwezig']) return voting_results def run_scraping_job(self): """Main scraping job""" print("Starting motion scraping...") motion_urls = self.scrape_motion_list() print(f"Found {len(motion_urls)} motion URLs") successful_scrapes = 0 for i, url in enumerate(motion_urls): print(f"Processing motion {i+1}/{len(motion_urls)}: {url}") motion_data = self.parse_motion_detail(url) if motion_data: if db.insert_motion(motion_data): successful_scrapes += 1 time.sleep(config.SCRAPING_DELAY) print(f"Scraping completed. Successfully scraped {successful_scrapes} motions.") scraper = MotionScraper()