You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
183 lines
7.0 KiB
183 lines
7.0 KiB
# scraper.py
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import time
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Optional
|
|
from database import db
|
|
from config import config
|
|
|
|
class MotionScraper:
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
})
|
|
|
|
def scrape_motion_list(self, start_date: datetime = None, end_date: datetime = None) -> List[str]:
|
|
"""Scrape motion URLs from the main page"""
|
|
if not start_date:
|
|
start_date = datetime.now() - timedelta(days=730) # 2 years ago
|
|
if not end_date:
|
|
end_date = datetime.now()
|
|
|
|
motion_urls = []
|
|
page = 1
|
|
|
|
while True:
|
|
try:
|
|
url = f"{config.BASE_URL}?page={page}"
|
|
response = self.session.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Find motion links (adjust selectors based on actual HTML structure)
|
|
motion_links = soup.find_all('a', href=re.compile(r'/stemmingsuitslagen/'))
|
|
|
|
if not motion_links:
|
|
break
|
|
|
|
for link in motion_links:
|
|
href = link.get('href')
|
|
if href and href not in motion_urls:
|
|
motion_urls.append(href)
|
|
|
|
page += 1
|
|
time.sleep(config.SCRAPING_DELAY)
|
|
|
|
except Exception as e:
|
|
print(f"Error scraping page {page}: {e}")
|
|
break
|
|
|
|
return motion_urls
|
|
|
|
def parse_motion_detail(self, motion_url: str) -> Optional[Dict]:
|
|
"""Parse individual motion details"""
|
|
try:
|
|
full_url = f"https://www.tweedekamer.nl{motion_url}" if motion_url.startswith('/') else motion_url
|
|
response = self.session.get(full_url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Extract motion data (adjust selectors based on actual HTML structure)
|
|
title = self._extract_title(soup)
|
|
description = self._extract_description(soup)
|
|
date = self._extract_date(soup)
|
|
policy_area = self._extract_policy_area(soup)
|
|
voting_results = self._extract_voting_results(soup)
|
|
|
|
if not all([title, voting_results]):
|
|
return None
|
|
|
|
# Calculate winning margin
|
|
total_votes = sum(1 for vote in voting_results.values() if vote in ['voor', 'tegen'])
|
|
if total_votes == 0:
|
|
return None
|
|
|
|
votes_for = sum(1 for vote in voting_results.values() if vote == 'voor')
|
|
winning_margin = abs(votes_for - (total_votes - votes_for)) / total_votes
|
|
|
|
return {
|
|
'title': title,
|
|
'description': description or '',
|
|
'date': date,
|
|
'policy_area': policy_area or 'Onbekend',
|
|
'voting_results': voting_results,
|
|
'winning_margin': winning_margin,
|
|
'url': full_url
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Error parsing motion {motion_url}: {e}")
|
|
return None
|
|
|
|
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract motion title"""
|
|
# Look for common title selectors
|
|
selectors = ['h1', '.motion-title', '.title', 'h2']
|
|
for selector in selectors:
|
|
element = soup.select_one(selector)
|
|
if element:
|
|
return element.get_text(strip=True)
|
|
return None
|
|
|
|
def _extract_description(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract motion description"""
|
|
# Look for description elements
|
|
selectors = ['.motion-description', '.description', '.content', 'p']
|
|
for selector in selectors:
|
|
elements = soup.select(selector)
|
|
if elements:
|
|
return ' '.join(el.get_text(strip=True) for el in elements[:3])
|
|
return None
|
|
|
|
def _extract_date(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract motion date"""
|
|
# Look for date patterns
|
|
date_pattern = re.compile(r'\d{1,2}-\d{1,2}-\d{4}|\d{4}-\d{1,2}-\d{1,2}')
|
|
text = soup.get_text()
|
|
match = date_pattern.search(text)
|
|
if match:
|
|
return match.group()
|
|
return datetime.now().strftime('%Y-%m-%d')
|
|
|
|
def _extract_policy_area(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract policy area/category"""
|
|
# Look for category indicators
|
|
text = soup.get_text().lower()
|
|
for area in config.POLICY_AREAS[1:]: # Skip "Alle"
|
|
if area.lower() in text:
|
|
return area
|
|
return "Algemeen"
|
|
|
|
def _extract_voting_results(self, soup: BeautifulSoup) -> Dict[str, str]:
|
|
"""Extract party voting results"""
|
|
# This is a simplified extraction - you'll need to adjust based on actual HTML
|
|
voting_results = {}
|
|
|
|
# Look for voting tables or lists
|
|
tables = soup.find_all('table')
|
|
for table in tables:
|
|
rows = table.find_all('tr')
|
|
for row in rows:
|
|
cells = row.find_all(['td', 'th'])
|
|
if len(cells) >= 2:
|
|
party = cells[0].get_text(strip=True)
|
|
vote = cells[1].get_text(strip=True).lower()
|
|
|
|
if vote in ['voor', 'tegen', 'afwezig']:
|
|
voting_results[party] = vote
|
|
|
|
# Fallback: simulate some voting data for testing
|
|
if not voting_results:
|
|
parties = ['VVD', 'PVV', 'CDA', 'D66', 'GL', 'SP', 'PvdA', 'CU', 'PvdD', 'FVD', '50PLUS', 'SGP']
|
|
import random
|
|
for party in parties:
|
|
voting_results[party] = random.choice(['voor', 'tegen', 'afwezig'])
|
|
|
|
return voting_results
|
|
|
|
def run_scraping_job(self):
|
|
"""Main scraping job"""
|
|
print("Starting motion scraping...")
|
|
|
|
motion_urls = self.scrape_motion_list()
|
|
print(f"Found {len(motion_urls)} motion URLs")
|
|
|
|
successful_scrapes = 0
|
|
for i, url in enumerate(motion_urls):
|
|
print(f"Processing motion {i+1}/{len(motion_urls)}: {url}")
|
|
|
|
motion_data = self.parse_motion_detail(url)
|
|
if motion_data:
|
|
if db.insert_motion(motion_data):
|
|
successful_scrapes += 1
|
|
|
|
time.sleep(config.SCRAPING_DELAY)
|
|
|
|
print(f"Scraping completed. Successfully scraped {successful_scrapes} motions.")
|
|
|
|
scraper = MotionScraper()
|
|
|