"""Web scraper for filmladder.nl Amsterdam cinemas."""
from __future__ import annotations
import re
from datetime import date, timedelta, time
import httpx
from bs4 import BeautifulSoup
from src.config import settings
from src.models import Cinema, Movie, Showtime
class ScrapingError(Exception):
"""Raised when scraping fails."""
pass
# Dutch day names mapping
DUTCH_DAYS = {
"maandag": 0,
"dinsdag": 1,
"woensdag": 2,
"donderdag": 3,
"vrijdag": 4,
"zaterdag": 5,
"zondag": 6,
}
def parse_rating(rating_text: str | None) -> float | None:
"""Parse rating from text like '7.6★' or '8.0★'."""
if not rating_text:
return None
match = re.search(r"(\d+\.?\d*)", rating_text)
if match:
try:
return float(match.group(1))
except ValueError:
return None
return None
def parse_time(time_str: str) -> time | None:
"""Parse time string like '21:00' or '18:30'."""
try:
parts = time_str.strip().split(":")
if len(parts) == 2:
hour = int(parts[0])
minute = int(parts[1])
return time(hour=hour, minute=minute)
except (ValueError, IndexError):
pass
return None
def get_date_for_dutch_day(day_name: str, today: date) -> date:
"""Convert Dutch day name to actual date."""
day_name_lower = day_name.lower().strip()
if day_name_lower == "vandaag":
return today
if day_name_lower == "morgen":
return today + timedelta(days=1)
# Handle day names (maandag, dinsdag, etc.)
if day_name_lower in DUTCH_DAYS:
target_weekday = DUTCH_DAYS[day_name_lower]
days_ahead = target_weekday - today.weekday()
if days_ahead <= 0:
days_ahead += 7 # Next week
return today + timedelta(days=days_ahead)
# If we can't parse it, return today as fallback
return today
async def fetch_html(url: str) -> str:
"""Fetch HTML content from URL."""
async with httpx.AsyncClient(timeout=settings.request_timeout) as client:
try:
response = await client.get(url)
response.raise_for_status()
return response.text
except httpx.HTTPError as e:
raise ScrapingError(f"Failed to fetch {url}: {e}") from e
def parse_cinema_data(html: str) -> list[Cinema]:
"""Parse cinema data from HTML."""
soup = BeautifulSoup(html, "lxml")
cinemas: list[Cinema] = []
today = date.today()
# Find all cinema sections
# Based on the HTML structure, cinemas appear to be in sections
# We'll look for cinema names and their associated movies
cinema_sections = soup.find_all(["h2", "h3"], string=re.compile(r".+"))
current_cinema: Cinema | None = None
current_movies: list[Movie] = []
for element in soup.find_all(["h2", "h3", "h4", "div"]):
text = element.get_text(strip=True)
# Check if this is a cinema name (usually in h2 or h3)
if element.name in ["h2", "h3"] and text and len(text) < 100:
# Save previous cinema if exists
if current_cinema:
current_cinema.movies = current_movies
cinemas.append(current_cinema)
# Start new cinema
current_cinema = Cinema(name=text, address=None)
current_movies = []
continue
# Look for movie titles and ratings
# Movies typically have ratings like "7.6★" nearby
rating_match = re.search(r"(\d+\.?\d*)\s*★", text)
if rating_match:
# This might be a movie section
# Try to find the movie title (usually before the rating)
# Or look for movie title in nearby elements
parent = element.parent
if parent:
# Look for movie title in siblings or parent
movie_title_elem = parent.find(["h4", "h5", "strong", "b"])
if movie_title_elem:
movie_title = movie_title_elem.get_text(strip=True)
rating = parse_rating(text)
if movie_title and movie_title != current_cinema.name if current_cinema else True:
# Create movie and parse showtimes
showtimes = parse_showtimes_for_movie(element, current_cinema.name if current_cinema else "", movie_title, today)
movie = Movie(title=movie_title, rating=rating, showtimes=showtimes)
current_movies.append(movie)
# Add last cinema
if current_cinema:
current_cinema.movies = current_movies
cinemas.append(current_cinema)
# Alternative parsing approach: look for structured patterns
if not cinemas:
cinemas = parse_structured_cinemas(soup, today)
return cinemas
def parse_showtimes_for_movie(element: BeautifulSoup, cinema_name: str, movie_title: str, today: date) -> list[Showtime]:
"""Parse showtimes for a movie from HTML element."""
showtimes: list[Showtime] = []
text = element.get_text()
# Look for day names and times in the text
# Pattern: "vandaag vr 21:00" or "morgen za 20:55"
day_time_pattern = r"(vandaag|morgen|maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\s+\w+\s+(\d{1,2}:\d{2})"
matches = re.finditer(day_time_pattern, text, re.IGNORECASE)
for match in matches:
day_name = match.group(1)
time_str = match.group(2)
showtime_date = get_date_for_dutch_day(day_name, today)
showtime_time = parse_time(time_str)
if showtime_time:
showtime = Showtime(
cinema_name=cinema_name,
movie_title=movie_title,
showtime_date=showtime_date,
showtime_time=showtime_time,
)
showtimes.append(showtime)
# Also look for standalone times after day names
# Find all time patterns
time_pattern = r"(\d{1,2}:\d{2})"
time_matches = re.finditer(time_pattern, text)
# Try to associate times with nearby day names
for time_match in time_matches:
time_str = time_match.group(1)
# Look backwards in text for day name
text_before = text[: time_match.start()]
day_match = re.search(
r"(vandaag|morgen|maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)",
text_before[-50:],
re.IGNORECASE,
)
if day_match:
day_name = day_match.group(1)
showtime_date = get_date_for_dutch_day(day_name, today)
showtime_time = parse_time(time_str)
if showtime_time:
showtime = Showtime(
cinema_name=cinema_name,
movie_title=movie_title,
showtime_date=showtime_date,
showtime_time=showtime_time,
)
# Avoid duplicates
if showtime not in showtimes:
showtimes.append(showtime)
return showtimes
def parse_structured_cinemas(soup: BeautifulSoup, today: date) -> list[Cinema]:
"""Alternative parsing: look for structured cinema/movie patterns."""
cinemas: list[Cinema] = []
# Look for cinema headings (h2 or h3 with cinema names)
for heading in soup.find_all(["h2", "h3"]):
cinema_name = heading.get_text(strip=True)
if not cinema_name or len(cinema_name) > 100:
continue
cinema = Cinema(name=cinema_name, address=None)
movies: list[Movie] = []
# Look for movies in the section following this heading
current = heading.find_next_sibling()
while current and current.name not in ["h2", "h3"]:
# Look for movie titles and ratings
text = current.get_text()
rating_match = re.search(r"(\d+\.?\d*)\s*★", text)
if rating_match:
# Find movie title (might be in a heading or bold)
title_elem = current.find(["h4", "h5", "strong", "b", "a"])
if title_elem:
movie_title = title_elem.get_text(strip=True)
if movie_title and movie_title != cinema_name:
rating = parse_rating(text)
showtimes = parse_showtimes_for_movie(current, cinema_name, movie_title, today)
movie = Movie(title=movie_title, rating=rating, showtimes=showtimes)
movies.append(movie)
current = current.find_next_sibling()
cinema.movies = movies
if cinema.movies: # Only add cinemas with movies
cinemas.append(cinema)
return cinemas
async def fetch_amsterdam_cinemas() -> list[Cinema]:
"""Fetch and parse all cinemas in Amsterdam from filmladder.nl."""
url = f"{settings.base_url}{settings.amsterdam_path}"
html = await fetch_html(url)
return parse_cinema_data(html)