Filmladder MCP Server

scraper.py•8.76 KiB

"""Web scraper for filmladder.nl Amsterdam cinemas.""" from __future__ import annotations import re from datetime import date, timedelta, time import httpx from bs4 import BeautifulSoup from src.config import settings from src.models import Cinema, Movie, Showtime class ScrapingError(Exception): """Raised when scraping fails.""" pass # Dutch day names mapping DUTCH_DAYS = { "maandag": 0, "dinsdag": 1, "woensdag": 2, "donderdag": 3, "vrijdag": 4, "zaterdag": 5, "zondag": 6, } def parse_rating(rating_text: str | None) -> float | None: """Parse rating from text like '7.6★' or '8.0★'.""" if not rating_text: return None match = re.search(r"(\d+\.?\d*)", rating_text) if match: try: return float(match.group(1)) except ValueError: return None return None def parse_time(time_str: str) -> time | None: """Parse time string like '21:00' or '18:30'.""" try: parts = time_str.strip().split(":") if len(parts) == 2: hour = int(parts[0]) minute = int(parts[1]) return time(hour=hour, minute=minute) except (ValueError, IndexError): pass return None def get_date_for_dutch_day(day_name: str, today: date) -> date: """Convert Dutch day name to actual date.""" day_name_lower = day_name.lower().strip() if day_name_lower == "vandaag": return today if day_name_lower == "morgen": return today + timedelta(days=1) # Handle day names (maandag, dinsdag, etc.) if day_name_lower in DUTCH_DAYS: target_weekday = DUTCH_DAYS[day_name_lower] days_ahead = target_weekday - today.weekday() if days_ahead <= 0: days_ahead += 7 # Next week return today + timedelta(days=days_ahead) # If we can't parse it, return today as fallback return today async def fetch_html(url: str) -> str: """Fetch HTML content from URL.""" async with httpx.AsyncClient(timeout=settings.request_timeout) as client: try: response = await client.get(url) response.raise_for_status() return response.text except httpx.HTTPError as e: raise ScrapingError(f"Failed to fetch {url}: {e}") from e def parse_cinema_data(html: str) -> list[Cinema]: """Parse cinema data from HTML.""" soup = BeautifulSoup(html, "lxml") cinemas: list[Cinema] = [] today = date.today() # Find all cinema sections # Based on the HTML structure, cinemas appear to be in sections # We'll look for cinema names and their associated movies cinema_sections = soup.find_all(["h2", "h3"], string=re.compile(r".+")) current_cinema: Cinema | None = None current_movies: list[Movie] = [] for element in soup.find_all(["h2", "h3", "h4", "div"]): text = element.get_text(strip=True) # Check if this is a cinema name (usually in h2 or h3) if element.name in ["h2", "h3"] and text and len(text) < 100: # Save previous cinema if exists if current_cinema: current_cinema.movies = current_movies cinemas.append(current_cinema) # Start new cinema current_cinema = Cinema(name=text, address=None) current_movies = [] continue # Look for movie titles and ratings # Movies typically have ratings like "7.6★" nearby rating_match = re.search(r"(\d+\.?\d*)\s*★", text) if rating_match: # This might be a movie section # Try to find the movie title (usually before the rating) # Or look for movie title in nearby elements parent = element.parent if parent: # Look for movie title in siblings or parent movie_title_elem = parent.find(["h4", "h5", "strong", "b"]) if movie_title_elem: movie_title = movie_title_elem.get_text(strip=True) rating = parse_rating(text) if movie_title and movie_title != current_cinema.name if current_cinema else True: # Create movie and parse showtimes showtimes = parse_showtimes_for_movie(element, current_cinema.name if current_cinema else "", movie_title, today) movie = Movie(title=movie_title, rating=rating, showtimes=showtimes) current_movies.append(movie) # Add last cinema if current_cinema: current_cinema.movies = current_movies cinemas.append(current_cinema) # Alternative parsing approach: look for structured patterns if not cinemas: cinemas = parse_structured_cinemas(soup, today) return cinemas def parse_showtimes_for_movie(element: BeautifulSoup, cinema_name: str, movie_title: str, today: date) -> list[Showtime]: """Parse showtimes for a movie from HTML element.""" showtimes: list[Showtime] = [] text = element.get_text() # Look for day names and times in the text # Pattern: "vandaag vr 21:00" or "morgen za 20:55" day_time_pattern = r"(vandaag|morgen|maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\s+\w+\s+(\d{1,2}:\d{2})" matches = re.finditer(day_time_pattern, text, re.IGNORECASE) for match in matches: day_name = match.group(1) time_str = match.group(2) showtime_date = get_date_for_dutch_day(day_name, today) showtime_time = parse_time(time_str) if showtime_time: showtime = Showtime( cinema_name=cinema_name, movie_title=movie_title, showtime_date=showtime_date, showtime_time=showtime_time, ) showtimes.append(showtime) # Also look for standalone times after day names # Find all time patterns time_pattern = r"(\d{1,2}:\d{2})" time_matches = re.finditer(time_pattern, text) # Try to associate times with nearby day names for time_match in time_matches: time_str = time_match.group(1) # Look backwards in text for day name text_before = text[: time_match.start()] day_match = re.search( r"(vandaag|morgen|maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)", text_before[-50:], re.IGNORECASE, ) if day_match: day_name = day_match.group(1) showtime_date = get_date_for_dutch_day(day_name, today) showtime_time = parse_time(time_str) if showtime_time: showtime = Showtime( cinema_name=cinema_name, movie_title=movie_title, showtime_date=showtime_date, showtime_time=showtime_time, ) # Avoid duplicates if showtime not in showtimes: showtimes.append(showtime) return showtimes def parse_structured_cinemas(soup: BeautifulSoup, today: date) -> list[Cinema]: """Alternative parsing: look for structured cinema/movie patterns.""" cinemas: list[Cinema] = [] # Look for cinema headings (h2 or h3 with cinema names) for heading in soup.find_all(["h2", "h3"]): cinema_name = heading.get_text(strip=True) if not cinema_name or len(cinema_name) > 100: continue cinema = Cinema(name=cinema_name, address=None) movies: list[Movie] = [] # Look for movies in the section following this heading current = heading.find_next_sibling() while current and current.name not in ["h2", "h3"]: # Look for movie titles and ratings text = current.get_text() rating_match = re.search(r"(\d+\.?\d*)\s*★", text) if rating_match: # Find movie title (might be in a heading or bold) title_elem = current.find(["h4", "h5", "strong", "b", "a"]) if title_elem: movie_title = title_elem.get_text(strip=True) if movie_title and movie_title != cinema_name: rating = parse_rating(text) showtimes = parse_showtimes_for_movie(current, cinema_name, movie_title, today) movie = Movie(title=movie_title, rating=rating, showtimes=showtimes) movies.append(movie) current = current.find_next_sibling() cinema.movies = movies if cinema.movies: # Only add cinemas with movies cinemas.append(cinema) return cinemas async def fetch_amsterdam_cinemas() -> list[Cinema]: """Fetch and parse all cinemas in Amsterdam from filmladder.nl.""" url = f"{settings.base_url}{settings.amsterdam_path}" html = await fetch_html(url) return parse_cinema_data(html)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/danielsteman/filmladder-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

scraper.py•8.76 KiB