SNCF API MCP Server

mcp-sncf
sncf_api

csv_station_finder.py•24.5 kB

from typing import Dict, Any, List, Optional, Tuple, Set import csv import re import os.path import time import json import unicodedata import math import logging from difflib import SequenceMatcher # Configure logging with a file handler to prevent stdout interference logger = logging.getLogger('csv_station_finder') # Don't propagate logs to root handler (which might output to stdout) logger.propagate = False # Ensure handler only added once if not logger.handlers: # Create logs directory if it doesn't exist os.makedirs('logs', exist_ok=True) file_handler = logging.FileHandler(os.path.join('logs', 'csv_station_finder.log')) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') file_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.setLevel(logging.INFO) class CSVStationFinder: """ A class that efficiently finds train stations in France using a CSV database """ def __init__(self, csv_file_path, encoding='utf-8'): """ Initialize with the CSV file path Args: csv_file_path: Path to the CSV file containing station data encoding: File encoding (default: utf-8) """ self.csv_file_path = csv_file_path self.encoding = encoding self.stations_by_city = {} # City name -> list of stations self.stations_by_id = {} # Station ID -> station data self.cities = set() # Set of all city names (normalized) self.city_name_map = {} # Normalized city name -> original city name self._cached_queries = {} # Cache for search queries self._last_cache_cleanup = time.time() self._cache_expiry = 24 * 60 * 60 # Cache expiry in seconds (24 hours) # Try different encodings if the default one fails encodings_to_try = [encoding, 'utf-8', 'latin1', 'cp1252', 'iso-8859-1'] for enc in encodings_to_try: try: logger.info(f"Trying encoding: {enc}") self.encoding = enc self.load_stations() if len(self.stations_by_id) > 0: logger.info(f"Successfully loaded with encoding: {enc}") break except Exception as e: logger.error(f"Failed with encoding {enc}: {str(e)}") # If we couldn't load any stations, add some hardcoded major stations if len(self.stations_by_id) == 0: logger.warning("Falling back to hardcoded stations") self._add_hardcoded_stations() def _normalize_text(self, text: str) -> str: """Normalize text for case-insensitive, accent-insensitive searching""" if not text: return "" # Convert to lowercase and strip spaces text = text.lower().strip() # Remove accents text = unicodedata.normalize('NFKD', text) text = ''.join([c for c in text if not unicodedata.combining(c)]) # Replace special characters with spaces text = re.sub(r'[^\w\s]', ' ', text) # Replace multiple spaces with a single space text = re.sub(r'\s+', ' ', text) return text.strip() def _similarity_score(self, str1: str, str2: str) -> float: """Calculate the similarity between two strings (0-1 where 1 is identical)""" return SequenceMatcher(None, str1, str2).ratio() def load_stations(self) -> None: """Load station data from the CSV file""" try: logger.info(f"Loading CSV file from: {self.csv_file_path}") if not os.path.exists(self.csv_file_path): logger.error(f"File not found: {self.csv_file_path}") return with open(self.csv_file_path, 'r', encoding=self.encoding) as csvfile: reader = csv.DictReader(csvfile) if not reader.fieldnames: logger.error("CSV file has no headers") return # Don't log the entire fieldnames collection as it might cause JSON parsing issues logger.info(f"CSV headers read successfully") row_count = 0 valid_stations = 0 for row in reader: row_count += 1 # Skip entries without coordinates if not row.get('latitude') or not row.get('longitude'): continue # Extract data station_id = row.get('id') name = row.get('name', '') # Handle potential parsing errors for numeric fields try: latitude = float(row.get('latitude', 0)) longitude = float(row.get('longitude', 0)) except (ValueError, TypeError): logger.warning(f"Invalid coordinates for station ID: {station_id}") continue country = row.get('country', '') # Handle boolean fields safely is_city_val = row.get('is_city', '').upper() is_main_val = row.get('is_main_station', '').upper() is_city = is_city_val == 'TRUE' if is_city_val else False is_main_station = is_main_val == 'TRUE' if is_main_val else False parent_id = row.get('parent_station_id', '') # Only include French stations if country != 'FR': continue # Create station object station = { 'id': f"stop_area:SNCF:{station_id}", # Format to match SNCF API 'name': name, 'type': 'stop_area', 'coord': {'lat': latitude, 'lon': longitude}, 'is_main_station': is_main_station, 'is_city': is_city, 'parent_id': parent_id if parent_id and parent_id != 'NA' else None } # Store in id lookup dictionary self.stations_by_id[station_id] = station valid_stations += 1 # Extract city name - either it's a city entry itself, or we extract from name city_name = None if is_city: # This entry represents a city city_name = name elif ' (' in name and ')' in name.split(' (')[-1]: # Extract city name from parentheses in station name: "Station Name (City)" city_part = name.split(' (')[-1] city_name = city_part[:-1] # Remove the closing parenthesis elif ' - ' in name: # Handle format "City - Station" city_name = name.split(' - ')[0] # If we have identified a city name if city_name: normalized_city = self._normalize_text(city_name) # Store the mapping from normalized to original city name if normalized_city not in self.city_name_map: self.city_name_map[normalized_city] = city_name # Add to cities set self.cities.add(normalized_city) # Add to city lookup dictionary if normalized_city not in self.stations_by_city: self.stations_by_city[normalized_city] = [] self.stations_by_city[normalized_city].append(station) # Sort stations within each city by is_main_station flag for city, stations in self.stations_by_city.items(): self.stations_by_city[city] = sorted( stations, key=lambda s: ( 0 if s['is_main_station'] else 1, # Main stations first 0 if not s['parent_id'] else 1 # Parent stations before child stations ) ) logger.info(f"Loaded {valid_stations} stations in {len(self.cities)} cities") except Exception as e: logger.error(f"Error loading station data: {str(e)}") # Initialize with empty data self.stations_by_city = {} self.stations_by_id = {} self.cities = set() def _cleanup_cache(self, force: bool = False) -> None: """Clean up expired cache entries""" current_time = time.time() # Only clean up once per hour unless forced if not force and (current_time - self._last_cache_cleanup < 3600): return expired_keys = [] # Find expired cache entries for key, (timestamp, _) in self._cached_queries.items(): if current_time - timestamp > self._cache_expiry: expired_keys.append(key) # Remove expired entries for key in expired_keys: del self._cached_queries[key] # Update last cleanup time self._last_cache_cleanup = current_time def find_stations_by_city(self, city: str) -> List[Dict[str, Any]]: """ Find all stations in a given city Args: city: City name Returns: List of station data dictionaries """ # Clean up cache periodically self._cleanup_cache() normalized_city = self._normalize_text(city) # Check cache cache_key = f"city:{normalized_city}" if cache_key in self._cached_queries: timestamp, stations = self._cached_queries[cache_key] if time.time() - timestamp <= self._cache_expiry: return stations # Exact match if normalized_city in self.stations_by_city: result = self.stations_by_city[normalized_city] self._cached_queries[cache_key] = (time.time(), result) return result # Fuzzy matching for city names best_match = None best_score = 0 for city_name in self.cities: score = self._similarity_score(normalized_city, city_name) if score > best_score and score > 0.8: # Threshold for similarity best_score = score best_match = city_name if best_match: result = self.stations_by_city[best_match] self._cached_queries[cache_key] = (time.time(), result) return result # No match found self._cached_queries[cache_key] = (time.time(), []) return [] def find_station_by_name(self, city: str, station_name: Optional[str] = None) -> Optional[Dict[str, Any]]: """ Find a station by city and optionally station name Args: city: City name station_name: Optional specific station name (if None, returns main station in city) Returns: Station data or None if not found """ # Clean up cache periodically self._cleanup_cache() # Create cache key normalized_city = self._normalize_text(city) normalized_station = self._normalize_text(station_name) if station_name else "" cache_key = f"city_station:{normalized_city}:{normalized_station}" # Check cache if cache_key in self._cached_queries: timestamp, station = self._cached_queries[cache_key] if time.time() - timestamp <= self._cache_expiry: return station # Find all stations in the city city_stations = self.find_stations_by_city(city) if not city_stations: self._cached_queries[cache_key] = (time.time(), None) return None # If no station name provided, return the main station if not station_name: # Find the main station (should be first due to sorting in load_stations) for station in city_stations: if station['is_main_station']: self._cached_queries[cache_key] = (time.time(), station) return station # If no main station found, return the first station self._cached_queries[cache_key] = (time.time(), city_stations[0]) return city_stations[0] # Find station by name normalized_station_name = self._normalize_text(station_name) # First try exact match for station in city_stations: if self._normalize_text(station['name']) == normalized_station_name: self._cached_queries[cache_key] = (time.time(), station) return station # Then try partial match best_match = None best_score = 0 for station in city_stations: score = self._similarity_score(normalized_station_name, self._normalize_text(station['name'])) if score > best_score and score > 0.6: # Threshold for similarity best_score = score best_match = station if best_match: self._cached_queries[cache_key] = (time.time(), best_match) return best_match # No match found self._cached_queries[cache_key] = (time.time(), None) return None def find_station_by_coordinates(self, lat: float, lon: float, max_distance_km: float = 2.0) -> Optional[Dict[str, Any]]: """ Find the nearest station to given coordinates Args: lat: Latitude lon: Longitude max_distance_km: Maximum distance in kilometers (default: 2km) Returns: Nearest station data or None if no station within max_distance """ # Clean up cache periodically self._cleanup_cache() # Create cache key cache_key = f"coord:{lat:.5f}:{lon:.5f}:{max_distance_km}" # Check cache if cache_key in self._cached_queries: timestamp, station = self._cached_queries[cache_key] if time.time() - timestamp <= self._cache_expiry: return station # Find nearest station nearest_station = None min_distance = float('inf') for station_id, station in self.stations_by_id.items(): station_lat = station['coord']['lat'] station_lon = station['coord']['lon'] # Calculate distance using Haversine formula distance = self._haversine_distance(lat, lon, station_lat, station_lon) if distance < min_distance: min_distance = distance nearest_station = station # Check if the nearest station is within max_distance if nearest_station and min_distance <= max_distance_km: self._cached_queries[cache_key] = (time.time(), nearest_station) return nearest_station # No station found within max_distance self._cached_queries[cache_key] = (time.time(), None) return None def _haversine_distance(self, lat1: float, lon1: float, lat2: float, lon2: float) -> float: """ Calculate the great circle distance between two points on the earth (specified in decimal degrees) Returns: Distance in kilometers """ # Convert decimal degrees to radians lat1, lon1, lat2, lon2 = map(lambda x: x * math.pi / 180, [lat1, lon1, lat2, lon2]) # Haversine formula dlon = lon2 - lon1 dlat = lat2 - lat1 a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2 c = 2 * math.asin(math.sqrt(a)) r = 6371 # Radius of earth in kilometers return c * r def search_stations(self, query: str, limit: int = 10) -> List[Dict[str, Any]]: """ Search for stations by name (across all cities) Args: query: Search query limit: Maximum number of results to return Returns: List of matching stations """ # Clean up cache periodically self._cleanup_cache() # Create cache key normalized_query = self._normalize_text(query) cache_key = f"search:{normalized_query}:{limit}" # Check cache if cache_key in self._cached_queries: timestamp, stations = self._cached_queries[cache_key] if time.time() - timestamp <= self._cache_expiry: return stations # Search all stations matched_stations = [] for station_id, station in self.stations_by_id.items(): station_name = self._normalize_text(station['name']) # Calculate match score score = self._similarity_score(normalized_query, station_name) # Also check if query is a substring of station name if normalized_query in station_name: score += 0.2 # Bonus for substring match # Store if score is good enough if score > 0.5: # Threshold for similarity matched_stations.append((score, station)) # Sort by score (descending) and limit results results = [station for _, station in sorted(matched_stations, key=lambda x: x[0], reverse=True)[:limit]] # Cache results self._cached_queries[cache_key] = (time.time(), results) return results def get_all_cities(self) -> List[str]: """ Get a list of all cities with train stations Returns: List of city names (original formatting, not normalized) """ return [self.city_name_map.get(city, city) for city in self.cities] def find_journey_with_city_names( self, from_city: str, to_city: str, from_station: Optional[str] = None, to_station: Optional[str] = None ) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]: """ Find departure and arrival stations for a journey between two cities Args: from_city: Departure city to_city: Arrival city from_station: Optional specific departure station to_station: Optional specific arrival station Returns: Tuple of (departure_station, arrival_station), either may be None if not found """ # Find departure station departure_station = self.find_station_by_name(from_city, from_station) # Find arrival station arrival_station = self.find_station_by_name(to_city, to_station) return departure_station, arrival_station def _add_hardcoded_stations(self): """Add hardcoded major stations as a fallback""" # Major stations data hardcoded_stations = [ { "id": "87686006", "name": "Paris Gare de Lyon", "city": "Paris", "lat": 48.844, "lon": 2.373, "is_main": True }, { "id": "87751008", "name": "Marseille Saint-Charles", "city": "Marseille", "lat": 43.303, "lon": 5.380, "is_main": True }, { "id": "87747006", "name": "Grenoble", "city": "Grenoble", "lat": 45.192, "lon": 5.716, "is_main": True }, { "id": "87722025", "name": "Lyon Part-Dieu", "city": "Lyon", "lat": 45.760, "lon": 4.860, "is_main": True }, { "id": "87723197", "name": "Lyon Perrache", "city": "Lyon", "lat": 45.750, "lon": 4.826, "is_main": False }, { "id": "87318964", "name": "Aix-en-Provence TGV", "city": "Aix en Provence", "lat": 43.455, "lon": 5.317, "is_main": True }, { "id": "87611004", "name": "Versailles-Chantiers", "city": "Versailles", "lat": 48.7942, "lon": 2.1347, "is_main": True }, { "id": "87711309", "name": "Versailles Rive Gauche", "city": "Versailles", "lat": 48.8031, "lon": 2.1271, "is_main": False }, { "id": "87545210", "name": "Versailles Rive Droite", "city": "Versailles", "lat": 48.809, "lon": 2.134, "is_main": False }, { "id": "87773002", "name": "Toulouse Matabiau", "city": "Toulouse", "lat": 43.611, "lon": 1.454, "is_main": True }, { "id": "87756056", "name": "Nice Ville", "city": "Nice", "lat": 43.704, "lon": 7.262, "is_main": True } ] for station_data in hardcoded_stations: city = station_data["city"] normalized_city = self._normalize_text(city) # Create station object station = { 'id': f"stop_area:SNCF:{station_data['id']}", 'name': station_data["name"], 'type': 'stop_area', 'coord': {'lat': station_data["lat"], 'lon': station_data["lon"]}, 'is_main_station': station_data["is_main"], 'is_city': False, 'parent_id': None } # Add to dictionaries self.stations_by_id[station_data['id']] = station if normalized_city not in self.city_name_map: self.city_name_map[normalized_city] = city self.cities.add(normalized_city) if normalized_city not in self.stations_by_city: self.stations_by_city[normalized_city] = [] self.stations_by_city[normalized_city].append(station) # Sort stations for city, stations in self.stations_by_city.items(): self.stations_by_city[city] = sorted( stations, key=lambda s: (0 if s['is_main_station'] else 1) ) logger.info(f"Added {len(hardcoded_stations)} hardcoded stations")

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Kryzo/mcp-sncf'

If you have feedback or need assistance with the MCP directory API, please join our Discord server