scraper.py•7.33 kB
import requests
import pandas as pd
from bs4 import BeautifulSoup
import logging
from typing import Dict, List, Optional, Union, Any
import json
import os
import datetime
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Constants
BASE_URL = "https://www.prosportstransactions.com/football/Search/SearchResults.php"
TEAM_MAPPING = {
"49ers": "San Francisco 49ers",
"Bears": "Chicago Bears",
"Bengals": "Cincinnati Bengals",
"Bills": "Buffalo Bills",
"Broncos": "Denver Broncos",
"Browns": "Cleveland Browns",
"Buccaneers": "Tampa Bay Buccaneers",
"Cardinals": "Arizona Cardinals",
"Chargers": "Los Angeles Chargers",
"Chiefs": "Kansas City Chiefs",
"Colts": "Indianapolis Colts",
"Cowboys": "Dallas Cowboys",
"Dolphins": "Miami Dolphins",
"Eagles": "Philadelphia Eagles",
"Falcons": "Atlanta Falcons",
"Giants": "New York Giants",
"Jaguars": "Jacksonville Jaguars",
"Jets": "New York Jets",
"Lions": "Detroit Lions",
"Packers": "Green Bay Packers",
"Panthers": "Carolina Panthers",
"Patriots": "New England Patriots",
"Raiders": "Las Vegas Raiders",
"Rams": "Los Angeles Rams",
"Ravens": "Baltimore Ravens",
"Commanders": "Washington Commanders",
"Saints": "New Orleans Saints",
"Seahawks": "Seattle Seahawks",
"Steelers": "Pittsburgh Steelers",
"Texans": "Houston Texans",
"Titans": "Tennessee Titans",
"Vikings": "Minnesota Vikings",
}
TRANSACTION_TYPE_MAPPING = {
"Player": "PlayerMovementChkBx",
"Injury": "InjuriesChkBx",
"Legal": "LegalChkBx",
"Disciplinary": "DisciplinaryChkBx",
"MinorLeague": "MinorLeagueChkBx",
"All": None
}
def validate_date(date_str: str) -> bool:
"""Validate date string format (YYYY-MM-DD)"""
try:
datetime.datetime.strptime(date_str, '%Y-%m-%d')
return True
except ValueError:
return False
def get_request_params(
team: Optional[str] = None,
player: Optional[str] = None,
start_date: str = "",
end_date: str = "",
transaction_type: str = "All"
) -> Dict[str, str]:
"""
Generate request parameters for the ProSportsTransactions search.
Args:
team: Team name (optional)
player: Player name (optional)
start_date: Start date in YYYY-MM-DD format
end_date: End date in YYYY-MM-DD format
transaction_type: Type of transaction to filter
Returns:
Dictionary of request parameters
"""
# Validate dates
if not validate_date(start_date) or not validate_date(end_date):
raise ValueError("Dates must be in YYYY-MM-DD format")
# Base params
params = {
"Player": player or "",
"Team": team or "",
"BeginDate": start_date,
"EndDate": end_date,
"submit": "Search"
}
# Add transaction type checkbox if not "All"
tx_type = TRANSACTION_TYPE_MAPPING.get(transaction_type)
if tx_type:
params[tx_type] = "yes"
return params
def fetch_page(params: Dict[str, str], page: int = 1) -> Optional[str]:
"""
Fetch a single page from ProSportsTransactions.
Args:
params: Search parameters
page: Page number
Returns:
HTML content or None if failed
"""
if page > 1:
params["start"] = str((page - 1) * 25)
try:
logger.debug(f"Fetching page {page} with params: {params}")
response = requests.get(BASE_URL, params=params)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logger.error(f"Error fetching page {page}: {e}")
return None
def has_next_page(html: str) -> bool:
"""Check if there's a next page based on the HTML content"""
soup = BeautifulSoup(html, 'lxml')
next_link = soup.select_one('a.next')
return next_link is not None
def parse_transactions_table(html: str, transaction_type: str) -> pd.DataFrame:
"""
Parse the HTML table of transactions.
Args:
html: HTML content
transaction_type: Type of transaction (for tagging)
Returns:
DataFrame of transaction data
"""
try:
# Parse the table using pandas
tables = pd.read_html(html)
if not tables:
logger.warning("No tables found in HTML")
return pd.DataFrame()
# Get the main table (should be the first one)
df = tables[0]
# Clean up column names
df.columns = [col.strip() for col in df.columns]
# Add transaction type column
df['transaction_type'] = transaction_type
return df
except Exception as e:
logger.error(f"Error parsing transactions table: {e}")
return pd.DataFrame()
def fetch_all_transactions(
team: Optional[str] = None,
player: Optional[str] = None,
start_date: str = "",
end_date: str = "",
transaction_type: str = "All"
) -> pd.DataFrame:
"""
Fetch all transactions matching the criteria, handling pagination.
Args:
team: Team name (optional)
player: Player name (optional)
start_date: Start date in YYYY-MM-DD format
end_date: End date in YYYY-MM-DD format
transaction_type: Type of transaction to filter
Returns:
DataFrame containing all transactions
"""
logger.info(f"Fetching transactions: {start_date} to {end_date}, team={team}, player={player}, type={transaction_type}")
params = get_request_params(team, player, start_date, end_date, transaction_type)
all_transactions = []
page = 1
while True:
html = fetch_page(params, page)
if not html:
break
df = parse_transactions_table(html, transaction_type)
if df.empty:
break
all_transactions.append(df)
if not has_next_page(html):
break
page += 1
# Combine all dataframes
if not all_transactions:
logger.warning("No transactions found")
return pd.DataFrame()
result = pd.concat(all_transactions, ignore_index=True)
logger.info(f"Found {len(result)} transactions")
return result
def get_all_nfl_teams() -> List[str]:
"""Return a list of all NFL teams"""
return list(TEAM_MAPPING.values())
def get_available_transaction_types() -> List[str]:
"""Return a list of available transaction types"""
return list(TRANSACTION_TYPE_MAPPING.keys())
def save_to_csv(df: pd.DataFrame, output_path: str) -> str:
"""Save dataframe to CSV file"""
try:
df.to_csv(output_path, index=False)
return output_path
except Exception as e:
logger.error(f"Error saving to CSV: {e}")
raise
def save_to_json(df: pd.DataFrame, output_path: str) -> str:
"""Save dataframe to JSON file"""
try:
result = df.to_json(orient='records')
with open(output_path, 'w') as f:
f.write(result)
return output_path
except Exception as e:
logger.error(f"Error saving to JSON: {e}")
raise