MLB Weather Project

Qualia81 · December 30, 2024, 7:53pm

I’m trying to build a project where I pull game and weather data from API’s to determine the likelihood that a game would be canceled based on historic data. If I pull a few dates it works. If I pull two years of games all the results come up as N/A. Here is the code.

import pandas as pd
import requests
from datetime import datetime, timedelta
import time
import statsapi
import csv


def fetch_historical_mlb_games(start_date, end_date):
    """
    Fetch historical MLB game schedules and statuses between two dates.

    :param start_date: str, start date in 'YYYY-MM-DD' format
    :param end_date: str, end date in 'YYYY-MM-DD' format
    :return: list of historical games with statuses
    """
    games = []
    current_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")

    while current_date <= end_date:
        date_str = current_date.strftime("%Y-%m-%d")
        print(f"Fetching games for {date_str}...")

        try:
            schedule = statsapi.schedule(date=date_str)
            for game in schedule:
                games.append({
                    "date": game.get("game_date", "Unknown Date"),
                    "venue": game.get("venue_name", "Unknown Venue"),
                    "status": game.get("status", "Unknown Status"),
                    "home_team": game.get("home_name", "Unknown Team"),
                    "away_team": game.get("away_name", "Unknown Team"),
                })
        except Exception as e:
            print(f"Error fetching games for {date_str}: {e}")
        
        # Add delay to avoid API rate limits
        time.sleep(1)

        current_date += timedelta(days=1)

    return games


def fetch_historical_weather(venue, date, api_key):
    """
    Fetch historical weather data for a given venue and date using Visual Crossing API.

    :param venue: str, venue name (city)
    :param date: str, date in 'YYYY-MM-DD' format
    :param api_key: str, Visual Crossing API key
    :return: dict with weather details or error
    """
    try:
        base_url = "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline"
        request_url = f"{base_url}/{venue}/{date}?unitGroup=metric&key={api_key}&include=days"
        response = requests.get(request_url, timeout=10)  # Add timeout to avoid hanging
        response.raise_for_status()
        weather_data = response.json()

        if "days" in weather_data and len(weather_data["days"]) > 0:
            day_data = weather_data["days"][0]
            return {
                "temperature": day_data.get("temp", "N/A"),
                "humidity": day_data.get("humidity", "N/A"),
                "precipitation": day_data.get("precip", "N/A"),
                "wind_speed": day_data.get("windspeed", "N/A"),
                "description": day_data.get("conditions", "N/A"),
            }
        else:
            return {"error": "No weather data found for this date."}
    except Exception as e:
        return {"error": str(e)}


def save_historical_data_to_csv(historical_data, output_file="historical_games_with_weather.csv"):
    """
    Save historical game and weather data to a CSV file.

    :param historical_data: list of dictionaries containing game and weather details
    :param output_file: str, name of the output CSV file
    """
    try:
        with open(output_file, mode="w", newline="") as file:
            fieldnames = [
                "date", "venue", "home_team", "away_team", "status",
                "temperature", "humidity", "precipitation", "wind_speed", "description"
            ]
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()

            for game in historical_data:
                weather = game.get("weather", {})
                writer.writerow({
                    "date": game.get("date", "N/A"),
                    "venue": game.get("venue", "N/A"),
                    "home_team": game.get("home_team", "N/A"),
                    "away_team": game.get("away_team", "N/A"),
                    "status": game.get("status", "N/A"),
                    "temperature": weather.get("temperature", "N/A"),
                    "humidity": weather.get("humidity", "N/A"),
                    "precipitation": weather.get("precipitation", "N/A"),
                    "wind_speed": weather.get("wind_speed", "N/A"),
                    "description": weather.get("description", "N/A"),
                })

        print(f"Historical data saved to {output_file}")
    except Exception as e:
        print(f"Error saving to CSV: {e}")


def calculate_cancellation_likelihood(historical_data_file):
    """
    Calculate the likelihood of game cancellations based on historical data.

    :param historical_data_file: str, path to the CSV file containing historical data
    :return: dict mapping weather descriptions to cancellation likelihood
    """
    data = pd.read_csv(historical_data_file)

    # Add cancellation flag (1 if postponed, 0 otherwise)
    data["is_canceled"] = data["status"].apply(lambda x: 1 if "Postponed" in x else 0)

    # Group by weather description and calculate likelihood
    grouped = data.groupby("description").agg(
        total_games=("is_canceled", "count"),
        canceled_games=("is_canceled", "sum")
    )
    grouped["cancellation_likelihood"] = (grouped["canceled_games"] / grouped["total_games"]) * 100

    # Convert to dictionary
    likelihood_dict = grouped["cancellation_likelihood"].to_dict()

    print("Cancellation likelihood by weather:")
    print(grouped)

    return likelihood_dict


def apply_cancellation_likelihood(data_file, likelihood_dict, output_file):
    """
    Apply cancellation likelihood to a dataset based on weather descriptions.

    :param data_file: str, path to the CSV file containing games data
    :param likelihood_dict: dict, mapping weather descriptions to cancellation likelihood
    :param output_file: str, path to save the enhanced dataset
    """
    # Load game data
    data = pd.read_csv(data_file)

    # Map likelihood to each game
    data["cancellation_likelihood"] = data["description"].map(
        lambda x: likelihood_dict.get(x, 0)  # Default to 0 if no match
    )

    # Save enhanced dataset
    data.to_csv(output_file, index=False)
    print(f"Enhanced dataset saved to {output_file}")


def main():
    """
    Main function to calculate and apply cancellation likelihood.
    """
    print("Welcome to the Game Cancellation Prediction Tool!")

    # Step 1: Fetch historical data for 2023
    print("Fetching 2023 data...")
    historical_games_2023 = fetch_historical_mlb_games("2023-04-01", "2023-10-01")
    save_historical_data_to_csv(historical_games_2023, "historical_games_with_weather_2023.csv")

    # Step 2: Fetch historical data for 2024
    print("Fetching 2024 data...")
    historical_games_2024 = fetch_historical_mlb_games("2024-04-01", "2024-10-01")
    save_historical_data_to_csv(historical_games_2024, "historical_games_with_weather_2024.csv")

    # Step 3: Calculate cancellation likelihood from 2023 data
    likelihood_dict = calculate_cancellation_likelihood("historical_games_with_weather_2023.csv")

    # Step 4: Apply cancellation likelihood to 2024 data
    apply_cancellation_likelihood("historical_games_with_weather_2024.csv", likelihood_dict, "enhanced_games_with_weather_2024.csv")


if __name__ == "__main__":
    main()

MRAB · December 30, 2024, 8:13pm

I suggest you add some logging.

Record the query and the response.

Try for 1 day and 2 days. Are the responses OK?

onePythonUser · December 30, 2024, 8:18pm

Hello,

first, please specify your interpretation of a few dates (a number please).

2nd, specify how many dates does two years of games cover (again, quantity). Have you determined the upper limit just prior to it failing?

Assuming your script is actually working, from wherever you’re pulling the data from, does it provide two years of games? How about for the weather data? Have you independently confirmed that the data being pulled is valid from both sources - for the duration being entered?

Have you zeroed in on which part of the script is failing (i.e., which function)? If so, have you tried backtracking via either print statements or being in debug mode to see where in the script the values begin deviating from the expected values?

onePythonUser · December 30, 2024, 8:52pm

Curious …

Assuming you need more time to fetch the data, can you increase the timeout value to a large value, …, for test purposes only, set it to 100. Does that make a difference?