import pandas as pd
import numpy as np
# Note, you will likely have to change these paths. If your data is in the same folder as this project, 
# the paths will likely be fixed for you by deleting ../../Data/schedule_project/ from each string.
schedule = pd.read_csv("schedule.csv")
draft_schedule = pd.read_csv("schedule_24_partial.csv")
locations = pd.read_csv("locations.csv")
game_data = pd.read_csv("team_game_data.csv")

#Data Manipulation : want only OKC Games + sort dates into chronological order
okc = draft_schedule[draft_schedule['team'] == 'OKC'].copy()
okc["gamedate"] = pd.to_datetime(okc["gamedate"])
okc = okc.sort_values("gamedate").reset_index(drop=True)
dates = okc["gamedate"].values 

#find 4 games in 6 nights
left_idxs = np.searchsorted(dates, dates - np.timedelta64(5, "D"), side="left") #look back the 5 days
counts_last_6_nights = np.arange(len(dates)) - left_idxs + 1
mask_4_in_6 = counts_last_6_nights >= 4 #boolean for 4 games in 6 nights
count_4_in_6 = int(mask_4_in_6.sum())
count_4_in_6

26

# gamedate is datetime and data sorted
schedule["gamedate"] = pd.to_datetime(schedule["gamedate"])

all_per82 = [] #store count of 4-in-6 games in a list

for (season, team), group in schedule.groupby(["season", "team"]): #group by season, team ex) OKC, 2020
    #Q1 setup
    dates = group.sort_values("gamedate")["gamedate"].values
    left = np.searchsorted(dates, dates - np.timedelta64(5, "D"), side="left")
    counts = np.arange(len(dates)) - left + 1
    four_in_six = (counts >= 4).sum()
    per82 = four_in_six * 82 / len(dates)
    if 2014 <= season <= 2023:   # keep only 2014–15 through 2023–24
        all_per82.append((team,per82)) #Q3 DF

#code for question 3 (makes my life easier)
df = pd.DataFrame(all_per82, columns=["team", "per82"])

avg_per82 = np.mean([x[1] for x in all_per82])
avg_per82

25.10330883503872

team_avg = df.groupby("team", as_index=False)["per82"].mean()
high = team_avg.loc[team_avg["per82"].idxmax()]
low = team_avg.loc[team_avg["per82"].idxmin()]
high, low

(team           CHA
 per82    28.109188
 Name: 3, dtype: object,
 team           NYK
 per82    22.186111
 Name: 19, dtype: object)

#Formula for eFG: (FGM + 0.5 * 3PM) / FGA. Found through https://www.nba.com/bucks/features/boeder-120917

schedule["gamedate"] = pd.to_datetime(schedule["gamedate"])
game_data["gamedate"] = pd.to_datetime(game_data["gamedate"])

#Data Manipulation for 2023, BKN
opp_off = game_data[(game_data["season"] == 2023) & (game_data["def_team"] == "BKN")].copy() #BKN is the defensive team

# BKN's defensive eFG%
opp_off["efg"] = (opp_off["fgmade"] + 0.5 * opp_off["fg3made"]) / opp_off["fgattempted"]
def_efg = opp_off["efg"].mean()

# opp back-to-back
team_games = schedule.groupby("team")["gamedate"].apply(set).to_dict()

opp_off["is_b2b"] = opp_off.apply(
    lambda row: (row["gamedate"] - pd.Timedelta(days=1)) in team_games[row["off_team"]],
    axis=1
)

def_efg_b2b = opp_off.loc[opp_off["is_b2b"], "efg"].mean()

def_efg, def_efg_b2b

(0.5450563595207142, 0.5363431117395119)

import matplotlib.pyplot as plt


schedule["gamedate"] = pd.to_datetime(schedule["gamedate"])

def four_in_six_count(dates):
    left = np.searchsorted(dates, dates - np.timedelta64(5,"D"), side="left")
    counts = np.arange(len(dates)) - left + 1
    return (counts >= 4).sum()

season_trends = []
for (season, team), g in schedule.groupby(["season","team"]):
    dates = np.sort(g["gamedate"].values)
    season_trends.append({"season": season, "team": team, "four_in_six": four_in_six_count(dates)})

df_trends = pd.DataFrame(season_trends)
season_avg = df_trends.groupby("season")["four_in_six"].mean().reset_index()

plt.figure(figsize=(8,5))
plt.plot(season_avg["season"], season_avg["four_in_six"], marker="o")
plt.title("Average 4-in-6 Games per Team by Season")
plt.xlabel("Season (year = start of season)")
plt.ylabel("Avg 4-in-6 per Team")
plt.grid(True)
plt.show()

# count games per team-season
games_per_season = schedule.groupby(["season","team"]).size().reset_index(name="games")

# average across teams for each season
avg_games = games_per_season.groupby("season")["games"].mean().reset_index()

plt.figure(figsize=(8,5))
plt.plot(avg_games["season"], avg_games["games"], marker="o", linewidth=2)
plt.title("Average Number of Games per Team by Season")
plt.xlabel("Season (start year)")
plt.ylabel("Games per Team")
plt.grid(True)
plt.show()

from matplotlib.lines import Line2D

#HAVERSINE FORMULA (CHATGPT)
def haversine(lat1, lon1, lat2, lon2, miles=True):
    R = 6371.0088 * (0.621371 if miles else 1.0)
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    return 2 * R * np.arcsin(np.sqrt(
        np.sin((lat2-lat1)/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin((lon2-lon1)/2)**2
    ))

def plot_team_scatter(team_code, use_miles=True):
#DATA MANIPULATION (chronological order + reset the index)
    df = draft_schedule.query("team == @team_code").copy()
    df["gamedate"] = pd.to_datetime(df["gamedate"])
    df = df.sort_values("gamedate").reset_index(drop=True)

#LAT AND LON
    loc = locations.set_index("team")[["latitude","longitude"]]
    home_lat, home_lon = loc.loc[team_code]
    #set lat, lon for the game based on home and away
    df["venue_lat"] = np.where(df["home"]==1, home_lat, df["opponent"].map(loc["latitude"]))
    df["venue_lon"] = np.where(df["home"]==1, home_lon, df["opponent"].map(loc["longitude"]))
    #compute distance and shift one down + fill the first game as 0 distance and rest day as 1 for the first game (asusmption)
    df["travel"] = haversine(df["venue_lat"].shift(), df["venue_lon"].shift(),
                             df["venue_lat"], df["venue_lon"], miles=use_miles).fillna(0)
    df["rest_days"] = df["gamedate"].diff().dt.days.fillna(1)

#4-IN-6 DETECTION
    #Pulled from earlier Q's
    dates = df["gamedate"].values.astype("datetime64[D]")
    left = np.searchsorted(dates, dates - np.timedelta64(5, "D"), side="left")
    df["is_4in6"] = (np.arange(len(dates)) - left + 1) >= 4


#VISUAL ENCODINGS (CHATGPT)
    sizes = (2 - np.clip(df["rest_days"],0,2))*120 + 30
    colors = np.where(df["home"]==1, "tab:blue", "tab:orange")

    fig, ax = plt.subplots(figsize=(12,5))
    ax.scatter(df["gamedate"], df["travel"], s=sizes, c=colors, alpha=0.7, edgecolor="k", lw=0.3)
    if df["is_4in6"].any():
        ax.scatter(df.loc[df["is_4in6"],"gamedate"], df.loc[df["is_4in6"],"travel"],
                   s=sizes[df["is_4in6"]]*1.1, marker="*", facecolors="none",
                   edgecolors="crimson", lw=1.2, label="4-in-6")
#LEGEND
    legend = [
    Line2D([0],[0], marker='o', color='w', label='Home',
           markerfacecolor='tab:blue', markeredgecolor='k', markersize=8, linestyle='None'),
    Line2D([0],[0], marker='o', color='w', label='Away',
           markerfacecolor='tab:orange', markeredgecolor='k', markersize=8, linestyle='None'),
    Line2D([0],[0], marker='*', color='crimson', label='4-in-6',
           markerfacecolor='none', markeredgecolor='crimson', markersize=12, linestyle='None'),
            ]
    
    ax.legend(handles=legend, loc="upper left", framealpha=0.95, title="Encoding")
    ax.text(1.0, 1.02, "Point size = shorter rest", transform=ax.transAxes, ha="right", va="bottom", fontsize=9, color="#444")
    ax.set(title=f"{team_code} Schedule — Travel vs Date", xlabel="Date",
           ylabel=f"Travel from previous game ({'miles' if use_miles else 'km'})")
    fig.autofmt_xdate(); plt.tight_layout(); plt.show()

#PLOT
plot_team_scatter("OKC")
plot_team_scatter("DEN")

print(game_data.columns.tolist())

['season', 'gametype', 'nbagameid', 'gamedate', 'offensivenbateamid', 'off_team_name', 'off_team', 'off_home', 'off_win', 'defensivenbateamid', 'def_team_name', 'def_team', 'def_home', 'def_win', 'fg2made', 'fg2missed', 'fg2attempted', 'fg3made', 'fg3missed', 'fg3attempted', 'fgmade', 'fgmissed', 'fgattempted', 'ftmade', 'ftmissed', 'ftattempted', 'reboffensive', 'rebdefensive', 'reboundchance', 'assists', 'stealsagainst', 'turnovers', 'blocksagainst', 'defensivefouls', 'offensivefouls', 'shootingfoulsdrawn', 'possessions', 'points', 'shotattempts', 'andones', 'shotattemptpoints']

#STEP 1: CREATE AGG
def _haversine(lat1, lon1, lat2, lon2, miles=True):
    R_km = 6371.0088
    R = R_km * (0.621371 if miles else 1.0)
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

def _four_in_six(date_series: pd.Series) -> pd.Series:
    d = pd.to_datetime(date_series, errors="coerce").values.astype("datetime64[D]")
    left = np.searchsorted(d, d - np.timedelta64(5, "D"), side="left")
    cnt = np.arange(len(d)) - left + 1
    return pd.Series(cnt >= 4, index=date_series.index)

# ADD SEASON
sched = schedule.copy()
sched["gamedate"] = pd.to_datetime(sched["gamedate"], errors="coerce")
sched["season"] = (sched["gamedate"] - pd.Timedelta(days=185)).dt.year
sched = sched.sort_values(["team", "gamedate"]).reset_index(drop=True)

# COMPUTE VENUE
loc = locations.set_index("team")[["latitude","longitude"]]
home_lat = sched["team"].map(loc["latitude"])
home_lon = sched["team"].map(loc["longitude"])
opp_lat  = sched["opponent"].map(loc["latitude"])
opp_lon  = sched["opponent"].map(loc["longitude"])

sched["venue_lat"] = np.where(sched["home"].eq(1), home_lat, opp_lat)
sched["venue_lon"] = np.where(sched["home"].eq(1), home_lon, opp_lon)

# TRAVEL MILES
sched["prev_lat"] = sched.groupby("team")["venue_lat"].shift(1)
sched["prev_lon"] = sched.groupby("team")["venue_lon"].shift(1)
sched["travel_miles"] = np.where(
    sched["prev_lat"].notna(),
    _haversine(sched["prev_lat"], sched["prev_lon"], sched["venue_lat"], sched["venue_lon"], miles=True),
    0.0
)

# B2B and REST DAYS
sched["rest_days"] = sched.groupby("team")["gamedate"].diff().dt.days
sched["is_b2b"] = sched["rest_days"].fillna(np.inf).eq(0)


sched["is_4in6"] = sched.groupby("team", group_keys=False)["gamedate"].apply(_four_in_six)

# BUILD AGG
agg = (sched.groupby(["team","season"])
       .agg(
           travel_miles=("travel_miles","sum"),
           b2b_games=("is_b2b","sum"),
           four_in_six=("is_4in6","sum"),
           games=("opponent","size")
       ).reset_index())

# STEP 2: CALCULATE ON-COURT STRENGTH OF SCHEDULED OPPONENTS USING SRS = MOV + SOS

# SETUP 
gd = game_data.copy()
TEAM_COL, OPP_COL, DATE_COL, GAME_ID, PTS_COL = "off_team", "def_team", "gamedate", "nbagameid", "points"
gd[DATE_COL] = pd.to_datetime(gd[DATE_COL], errors="coerce")

# BUILD OPP VIEW + TEAMS
opp_view = gd[[GAME_ID, TEAM_COL, "season", DATE_COL, PTS_COL]].rename(
    columns={TEAM_COL: OPP_COL, PTS_COL: "opp_points"}
)

# MERGE TEAMS
gd_pair = gd.merge(
    opp_view,
    on=[OPP_COL, "season", DATE_COL, GAME_ID],
    how="inner",
    validate="m:1"
)

# CALC MOV + WINS
gd_pair["mov"] = gd_pair[PTS_COL] - gd_pair["opp_points"]
gd_pair["win"] = (gd_pair["mov"] > 0).astype(int)

# TEAM-SEASON LEVEL STATS 
# Average MOV
mov_team = (gd_pair.groupby([TEAM_COL, "season"])["mov"]
            .mean().rename("avg_mov").reset_index())

# AVG TEAM WINS
wins_team = (gd_pair.groupby([TEAM_COL, "season"])["win"]
             .sum().rename("wins").reset_index())

# SOS proxy: average opponent MOV faced
gd_with_opp_mov = gd_pair.merge(
    mov_team.rename(columns={TEAM_COL: OPP_COL, "avg_mov": "opp_avg_mov"}),
    on=[OPP_COL, "season"], how="left"
)
sos_team = (gd_with_opp_mov.groupby([TEAM_COL, "season"])["opp_avg_mov"]
            .mean().rename("sos").reset_index())

# SRS = MOV + SOS
srs = mov_team.merge(sos_team, on=[TEAM_COL, "season"], how="left")
srs["srs"] = srs["avg_mov"] + srs["sos"]

# Combine MOV, SRS, and wins
srs = srs.merge(wins_team, on=[TEAM_COL, "season"], how="left")

print("\nSRS preview (with wins):\n", srs.head())

# MERGE INTO AGG
agg = agg.merge(
    srs[[TEAM_COL, "season", "srs", "wins"]].rename(columns={TEAM_COL: "team"}),
    on=["team", "season"],
    how="left"
)
if "srs_x" in agg.columns and "srs_y" in agg.columns:
    agg["srs"] = agg["srs_x"].combine_first(agg["srs_y"])
    agg = agg.drop(columns=["srs_x", "srs_y"])

SRS preview (with wins):
   off_team  season   avg_mov       sos       srs  wins
0      ATL    2014  5.426829 -0.585068  4.841761    60
1      ATL    2015  3.609756 -0.083730  3.526026    48
2      ATL    2016 -0.853659 -0.306811 -1.160470    43
3      ATL    2017 -5.487805  0.210738 -5.277067    24
4      ATL    2018 -6.048780  0.056663 -5.992118    29

#STEP 3: OLS TIME

features = ["srs", "travel_miles", "b2b_games", "four_in_six"]
model_df = agg.dropna(subset=["wins"] + features).copy()

Y = model_df["wins"].values.astype(float)
X = model_df[features].values
X = np.c_[np.ones(len(X)), X]   # add intercept


# 2. OLS Estimate 
B = np.linalg.pinv(X.T @ X) @ (X.T @ Y)

predictors = ["Intercept"] + features
print("OLS Regression Results \n")
for i, b in enumerate(B):
    print(f"{predictors[i]:>12}: {b: .4f}")

# Predictions
Y_hat = X @ B

# good of fit
mse = np.mean((Y - Y_hat)**2)
ss_total = np.sum((Y - Y.mean())**2)
ss_resid = np.sum((Y - Y_hat)**2)
R2 = 1 - ss_resid/ss_total
adj_R2 = R2 - (1 - R2) * (X.shape[1]-1)/(len(Y) - X.shape[1]-1)

print(f"\nR²: {R2:.3f}")
print(f"Adjusted R²: {adj_R2:.3f}")
print(f"MSE: {mse:.3f}")


# 3. Schedule effect: Actual vs Neutralized schedule
X_actual = model_df[features].values
X_neutral = X_actual.copy()

# Neutralize schedule factors (set to season avg)
for j, col in enumerate(features[1:], start=1):  # skip srs, keep only travel/b2b/4-in-6
    season_means = model_df.groupby("season")[col].transform("mean").values
    X_neutral[:, j] = season_means

X_actual = np.c_[np.ones(len(X_actual)), X_actual]
X_neutral = np.c_[np.ones(len(X_neutral)), X_neutral]

yhat_actual = X_actual @ B
yhat_neutral = X_neutral @ B
model_df["schedule_wins_delta"] = yhat_actual - yhat_neutral

totals = model_df.groupby("team")["schedule_wins_delta"].sum().sort_values()

print("\nMost hurt by schedule (wins lost):")
print(totals.head(5).round(2))

print("\nMost helped by schedule (wins gained):")
print(totals.tail(5)[::-1].round(2))

OLS Regression Results 

   Intercept:  35.7228
         srs:  2.4868
travel_miles:  0.0001
   b2b_games:  0.0000
 four_in_six:  0.0073

R²: 0.901
Adjusted R²: 0.900
MSE: 14.286

Most hurt by schedule (wins lost):
team
CLE   -8.44
IND   -6.29
DET   -6.17
TOR   -5.82
WAS   -5.29
Name: schedule_wins_delta, dtype: float64

Most helped by schedule (wins gained):
team
POR    8.98
GSW    5.89
MIN    4.60
SAC    4.12
MIA    4.08
Name: schedule_wins_delta, dtype: float64

Name: Rachel Yun¶

Date: 08/21/2025¶

Introduction¶

Answers¶

Part 1¶

Part 2¶

Part 3¶

Setup and Data¶

Part 1 -- Schedule Analysis¶

Question 1¶

Question 2¶

Question 3¶

Question 4¶

Question 5¶

Part 2 -- Trends and Visualizations¶

Question 6¶

Question 7¶

Question 8¶

Part 3 -- Modeling¶

Question 9¶