In this post I'm going to explore Dean Oliver's Four Factors, compute the weight of each factor based on linear regression and see how those factors are influenced by home court advantage.

For a quick overview of the four factors read this: https://www.basketball-reference.com/about/factors.html

Or even better is to read Dean Oliver's book Basketball on Paper: http://www.basketballonpaper.com/

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import NBAapi as nba
from sklearn import linear_model,preprocessing
import matplotlib.gridspec as gridspec
#from scipy.stats import ttest_ind

plt.style.use('fivethirtyeight') # just recently found this cool style that makes the plots look like fivethirtyeight

%matplotlib inline
In [2]:
def seasons_string(start,end):
    '''
    creates a list of NBA seasons from start-end
    '''
    years = np.arange(start,end+1)
    seasons = []
    for year in years:
        string1 = str(year)
        string2 = str(year+1)
        season = '{}-{}'.format(string1,string2[-2:])
        seasons.append(season)
    return seasons

Getting game log data for the last 30 years:

In [3]:
seasons = seasons_string(1987,2016) # define seasons to look at
game_logs = []
for season in seasons:
    df = nba.league.gamelog(season=season).sort_values('GAME_ID') # get game logs
    
    # split the game logs based on home and away teams
    away_game = df['MATCHUP'].str.contains(' @ ')
    df_home = df.loc[~away_game,:].copy()
    df_away = df.loc[away_game,:].copy()
    
    # merge home and away dataframes into one dataframe where all the columns for the away team become _OPP
    game_logs.append(df_home.merge(df_away,on='GAME_ID',suffixes=('', '_OPP')))
    
game_logs = pd.concat(game_logs,ignore_index=True) # create one dataframes with all of the seasons

How often does the home team win?

In [4]:
game_logs.groupby('WL').size()
Out[4]:
WL
L    13558
W    21038
dtype: int64

Home team wins 60.8% of the games (21038/(21038+13558))

Calculating the four factors + more:

In [5]:
EFG_home = 1.0*(game_logs['FGM'] + 0.5*game_logs['FG3M'])/game_logs['FGA']
EFG_away = 1.0*(game_logs['FGM_OPP'] + 0.5*game_logs['FG3M_OPP'])/game_logs['FGA_OPP']
OREB_PCT_home = 1.0*(game_logs['OREB'])/(game_logs['OREB']+game_logs['DREB_OPP'])
OREB_PCT_away = 1.0*(game_logs['OREB_OPP'])/(game_logs['OREB_OPP']+game_logs['DREB'])
TOV_PCT_home = 1.0*(game_logs['TOV'])/(game_logs['TOV']+game_logs['FGA']+0.44*game_logs['FTA'])
TOV_PCT_away = 1.0*(game_logs['TOV_OPP'])/(game_logs['TOV_OPP']+game_logs['FGA_OPP']+0.44*game_logs['FTA_OPP'])
FTA_home = game_logs['FTA']
FTA_away = game_logs['FTA_OPP']
FT_PCT_home = 1.0*(game_logs['FT_PCT'])
FT_PCT_away = 1.0*(game_logs['FT_PCT_OPP'])
FT_factor_home = 1.0*game_logs['FTM']/game_logs['FGA']
FT_factor_away = 1.0*game_logs['FTM_OPP']/game_logs['FGA_OPP']
PTS_home = game_logs['PTS']
PTS_away = game_logs['PTS_OPP']
homeW = game_logs['WL'].str.contains('W')
awayW = game_logs['WL_OPP'].str.contains('W')
idx = (homeW==True) | (homeW==False)

Plotting the distributions:

  • See legend for the average
In [6]:
plt.style.use('fivethirtyeight')

plt.figure(figsize=(9,6))
plt.hist(EFG_home[idx],bins=51,range=(0.2,0.8),alpha=0.5,normed=True);
plt.hist(EFG_away[idx],bins=51,range=(0.2,0.8),alpha=0.5,normed=True);
plt.legend(['EFG - Home {:.3f}'.format(np.mean(EFG_home[idx])),'EFG - Away {:.3f}'.format(np.mean(EFG_away[idx]))])
plt.title('Effective Field Goal Distribution')

plt.figure(figsize=(9,6))
plt.hist(OREB_PCT_home[idx],bins=51,range=(0.0,0.7),alpha=0.5,normed=True);
plt.hist(OREB_PCT_away[idx],bins=51,range=(0.0,0.7),alpha=0.5,normed=True);
plt.legend(['OREB_PCT - Home {:.3f}'.format(np.mean(OREB_PCT_home[idx])),'OREB_PCT - Away {:.3f}'.format(np.mean(OREB_PCT_away[idx]))])
plt.title('Offensive Rebounds % Distribution')

plt.figure(figsize=(9,6))
plt.hist(TOV_PCT_home[idx],bins=51,range=(0.0,0.3),alpha=0.5,normed=True);
plt.hist(TOV_PCT_away[idx],bins=51,range=(0.0,0.3),alpha=0.5,normed=True);
plt.legend(['TOV_PCT - Home {:.3f}'.format(np.mean(TOV_PCT_home[idx])),'TOV_PCT - Away {:.3f}'.format(np.mean(TOV_PCT_away[idx]))])
plt.title('Turnovers % Distribution')

plt.figure(figsize=(9,6))
plt.hist(FT_PCT_home[idx],bins=31,range=(0.3,1.0),alpha=0.5,normed=True);
plt.hist(FT_PCT_away[idx],bins=31,range=(0.3,1.0),alpha=0.5,normed=True);
plt.legend(['FT_PCT - Home {:.3f}'.format(np.mean(FT_PCT_home[idx])),'FT_PCT - Away {:.3f}'.format(np.mean(FT_PCT_away[idx]))])
plt.title('Free Throw % Distribution')

plt.figure(figsize=(9,6))
plt.hist(FTA_home[idx],bins=61,range=(0.0,60.0),alpha=0.5,normed=True);
plt.hist(FTA_away[idx],bins=61,range=(0.0,60.0),alpha=0.5,normed=True);
plt.legend(['FTA - Home {:.2f}'.format(np.mean(FTA_home[idx])),'FTA - Away {:.2f}'.format(np.mean(FTA_away[idx]))])
plt.title('Free Throws Attempts Distribution')

plt.figure(figsize=(9,6))
plt.hist(FT_factor_home[idx],bins=36,range=(0.0,1.0),alpha=0.5,normed=True);
plt.hist(FT_factor_away[idx],bins=36,range=(0.0,1.0),alpha=0.5,normed=True);
plt.legend(['FT_factor - Home {:.3f}'.format(np.mean(FT_factor_home[idx])),'FT_factor - Away {:.3f}'.format(np.mean(FT_factor_away[idx]))])
plt.title('Free Throw Factor Distribution')

plt.figure(figsize=(9,6))
plt.hist(PTS_home[idx],bins=101,range=(50.0,150.0),alpha=0.5,normed=True);
plt.hist(PTS_away[idx],bins=101,range=(50.0,150.0),alpha=0.5,normed=True);
plt.legend(['PTS - Home {:.3f}'.format(np.mean(PTS_home[idx])),'PTS - Away {:.3f}'.format(np.mean(PTS_away[idx]))])
plt.title('Points Distribution')
Out[6]:

What can we learn from these distributions? Home team does better at almost everything. Can these differences be a fluke? I ran a T test and since we are looking at so many games even a very small change between the home and away team is not a coincidence.

Let's summerize the differences we see for the home team:

  1. EFG is about 1.3% higher.
  2. The OREB % is about 1.5% higher.
  3. TOV % is 0.5% lower.
  4. FT factor is 1.1% higher. Since the Free Throw % is almost identical, we can attribute this change to getting to the line more often (1.25 times more per game).
  5. Home team scores 3.5 points more than the away team.

So basically the home team does better at each of the four factors! Which leads to an average of a 3.5 points margin.

Using regression to find the weight of each factor:

First decision I made is to compute the difference between the four factors per game instead of having 8 factors (4 for each team). This reduces the number of features to 4 and at least to me it is more intuitive.

For the logistic regression the predictor is either win (1) or lose (0). For the linear regression the predictor is the final score points difference.

In [7]:
homeW = game_logs['WL'].str.contains('W')

EFGd = EFG_home[idx] - EFG_away[idx]
OREBd = OREB_PCT_home[idx] - OREB_PCT_away[idx]
TOVd = TOV_PCT_home[idx] - TOV_PCT_away[idx]
FTd = FT_factor_home[idx] - FT_factor_away[idx]
PTSd = PTS_home[idx] - PTS_away[idx]

Y = np.where(homeW[idx],1,0)
X = np.c_[EFGd,OREBd,TOVd,FTd]

# Scaling does not change the outcome but does change the weights to scale them properly
Xscaled = preprocessing.scale(X) 

Another important decision when computing the weights is whether we should scale the different factors so their standard deviation is the same. For example, we can see that the EFG has a much higher distribution then any other factor so the weight would be under emphasized if we do not scale the factors. I decided to scale each feature.

I'm going to use logistic regression to see home many games can be predicted correctly with it. To find the coefficients I'm going to use linear regression:

In [8]:
logreg = linear_model.LogisticRegression(C=1.0)
linreg = linear_model.LinearRegression()
logreg.fit(Xscaled, Y);
linreg.fit(Xscaled, PTSd);

How many did we get right?

In [9]:
print 'Using Logistic Regression we can predict at: {} accuracy'.format(logreg.score(Xscaled,Y))
print 'Using Linear Regression the R^2 factor is: {}'.format(linreg.score(Xscaled,PTSd))
Using Logistic Regression we can predict at: 0.957480633599 accuracy
Using Linear Regression the R^2 factor is: 0.935771070796

The four factors are very good predictors of winning as can be seen by the Logistic regression.

Let's take a look at the coefficients:

In [10]:
features = ['EFG','OREB','TOV','FT']
zip(features,np.transpose(linreg.coef_.ravel()))
Out[10]:
[('EFG', 11.971324398404628),
 ('OREB', 4.5787955438162742),
 ('TOV', -5.9907585214746533),
 ('FT', 3.1606929546319997)]

Weighted coefficients (to give 100%):

In [11]:
print 'Linear Regression Weights:'
for f,n in zip(features,100.0*linreg.coef_.ravel()/np.sum(np.abs(linreg.coef_.ravel()))):
    print(f,n)
Linear Regression Weights:
('EFG', 46.578180779514454)
('OREB', 17.81523576628928)
('TOV', -23.308919225081691)
('FT', 12.297664229114561)

Dean Oliver work:

  1. Shooting (40%)
  2. Turnovers (25%)
  3. Rebounding (20%)
  4. Free Throws (15%)

My work:

  1. Shooting (47%)
  2. Turnovers (23%)
  3. Rebounding (18%)
  4. Free Throws (12%)

So overall the weights are pretty close to Dean Oliver's work.

Let's try to visualize the two most import parameters

In [12]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(13.3,10))
plt.scatter(EFGd[homeW[idx]],TOVd[homeW[idx]],alpha=0.05,label='Win')
plt.scatter(EFGd[homeW[idx]==False],TOVd[homeW[idx]==False],alpha=0.05,marker='s',label='Lose')
leg = plt.legend()
for lh in leg.legendHandles: 
    lh.set_alpha(1)
plt.ylabel('TOV % Differential')
plt.xlabel('EFG % Differential')
plt.title('Can We Predict Outcome By EFG and TOV %?')
plt.axis('equal');
plt.xlim([-0.25,0.25]);

We can see that there is a fairly good separation between the wins and loses if we explore just the EFG and TOV % differential.

In [13]:
def plot_winning_chances(ddata,W1,W2,bin_numbers=21,lim=(-0.16,0.16)):
    bins = np.linspace(lim[0],lim[1],bin_numbers)
    N_games = np.zeros(bin_numbers-1)
    WP = np.zeros(bin_numbers-1)
    WP2 = np.zeros(bin_numbers-1)
    for i in range(len(bins)-1):
        idx2 = (ddata>=bins[i]) & (ddata<bins[i+1])
        idx3 = (-1.0*ddata>=bins[i]) & (-1.0*ddata<bins[i+1])
        N_games[i] = np.sum(idx2)
        if np.sum(idx2) > 0:
            WP[i] = 1.0*np.sum(W1[idx2])/np.sum(idx2)
            WP2[i] = 1.0*np.sum(W2[idx3])/np.sum(idx3)
        else:
            WP[i] = np.nan
            WP2[i] = np.nan

    plt.plot(bins[:-1]+(bins[1]-bins[0])/2,WP,bins[:-1]+(bins[1]-bins[0])/2,WP2)
    plt.plot(bins[:-1]+(bins[1]-bins[0])/2,0.5*(WP+WP2),'k--',alpha=0.5,linewidth=2)

What are the chances of winning the game based on each factor:

In [14]:
plt.style.use('fivethirtyeight')

hW = game_logs['WL'].str.contains('W')[idx]
aW = game_logs['WL_OPP'].str.contains('W')[idx]

plt.figure(figsize=(9.3,7))
plot_winning_chances(EFGd,hW,aW,bin_numbers=61,lim=(-0.3,0.3))
plt.ylabel('Probability of Winning Game')
plt.xlabel('Effective Field Goal Differential')
plt.title('Team Chances of Winning Game - EFG')
plt.legend(['Home Team','Away Team','Average'],fontsize=16.0)
plt.xticks(np.arange(-0.30,0.35,0.05));
plt.yticks(np.arange(0.0,1.1,0.1));
plt.xlim([-0.20,0.20])

plt.figure(figsize=(9.3,7))
plot_winning_chances(TOVd,hW,aW,bin_numbers=21,lim=(-0.1,0.1))
plt.ylabel('Probability of Winning Game')
plt.xlabel('Turnover % Differential')
plt.title('Team Chances of Winning Game - TOV %')
plt.legend(['Home Team','Away Team','Average'],fontsize=16.0)
plt.xticks(np.arange(-0.30,0.35,0.05));
plt.yticks(np.arange(0.0,1.1,0.1));
plt.xlim([-0.1,0.1]);

plt.figure(figsize=(9.3,7))
plot_winning_chances(OREBd,hW,aW,bin_numbers=21,lim=(-0.16,0.16))
plt.ylabel('Probability of Winning Game')
plt.xlabel('OREB % Differential')
plt.title('Team Chances of Winning Game - OREB %')
plt.legend(['Home Team','Away Team','Average'],fontsize=16.0)
plt.xticks(np.arange(-0.30,0.35,0.05));
plt.yticks(np.arange(0.0,1.1,0.1));
plt.xlim([-0.16,0.16]);

plt.figure(figsize=(9.3,7))
plot_winning_chances(FTd,hW,aW,bin_numbers=21,lim=(-0.25,0.25))
plt.ylabel('Probability of Winning Game')
plt.xlabel('FT Factor % Differential')
plt.title('Team Chances of Winning Game - FT Factor %')
plt.legend(['Home Team','Away Team','Average'],fontsize=16.0)
plt.xticks(np.arange(-0.30,0.35,0.05));
plt.yticks(np.arange(0.0,1.1,0.1));
plt.xlim([-0.25,0.25]);