The goal of the project was to compare different NBA players based on their shot selection and cluster them into groups. These new groups can be compared to the assigned position of players to check for correlation.

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import misc,ndimage
import scipy.cluster.hierarchy as sch
import seaborn as sns
import NBAapi as nba
import urllib, cStringIO
from collections import defaultdict

Load shot chart data

In [2]:
df,_ = nba.shotchart.shotchartdetail(season='2014-15')
df.head()
Out[2]:
GRID_TYPE GAME_ID GAME_EVENT_ID PLAYER_ID PLAYER_NAME TEAM_ID TEAM_NAME PERIOD MINUTES_REMAINING SECONDS_REMAINING ... SHOT_ZONE_AREA SHOT_ZONE_RANGE SHOT_DISTANCE LOC_X LOC_Y SHOT_ATTEMPTED_FLAG SHOT_MADE_FLAG GAME_DATE HTM VTM
0 Shot Chart Detail 0021400001 2 203076 Anthony Davis 1610612740 New Orleans Pelicans 1 11 43 ... Center(C) 16-24 ft. 20 50 194 1 0 20141028 NOP ORL
1 Shot Chart Detail 0021400001 4 202696 Nikola Vucevic 1610612753 Orlando Magic 1 11 31 ... Center(C) 16-24 ft. 18 -8 189 1 1 20141028 NOP ORL
2 Shot Chart Detail 0021400001 7 203076 Anthony Davis 1610612740 New Orleans Pelicans 1 11 6 ... Left Side Center(LC) 16-24 ft. 18 -131 127 1 0 20141028 NOP ORL
3 Shot Chart Detail 0021400001 9 203901 Elfrid Payton 1610612753 Orlando Magic 1 10 54 ... Center(C) Less Than 8 ft. 1 -15 4 1 0 20141028 NOP ORL
4 Shot Chart Detail 0021400001 25 203076 Anthony Davis 1610612740 New Orleans Pelicans 1 10 29 ... Center(C) Less Than 8 ft. 0 0 1 1 1 20141028 NOP ORL

5 rows × 24 columns

Let's write a few useful functions that will be used throughout this project:

In [3]:
def KDE_heatmap(df,sigma=3):
    '''
    This function performs KDE calculation for a given shot chart.
    Input  - dataframe with x and y shot coordinates.
    Option - sigma (in feet).
    Output - KDE of shot chart
    '''
    N,_,_ = np.histogram2d( 0.1*df['LOC_X'].values, 0.1*df['LOC_Y'].values,bins = [500, 500],range = [[-25,25],[-5.25,44.75]])
    KDE = ndimage.filters.gaussian_filter(N,10.0*sigma)
    return 1.0*KDE/np.sum(KDE)

def players_picture(player_id):
    '''
    Input: player ID
    Output: players picture
    '''
    URL = "http://stats.nba.com/media/players/230x185/%d.png" %player_id
    file = cStringIO.StringIO(urllib.urlopen(URL).read())
    return misc.imread(file)

def correlation_distance(N1,N2):
    '''
    Takes two 2D array from KDE funciton and finds the distance between the two arrays. 
    Output values are between 0-1 where 0 is identical and 1 is no similarity.
    '''
    D = np.sum(abs(N1-N2))/2.0
    return D

def shot_scatter(df,player_pic=True,ax=None,noise=True,**kwargs):
    '''
    Plotting scatter plot of shots.
    input - dataframe with x and y coordinates.
    optional - player_pic (default True) loads player picture. Use if dataframe is for a single player. 
               ax (default None) can pass plot axis.
               noise (default True) adds some random scatter to the data for better visualization  
               other - any variables that can be passed into the scatter function (e.g. transperecy value)
    '''
    if ax is None: 
        ax = plt.gca(xlim = [30,-30],ylim = [-7,43],xticks=[],yticks=[],aspect=1.0)
    nba.plot.court(ax,outer_lines=True,color='black',lw=2.0,direction='down')
    ax.axis('off')
    if noise:
        X = df.LOC_X.values + np.random.normal(loc=0.0, scale=1.5, size=len(df.LOC_X.values))
        Y = df.LOC_Y.values + np.random.normal(loc=0.0, scale=1.5, size=len(df.LOC_Y.values))
    else:
        X = df.LOC_X.values
        Y = df.LOC_Y.values
    ax.scatter(-0.1*X,0.1*Y,**kwargs)
    if player_pic:
        name = df.PLAYER_NAME.values[0]
        player_id = df.PLAYER_ID.values[0]
        pic = players_picture(player_id)
        ax.imshow(pic,extent=[15,25,30,37.8261])
        ax.text(20,29,name,fontsize=16,horizontalalignment='center',verticalalignment='center')
    ax.text(0,-7,'By: Doingthedishes',color='black',horizontalalignment='center',fontsize=20,fontweight='bold')
    
def shot_heatmap(df,sigma = 1,log=False,player_pic=True,ax=None,cmap='jet'):
    '''
    This function plots a heatmap based on the shot chart.
    input - dataframe with x and y coordinates.
    optional - log (default false) plots heatmap in log scale. 
               player (default true) adds player's picture and name if true 
               sigma - the sigma of the Gaussian kernel. In feet (default=1)
    '''
    N = KDE_heatmap(df,sigma)
    if ax is None:
        ax = plt.gca(xlim = [30,-30],ylim = [-7,43],xticks=[],yticks=[],aspect=1.0)
    nba.plot.court(ax,outer_lines=True,color='black',lw=2.0,direction='down')
    ax.axis('off')
    if log:
        ax.imshow(np.rot90(np.log10(N+1)),cmap=cmap,extent=[25.0, -25.0, -5.25, 44.75])
    else:
        ax.imshow(np.rot90(N),cmap=cmap,extent=[25.0, -25.0, -5.25, 44.75])
    if player_pic:
        player_id = df.PLAYER_ID.values[0]
        pic = players_picture(player_id)
        ax.imshow(pic,extent=[15,25,30,37.8261])
    ax.text(0,-7,'By: Doingthedishes',color='white',horizontalalignment='center',fontsize=20,fontweight='bold')

Choose players with 500 shots or more:

In [4]:
player_df = pd.DataFrame({'shots' : df.groupby(by=['PLAYER_ID','PLAYER_NAME']).size()}).reset_index()
idx = player_df.shots.values > 500
player_df = player_df.ix[idx]
players = player_df.PLAYER_ID.values
player_df.head()
Out[4]:
PLAYER_ID PLAYER_NAME shots
1 977 Kobe Bryant 713
2 1495 Tim Duncan 819
4 1717 Dirk Nowitzki 1062
5 1718 Paul Pierce 656
12 1938 Manu Ginobili 589

Plot example heatmaps + scatter plots

In [5]:
name = 'Stephen Curry'
f, axarr = plt.subplots(1,2,figsize=(20,10),facecolor='white')
for i in range(2):
    axarr[i].set_ylim([-10,41.5])
    axarr[i].set_xlim([25,-25])
    axarr[i].set_aspect(1)
    axarr[i].set_xticks([])
    axarr[i].set_yticks([])
    axarr[i].axis('off')
f.subplots_adjust(hspace=0,wspace=0)
shot_scatter(df[df['PLAYER_NAME']==name],ax=axarr[0],alpha = 0.2)
shot_heatmap(df[df['PLAYER_NAME']==name],ax=axarr[1],player_pic=False,log=True)
In [6]:
name = 'Dirk Nowitzki'
f, axarr = plt.subplots(1,2,figsize=(20,10),facecolor='white')
for i in range(2):
    axarr[i].set_ylim([-10,41.5])
    axarr[i].set_xlim([25,-25])
    axarr[i].set_aspect(1)
    axarr[i].set_xticks([])
    axarr[i].set_yticks([])
    axarr[i].axis('off')
f.subplots_adjust(hspace=0,wspace=0)
shot_scatter(df[df['PLAYER_NAME']==name],ax=axarr[0],alpha = 0.2)
shot_heatmap(df[df['PLAYER_NAME']==name],ax=axarr[1],player_pic = False)

Comparing players

In order to compare players I'm going to calculate the KDE for each player based on their shots during the entire season. The KDE is conceptaully equivalent to calculating the heatmaps for each player.

Note: when computing the KDE the $\sigma$ of the Gaussian kernel needs to be choosen. Larger $\sigma$ means that further shots are still going to be correlated but we do not want to choose a $\sigma$ that is too large. I found that $\sigma = 3$ (feet) was a good compromise between resolution and ensuring that close shots are correlated.

In [7]:
hmaps = np.zeros([500,500,len(players)])
for i,player in enumerate(player_df.PLAYER_NAME):
    hmaps[:,:,i] = KDE_heatmap(df[df['PLAYER_ID']==players[i]])

Now that we have a heatmap for each player (i.e. KDE) we can take each pair of players and compare how similar those heatmaps are. To do so I'm using the correlation_distance function that I defined above. There are sevevral ways to compute a similarity measure. I choose this one after exploring a few different options. Another similarity measure that I like (but I am not sowing here) is the Kernal Distance (https://arxiv.org/abs/1103.1625).

By comparing each pair of players we can create a similarity matrix $S$ using $1 - D$ where D is the player's shot density distance (i.e. how different they are from each other):

In [8]:
S = np.zeros([len(players),len(players)])
for i in xrange(len(players)):
    for j in xrange(i,len(players)):
        S[i,j] = 1 - correlation_distance(hmaps[:,:,i],hmaps[:,:,j])
        S[j,i] = S[i,j]

Let's plot the matrix D:

In [9]:
fig= plt.figure(figsize=(8,8))
im = plt.imshow(1-S,cmap='jet')
plt.colorbar(im,fraction=0.046, pad=0.04)
plt.show()

At this point it does not look like much. We need to work a little more to get some interesting information out of this data.

Find players with minimum similiraty:

In [10]:
i,j = np.unravel_index(np.argmin(S),np.shape(S))
name1 =  player_df.PLAYER_NAME.where(player_df['PLAYER_ID']== players[i], np.nan).max()
name2 =  player_df.PLAYER_NAME.where(player_df['PLAYER_ID']== players[j], np.nan).max()
f, axarr = plt.subplots(1,2,figsize=(20,10),facecolor='white')
for n in range(2):
    axarr[n].set_ylim([-10,41.5])
    axarr[n].set_xlim([25,-25])
    axarr[n].set_aspect(1)
    axarr[n].set_xticks([])
    axarr[n].set_yticks([])
    axarr[n].axis('off')
f.suptitle('Similarity = {}'.format(np.round(S[i,j],2)),fontsize=20.0,y=1.02)
shot_scatter(df[df['PLAYER_NAME']==name1],ax=axarr[0],alpha=0.2)
shot_scatter(df[df['PLAYER_NAME']==name2],ax=axarr[1],alpha=0.2)
f.tight_layout()

Find players with maximum similarity

In [11]:
i,j = np.unravel_index(np.argmax(S-np.identity(len(S))),np.shape(S))
name1 =  player_df.PLAYER_NAME.where(player_df['PLAYER_ID']== players[i], np.nan).max()
name2 =  player_df.PLAYER_NAME.where(player_df['PLAYER_ID']== players[j], np.nan).max()
f, axarr = plt.subplots(1,2,figsize=(20,10),facecolor='white')
for n in range(2):
    axarr[n].set_ylim([-10,41.5])
    axarr[n].set_xlim([25,-25])
    axarr[n].set_aspect(1)
    axarr[n].set_xticks([])
    axarr[n].set_yticks([])
    axarr[n].axis('off')
f.suptitle('Similarity = {}'.format(np.round(S[i,j],2)),fontsize=20.0,y=1.02)
shot_scatter(df[df['PLAYER_NAME']==name1],ax=axarr[0],alpha=0.2)
shot_scatter(df[df['PLAYER_NAME']==name2],ax=axarr[1],alpha=0.2)
f.tight_layout()