数据分析--赛季球员数据分析-24直播网

数据分析--赛季球员数据分析

来源：24直播网2026-05-25 20:58:34

直播信号

一键直击直播高清直播{看球热门} 热门直播 4k超清推荐

数据分析--赛季球员数据分析

1 import numpy as np

2 import pandas as pd

3 import matplotlib.pyplot as plt

4 import seaborn as sns

5 载入数据：

7 ## 插入数据

8 data=pd.read_csv("C:\Users\60424\Downloads\nba_players_with_salary.csv")

9 data.head()

12 ## 描述统计

13 data.describe()

16 data['Height'] = data['Height'].str.replace('cm','')

17 data['Weight'] = data['Weight'].str.replace('kg','')

18 data['Height']

19 data['Height'] = data['Height'].astype('int')

20 data['Weight'] = data['Weight'].astype('int')

22 data['Weight'].describe()

24 import matplotlib.pyplot as plt

25 #解决中文显示问题

26 plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体

27 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题

28 # 使用直方图显示身高数值分布

29 data['Weight'].plot(kind='hist')

31 #分析身高和得分是否有关

33 player.plot(kind='scatter',x='Height',y='Rating')

37 dat_cor=data.loc[:,['RPM','AGE','SALARY_MILLIONS','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','POINTS','GP','MPG','ORPM','DRPM']]

38 coor=dat_cor.corr()

39 sns.heatmap(coor,square=True, linewidths=0.02, annot=False)

40 #seaborn中的heatmap函数，是将多维度数值变量按数值大小进行交叉热图展示。

44 #薪资最高的10名运动员

45 data.loc[:,['PLAYER','SALARY_MILLIONS','RPM','AGE','MPG']].sort_values(by='SALARY_MILLIONS',ascending=False).head(10)

46 #效率值最高的10名运动员

47 data.loc[:,['PLAYER','RPM','SALARY_MILLIONS','AGE','MPG']].sort_values(by='RPM',ascending=False).head(10)

48 #出场时间最高的10名运动员

49 data.loc[:,['PLAYER','RPM','SALARY_MILLIONS','AGE','MPG']].sort_values(by='MPG',ascending=False).head(10)

52 #分布及核密度展示

53 sns.set_style('darkgrid') #设置seaborn的面板风格

54 plt.figure(figsize=(12,12))

55 plt.subplot(3,1,1) #拆分页面，多图展示

56 sns.distplot(data['SALARY_MILLIONS'])维罗纳数据分析

57 plt.xticks(np.linspace(0,40,9))

58 plt.ylabel(u'$Salary$',size=10)

60 plt.subplot(3,1,2)

61 sns.distplot(data['RPM'])

62 plt.xticks(np.linspace(-10,10,9))

63 plt.ylabel(u'$RPM$',size=10)

65 plt.subplot(3,1,3)

66 sns.distplot(data['AGE'])

67 plt.xticks(np.linspace(20,40,11))

68 plt.ylabel(u'$AGE$',size=10)

73 dat1=data.loc[:,['RPM','SALARY_MILLIONS','AGE','POINTS']]

74 sns.jointplot(dat1.SALARY_MILLIONS,dat1.AGE,kind='kde',size=8)

77 dat1=data.loc[:,['RPM','SALARY_MILLIONS','AGE','POINTS']]

78 sns.pairplot(dat1) #相关性展示，斜对角为分布展示，可以直观地看变量是否具有现行关系

82 #根据已有变量生成新的变量

83 data['avg_point']=data['POINTS']/data['MP'] #每分钟得分

84 def age_cut(df):

85 if df.AGE<=24:

86 return 'young'

87 elif df.AGE>=30:

88 return 'old'

89 else:

90 return 'best'

91 data['age_cut']=data.apply(lambda x: age_cut(x),axis=1) #球员是否处于黄金年龄

92 data['cnt']=1 #计数用

94 ### 球员薪水与效率值按年龄段来看

95 sns.set_style('darkgrid') #设置seaborn的面板风格

96 plt.figure(figsize=(8,8))

97 plt.title(u'$RPM and SALARY$',size=15)

98 X1=data.loc[data.age_cut=='old'].SALARY_MILLIONS

99 Y1=data.loc[data.age_cut=='old'].RPM

100 X2=data.loc[data.age_cut=='best'].SALARY_MILLIONS

101 Y2=data.loc[data.age_cut=='best'].RPM

102 X3=data.loc[data.age_cut=='young'].SALARY_MILLIONS

103 Y3=data.loc[data.age_cut=='young'].RPM

104 plt.plot(X1,Y1,'.')

105 plt.plot(X2,Y2,'.')

106 plt.plot(X3,Y3,'.')

107 plt.xlim(0,30)

108 plt.ylim(-8,8)

109 plt.xlabel('Salary',size=10)

110 plt.ylabel('RPM',size=10)

111 plt.xticks(np.arange(0,30,3))

112 plt.legend(['old','best','young'])

113

114

115

116

117 ### 分组操作按球队

118 dat_grp=data.groupby(by=['TEAM'],as_index=False).agg({'SALARY_MILLIONS':np.mean,'RPM':np.mean,'PLAYER':np.size})

119 dat_grp=dat_grp.loc[dat_grp.PLAYER>5] #不考虑在赛季中转会的球员

120 dat_grp.sort_values(by='SALARY_MILLIONS',ascending=False).head(10)

121

122

123 ### 分组操作按场上位置

124 dat_grp2=data.groupby(by=['TEAM','age_cut'],as_index=False).agg({'SALARY_MILLIONS':np.mean,'RPM':np.mean,'PLAYER':np.size})

125 dat_grp2=dat_grp2.loc[dat_grp2.PLAYER>3] ##剔除掉少量的position摇摆人

126 dat_grp2.sort_values(by=['PLAYER','RPM'],ascending=False).head(15）

127

128

129

130

131 ##数据可视化按球队

132 dat_grp3=data.groupby(by=['TEAM'],as_index=False).agg({'SALARY_MILLIONS':np.mean,'RPM':np.mean,'PLAYER':np.size,'POINTS':np.mean,'eFG%':np.mean,'MPG':np.mean,'AGE':np.mean})

133 dat_grp3=dat_grp3.loc[dat_grp3.PLAYER>5]

134 dat_grp3.sort_values(by=['RPM'],ascending=False).head(10

135

136 sns.set_style('whitegrid')#设置seaborn的面板风格

137 plt.figure(figsize=(12,8))

138 dat_grp4=data[data['TEAM'].isin(['GS','CLE','SA','LAC','OKC','UTAH','CHA','TOR','NO','BOS'])]

139 plt.subplot(3,1,1)

140 sns.boxplot(x='TEAM',y='AGE',data=dat_grp4)

141 plt.subplot(3,1,2)

142 sns.boxplot(x='TEAM',y='SALARY_MILLIONS',data=dat_grp4)

143 plt.subplot(3,1,3)

144 sns.boxplot(x='TEAM',y='MPG',data=dat_grp4)

145

146

147 plt.figure(figsize=(12,8))

148 plt.subplot(3,1,1)

149 sns.violinplot(x='TEAM',y='POINTS',data=dat_grp4)

150 plt.subplot(3,1,2)

151 sns.violinplot(x='TEAM',y='eFG%',data=dat_grp4)

152 plt.subplot(3,1,3)

153 sns.violinplot(x='TEAM',y='RPM',data=dat_grp4)

154

155

156

157 data.loc[data.TEAM=='SA'].sort_values(by='RPM',ascending=True).head(3)

158

159

160 point_guards = nba[nba['pos'] == 'PG']

161 point_guards['ppg'] = point_guards['pts'] / point_guards['g']

162 #ppg =pts/g

163 point_guards[['pts', 'g', 'ppg']].head(5)

164

165

166

167

168

169 point_guards = point_guards[point_guards['tov'] != 0]

170 point_guards['atr'] = point_guards['ast'] / point_guards['tov']

171 plt.scatter(point_guards['ppg'], point_guards['atr'], c='y')

172 plt.title("Point Guards")

173 plt.xlabel('Points Per Game', fontsize=13)

174 plt.ylabel('Assist Turnover Ratio', fontsize=13)

175 plt.show()

176

177

178

179

180 num_clusters = 5

181

182 random_initial_points = np.random.choice(point_guards.index, size=num_clusters)

183

184 centroids = point_guards.loc[random_initial_points]

185 plt.scatter(point_guards['ppg'], point_guards['atr'], c='yellow')

186 plt.scatter(centroids['ppg'], centroids['atr'], c='red')

187 plt.title("Centroids")

188 plt.xlabel('Points Per Game', fontsize=13)

189 plt.ylabel('Assist Turnover Ratio', fontsize=13)

190 plt.show()

191 　

192

193

194

195 def centroids_to_dict(centroids):

196 dictionary = dict()

197 # iterating counter we use to generate a cluster_id

198 counter = 0

199

200 # iterate a pandas data frame row-wise using .iterrows()

201 for index, row in centroids.iterrows():

202 coordinates = [row['ppg'], row['atr']]

203 dictionary[counter] = coordinates

204 counter += 1

205

206 return dictionary

207

208 centroids_dict = centroids_to_dict(centroids)

209

210 import math

211

212 def calculate_distance(centroid, player_values):

213 root_distance = 0

214

215 for x in range(0, len(centroid)):

216 difference = centroid[x] - player_values[x]

217 squared_difference = difference**2

218 root_distance += squared_difference

219

220 euclid_distance = math.sqrt(root_distance)

221 return euclid_distance

222

223 q = [5, 2]

224 p = [3,1]

225

226 # Sqrt(5) = ~2.24

227 print(calculate_distance(q, p))

228

229

230

231

232 def assign_to_cluster(row):

233 lowest_distance = -1

234 closest_cluster = -1

235

236 for cluster_id, centroid in centroids_dict.items():

237 df_row = [row['ppg'], row['atr']]

238 euclidean_distance = calculate_distance(centroid, df_row)

239

240 if lowest_distance == -1:

241 lowest_distance = euclidean_distance

242 closest_cluster = cluster_id

243 elif euclidean_distance < lowest_distance:

244 lowest_distance = euclidean_distance

245 closest_cluster = cluster_id

246 return closest_cluster

247

248 point_guards['cluster'] = point_guards.apply(lambda row: assign_to_cluster(row), axis=1)

249

250 def visualize_clusters(df, num_clusters):

251 colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']

252

253 for n in range(num_clusters):

254 clustered_df = df[df['cluster'] == n]

255 plt.scatter(clustered_df['ppg'], clustered_df['atr'], c=colors[n-1])

256 plt.xlabel('Points Per Game', fontsize=13)

257 plt.ylabel('Assist Turnover Ratio', fontsize=13)

258 plt.show()

259

260 visualize_clusters(point_guards, 5)

261 　　

262

263

264

265

266

267 def recalculate_centroids(df):

268 new_centroids_dict = dict()

269 # 0..1...2...3...4

270 for cluster_id in range(0, num_clusters):

271 # Finish the logic

272 return new_centroids_dict

273

274 centroids_dict = recalculate_centroids(point_guards)

275

276 point_guards['cluster'] = point_guards.apply(lambda row: assign_to_cluster(row), axis=1)

277

278 centroids_dict = recalculate_centroids(point_guards)

279 point_guards['cluster'] = point_guards.apply(lambda row: assign_to_cluster(row), axis=1)

280

281 from sklearn.cluster import KMeans

282

283 kmeans = KMeans(n_clusters=num_clusters)

284 kmeans.fit(point_guards[['ppg', 'atr']])

285 point_guards['cluster'] = kmeans.labels_

286

287 visualize_clusters(point_guards, num_clusters)

288

289

290 ## 聚合

291 grp=pd.groupby(data,['name','season','team','team_op'],as_index=False).agg({'mp':np.mean,'pts':np.mean,'ast':np.mean,'trb':np.mean,'tov':np.mean,})

292 grp.head()

293 grp=grp[(grp['season']!='07-08')&(grp['season']!='08-09')&(grp['season']!='19-20')]

294 tmp=grp#[grp['name']=='拉塞尔-威斯布鲁克'] #'凯文-杜兰特'#'詹姆斯-哈登' #拉塞尔-威斯布鲁克

295

296 px.scatter(tmp,x='ast',y='trb',color='team_op',size='pts',size_max=50,

297 title='雷霆三少',text='team_op',

298 facet_col='name',

299 animation_frame='season',animation_group='name',range_x=[0,20],range_y=[0,20])

300

301

302

303

304 tmp=pd.groupby(data,['name','season'],as_index=False).agg({'mp':np.mean,'pts':np.mean,'ast':np.mean,'trb':np.mean,'tov':np.mean,})

305

306 px.scatter_3d(tmp,x='ast',y='trb',z='pts',size='mp',size_max=50,

307 title='雷霆三少',text='name',

308 animation_frame='season',animation_group='name',range_x=[0,15],range_y=[0,20],range_z=[10,40])

标签：

上一篇: NBA今日赛况：鲍威尔41分逆转，哈登49分钟苦战下一篇: ️22岁阿德耶米与29岁说唱歌手结婚