-
Notifications
You must be signed in to change notification settings - Fork 14
/
gradientDescent.py
346 lines (260 loc) · 11.1 KB
/
gradientDescent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
import numpy as np
import scipy as sp
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, LinearRegression
import pandas as pd
import mysql.connector
import os
import datetime as dt
from itertools import chain
import matplotlib.pyplot as plt
import constants
import warnings
def getDate(day, month, year, cursor):
gameIDP = 0
findGame = "SELECT iddates FROM dates WHERE date = %s"
findGameData = (dt.date(year, month, day),)
cursor.execute(findGame, findGameData)
for game in cursor:
gameIDP = game[0]
return gameIDP
def getDates(day, month, year, numdays, cursor):
base = dt.date(year, month, day)
dateList = [base - dt.timedelta(days=x) for x in range(0, numdays)]
# get date ids from database
gameIDs = []
for date in dateList:
findGame = "SELECT iddates FROM dates WHERE date = %s"
findGameData = (date,)
cursor.execute(findGame, findGameData)
for game in cursor:
gameIDs.append(game[0])
return gameIDs
def mapFeatures(X):
'''
MAPFEATURE Feature mapping function to polynomial features
MAPFEATURE(X1, X2) maps the two input features
to quadratic features used in the regularization exercise.
Returns a new feature array with more features, comprising of
X1, X2, X1.^2, X2.^2, X1*X2, X1*X2.^2, etc..
Inputs X1, X2 must be the same size
:param X:
:return: XTransform
'''
degree = 4
poly = PolynomialFeatures(degree)
XTransform = poly.fit_transform(X)
return XTransform
def computCostMulti(X, y, theta):
'''
COMPUTECOSTMULTI Compute cost for linear regression with multiple variables
J = COMPUTECOSTMULTI(X, y, theta) computes the cost of using theta as the
parameter for linear regression to fit the data points in X and y
'''
# Initialize some useful values
m = np.shape(y)[0]
# Cost
h = X.dot(np.transpose(theta))
error = (h - y)
J = ((np.transpose(error).dot(error))/(2*m))
return J
def featureNormalize(X):
'''
FEATURENORMALIZE Normalizes the features in X FEATURENORMALIZE(X) returns a normalized version of X where
the mean value of each feature is 0 and the standard deviation is 1. This is often a good preprocessing step
to do when working with learning algorithms. Ignores the bias feature
'''
X_norm = X
mu = np.zeros((1, np.shape(X)[1]), dtype=float)
sigma = np.zeros((1, np.shape(X)[1]), dtype=float)
numFeatures = np.shape(X)[1]
i = 0
while i < numFeatures:
feature = X[:,i]
mu[1,i] = np.mean(feature)
sigma[1,i] = np.std(feature)
feature = (feature - (mu[1,i])) / (sigma[1,i])
X_norm[:,i] = feature
i = i + 1
return X_norm, mu, sigma
def gradientDescentMulti(X, y, theta, alpha, numIters):
# Initializ some useful values
m = np.shape(y)[0]
# number of training examples
thetaz = np.shape(theta)[1]
i = 0
costHistory = []
iterHistory = []
previousJ = 1000000000
optimialIteration = False
optimalIterNumber = 0
while i < numIters:
h = X.dot(np.transpose(theta))
error = (h - y)
j = 0
while j < thetaz:
xColumn = X[:,j]
partialD = np.transpose(error).dot(xColumn)
valueSet = ((theta[0, j]) - (alpha*partialD)/m)
theta[0, j] = valueSet
j = j + 1
J = computCostMulti(X, y, theta)
if ((previousJ - J[0,0]) <= 0.001) and (not optimialIteration):
optimalIterNumber = (i + 1)
optimialIteration = True
print "Optimal # of Iterations: " + str(optimalIterNumber)
previousJ = J[0,0]
costHistory.append(J[0,0])
iterHistory.append(i)
i = i + 1
return theta, costHistory, iterHistory
if __name__ == "__main__":
print "Loading data..."
cnx = mysql.connector.connect(user=constants.databaseUser,
host=constants.databaseHost,
database=constants.databaseName,
password=constants.databasePassword)
cursor = cnx.cursor()
# predict data
# date to predict
yearP = constants.yearP
monthP = constants.monthP
dayP = constants.dayP
# dates to retrieve data for batter test data
# start date
year = constants.gdStartYear
month = constants.gdStartMonth
day = constants.gdStartDay
numdays = constants.numdaysGradientDescent
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
gameIDs = getDates(day, month, year, numdays, cursor)
# select data with cooresponding game id and other constraints
batterConstraints = ['paL', 'paR']
batterConstraintsValues = {}
batterConstraintsTypes = {}
for con in batterConstraints:
var1 = raw_input("Enter operand/statement for constraint " + con + ": ")
batterConstraintsTypes[con] = var1
var0 = raw_input("Enter value for constraint " + con + ": ")
batterConstraintsValues[con] = var0
constraintsString = "("
for constraint in batterConstraints:
constraintString = constraint + " " + batterConstraintsTypes[constraint] + " " + batterConstraintsValues[constraint]
if batterConstraints[-1] != constraint:
constraintString = constraintString + ' AND '
constraintsString = constraintsString + constraintString
constraintsString = constraintsString + ")"
features = ['adjwOBA', 'adjBBP', 'adjBABIP', 'adjISO', 'adjOBP', 'adjSLG', 'adjKP', 'battingOrder', 'parkFactor',
'battersdaily.wOBA', 'battersdaily.ISO', 'battersdaily.SLG', 'battersdaily.OBP', 'battersdaily.BB',
'battersdaily.BABIP', 'battersdaily.adjSB', 'tab', 'thits', 'tsingles', 'tdoubles', 'ttriples', 'thomeruns',
'tstolenbases', 'truns', 'trbis', 'twalks', 'terrors', 'thbp', 'tstrikeouts']
targets = ['dkpoints']
featuresString = ""
for feat in features:
featuresString = featuresString + feat
if features[-1] != feat:
featuresString = featuresString + ", "
targetsString = ""
for tar in targets:
targetsString = targetsString + tar
if targets[-1] != tar:
featuresString = targetsString + ", "
print "Loading data..."
# constants: take into affect parkFactor being null for some data, asahd khaled, batters not having predictions b/c didn't start
getTestData = "SELECT batterID, "
getTestData = getTestData + featuresString
getTestData = getTestData + ", "
getTestData = getTestData + targetsString
getTestData = getTestData + " FROM battersdaily LEFT JOIN batters ON battersdaily.batterID = batters.idbatters " \
"WHERE battersdaily.bgameID = %s AND oppPitcher != 0 AND dkPointsPred IS NOT NULL AND "
getTestData = getTestData + constraintsString
numpyDataArrays = []
# execute command + load into numpy array
for game in gameIDs:
testVariables = (game,)
cursor.execute(getTestData, testVariables)
results = cursor.fetchall()
numRows = cursor.rowcount
D = np.fromiter(chain.from_iterable(results), dtype=float, count=-1)
D = D.reshape(numRows, -1)
numpyDataArrays.append(D)
iterDataSets = iter(numpyDataArrays)
next(iterDataSets)
testData = numpyDataArrays[0]
for dataArray in iterDataSets:
testData = np.vstack((testData, dataArray))
# split data
numFeatures = len(features)
batterIDs, testX = np.split(testData, [1], 1)
testX, testY = np.split(testX, [numFeatures], 1)
print "Number of training examples: " + str(np.shape(testX)[0])
# add bias term
ones = np.ones((np.shape(testX)[0], 1), dtype=float)
testX = np.hstack((ones, testX))
# learning rate + iterations
alpha = 0.01
num_iters = 1000
# theta initialization
theta = np.zeros(((numFeatures + 1), 1))
theta = np.transpose(theta)
theta, JHistory, iterHistory = gradientDescentMulti(testX, testY, theta, alpha, num_iters)
print "Batter Theta Values from Homemade Gradient Descent"
print theta
reg = LinearRegression(fit_intercept=False, normalize=True)
reg.fit(testX, testY)
thetaSKLearn = reg.coef_
print "Batter Theta Values from Sklearn Linear Regression"
print thetaSKLearn
ridge = Ridge(alpha=1, fit_intercept=False, normalize=True)
ridge.fit(testX, testY)
thetaSKLearnRidge = ridge.coef_
print "Batter Theta Values from Sklearn Ridge Regression"
print thetaSKLearnRidge
ridgeP = Ridge(alpha=150, fit_intercept=False, normalize=True)
newFeatures = mapFeatures(testX)
ridgeP.fit(newFeatures, testY)
thetaSKLearnRidgeP = ridgeP.coef_
print "Batter Theta Values from Sklearn Ridge Regression w/ Polynomial Feature Mapping"
print thetaSKLearnRidgeP
gameIDP = getDate(dayP, monthP, yearP, cursor)
# get projections
# predict for all in battersdaily - don't project for pitchers/pinch hitters/newbies
getPredictData = "SELECT batterID, "
getPredictData = getPredictData + featuresString
getPredictData = getPredictData + " FROM battersdaily LEFT JOIN batters ON battersdaily.batterID = batters.idbatters " \
"WHERE battersdaily.bgameID = %s AND start = 1 AND dkSalary IS NOT NULL"
testVariables = (gameIDP,)
cursor.execute(getPredictData, testVariables)
results = cursor.fetchall()
numRows = cursor.rowcount
D = np.fromiter(chain.from_iterable(results), dtype=float, count=-1)
targetData = D.reshape(numRows, -1)
# split data
batterIDs, targetX = np.split(targetData, [1], 1)
print "Number of targets: " + str(np.shape(targetX)[0])
# add bias term
ones = np.ones((np.shape(targetX)[0], 1), dtype=float)
targetX = np.hstack((ones, targetX))
# predict
targetY = targetX.dot(np.transpose(theta))
targetYSKLearn = targetX.dot(np.transpose(thetaSKLearn))
targetYSKLearnRidge = targetX.dot(np.transpose(thetaSKLearnRidge))
newTargetX = mapFeatures(targetX)
targetYSKLearnRidgeP = newTargetX.dot(np.transpose(thetaSKLearnRidgeP))
# load predictions into database
bID = 0
numBatters = np.shape(batterIDs)[0]
while bID < numBatters:
batterID = int(batterIDs[bID, 0])
batterProjection = float(targetY[bID, 0])
batterProjectionSKLearn = float(targetYSKLearn[bID, 0])
batterProjectionSKLearnRidge = float(targetYSKLearnRidge[bID, 0])
batterProjectionSKLearnRidgeP = float(targetYSKLearnRidgeP[bID, 0])
updateBattersDKPoints = "UPDATE battersdaily SET dkPointsPred = %s, dkPointsPredSKLin = %s, dkPointsPredRidge = %s, dkPointsPredRidgeP = %s WHERE bgameID = %s AND batterID = %s"
updateBatterDKPointsData = (batterProjection, batterProjectionSKLearn, batterProjectionSKLearnRidge, batterProjectionSKLearnRidgeP, gameIDP, batterID)
cursor.execute(updateBattersDKPoints, updateBatterDKPointsData)
bID = bID + 1
print "Predicted DK Points for Batters"
cursor.close()
cnx.commit()
cnx.close()