-
Notifications
You must be signed in to change notification settings - Fork 0
/
libSLM.py
738 lines (628 loc) · 30.2 KB
/
libSLM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
"""
libSLM implements the Second order Linear Model and its variants. In SLM, the label y is assumed to be generated by model
y = x'*w + x'*M*x
where x is the feature, x' is the transpose of x. w is the first order coefficient, M is the second order coefficient.
We minimize the following loss function:
|| x'*w + x'*M*x - y ||^2 + lambda_w*||w |^2 + lambda_M*||M||^2
We assume that M is rank k.
"""
import sklearn.decomposition
from sklearn.base import BaseEstimator, RegressorMixin
import scipy.sparse
import numpy
class SLM(BaseEstimator, RegressorMixin):
"""
The second order linear model estimator.
"""
def __init__(self, rank_M=4, max_iter=50, tol=1e-6, max_init_iter=20, init_tol=1e-2, lambda_w=0, lambda_M=0,
using_cache=True, solver_algorithm='Greedy', als_max_iter_w = 10, als_max_iter_M = 1,
learning_rate=None, diag_zero=None, truncate_y=None, ):
"""
Initialize the SLM estimator.
:param rank_M: The rank of the second order coefficient matrix M
:param max_iter: The maximal number of iterations in training..
:param tol: The error tolerance in the stopping criteria
:param max_init_iter: The maximal number of iterations in the initialization stage
:param init_tol: The error tolerance in the initialization stage
:param lambda_w: The regularizer of w
:param lambda_M: The regularizer of M
:param using_cache: True if using cached variables between several fit() calls to save some computation.
:param solver_algorithm: 'Greedy' or 'Provable'. 'Greedy' use greedy algorithm, usually in real-world datasets when features are sparse. In 'Greedy' algorithm, only diag_zero=True is implemented.
'Provable' use the provable algorithm but may be much slower than 'Greedy' in real-world datasets. 'Provable' algorithm only accepts dense features.
:param learning_rate: the learning rate. Default value: 1) If solver_algorithm='Greedy', learning_rate is greedily
selected. 2) solver_algorithm='Provable', learning_rate=1.0
:param als_max_iter_w: When using 'Greedy' algorithm, the number of iterations to compute the least square solution of w
:param als_max_iter_M: When using 'Greedy' algorithm, the number of iterations to compute the least square solution of M
:param diag_zero: True or False. Whether M is assumed to be diagonal-zero. diag_zeor=True is a necessary assumption when learning non-MIP distribution.
Default value: 1) If solver_algorithm='Greedy', diag_zero must be 'True'; 2) If solver_algorithm='Provable', diag_zero=False
:param truncate_y: True of False. When training, whether thresholding the predicted y into [-1,+1]. Only valid in 'Greedy' algorithm.
"""
self.rank_M = rank_M
self.max_iter = max_iter
self.tol = tol
self.max_init_iter = max_init_iter
self.init_tol = init_tol
self.lambda_w = lambda_w
self.lambda_M = lambda_M
self.using_cache = using_cache
self.solver_algorithm = solver_algorithm
self.als_max_iter_w = als_max_iter_w
self.als_max_iter_M = als_max_iter_M
self.learning_rate = learning_rate
self.diag_zero = diag_zero
self.truncate_y = truncate_y
if self.learning_rate is None:
if solver_algorithm=='Greedy': self.learning_rate=numpy.inf
else: self.learning_rate = 1.0
if self.diag_zero is None:
if solver_algorithm=='Greedy': self.diag_zero=True
else: self.diag_zero = False
if self.truncate_y is None:
if solver_algorithm=='Greedy': self.truncate_y=True
else: self.truncate_y = False
pass
pass # end def
def predict(self, X, X_is_z_score_normalized=False):
"""
predict the label y of testing instances X
:param X: The feature matrix. Should be a (n,d) numpy.ndarray, i.e., each row presents an instance, each column presents a feature.
:param X_is_z_score_normalized:
:return: the prediction y of (n,) numpy.ndarray, float number
"""
if self.solver_algorithm=='Provable' and self.diag_zero==False:
return self.predict_provable_diag_not_zero(X,X_is_z_score_normalized)
if self.solver_algorithm == 'Provable' and self.diag_zero == True:
return self.predict_provable_diag_zero(X,X_is_z_score_normalized)
if self.solver_algorithm == 'Greedy' and self.diag_zero == True:
return self.predict_greedy_diag_zero(X,X_is_z_score_normalized)
return 0
pass # end def predict
def predict_provable_diag_not_zero(self, X, X_is_z_score_normalized):
X = X.T
if not X_is_z_score_normalized:
X = (X - self.data_mean_) / self.data_std_
the_decision_values = A_(X, self.U_, self.V_) + X.T.dot(self.w_) + self.b_
return the_decision_values.flatten()
pass # end def predict_provable_diag_not_zero
def predict_provable_diag_zero(self,X,X_is_z_score_normalized):
X = X.T
if not X_is_z_score_normalized:
X = (X - self.data_mean_) / self.data_std_
the_decision_values = A_diag0(self.U_, self.V_, X) + X.T.dot(self.w_) + self.b_
return the_decision_values.flatten()
pass # end def predict_provable_diag_zero
def predict_greedy_diag_zero(self, X, X_is_z_score_normalized):
X = X.T
# X = X[numpy.nonzero(self.fea_sel_bool_)[0], :]
the_decision_values = numpy.asarray(
A_diag0_sparse(self.U_, self.V_, X, self.data_mean_) + X.T.dot(self.w_) + self.b_ - self.data_mean_.T.dot(self.w_))
return the_decision_values.flatten()
pass # end def predict_greedy_diag_zero
def fit(self,X, y=None, sample_weight=None, n_more_iter=None, X_is_z_score_normalized=False):
"""
Train/Fit the estimator with training data {X,y}
:param X: The feature matrix. Should be a (n,d) numpy.ndarray, i.e., each row presents an instance, each column presents a feature.
:param y: the label vector. Should be an (n,) numpy.ndarray. For classification problem, y should be in set {-1,+1}
:sample_weight y: the weights of each training instances. Should be an (n,) numpy.ndarray. Usually used in Boosting.
:param n_more_iter: The number of iterations in this fit() call. Will overwrite max_iter if not None
:param X_is_z_score_normalized: whether X is z-score normalized. If not, will performance z-score normalization automatically at the beginning of fit()
:return:
"""
if self.solver_algorithm=='Provable' and self.diag_zero==False:
return self.fit_provable_diag_not_zero(X, y, sample_weight, n_more_iter, X_is_z_score_normalized)
if self.solver_algorithm == 'Provable' and self.diag_zero == True:
return self.fit_provable_diag_zero(X, y, sample_weight, n_more_iter, X_is_z_score_normalized)
if self.solver_algorithm == 'Greedy' and self.diag_zero == True:
return self.fit_greedy_diag_zero(X, y, sample_weight, n_more_iter,)
pass # end def fit
def fit_provable_diag_zero(self, X, y, sample_weight, n_more_iter, X_is_z_score_normalized):
X = X.T
y = y[:, numpy.newaxis]
y = numpy.asarray(y, dtype=numpy.float)
n = X.shape[1]
if sample_weight is None:
sample_weight = numpy.ones((n,))
sample_weight = sample_weight / numpy.sum(sample_weight)
sample_weight = sample_weight[:, numpy.newaxis]
X_new = None
if not hasattr(self, 'is_init_stage_'):
self.is_init_stage_ = True
self.d_ = X.shape[0]
X_times_sample_weight = n * X * sample_weight.T
if not X_is_z_score_normalized:
self.data_mean_ = X_times_sample_weight.mean(axis=1, keepdims=True)
X_new = X - self.data_mean_
else:
self.data_mean_ = numpy.zeros((X.shape[0], 1))
X_new = X
X_weighted_std = numpy.sqrt(n * numpy.mean((X_new ** 2) * sample_weight.T, axis=1, keepdims=True))
if not X_is_z_score_normalized:
self.data_std_ = numpy.maximum(X_weighted_std, 1e-12)
X_new /= self.data_std_
else:
self.data_std_ = numpy.ones((X.shape[0], 1))
Xp2 = X_new ** 2
Xp3 = Xp2 * X_new
if self.using_cache:
self.cached_Xp2_ = Xp2
pass # end if self.using_cache
self.data_moment3_ = numpy.mean(n * (Xp3) * sample_weight.T, axis=1, keepdims=True)
U, _ = numpy.linalg.qr(numpy.random.randn(self.d_, self.rank_M))
self.U_ = U
self.V_ = numpy.zeros(U.shape)
self.w_ = numpy.zeros((self.d_, 1))
self.b_ = 0
# end if
if X_is_z_score_normalized == False and X_new is None:
X = (X - self.data_mean_) / self.data_std_
if X_is_z_score_normalized == False and X_new is not None:
X = X_new
pass # end if not hasattr(self, 'is_init_stage_'):
if self.is_init_stage_:
p0 = numpy.sum(y)
p1 = X.dot(y)
cache_1 = self.data_moment3_ * p1 + p0
# for t in xrange(the_num_iter):
for t in xrange(self.max_init_iter):
U_new = (ApA_diag0(y, self.U_, X) - cache_1 * self.U_) / (2 * n)
U_new, _ = numpy.linalg.qr(U_new)
if numpy.sum(numpy.abs(self.U_ - U_new)) < self.init_tol:
self.is_init_stage_ = False
self.U_ = U_new
print 'early stop in initialzation'
break
# end if numpy.sum(numpy.abs(self.U-U_new)) < self.init_tol:
self.U_ = U_new
# end for ite_count in xrange(n_more_iter
print 'stop initialzation'
self.is_init_stage_ = False
# update V
V_new = (ApA_diag0(y, self.U_, X) - cache_1 * self.U_) / (2 * n) * self.learning_rate
hat_y = A_diag0(self.U_, V_new, X)
self.V_ = V_new
# update w and b
dy = y - hat_y
p0 = numpy.sum(dy)
p1 = X.dot(dy)
w_new = p1 / n * self.learning_rate
b_new = p0 / n * self.learning_rate
hat_y = A_diag0(self.U_, self.V_, X) + X.T.dot(w_new) + b_new
self.w_ = w_new
self.b_ = b_new
pass # end if
pass # end if self.is_init_stage_
if not self.is_init_stage_:
if n_more_iter is None:
the_num_iter = self.max_iter
else:
the_num_iter = n_more_iter
U = self.U_
V = self.V_
w = self.w_
b = self.b_
for t in xrange(the_num_iter):
# update U,V
hat_y = A_diag0(U, V, X) + X.T.dot(w) + b
dy = y - hat_y
dy = n * dy * sample_weight
p0 = numpy.sum(dy)
p1 = X.dot(dy)
cache_1 = self.data_moment3_ * p1 + p0
U_new = (ApA_diag0(dy,U,X) - cache_1*U)/(2 * n) * self.learning_rate + \
U.dot(V.T.dot(U)) / 2 + V.dot(U.T.dot(U)) / 2
U_new, _ = numpy.linalg.qr(U_new)
V_new = (ApA_diag0(dy,U_new,X) - cache_1*U_new) / (2 * n) * self.learning_rate + \
U.dot(V.T.dot(U_new)) / 2 + V.dot(U.T.dot(U_new)) / 2
hat_y = A_diag0(U_new, V_new, X) + X.T.dot(w) + b
# update w and b
dy = y - hat_y
dy = n * dy * sample_weight
p0 = numpy.sum(dy)
p1 = X.dot(dy)
w_new = p1/n * self.learning_rate + w
b_new = p0/n*self.learning_rate + b
hat_y = A_diag0(U_new, V_new, X) + X.T.dot(w_new) + b_new
if numpy.sum(numpy.abs(U-U_new))<self.tol and numpy.sum(numpy.abs(V-V_new))<self.tol and numpy.sum(numpy.abs(w-w_new))<self.tol and numpy.abs(b-b_new)<self.tol:
U = U_new
V = V_new
w = w_new
b = b_new
print 'early stop'
break
# end if ... < tol
# update old with new variances
U = U_new
V = V_new
w = w_new
b = b_new
# end for t
self.U_ = U
self.V_ = V
self.w_ = w
self.b_ = b
# end if not self.is_init_stage_:
return self
pass # end def fit_provable_diag_not_zero
def fit_provable_diag_not_zero(self, X, y, sample_weight, n_more_iter, X_is_z_score_normalized):
X = X.T
y = y[:, numpy.newaxis]
y = numpy.asarray(y, dtype=numpy.float)
n = X.shape[1]
if sample_weight is None:
sample_weight = numpy.ones((n,))
sample_weight = sample_weight / numpy.sum(sample_weight)
sample_weight = sample_weight[:, numpy.newaxis]
X_new = None
if not hasattr(self,'is_init_stage_'):
self.is_init_stage_ = True
self.d_ = X.shape[0]
X_times_sample_weight = n * X * sample_weight.T
if not X_is_z_score_normalized:
self.data_mean_ = X_times_sample_weight.mean(axis=1, keepdims=True)
X_new = X - self.data_mean_
else:
self.data_mean_ = numpy.zeros((X.shape[0],1))
X_new = X
X_weighted_std = numpy.sqrt(n * numpy.mean((X_new ** 2) * sample_weight.T, axis=1, keepdims=True))
if not X_is_z_score_normalized:
self.data_std_ = numpy.maximum(X_weighted_std, 1e-12)
X_new /= self.data_std_
else:
self.data_std_ = numpy.ones((X.shape[0],1))
Xp2 = X_new ** 2
Xp3 = Xp2*X_new
Xp4 = Xp3 * X_new
if self.using_cache:
self.cached_Xp2_ = Xp2
# end if self.using_cache
self.data_moment3_ = numpy.mean(n * Xp3 * sample_weight.T, axis=1, keepdims=True)
self.data_moment4_ = numpy.mean(n * Xp4 * sample_weight.T, axis=1, keepdims=True)
tmp_A = numpy.zeros((2, 2, self.d_))
tmp_A[0, 0, :] = 1
tmp_A[0, 1, :] = self.data_moment3_.ravel()
# tmp_A[0, 2, :] = self.data_moment4.ravel()
tmp_A[1, 0, :] = self.data_moment3_.ravel()
tmp_A[1, 1, :] = self.data_moment4_.ravel() - 1
# tmp_A[1, 2, :] = self.data_moment5.ravel() - self.data_moment3.ravel()
tmp_b = numpy.zeros((2, 1, self.d_))
tmp_b[0, 0, :] = self.data_moment3_.ravel()
tmp_b[1, 0, :] = self.data_moment4_.ravel() - 3
tmp_bw = numpy.zeros((2, 1, self.d_))
tmp_bw[0, 0, :] = 1
tmp_bw[1, 0, :] = 0
self.Z_ = numpy.zeros((self.d_, 2))
self.G_ = numpy.zeros((self.d_, 2))
sv_record = numpy.zeros((self.d_,))
for i in xrange(self.d_):
tmpu_u, tmp_s, tmp_v = numpy.linalg.svd(tmp_A[:, :, i], full_matrices=False)
sv_record[i] = tmp_s[1]
if tmp_s[1] < 0.01:
print 'warning! small singular value when computing Z and G! sv = %f' % (tmp_s[1])
sv_threshold = numpy.max(sv_record)*1e-3
for i in xrange(self.d_):
pinv_tmpA = numpy.linalg.pinv(tmp_A[:, :, i], sv_threshold)
self.G_[i, :] = numpy.ravel(pinv_tmpA.dot(tmp_bw[:, :, i]))
self.Z_[i, :] = numpy.ravel(pinv_tmpA.dot(tmp_b[:, :, i]))
U, _ = numpy.linalg.qr(numpy.random.randn(self.d_, self.rank_M))
self.U_ = U
self.V_ = numpy.zeros(U.shape)
self.w_ = numpy.zeros((self.d_,1))
self.b_ = 0
pass # end if
if X_is_z_score_normalized == False and X_new is None:
X = (X - self.data_mean_) / self.data_std_
if X_is_z_score_normalized == False and X_new is not None:
X = X_new
if self.using_cache:
Xp2 = self.cached_Xp2_
else:
Xp2 = X**2
p0 = numpy.sum(y)
p1 = X.dot(y)
p2 = Xp2.dot(y)
if self.is_init_stage_:
self.is_init_stage_ = False
for t in xrange(self.max_init_iter):
U_new = mathcal_M_(n * y * sample_weight, self.U_, X, self.Z_, p0, p1, p2, ) / (2 * n)
U_new,_ = numpy.linalg.qr(U_new)
if numpy.sum(numpy.abs(self.U_-U_new)) < self.init_tol:
self.U_ = U_new
print 'early stop in initialzation'
break
# end if numpy.sum(numpy.abs(self.U-U_new)) < self.init_tol:
self.U_ = U_new
# end for ite_count in xrange(n_more_iter):
# update V
V = mathcal_M_(n * y * sample_weight, self.U_, X, self.Z_, p0, p1, p2) / (2 * n) * self.learning_rate
self.V_ = V
# update w and b
w_new = mathcal_W_(self.G_, p0, p1, p2) / n * self.learning_rate
self.w_ = w_new
b = (numpy.mean(y) - numpy.sum(self.U_ * self.V_)) * self.learning_rate
self.b_ = b
pass # end if self.is_init_stage_
if not self.is_init_stage_:
if n_more_iter is None:
the_num_iter = self.max_iter
else:
the_num_iter = n_more_iter
U = self.U_
V = self.V_
w = self.w_
b = self.b_
for t in xrange(the_num_iter):
hat_y = A_(X, U, V) + X.T.dot(w) + b
dy = y - hat_y
dy = n * dy * sample_weight
p0 = numpy.sum(dy)
p1 = X.dot(dy)
p2 = Xp2.dot(dy)
# update U
U_new = mathcal_M_(dy, U, X, self.Z_, p0, p1, p2) / (2 * n) * self.learning_rate + \
U.dot(V.T.dot(U)) / 2 + V.dot(U.T.dot(U)) / 2
# V_new = U_new
U_new, _ = numpy.linalg.qr(U_new)
# update V
V_new = mathcal_M_(dy, U_new, X, self.G_, p0, p1, p2) / (2 * n) * self.learning_rate + \
U.dot(V.T.dot(U_new)) / 2 + V.dot(U.T.dot(U_new)) / 2
# update w and b
hat_y = A_(X, U_new, V_new) + X.T.dot(w) + b
dy = y - hat_y
dy = n * dy * sample_weight
p0 = numpy.sum(dy)
p1 = X.dot(dy)
p2 = Xp2.dot(dy)
w_new = mathcal_W_(self.G_, p0, p1, p2) / n * self.learning_rate + w
b_new = numpy.mean(dy)*self.learning_rate + b
if numpy.sum(numpy.abs(U-U_new))<self.tol and numpy.sum(numpy.abs(V-V_new))<self.tol and numpy.sum(numpy.abs(w-w_new))<self.tol and numpy.abs(b-b_new)<self.tol:
U = U_new
V = V_new
w = w_new
b = b_new
print 'early stop'
break
# end if ... < tol
# update old with new variances
U = U_new
V = V_new
w = w_new
b = b_new
# end for t
self.U_ = U
self.V_ = V
self.w_ = w
self.b_ = b
pass # end if not self.is_init_stage_:
return self
pass # end def fit_provable_diag_zero
def fit_greedy_diag_zero(self, X, y, sample_weight=None, n_more_iter=None,):
X = X.T
y = y[:, numpy.newaxis]
y = numpy.asarray(y, dtype=numpy.float)
n = X.shape[1]
reg_w = float(self.lambda_w) / n
reg_M = float(self.lambda_M) / n
if n_more_iter is None: n_more_iter = self.max_iter
if not hasattr(self, 'has_initialized_'):
self.has_initialized_ = True
self.U_not_initialized_ = True
# tmp_data_mean_ = numpy.asarray(X.mean(axis=1))
# tmp_data_std_ = numpy.sqrt(numpy.asarray(X.power(2).mean(axis=1)) - tmp_data_mean_ ** 2)
# self.fea_sel_bool_ = tmp_data_std_ > 1e-2
# new_X = X[numpy.nonzero(self.fea_sel_bool_)[0], :]
new_X = X
tmp_data_mean_ = numpy.asarray(new_X.mean(axis=1))
self.data_std_ = numpy.sqrt(numpy.asarray(new_X.power(2).mean(axis=1)) - tmp_data_mean_ ** 2)
self.data_mean_ = numpy.asarray(new_X.mean(axis=1))
self.data_moment3 = numpy.asarray(new_X.power(3).mean(axis=1))
self.data_moment3 -= 3 * self.data_mean_ * numpy.asarray(new_X.power(2).mean(axis=1))
self.data_moment3 -= 2 * self.data_mean_ ** 3
self.d_ = new_X.shape[0]
U, _ = numpy.linalg.qr(numpy.random.randn(self.d_, self.rank_M))
self.U_ = U
self.V_ = numpy.zeros(U.shape)
self.w_ = numpy.zeros((self.d_, 1))
self.b_ = 0
pass # end if not hasattr(self, 'has_initialized_')
# X = X[numpy.nonzero(self.fea_sel_bool_)[0], :]
# X_power_2 = X.power(2)
U = self.U_
V = self.V_
w = self.w_
b = self.b_
hat_y = numpy.asarray(A_diag0_sparse(U,V,X,self.data_mean_) + X.T.dot(w) + b)-self.data_mean_.T.dot(w)
truncated_y = hat_y.copy()
if self.truncate_y:
truncated_y[hat_y>1]=1
truncated_y[hat_y<-1] = -1
for t in xrange(n_more_iter):
# als w
for als_count in xrange(self.als_max_iter_w):
dy = truncated_y -y
X_dy = numpy.asarray(X.dot(dy) - self.data_mean_*numpy.sum(dy)) # X.dot(dy)
grad_t = X_dy / n + reg_w *w
XT_grad_t = numpy.asarray(X.T.dot(grad_t)) - self.data_mean_.T.dot(grad_t)
learning_rate_w_ = ( grad_t.T.dot( X_dy )/n + reg_w*numpy.sum(grad_t*w) )/( numpy.linalg.norm(XT_grad_t)**2/n + reg_w*numpy.linalg.norm(grad_t)**2+1e-10)
learning_rate_w_ = numpy.min([self.learning_rate,learning_rate_w_])
# learning_rate_w_ = self.learning_rate
w_new = w - learning_rate_w_*grad_t
hat_y = hat_y-learning_rate_w_*(XT_grad_t )
truncated_y = hat_y.copy()
if self.truncate_y:
truncated_y[hat_y > 1] = 1
truncated_y[hat_y < -1] = -1
w = w_new
# update b
delta_b = float(numpy.sum(y - truncated_y))/n
b_new = b + delta_b
hat_y = hat_y + delta_b
truncated_y = hat_y.copy()
if self.truncate_y:
truncated_y[hat_y > 1] = 1
truncated_y[hat_y < -1] = -1
b = b_new
pass # end for als_count in xrange(self.als_max_iter_w)
# update U,V
for als_count in xrange(self.als_max_iter_M):
dy = truncated_y -y
p0 = numpy.sum(dy)
p1 = X.dot(dy) - self.data_mean_ * numpy.sum(dy)
cache_1 = self.data_moment3 * p1 + p0
# cache_1 = p0
if self.U_not_initialized_:
for init_U_ite in xrange(self.max_init_iter):
ApA_dy_Ut = numpy.asarray(ApA_diag0_sparse(dy, U, X, self.data_mean_)) - cache_1 * U
U_new = ApA_dy_Ut
U,_ = numpy.linalg.qr(U_new)
pass # end for init_U_ite
self.U_not_initialized_ = False
pass # end if self.U_not_initialized_
ApA_dy_Ut = numpy.asarray(ApA_diag0_sparse(dy, U, X, self.data_mean_)) - cache_1*U
grad_UV = (ApA_dy_Ut/(2*n) + reg_M*V) # grad
# grad_diag = (numpy.asarray(X_power_2.dot(dy)))/(2*n)*U
# grad_diag -= 2*self.data_mean_*(numpy.asarray(X.dot(dy)))/(2*n)*U
# grad_diag += self.data_mean_**2 * numpy.sum(dy) /(2*n) * U
# grad_t = grad_UV - grad_diag
grad_t = grad_UV
A_Ut_grad_t = numpy.asarray(A_diag0_sparse(U,grad_t,X,self.data_mean_))
learning_rate_M_ = (dy.T.dot(A_Ut_grad_t)/n + reg_M*numpy.sum(grad_t*V))/( numpy.linalg.norm(A_Ut_grad_t)**2/n + reg_M*numpy.linalg.norm(grad_t)**2+1e-8)
learning_rate_M_ = numpy.min([self.learning_rate, learning_rate_M_])
# learning_rate_M_ = self.learning_rate
V_new = V - learning_rate_M_*grad_t
hat_y = hat_y - learning_rate_M_*A_Ut_grad_t
truncated_y = hat_y.copy()
if self.truncate_y:
truncated_y[hat_y > 1] = 1
truncated_y[hat_y < -1] = -1
Q, R = numpy.linalg.qr(V_new)
V_new = U.dot(R.T)
U_new = Q
U = U_new
V = V_new
pass # end for als_count in xrange(self.als_max_iter_M):
pass # end for t
self.U_ = U
self.V_ = V
self.w_ = w
self.b_ = b
return self
pass # end def fit_greedy_diag_zero
pass # end class SLM
# ----- Auxiliary functions for fit_provable_diag_not_zero ----- #
def mathcal_W_(G,p0,p1,p2):
# type: (numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray) -> numpy.ndarray
"""
Return $\mathcal{W}(y)*n given the constant parameters. X should be zero-mean unit-variance
"""
# p0 = numpy.sum(y)
# p1 = X.dot(y)
# p2 = (X**2).dot(y)
# p3 = (X**3).dot(y)
# return G[:,0,numpy.newaxis]*p1 + G[:,1,numpy.newaxis]*(p2-p0) + G[:,2,numpy.newaxis]*( p3 - data_moment3*p0 )
return G[:, 0, numpy.newaxis] * p1 + G[:, 1, numpy.newaxis] * (p2 - p0)
# end def
def mathcal_M_(y,U,X,Z,p0,p1,p2):
# type: (numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray) -> numpy.ndarray
"""
Return $\mathcal{M}(y)U*2n, given the constant parameters. X should be zero-mean unit-variance
"""
# p0 = numpy.sum(y)
# p1 = X.dot(y)
# p2 = (X**2).dot(y)
# p3 = (X**3).dot(y)
term1 = (X*y.T).dot(X.T.dot(U))
# term2 = p0+ Z[:,0,numpy.newaxis]*p1 + Z[:,1,numpy.newaxis]*(p2-p0) + Z[:,2,numpy.newaxis]*( p3 - data_moment3*p0)
term2 = p0+ Z[:, 0, numpy.newaxis] * p1 + Z[:, 1, numpy.newaxis] * (p2 - p0)
return term1 - term2*U
# end def
def A_(X,U,V):
# type: (numpy.ndarray, numpy.ndarray, numpy.ndarray) -> numpy.ndarray
"""
The sensing operator A in gFM. X is the data matrix; UV'=M as in gFM. The X should be zero-mean and unit-variance.
\mathcal{A}(M) = { x_i' (M +M') x_i/2}_{i=1}^n where M=UV'
:param X: a $d \times n$ feature matrix
:param U: $d \times k$ matrix
:param V: $d \times k$ matrix
:return: z = A(UV')
"""
z = numpy.sum( X.T.dot(U) * X.T.dot(V), axis=1, keepdims=True)
return z
# end def
def ApW_(X,p,W):
# type: (numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray) -> numpy.ndarray
"""
Compute z=A'(p)W, X should be zero-mean and unit-variance
: param X: feature matrix, $d \times n$
:param p: $n \times 1$ vector
:param W: $d \times k$ matrix
:param mean: mean vector of features, $d \times 1$ vector.
:param sigma: std of features, $d \times 1$ vector
:return: $d \times k$ matrix
"""
z = X.dot(p*X.T.dot(W))
return z
# end def
def ApA_diag0(y, U, X):
# type: (numpy.ndarray, numpy.ndarray, numpy.ndarray,) -> numpy.ndarray
"""
return A'A(y)*U= (\sum_i x_i y_i x_i' )*U
:param y:
:param U:
:param X:
:return:
"""
return (X*y.T).dot(X.T.dot(U))
pass # end def
def A_diag0(U, V, X):
# type: (numpy.ndarray, numpy.ndarray, numpy.ndarray,) -> numpy.ndarray
"""
return A(UV' - diag(UV')) = (X'U * X'V) 1 - (X*(((U*V)1)*X))1
:param U:
:param V:
:param X:
:return:
"""
z = numpy.sum(X.T.dot(U) * X.T.dot(V), axis=1, keepdims=True)
z_diag = numpy.sum(X*(numpy.sum(U*V,axis=1,keepdims=True)*X), axis=0, keepdims=True).T
return z-z_diag
pass # end def
def ApA_diag0_sparse(y, U, X, data_mean):
# type: (numpy.ndarray, numpy.ndarray, numpy.ndarray,) -> numpy.ndarray
"""
return A'A(y)*U= (\sum_i x_i y_i x_i' )*U
:param y:
:param U:
:param X:
:return:
"""
XTdotU = numpy.asarray(X.T.dot(U))
data_meanTdotU = data_mean.T.dot(U)
r = X.dot(y*XTdotU) - data_mean.dot(y.T.dot(XTdotU))
r -= X.dot(y).dot(data_meanTdotU)
r += numpy.sum(y) * data_mean.dot(data_meanTdotU)
return numpy.asarray(r)
pass # end def
def A_diag0_sparse(U, V, X,data_mean):
# type: (numpy.ndarray, numpy.ndarray, numpy.ndarray,) -> numpy.ndarray
"""
return A(UV' - diag(UV')) = (X'U * X'V) 1 - (X*(((U*V)1)*X))1
:param U:
:param V:
:param X:
:return:
"""
n = X.shape[1]
XTdotU = numpy.asarray(X.T.dot(U))
XTdotV = numpy.asarray(X.T.dot(V))
data_meanTdotU = numpy.asarray(data_mean.T.dot(U))
data_meanTdotV = numpy.asarray(data_mean.T.dot(V))
sum_UV = numpy.sum(U*V,axis=1,keepdims=True)
z = numpy.sum(XTdotU *XTdotV, axis=1, keepdims=True)
z -= numpy.sum( data_meanTdotU*XTdotV, axis=1, keepdims=True)
z -= numpy.sum( XTdotU * data_meanTdotV, axis=1, keepdims=True)
z += numpy.sum(data_meanTdotU *data_meanTdotV)
z_diag = numpy.asarray(X.T.power(2).dot(sum_UV))
z_diag -= 2*numpy.asarray(X.T.dot(data_mean * sum_UV))
z_diag += numpy.asarray(numpy.sum( (data_mean**2)*(sum_UV), axis=0, keepdims=True).T)
return numpy.asarray(z-z_diag)
pass # end def