libSLM.py

"""
libSLM implements the Second order Linear Model and its variants. In SLM, the label y is assumed to be generated by model
y = x'*w + x'*M*x
where x is the feature, x' is the transpose of x. w is the first order coefficient, M is the second order coefficient.
We minimize the following loss function:
|| x'*w + x'*M*x - y ||^2 + lambda_w*||w |^2 + lambda_M*||M||^2
We assume that M is rank k.

"""
import sklearn.decomposition
from sklearn.base import BaseEstimator, RegressorMixin
import scipy.sparse
import numpy


class SLM(BaseEstimator, RegressorMixin):
    """
    The second order linear model estimator.
    """
    def __init__(self, rank_M=4, max_iter=50, tol=1e-6, max_init_iter=20, init_tol=1e-2, lambda_w=0, lambda_M=0,
                 using_cache=True, solver_algorithm='Greedy', als_max_iter_w = 10, als_max_iter_M = 1,
                 learning_rate=None, diag_zero=None, truncate_y=None, ):
        """
        Initialize the SLM estimator.
        :param rank_M: The rank of the second order coefficient matrix M
        :param max_iter: The maximal number of iterations in training..
        :param tol: The error tolerance in the stopping criteria
        :param max_init_iter: The maximal number of iterations in the initialization stage
        :param init_tol: The error tolerance in the initialization stage
        :param lambda_w: The regularizer of w
        :param lambda_M: The regularizer of M
        :param using_cache: True if using cached variables between several fit() calls to save some computation.
        :param solver_algorithm: 'Greedy' or 'Provable'. 'Greedy' use greedy algorithm, usually in real-world datasets when features are sparse. In 'Greedy' algorithm, only diag_zero=True is implemented.
        'Provable' use the provable algorithm but may be much slower than 'Greedy' in real-world datasets. 'Provable' algorithm only accepts dense features.
        :param learning_rate: the learning rate. Default value: 1) If solver_algorithm='Greedy', learning_rate is greedily
         selected. 2) solver_algorithm='Provable', learning_rate=1.0
        :param als_max_iter_w: When using 'Greedy' algorithm, the number of iterations to compute the least square solution of w
        :param als_max_iter_M: When using 'Greedy' algorithm, the number of iterations to compute the least square solution of M
        :param diag_zero: True or False. Whether M is assumed to be diagonal-zero. diag_zeor=True is a necessary assumption when learning non-MIP distribution.
        Default value: 1) If solver_algorithm='Greedy', diag_zero must be 'True';  2) If solver_algorithm='Provable', diag_zero=False
        :param truncate_y: True of False. When training, whether thresholding the predicted y into [-1,+1]. Only valid in 'Greedy' algorithm.
        """
        self.rank_M = rank_M
        self.max_iter = max_iter
        self.tol = tol
        self.max_init_iter = max_init_iter
        self.init_tol = init_tol
        self.lambda_w = lambda_w
        self.lambda_M = lambda_M
        self.using_cache = using_cache
        self.solver_algorithm = solver_algorithm
        self.als_max_iter_w = als_max_iter_w
        self.als_max_iter_M = als_max_iter_M
        self.learning_rate = learning_rate
        self.diag_zero = diag_zero
        self.truncate_y = truncate_y

        if self.learning_rate is None:
            if solver_algorithm=='Greedy': self.learning_rate=numpy.inf
            else: self.learning_rate = 1.0

        if self.diag_zero is None:
            if solver_algorithm=='Greedy': self.diag_zero=True
            else: self.diag_zero = False

        if self.truncate_y is None:
            if solver_algorithm=='Greedy': self.truncate_y=True
            else: self.truncate_y = False

        pass

    pass  # end def

    def predict(self, X, X_is_z_score_normalized=False):
        """
        predict the label y of testing instances X
        :param X: The feature matrix. Should be a (n,d) numpy.ndarray, i.e., each row presents an instance, each column presents a feature.
        :param X_is_z_score_normalized:
        :return: the prediction y of (n,) numpy.ndarray, float number
        """
        if self.solver_algorithm=='Provable' and self.diag_zero==False:
            return self.predict_provable_diag_not_zero(X,X_is_z_score_normalized)
        if self.solver_algorithm == 'Provable' and self.diag_zero == True:
            return self.predict_provable_diag_zero(X,X_is_z_score_normalized)
        if self.solver_algorithm == 'Greedy' and self.diag_zero == True:
            return self.predict_greedy_diag_zero(X,X_is_z_score_normalized)
        return 0
    pass # end def predict

    def predict_provable_diag_not_zero(self, X, X_is_z_score_normalized):
        X = X.T
        if not X_is_z_score_normalized:
            X = (X - self.data_mean_) / self.data_std_
        the_decision_values = A_(X, self.U_, self.V_) + X.T.dot(self.w_) + self.b_
        return the_decision_values.flatten()
    pass # end def predict_provable_diag_not_zero

    def predict_provable_diag_zero(self,X,X_is_z_score_normalized):
        X = X.T
        if not X_is_z_score_normalized:
            X = (X - self.data_mean_) / self.data_std_
        the_decision_values = A_diag0(self.U_, self.V_, X) + X.T.dot(self.w_) + self.b_
        return the_decision_values.flatten()
    pass # end def predict_provable_diag_zero

    def predict_greedy_diag_zero(self, X, X_is_z_score_normalized):
        X = X.T
        # X = X[numpy.nonzero(self.fea_sel_bool_)[0], :]
        the_decision_values = numpy.asarray(
            A_diag0_sparse(self.U_, self.V_, X, self.data_mean_) + X.T.dot(self.w_) + self.b_ - self.data_mean_.T.dot(self.w_))
        return the_decision_values.flatten()
    pass # end def predict_greedy_diag_zero



    def fit(self,X, y=None, sample_weight=None, n_more_iter=None, X_is_z_score_normalized=False):
        """
        Train/Fit the estimator with training data {X,y}
        :param X: The feature matrix. Should be a (n,d) numpy.ndarray, i.e., each row presents an instance, each column presents a feature.
        :param y: the label vector. Should be an (n,) numpy.ndarray. For classification problem, y should be in set {-1,+1}
        :sample_weight y: the weights of each training instances. Should be an (n,) numpy.ndarray. Usually used in Boosting.
        :param n_more_iter: The number of iterations in this fit() call. Will overwrite max_iter if not None
        :param X_is_z_score_normalized: whether X is z-score normalized. If not, will performance z-score normalization automatically at the beginning of fit()
        :return:
        """
        if self.solver_algorithm=='Provable' and self.diag_zero==False:
            return self.fit_provable_diag_not_zero(X, y, sample_weight, n_more_iter, X_is_z_score_normalized)
        if self.solver_algorithm == 'Provable' and self.diag_zero == True:
            return self.fit_provable_diag_zero(X, y, sample_weight, n_more_iter, X_is_z_score_normalized)
        if self.solver_algorithm == 'Greedy' and self.diag_zero == True:
            return self.fit_greedy_diag_zero(X, y, sample_weight, n_more_iter,)
    pass # end def fit

    def fit_provable_diag_zero(self, X, y, sample_weight, n_more_iter, X_is_z_score_normalized):
        X = X.T
        y = y[:, numpy.newaxis]
        y = numpy.asarray(y, dtype=numpy.float)
        n = X.shape[1]

        if sample_weight is None:
            sample_weight = numpy.ones((n,))
            sample_weight = sample_weight / numpy.sum(sample_weight)
        sample_weight = sample_weight[:, numpy.newaxis]

        X_new = None

        if not hasattr(self, 'is_init_stage_'):
            self.is_init_stage_ = True
            self.d_ = X.shape[0]
            X_times_sample_weight = n * X * sample_weight.T

            if not X_is_z_score_normalized:
                self.data_mean_ = X_times_sample_weight.mean(axis=1, keepdims=True)
                X_new = X - self.data_mean_
            else:
                self.data_mean_ = numpy.zeros((X.shape[0], 1))
                X_new = X
            X_weighted_std = numpy.sqrt(n * numpy.mean((X_new ** 2) * sample_weight.T, axis=1, keepdims=True))

            if not X_is_z_score_normalized:
                self.data_std_ = numpy.maximum(X_weighted_std, 1e-12)
                X_new /= self.data_std_
            else:
                self.data_std_ = numpy.ones((X.shape[0], 1))

            Xp2 = X_new ** 2
            Xp3 = Xp2 * X_new

            if self.using_cache:
                self.cached_Xp2_ = Xp2
            pass  # end if self.using_cache

            self.data_moment3_ = numpy.mean(n * (Xp3) * sample_weight.T, axis=1, keepdims=True)

            U, _ = numpy.linalg.qr(numpy.random.randn(self.d_, self.rank_M))
            self.U_ = U
            self.V_ = numpy.zeros(U.shape)
            self.w_ = numpy.zeros((self.d_, 1))
            self.b_ = 0
            # end if

            if X_is_z_score_normalized == False and X_new is None:
                X = (X - self.data_mean_) / self.data_std_
            if X_is_z_score_normalized == False and X_new is not None:
                X = X_new
        pass # end if not hasattr(self, 'is_init_stage_'):

        if self.is_init_stage_:
            p0 = numpy.sum(y)
            p1 = X.dot(y)
            cache_1 = self.data_moment3_ * p1 + p0
            # for t in xrange(the_num_iter):
            for t in xrange(self.max_init_iter):
                U_new = (ApA_diag0(y, self.U_, X) - cache_1 * self.U_) / (2 * n)
                U_new, _ = numpy.linalg.qr(U_new)
                if numpy.sum(numpy.abs(self.U_ - U_new)) < self.init_tol:
                    self.is_init_stage_ = False
                    self.U_ = U_new
                    print 'early stop in initialzation'
                    break
                # end if numpy.sum(numpy.abs(self.U-U_new)) < self.init_tol:
                self.U_ = U_new
            # end for ite_count in xrange(n_more_iter
            print 'stop initialzation'
            self.is_init_stage_ = False
            # update V
            V_new = (ApA_diag0(y, self.U_, X) - cache_1 * self.U_) / (2 * n) * self.learning_rate
            hat_y = A_diag0(self.U_, V_new, X)
            self.V_ = V_new

            # update w and b
            dy = y - hat_y
            p0 = numpy.sum(dy)
            p1 = X.dot(dy)
            w_new = p1 / n * self.learning_rate
            b_new = p0 / n * self.learning_rate
            hat_y = A_diag0(self.U_, self.V_, X) + X.T.dot(w_new) + b_new
            self.w_ = w_new
            self.b_ = b_new
            pass  # end if
        pass # end if self.is_init_stage_

        if not self.is_init_stage_:
            if n_more_iter is None:
                the_num_iter = self.max_iter
            else:
                the_num_iter = n_more_iter

            U = self.U_
            V = self.V_
            w = self.w_
            b = self.b_

            for t in xrange(the_num_iter):
                # update U,V
                hat_y = A_diag0(U, V, X) + X.T.dot(w) + b
                dy = y - hat_y
                dy = n * dy * sample_weight
                p0 = numpy.sum(dy)
                p1 = X.dot(dy)
                cache_1 = self.data_moment3_ * p1 + p0
                U_new = (ApA_diag0(dy,U,X) - cache_1*U)/(2 * n) * self.learning_rate + \
                        U.dot(V.T.dot(U)) / 2 + V.dot(U.T.dot(U)) / 2
                U_new, _ = numpy.linalg.qr(U_new)
                V_new = (ApA_diag0(dy,U_new,X) - cache_1*U_new) / (2 * n) * self.learning_rate + \
                        U.dot(V.T.dot(U_new)) / 2 + V.dot(U.T.dot(U_new)) / 2
                hat_y = A_diag0(U_new, V_new, X) + X.T.dot(w) + b

                # update w and b
                dy = y - hat_y
                dy = n * dy * sample_weight
                p0 = numpy.sum(dy)
                p1 = X.dot(dy)
                w_new = p1/n * self.learning_rate + w
                b_new = p0/n*self.learning_rate + b
                hat_y = A_diag0(U_new, V_new, X) + X.T.dot(w_new) + b_new

                if numpy.sum(numpy.abs(U-U_new))<self.tol and numpy.sum(numpy.abs(V-V_new))<self.tol and numpy.sum(numpy.abs(w-w_new))<self.tol and numpy.abs(b-b_new)<self.tol:
                    U = U_new
                    V = V_new
                    w = w_new
                    b = b_new
                    print 'early stop'
                    break
                # end if ... < tol

                # update old with new variances
                U = U_new
                V = V_new
                w = w_new
                b = b_new
            # end for t

            self.U_ = U
            self.V_ = V
            self.w_ = w
            self.b_ = b
        # end if not self.is_init_stage_:

        return self
    pass # end def fit_provable_diag_not_zero

    def fit_provable_diag_not_zero(self, X, y, sample_weight, n_more_iter, X_is_z_score_normalized):
        X = X.T
        y = y[:, numpy.newaxis]
        y = numpy.asarray(y, dtype=numpy.float)
        n = X.shape[1]

        if sample_weight is None:
            sample_weight = numpy.ones((n,))
            sample_weight = sample_weight / numpy.sum(sample_weight)
        sample_weight = sample_weight[:, numpy.newaxis]

        X_new = None

        if not hasattr(self,'is_init_stage_'):
            self.is_init_stage_ = True
            self.d_ = X.shape[0]
            X_times_sample_weight = n * X * sample_weight.T

            if not X_is_z_score_normalized:
                self.data_mean_ = X_times_sample_weight.mean(axis=1, keepdims=True)
                X_new = X - self.data_mean_
            else:
                self.data_mean_ = numpy.zeros((X.shape[0],1))
                X_new = X
            X_weighted_std = numpy.sqrt(n * numpy.mean((X_new ** 2) * sample_weight.T, axis=1, keepdims=True))

            if not X_is_z_score_normalized:
                self.data_std_ = numpy.maximum(X_weighted_std, 1e-12)
                X_new /= self.data_std_
            else:
                self.data_std_ = numpy.ones((X.shape[0],1))

            Xp2 = X_new ** 2
            Xp3 = Xp2*X_new
            Xp4 = Xp3 * X_new
            if self.using_cache:
                self.cached_Xp2_ = Xp2
            # end if self.using_cache

            self.data_moment3_ = numpy.mean(n * Xp3 * sample_weight.T, axis=1, keepdims=True)
            self.data_moment4_ = numpy.mean(n * Xp4 * sample_weight.T, axis=1, keepdims=True)

            tmp_A = numpy.zeros((2, 2, self.d_))
            tmp_A[0, 0, :] = 1
            tmp_A[0, 1, :] = self.data_moment3_.ravel()
            # tmp_A[0, 2, :] = self.data_moment4.ravel()
            tmp_A[1, 0, :] = self.data_moment3_.ravel()
            tmp_A[1, 1, :] = self.data_moment4_.ravel() - 1
            # tmp_A[1, 2, :] = self.data_moment5.ravel() - self.data_moment3.ravel()

            tmp_b = numpy.zeros((2, 1, self.d_))
            tmp_b[0, 0, :] = self.data_moment3_.ravel()
            tmp_b[1, 0, :] = self.data_moment4_.ravel() - 3

            tmp_bw = numpy.zeros((2, 1, self.d_))
            tmp_bw[0, 0, :] = 1
            tmp_bw[1, 0, :] = 0

            self.Z_ = numpy.zeros((self.d_, 2))
            self.G_ = numpy.zeros((self.d_, 2))
            sv_record = numpy.zeros((self.d_,))
            for i in xrange(self.d_):
                tmpu_u, tmp_s, tmp_v = numpy.linalg.svd(tmp_A[:, :, i], full_matrices=False)
                sv_record[i] = tmp_s[1]
                if tmp_s[1] < 0.01:
                    print 'warning! small singular value when computing Z and G! sv = %f' % (tmp_s[1])
            sv_threshold = numpy.max(sv_record)*1e-3
            for i in xrange(self.d_):
                pinv_tmpA = numpy.linalg.pinv(tmp_A[:, :, i], sv_threshold)
                self.G_[i, :] = numpy.ravel(pinv_tmpA.dot(tmp_bw[:, :, i]))
                self.Z_[i, :] = numpy.ravel(pinv_tmpA.dot(tmp_b[:, :, i]))

            U, _ = numpy.linalg.qr(numpy.random.randn(self.d_, self.rank_M))
            self.U_ = U
            self.V_ = numpy.zeros(U.shape)
            self.w_ = numpy.zeros((self.d_,1))
            self.b_ = 0
        pass # end if

        if X_is_z_score_normalized == False and X_new is None:
            X = (X - self.data_mean_) / self.data_std_
        if X_is_z_score_normalized == False and X_new is not None:
            X = X_new

        if self.using_cache:
            Xp2 = self.cached_Xp2_
        else:
            Xp2 = X**2

        p0 = numpy.sum(y)
        p1 = X.dot(y)
        p2 = Xp2.dot(y)

        if self.is_init_stage_:
            self.is_init_stage_ = False
            for t in xrange(self.max_init_iter):
                U_new = mathcal_M_(n * y * sample_weight, self.U_, X, self.Z_, p0, p1, p2, ) / (2 * n)
                U_new,_ = numpy.linalg.qr(U_new)
                if numpy.sum(numpy.abs(self.U_-U_new)) < self.init_tol:
                    self.U_ = U_new
                    print 'early stop in initialzation'
                    break
                # end if numpy.sum(numpy.abs(self.U-U_new)) < self.init_tol:
                self.U_ = U_new
            # end for ite_count in xrange(n_more_iter):

            # update V
            V = mathcal_M_(n * y * sample_weight, self.U_, X, self.Z_, p0, p1, p2) / (2 * n) * self.learning_rate
            self.V_ = V

            # update w and b
            w_new = mathcal_W_(self.G_, p0, p1, p2) / n * self.learning_rate
            self.w_ = w_new
            b = (numpy.mean(y) - numpy.sum(self.U_ * self.V_)) * self.learning_rate
            self.b_ = b
        pass # end if self.is_init_stage_

        if not self.is_init_stage_:
            if n_more_iter is None:
                the_num_iter = self.max_iter
            else:
                the_num_iter = n_more_iter

            U = self.U_
            V = self.V_
            w = self.w_
            b = self.b_

            for t in xrange(the_num_iter):
                hat_y = A_(X, U, V) + X.T.dot(w) + b
                dy = y - hat_y
                dy = n * dy * sample_weight
                p0 = numpy.sum(dy)
                p1 = X.dot(dy)
                p2 = Xp2.dot(dy)

                # update U
                U_new = mathcal_M_(dy, U, X, self.Z_, p0, p1, p2) / (2 * n) * self.learning_rate + \
                        U.dot(V.T.dot(U)) / 2 + V.dot(U.T.dot(U)) / 2
                # V_new = U_new
                U_new, _ = numpy.linalg.qr(U_new)

                # update V
                V_new = mathcal_M_(dy, U_new, X, self.G_, p0, p1, p2) / (2 * n) * self.learning_rate + \
                        U.dot(V.T.dot(U_new)) / 2 + V.dot(U.T.dot(U_new)) / 2

                # update w and b
                hat_y = A_(X, U_new, V_new) + X.T.dot(w) + b
                dy = y - hat_y
                dy = n * dy * sample_weight
                p0 = numpy.sum(dy)
                p1 = X.dot(dy)
                p2 = Xp2.dot(dy)

                w_new = mathcal_W_(self.G_, p0, p1, p2) / n * self.learning_rate + w

                b_new = numpy.mean(dy)*self.learning_rate + b

                if numpy.sum(numpy.abs(U-U_new))<self.tol and numpy.sum(numpy.abs(V-V_new))<self.tol and numpy.sum(numpy.abs(w-w_new))<self.tol and numpy.abs(b-b_new)<self.tol:
                    U = U_new
                    V = V_new
                    w = w_new
                    b = b_new
                    print 'early stop'
                    break
                # end if ... < tol

                # update old with new variances
                U = U_new
                V = V_new
                w = w_new
                b = b_new
            # end for t

            self.U_ = U
            self.V_ = V
            self.w_ = w
            self.b_ = b
        pass # end if not self.is_init_stage_:
        return self
    pass # end def fit_provable_diag_zero


    def fit_greedy_diag_zero(self, X, y, sample_weight=None, n_more_iter=None,):
        X = X.T
        y = y[:, numpy.newaxis]
        y = numpy.asarray(y, dtype=numpy.float)
        n = X.shape[1]

        reg_w = float(self.lambda_w) / n
        reg_M = float(self.lambda_M) / n

        if n_more_iter is None: n_more_iter = self.max_iter

        if not hasattr(self, 'has_initialized_'):
            self.has_initialized_ = True
            self.U_not_initialized_ = True

            # tmp_data_mean_ = numpy.asarray(X.mean(axis=1))
            # tmp_data_std_ = numpy.sqrt(numpy.asarray(X.power(2).mean(axis=1)) - tmp_data_mean_ ** 2)
            # self.fea_sel_bool_ = tmp_data_std_ > 1e-2

            # new_X = X[numpy.nonzero(self.fea_sel_bool_)[0], :]
            new_X = X
            tmp_data_mean_ = numpy.asarray(new_X.mean(axis=1))
            self.data_std_ = numpy.sqrt(numpy.asarray(new_X.power(2).mean(axis=1)) - tmp_data_mean_ ** 2)
            self.data_mean_ = numpy.asarray(new_X.mean(axis=1))
            self.data_moment3 = numpy.asarray(new_X.power(3).mean(axis=1))
            self.data_moment3 -= 3 * self.data_mean_ * numpy.asarray(new_X.power(2).mean(axis=1))
            self.data_moment3 -= 2 * self.data_mean_ ** 3

            self.d_ = new_X.shape[0]
            U, _ = numpy.linalg.qr(numpy.random.randn(self.d_, self.rank_M))
            self.U_ = U
            self.V_ = numpy.zeros(U.shape)
            self.w_ = numpy.zeros((self.d_, 1))
            self.b_ = 0
        pass # end if not hasattr(self, 'has_initialized_')


        # X = X[numpy.nonzero(self.fea_sel_bool_)[0], :]
        # X_power_2 = X.power(2)

        U = self.U_
        V = self.V_
        w = self.w_
        b = self.b_
        hat_y = numpy.asarray(A_diag0_sparse(U,V,X,self.data_mean_) + X.T.dot(w) + b)-self.data_mean_.T.dot(w)

        truncated_y = hat_y.copy()
        if self.truncate_y:
            truncated_y[hat_y>1]=1
            truncated_y[hat_y<-1] = -1


        for t in xrange(n_more_iter):
            # als w
            for als_count in xrange(self.als_max_iter_w):
                dy = truncated_y -y
                X_dy = numpy.asarray(X.dot(dy) - self.data_mean_*numpy.sum(dy))  # X.dot(dy)
                grad_t = X_dy / n + reg_w *w
                XT_grad_t = numpy.asarray(X.T.dot(grad_t)) - self.data_mean_.T.dot(grad_t)
                learning_rate_w_ = ( grad_t.T.dot( X_dy )/n + reg_w*numpy.sum(grad_t*w) )/( numpy.linalg.norm(XT_grad_t)**2/n + reg_w*numpy.linalg.norm(grad_t)**2+1e-10)
                learning_rate_w_ = numpy.min([self.learning_rate,learning_rate_w_])
                # learning_rate_w_ = self.learning_rate
                w_new = w - learning_rate_w_*grad_t
                hat_y = hat_y-learning_rate_w_*(XT_grad_t )
                truncated_y = hat_y.copy()
                if self.truncate_y:
                    truncated_y[hat_y > 1] = 1
                    truncated_y[hat_y < -1] = -1
                w = w_new

                # update b
                delta_b = float(numpy.sum(y - truncated_y))/n
                b_new = b + delta_b
                hat_y = hat_y + delta_b
                truncated_y = hat_y.copy()
                if self.truncate_y:
                    truncated_y[hat_y > 1] = 1
                    truncated_y[hat_y < -1] = -1
                b = b_new
            pass # end for als_count in xrange(self.als_max_iter_w)

            # update U,V
            for als_count in xrange(self.als_max_iter_M):
                dy = truncated_y -y
                p0 = numpy.sum(dy)
                p1 = X.dot(dy) - self.data_mean_ * numpy.sum(dy)
                cache_1 = self.data_moment3 * p1 + p0
                # cache_1 = p0
                if self.U_not_initialized_:
                    for init_U_ite in xrange(self.max_init_iter):
                        ApA_dy_Ut = numpy.asarray(ApA_diag0_sparse(dy, U, X, self.data_mean_)) - cache_1 * U
                        U_new = ApA_dy_Ut
                        U,_ =  numpy.linalg.qr(U_new)
                    pass # end for init_U_ite
                    self.U_not_initialized_ = False
                pass # end if self.U_not_initialized_
                ApA_dy_Ut = numpy.asarray(ApA_diag0_sparse(dy, U, X, self.data_mean_)) - cache_1*U
                grad_UV = (ApA_dy_Ut/(2*n) + reg_M*V) # grad
                # grad_diag = (numpy.asarray(X_power_2.dot(dy)))/(2*n)*U
                # grad_diag -= 2*self.data_mean_*(numpy.asarray(X.dot(dy)))/(2*n)*U
                # grad_diag += self.data_mean_**2 * numpy.sum(dy) /(2*n) * U
                # grad_t = grad_UV - grad_diag
                grad_t = grad_UV
                A_Ut_grad_t = numpy.asarray(A_diag0_sparse(U,grad_t,X,self.data_mean_))

                learning_rate_M_ = (dy.T.dot(A_Ut_grad_t)/n + reg_M*numpy.sum(grad_t*V))/( numpy.linalg.norm(A_Ut_grad_t)**2/n + reg_M*numpy.linalg.norm(grad_t)**2+1e-8)
                learning_rate_M_ = numpy.min([self.learning_rate, learning_rate_M_])
                # learning_rate_M_ = self.learning_rate
                V_new = V - learning_rate_M_*grad_t
                hat_y = hat_y - learning_rate_M_*A_Ut_grad_t
                truncated_y = hat_y.copy()
                if self.truncate_y:
                    truncated_y[hat_y > 1] = 1
                    truncated_y[hat_y < -1] = -1

                Q, R = numpy.linalg.qr(V_new)
                V_new = U.dot(R.T)
                U_new = Q
                U = U_new
                V = V_new
            pass # end for als_count in xrange(self.als_max_iter_M):

        pass # end for t

        self.U_ = U
        self.V_ = V
        self.w_ = w
        self.b_ = b

        return self
    pass  # end def fit_greedy_diag_zero

pass  # end class SLM


# ----- Auxiliary functions for fit_provable_diag_not_zero ----- #
def mathcal_W_(G,p0,p1,p2):
    # type: (numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray) -> numpy.ndarray
    """
    Return $\mathcal{W}(y)*n given the constant parameters. X should be zero-mean unit-variance
    """
    # p0 = numpy.sum(y)
    # p1 = X.dot(y)
    # p2 = (X**2).dot(y)
    # p3 = (X**3).dot(y)

    # return G[:,0,numpy.newaxis]*p1 + G[:,1,numpy.newaxis]*(p2-p0) + G[:,2,numpy.newaxis]*( p3 - data_moment3*p0 )
    return G[:, 0, numpy.newaxis] * p1 + G[:, 1, numpy.newaxis] * (p2 - p0)

# end def

def mathcal_M_(y,U,X,Z,p0,p1,p2):
    # type: (numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray) -> numpy.ndarray
    """
    Return $\mathcal{M}(y)U*2n, given the constant parameters. X should be zero-mean unit-variance
    """

    # p0 = numpy.sum(y)
    # p1 = X.dot(y)
    # p2 = (X**2).dot(y)
    # p3 = (X**3).dot(y)

    term1 = (X*y.T).dot(X.T.dot(U))
    # term2 = p0+ Z[:,0,numpy.newaxis]*p1 + Z[:,1,numpy.newaxis]*(p2-p0) + Z[:,2,numpy.newaxis]*( p3 - data_moment3*p0)
    term2 = p0+ Z[:, 0, numpy.newaxis] * p1 + Z[:, 1, numpy.newaxis] * (p2 - p0)
    return term1 - term2*U
# end def

def A_(X,U,V):
    # type: (numpy.ndarray, numpy.ndarray, numpy.ndarray) -> numpy.ndarray
    """
    The sensing operator A in gFM. X is the data matrix; UV'=M as in gFM. The X should be zero-mean and unit-variance.
    \mathcal{A}(M) = { x_i' (M +M') x_i/2}_{i=1}^n where M=UV'
    :param X: a $d \times n$ feature matrix
    :param U: $d \times k$ matrix
    :param V: $d \times k$ matrix
    :return: z = A(UV')
    """

    z = numpy.sum( X.T.dot(U) * X.T.dot(V), axis=1, keepdims=True)
    return z

# end def

def ApW_(X,p,W):
    # type: (numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray) -> numpy.ndarray
    """
    Compute z=A'(p)W, X should be zero-mean and unit-variance
    : param X: feature matrix, $d \times n$
    :param p: $n \times 1$ vector
    :param W: $d \times k$ matrix
    :param mean: mean vector of features, $d \times 1$ vector.
    :param sigma: std of features, $d \times 1$ vector
    :return: $d \times k$ matrix
    """


    z = X.dot(p*X.T.dot(W))
    return z
# end def

def ApA_diag0(y, U, X):
    # type: (numpy.ndarray, numpy.ndarray, numpy.ndarray,) -> numpy.ndarray
    """
    return A'A(y)*U= (\sum_i x_i y_i x_i' )*U
    :param y:
    :param U:
    :param X:
    :return:
    """
    return (X*y.T).dot(X.T.dot(U))
pass # end def

def A_diag0(U, V, X):
    # type: (numpy.ndarray, numpy.ndarray, numpy.ndarray,) -> numpy.ndarray
    """
    return A(UV' - diag(UV')) = (X'U * X'V) 1 - (X*(((U*V)1)*X))1
    :param U:
    :param V:
    :param X:
    :return:
    """
    z = numpy.sum(X.T.dot(U) * X.T.dot(V), axis=1, keepdims=True)
    z_diag = numpy.sum(X*(numpy.sum(U*V,axis=1,keepdims=True)*X), axis=0, keepdims=True).T
    return z-z_diag
pass # end def

def ApA_diag0_sparse(y, U, X, data_mean):
    # type: (numpy.ndarray, numpy.ndarray, numpy.ndarray,) -> numpy.ndarray
    """
    return A'A(y)*U= (\sum_i x_i y_i x_i' )*U
    :param y:
    :param U:
    :param X:
    :return:
    """
    XTdotU = numpy.asarray(X.T.dot(U))
    data_meanTdotU = data_mean.T.dot(U)

    r = X.dot(y*XTdotU) - data_mean.dot(y.T.dot(XTdotU))
    r -= X.dot(y).dot(data_meanTdotU)
    r += numpy.sum(y) * data_mean.dot(data_meanTdotU)

    return numpy.asarray(r)


pass # end def

def A_diag0_sparse(U, V, X,data_mean):
    # type: (numpy.ndarray, numpy.ndarray, numpy.ndarray,) -> numpy.ndarray
    """
    return A(UV' - diag(UV')) = (X'U * X'V) 1 - (X*(((U*V)1)*X))1
    :param U:
    :param V:
    :param X:
    :return:
    """
    n = X.shape[1]
    XTdotU = numpy.asarray(X.T.dot(U))
    XTdotV = numpy.asarray(X.T.dot(V))
    data_meanTdotU = numpy.asarray(data_mean.T.dot(U))
    data_meanTdotV = numpy.asarray(data_mean.T.dot(V))
    sum_UV = numpy.sum(U*V,axis=1,keepdims=True)
    z = numpy.sum(XTdotU *XTdotV, axis=1, keepdims=True)
    z -= numpy.sum( data_meanTdotU*XTdotV, axis=1, keepdims=True)
    z -= numpy.sum( XTdotU * data_meanTdotV, axis=1, keepdims=True)
    z += numpy.sum(data_meanTdotU *data_meanTdotV)
    z_diag = numpy.asarray(X.T.power(2).dot(sum_UV))
    z_diag -= 2*numpy.asarray(X.T.dot(data_mean * sum_UV))
    z_diag +=  numpy.asarray(numpy.sum( (data_mean**2)*(sum_UV), axis=0, keepdims=True).T)

    return numpy.asarray(z-z_diag)
pass # end def