From 5fee62e3c51ff9b7629cf58adea3bdc2b134cdd7 Mon Sep 17 00:00:00 2001
From: auxten <auxtenwpc@gmail.com>
Date: Mon, 31 Oct 2022 18:13:52 +0800
Subject: [PATCH 1/4] Use float32 to reduce mem usage

---
 example/futuresales/main_test.go             |   9 +-
 example/movielens/dinimpl.go                 |  41 +---
 example/movielens/dinimpl_test.go            |  15 +-
 example/movielens/feature.go                 |  35 +--
 example/movielens/feature_test.go            |  20 +-
 example/movielens/mlp.go                     |  43 +++-
 example/movielens/mlpimpl.go                 |  39 +--
 example/movielens/mlpimpl_test.go            |  21 +-
 feature/embedding/model/model.go             |   1 +
 feature/embedding/model/word2vec/word2vec.go |  47 +++-
 feature/multihot.go                          |  11 +
 model/din/activation_test.go                 |  30 +--
 model/din/din.go                             |  48 ++--
 model/din/model.go                           |  67 +++--
 model/din/model_test.go                      |  28 +--
 model/din/simplemlp.go                       |  30 +--
 ps/batch.go                                  | 187 --------------
 ps/batch_test.go                             |  39 ---
 ps/printer.go                                |  68 ------
 ps/sample.go                                 |  58 -----
 ps/sample_test.go                            |  39 ---
 ps/solver.go                                 |  95 --------
 ps/trainer.go                                | 109 ---------
 ps/trainer_test.go                           | 244 -------------------
 recommend/rcmd.go                            | 116 +++++----
 utils/util.go                                |   8 +
 26 files changed, 361 insertions(+), 1087 deletions(-)
 delete mode 100644 ps/batch.go
 delete mode 100644 ps/batch_test.go
 delete mode 100644 ps/printer.go
 delete mode 100644 ps/sample.go
 delete mode 100644 ps/sample_test.go
 delete mode 100644 ps/solver.go
 delete mode 100644 ps/trainer.go
 delete mode 100644 ps/trainer_test.go

diff --git a/example/futuresales/main_test.go b/example/futuresales/main_test.go
index ebb8521..544f662 100644
--- a/example/futuresales/main_test.go
+++ b/example/futuresales/main_test.go
@@ -13,6 +13,7 @@ import (
 	"github.com/auxten/edgeRec/feature"
 	"github.com/auxten/edgeRec/nn"
 	"github.com/auxten/edgeRec/ps"
+	"github.com/auxten/edgeRec/recommend"
 	"github.com/auxten/edgeRec/schema"
 	"github.com/auxten/edgeRec/utils"
 	log "github.com/sirupsen/logrus"
@@ -84,7 +85,7 @@ func TestTrain(t *testing.T) {
 
 		fmt.Printf("training data count: %d\n", trainCount)
 		// training
-		trainSample := make(ps.Samples, trainCount)
+		trainSample := make(recommend.Samples, trainCount)
 		{
 			trainRows, err := scanner.GetRows(fmt.Sprintf(`select date, date_block_num, shop_id, s.item_id, item_price, item_category_id, item_name, item_cnt_day from sales_train s 
 				left join items i on s.item_id = i.item_id limit %d`, trainCount))
@@ -104,7 +105,7 @@ func TestTrain(t *testing.T) {
 				if err != nil {
 					log.Fatal(err)
 				}
-				trainSample[i] = ps.Sample{
+				trainSample[i] = recommend.Sample{
 					Input:    featureTransform(date, date_block_num, shop_id, item_id, item_price, item_category_id.Float64, item_name.String),
 					Response: []float64{outputTransform(item_cnt_day)}}
 				i++
@@ -118,7 +119,7 @@ func TestTrain(t *testing.T) {
 		// 10% test data
 		fmt.Printf("test data count: %d\n", testCount)
 		i = 0
-		testSample := make(ps.Samples, testCount)
+		testSample := make(recommend.Samples, testCount)
 		rows, err := scanner.GetRows(fmt.Sprintf(`select date, date_block_num, shop_id, s.item_id, item_price, item_category_id, item_name, 
 			item_cnt_day from sales_train s 
 				left join items i on s.item_id = i.item_id limit %d, %d`, trainCount, testCount))
@@ -138,7 +139,7 @@ func TestTrain(t *testing.T) {
 			if err != nil {
 				log.Fatal(err)
 			}
-			testSample[i] = ps.Sample{
+			testSample[i] = recommend.Sample{
 				Input:    featureTransform(date, date_block_num, shop_id, item_id, item_price, item_category_id.Float64, item_name.String),
 				Response: []float64{outputTransform(item_cnt_day)}}
 			i++
diff --git a/example/movielens/dinimpl.go b/example/movielens/dinimpl.go
index 634ab46..d816869 100644
--- a/example/movielens/dinimpl.go
+++ b/example/movielens/dinimpl.go
@@ -6,7 +6,6 @@ import (
 	"github.com/auxten/edgeRec/model/din"
 	rcmd "github.com/auxten/edgeRec/recommend"
 	log "github.com/sirupsen/logrus"
-	"gonum.org/v1/gonum/mat"
 	"gorgonia.org/tensor"
 )
 
@@ -29,18 +28,14 @@ type dinImpl struct {
 	pred    *din.DinNet
 }
 
-func (d *dinImpl) Predict(X mat.Matrix, Y mat.Mutable) *mat.Dense {
-	numPred, _ := X.Dims()
-	inputTensor := tensor.New(tensor.WithShape(X.Dims()), tensor.WithBacking(X.(*mat.Dense).RawMatrix().Data))
-	y, err := din.Predict(d.pred, numPred, d.predBatchSize, d.sampleInfo, inputTensor)
+func (d *dinImpl) Predict(X tensor.Tensor) tensor.Tensor {
+	numPred := X.Shape()[0]
+	y, err := din.Predict(d.pred, numPred, d.predBatchSize, d.sampleInfo, X)
 	if err != nil {
 		log.Errorf("predict din model failed: %v", err)
 		return nil
 	}
-	yDense := mat.NewDense(numPred, 1, y)
-	if Y != nil {
-		Y.(*mat.Dense).SetRawMatrix(yDense.RawMatrix())
-	}
+	yDense := tensor.NewDense(din.DT, tensor.Shape{numPred, 1}, tensor.WithBacking(y))
 
 	return yDense
 }
@@ -53,31 +48,19 @@ func (d *dinImpl) Fit(trainSample *rcmd.TrainSample) (pred rcmd.PredictAbstract,
 	d.cFeatureDim = trainSample.Info.CtxFeatureRange[1] - trainSample.Info.CtxFeatureRange[0]
 	d.sampleInfo = &trainSample.Info
 
-	sampleLen := len(trainSample.Data)
-	X := mat.NewDense(sampleLen, len(trainSample.Data[0].Input), nil)
-	for i, sample := range trainSample.Data {
-		X.SetRow(i, sample.Input)
-	}
-	Y := mat.NewDense(sampleLen, 1, nil)
-	for i, sample := range trainSample.Data {
-		Y.Set(i, 0, sample.Response[0])
+	if trainSample.Rows != len(trainSample.Y) {
+		err = fmt.Errorf("number of examples %d and labels %d do not match",
+			trainSample.Rows, len(trainSample.Y))
+		return
 	}
 
+	inputs := tensor.New(tensor.WithShape(trainSample.Rows, trainSample.XCols), tensor.WithBacking(trainSample.X))
+	labels := tensor.New(tensor.WithShape(trainSample.Rows, 1), tensor.WithBacking(trainSample.Y))
+
 	d.learner = din.NewDinNet(d.uProfileDim, d.uBehaviorSize, d.uBehaviorDim, d.iFeatureDim, d.cFeatureDim)
-	var (
-		inputs, labels tensor.Tensor
-		numExamples, _ = X.Dims()
-		numLabels, _   = Y.Dims()
-	)
-	if numExamples != numLabels {
-		err = fmt.Errorf("number of examples and labels do not match")
-		return
-	}
 
-	inputs = tensor.New(tensor.WithShape(X.Dims()), tensor.WithBacking(X.RawMatrix().Data))
-	labels = tensor.New(tensor.WithShape(Y.Dims()), tensor.WithBacking(Y.RawMatrix().Data))
 	err = din.Train(d.uProfileDim, d.uBehaviorSize, d.uBehaviorDim, d.iFeatureDim, d.cFeatureDim,
-		numExamples, d.batchSize, d.epochs, d.earlyStop,
+		trainSample.Rows, d.batchSize, d.epochs, d.earlyStop,
 		d.sampleInfo,
 		inputs, labels,
 		d.learner,
diff --git a/example/movielens/dinimpl_test.go b/example/movielens/dinimpl_test.go
index 6ceee76..6e63c85 100644
--- a/example/movielens/dinimpl_test.go
+++ b/example/movielens/dinimpl_test.go
@@ -6,10 +6,9 @@ import (
 	"math/rand"
 	"testing"
 
-	"github.com/auxten/edgeRec/nn/metrics"
+	"github.com/auxten/edgeRec/model/din"
 	rcmd "github.com/auxten/edgeRec/recommend"
 	. "github.com/smartystreets/goconvey/convey"
-	"gonum.org/v1/gonum/mat"
 )
 
 type dinPredictor struct {
@@ -52,9 +51,9 @@ func TestDinOnMovielens(t *testing.T) {
 		var (
 			userId     int
 			itemId     int
-			rating     float64
+			rating     float32
 			timestamp  int64
-			yTrue      = mat.NewDense(testCount, 1, nil)
+			yTrue      []float32
 			sampleKeys = make([]rcmd.Sample, 0, testCount)
 		)
 		for i := 0; rows.Next(); i++ {
@@ -62,7 +61,8 @@ func TestDinOnMovielens(t *testing.T) {
 			if err != nil {
 				t.Errorf("scan error: %v", err)
 			}
-			yTrue.Set(i, 0, BinarizeLabel(rating))
+			//yTrue.Set(i, 0, BinarizeLabel(rating))
+			yTrue = append(yTrue, BinarizeLabel32(rating))
 			sampleKeys = append(sampleKeys, rcmd.Sample{userId, itemId, 0, timestamp})
 		}
 		batchPredictCtx := context.Background()
@@ -73,8 +73,9 @@ func TestDinOnMovielens(t *testing.T) {
 		}
 		yPred, err := rcmd.BatchPredict(batchPredictCtx, dinPred, sampleKeys)
 		So(err, ShouldBeNil)
-		rocAuc := metrics.ROCAUCScore(yTrue, yPred, "", nil)
-		rowCount, _ := yTrue.Dims()
+		rocAuc := din.RocAuc32(yPred.Data().([]float32), yTrue)
+		//rocAuc := metrics.ROCAUCScore(yTrue, yPred, "", nil)
+		rowCount := len(yTrue)
 		fmt.Printf("rocAuc on test set %d: %f\n", rowCount, rocAuc)
 	})
 }
diff --git a/example/movielens/feature.go b/example/movielens/feature.go
index 7635824..361dfe6 100644
--- a/example/movielens/feature.go
+++ b/example/movielens/feature.go
@@ -39,7 +39,7 @@ func initDb(dbPath string) (err error) {
 type MovielensRec struct {
 	DataPath   string
 	SampleCnt  int
-	mRatingMap map[int][2]float64
+	mRatingMap map[int][2]float32
 	ubcTrain   *ubcache.UserBehaviorCache
 	ubcPredict *ubcache.UserBehaviorCache
 }
@@ -103,8 +103,8 @@ func (recSys *MovielensRec) GetItemFeature(ctx context.Context, itemId int) (ten
 		var (
 			movieYear             int
 			itemTitle, itemGenres string
-			avgRating, cntRating  float64
-			GenreTensor           [50]float64 // 5 * 10
+			avgRating, cntRating  float32
+			GenreTensor           [50]float32 // 5 * 10
 		)
 		if err = rows.Scan(&itemTitle, &itemGenres); err != nil {
 			log.Errorf("failed to scan item %d: %v", itemId, err)
@@ -129,11 +129,11 @@ func (recSys *MovielensRec) GetItemFeature(ctx context.Context, itemId int) (ten
 		}
 		if mr, ok := recSys.mRatingMap[itemId]; ok {
 			avgRating = mr[0] / 5.
-			cntRating = math.Log2(mr[1])
+			cntRating = float32(math.Log2(float64(mr[1])))
 		}
 
-		tensor = utils.ConcatSlice(tensor, GenreTensor[:], rcmd.Tensor{
-			float64(movieYear-1990) / 20.0, avgRating, cntRating,
+		tensor = utils.ConcatSlice32(tensor, GenreTensor[:], rcmd.Tensor{
+			float32(movieYear-1990) / 20.0, avgRating, cntRating,
 		})
 		return
 	} else {
@@ -149,7 +149,7 @@ func (recSys *MovielensRec) GetUserFeature(ctx context.Context, userId int) (ten
 		ugenres          sql.NullString
 		avgRating        sql.NullFloat64
 		cntRating        sql.NullFloat64
-		top5GenresTensor [50]float64
+		top5GenresTensor [50]float32
 	)
 	// get stage value from ctx
 	stage := ctx.Value(rcmd.StageKey).(rcmd.Stage)
@@ -183,7 +183,7 @@ func (recSys *MovielensRec) GetUserFeature(ctx context.Context, userId int) (ten
 		for i, genre := range top5Genres {
 			copy(top5GenresTensor[i*10:], genreFeature(genre.Key))
 		}
-		tensor = utils.ConcatSlice(rcmd.Tensor{avgRating.Float64 / 5., cntRating.Float64 / 100.}, top5GenresTensor[:])
+		tensor = utils.ConcatSlice32(rcmd.Tensor{float32(avgRating.Float64) / 5., float32(cntRating.Float64) / 100.}, top5GenresTensor[:])
 		if rcmd.DebugItemId != 0 && userId == rcmd.DebugUserId {
 			log.Infof("user %d: %v ", userId, tensor)
 		}
@@ -196,7 +196,7 @@ func (recSys *MovielensRec) GetUserFeature(ctx context.Context, userId int) (ten
 }
 
 func genreFeature(genre string) (tensor rcmd.Tensor) {
-	return feature.HashOneHot([]byte(genre), 10)
+	return feature.HashOneHot32([]byte(genre), 10)
 }
 
 func (recSys *MovielensRec) SampleGenerator(_ context.Context) (ret <-chan rcmd.Sample, err error) {
@@ -228,14 +228,14 @@ func (recSys *MovielensRec) SampleGenerator(_ context.Context) (ret <-chan rcmd.
 			i++
 			var (
 				userId, movieId int
-				rating, label   float64
+				rating, label   float32
 				timestamp       int64
 			)
 			if err = rows.Scan(&userId, &movieId, &rating, &timestamp); err != nil {
 				log.Errorf("failed to scan ratings: %v", err)
 				return
 			}
-			label = BinarizeLabel(rating)
+			label = BinarizeLabel32(rating)
 			// label = rating / 5.0
 
 			sampleCh <- rcmd.Sample{
@@ -268,18 +268,18 @@ func (recSys *MovielensRec) PreTrain(ctx context.Context) (err error) {
 		return
 	}
 	defer rows1.Close()
-	recSys.mRatingMap = make(map[int][2]float64)
+	recSys.mRatingMap = make(map[int][2]float32)
 	for rows1.Next() {
 		var (
 			movieId int
-			avgR    float64
+			avgR    float32
 			cntR    int
 		)
 		if err = rows1.Scan(&movieId, &avgR, &cntR); err != nil {
 			log.Errorf("failed to scan movieId: %v", err)
 			return
 		}
-		recSys.mRatingMap[movieId] = [2]float64{avgR, float64(cntR)}
+		recSys.mRatingMap[movieId] = [2]float32{avgR, float32(cntR)}
 	}
 
 	// fill user behavior cache
@@ -389,3 +389,10 @@ func BinarizeLabel(rating float64) float64 {
 	}
 	return 0.0
 }
+
+func BinarizeLabel32(rating float32) float32 {
+	if rating > 3.5 {
+		return 1.0
+	}
+	return 0.0
+}
diff --git a/example/movielens/feature_test.go b/example/movielens/feature_test.go
index 46ea86d..6d05e88 100644
--- a/example/movielens/feature_test.go
+++ b/example/movielens/feature_test.go
@@ -46,7 +46,7 @@ func TestFeatureEngineer(t *testing.T) {
 		testData := []struct {
 			userId   int
 			itemId   int
-			expected float64
+			expected float32
 		}{
 			{8, 527, 1.},
 			{8, 432, 0.},
@@ -68,8 +68,8 @@ func TestFeatureEngineer(t *testing.T) {
 			fmt.Printf("userId:%d, itemId:%d, expected:%f, pred:%f\n",
 				test.userId, test.itemId, test.expected, score[0].Score)
 			//So(pred.At(0, 0), ShouldAlmostEqual, test.expected)
-			yTrue.Set(i, 0, test.expected)
-			yPred.Set(i, 0, score[0].Score)
+			yTrue.Set(i, 0, float64(test.expected))
+			yPred.Set(i, 0, float64(score[0].Score))
 		}
 
 		rocAuc := metrics.ROCAUCScore(yTrue, yPred, "", nil)
@@ -84,9 +84,10 @@ func TestFeatureEngineer(t *testing.T) {
 		var (
 			userId     int
 			itemId     int
-			rating     float64
+			rating     float32
 			timestamp  int64
 			yTrue      = mat.NewDense(testCount, 1, nil)
+			yPredDense = mat.NewDense(testCount, 1, nil)
 			sampleKeys = make([]rcmd.Sample, 0, testCount)
 		)
 		for i := 0; rows.Next(); i++ {
@@ -94,13 +95,20 @@ func TestFeatureEngineer(t *testing.T) {
 			if err != nil {
 				t.Errorf("scan error: %v", err)
 			}
-			yTrue.Set(i, 0, BinarizeLabel(rating))
+			yTrue.Set(i, 0, BinarizeLabel(float64(rating)))
 			sampleKeys = append(sampleKeys, rcmd.Sample{userId, itemId, 0, timestamp})
 		}
 		batchPredictCtx := context.Background()
 		yPred, err := rcmd.BatchPredict(batchPredictCtx, model, sampleKeys)
 		So(err, ShouldBeNil)
-		rocAuc := metrics.ROCAUCScore(yTrue, yPred, "", nil)
+		for i := 0; i < testCount; i++ {
+			val, err := yPred.At(i, 0)
+			if err != nil {
+				t.Errorf("yPred.At error: %v", err)
+			}
+			yPredDense.Set(i, 0, float64(val.(float32)))
+		}
+		rocAuc := metrics.ROCAUCScore(yTrue, yPredDense, "", nil)
 		rowCount, _ := yTrue.Dims()
 		fmt.Printf("rocAuc on test set %d: %f\n", rowCount, rocAuc)
 	})
diff --git a/example/movielens/mlp.go b/example/movielens/mlp.go
index 5714bb5..94542f4 100644
--- a/example/movielens/mlp.go
+++ b/example/movielens/mlp.go
@@ -5,14 +5,37 @@ import (
 	nn "github.com/auxten/edgeRec/nn/neural_network"
 	rcmd "github.com/auxten/edgeRec/recommend"
 	"gonum.org/v1/gonum/mat"
+	"gorgonia.org/tensor"
 )
 
 type predWrap struct {
 	pred base.Predicter
 }
 
-func (p *predWrap) Predict(X mat.Matrix, Y mat.Mutable) *mat.Dense {
-	return p.pred.Predict(X, Y)
+func (p *predWrap) Predict(X tensor.Tensor) tensor.Tensor {
+	numPred := X.Shape()[0]
+	xWidth := X.Shape()[1]
+	//convert float32 tensor to float64 mat.Dense
+	xDense := mat.NewDense(numPred, X.Shape()[1], nil)
+	for i := 0; i < numPred; i++ {
+		for j := 0; j < xWidth; j++ {
+			val, err := X.At(i, j)
+			if err != nil {
+				return nil
+			}
+			xDense.Set(i, j, float64(val.(float32)))
+		}
+	}
+	yMutable := mat.NewDense(numPred, 1, nil)
+	p.pred.Predict(xDense, yMutable)
+
+	//convert float64 mat.Dense to float32 tensor
+	y := make([]float32, numPred)
+	for i := 0; i < numPred; i++ {
+		val := yMutable.At(i, 0)
+		y[i] = float32(val)
+	}
+	return tensor.NewDense(tensor.Float32, tensor.Shape{numPred, 1}, tensor.WithBacking(y))
 }
 
 type fitWrap struct {
@@ -20,14 +43,18 @@ type fitWrap struct {
 }
 
 func (fit *fitWrap) Fit(trainSample *rcmd.TrainSample) (rcmd.PredictAbstract, error) {
-	sampleLen := len(trainSample.Data)
-	sampleDense := mat.NewDense(sampleLen, len(trainSample.Data[0].Input), nil)
-	for i, sample := range trainSample.Data {
-		sampleDense.SetRow(i, sample.Input)
+	sampleLen := trainSample.Rows
+	x64 := make([]float64, sampleLen*trainSample.XCols)
+	for i := 0; i < sampleLen; i++ {
+		for j := 0; j < trainSample.XCols; j++ {
+			x64[i*trainSample.XCols+j] = float64(trainSample.X[i*trainSample.XCols+j])
+		}
 	}
+	sampleDense := mat.NewDense(sampleLen, trainSample.XCols, x64)
+
 	yClass := mat.NewDense(sampleLen, 1, nil)
-	for i, sample := range trainSample.Data {
-		yClass.Set(i, 0, sample.Response[0])
+	for i, sample := range trainSample.Y {
+		yClass.Set(i, 0, float64(sample))
 	}
 
 	pred := fit.model.Fit(sampleDense, yClass)
diff --git a/example/movielens/mlpimpl.go b/example/movielens/mlpimpl.go
index b46a013..3280636 100644
--- a/example/movielens/mlpimpl.go
+++ b/example/movielens/mlpimpl.go
@@ -6,7 +6,6 @@ import (
 	"github.com/auxten/edgeRec/model/din"
 	rcmd "github.com/auxten/edgeRec/recommend"
 	log "github.com/sirupsen/logrus"
-	"gonum.org/v1/gonum/mat"
 	"gorgonia.org/tensor"
 )
 
@@ -29,18 +28,14 @@ type mlpImpl struct {
 	pred    *din.SimpleMLP
 }
 
-func (d *mlpImpl) Predict(X mat.Matrix, Y mat.Mutable) *mat.Dense {
-	numPred, _ := X.Dims()
-	inputTensor := tensor.New(tensor.WithShape(X.Dims()), tensor.WithBacking(X.(*mat.Dense).RawMatrix().Data))
-	y, err := din.Predict(d.pred, numPred, d.predBatchSize, d.sampleInfo, inputTensor)
+func (d *mlpImpl) Predict(X tensor.Tensor) tensor.Tensor {
+	numPred := X.Shape()[0]
+	y, err := din.Predict(d.pred, numPred, d.predBatchSize, d.sampleInfo, X)
 	if err != nil {
 		log.Errorf("predict din model failed: %v", err)
 		return nil
 	}
-	yDense := mat.NewDense(numPred, 1, y)
-	if Y != nil {
-		Y.(*mat.Dense).SetRawMatrix(yDense.RawMatrix())
-	}
+	yDense := tensor.NewDense(din.DT, tensor.Shape{numPred, 1}, tensor.WithBacking(y))
 
 	return yDense
 }
@@ -53,31 +48,17 @@ func (d *mlpImpl) Fit(trainSample *rcmd.TrainSample) (pred rcmd.PredictAbstract,
 	d.cFeatureDim = trainSample.Info.CtxFeatureRange[1] - trainSample.Info.CtxFeatureRange[0]
 	d.sampleInfo = &trainSample.Info
 
-	sampleLen := len(trainSample.Data)
-	X := mat.NewDense(sampleLen, len(trainSample.Data[0].Input), nil)
-	for i, sample := range trainSample.Data {
-		X.SetRow(i, sample.Input)
-	}
-	Y := mat.NewDense(sampleLen, 1, nil)
-	for i, sample := range trainSample.Data {
-		Y.Set(i, 0, sample.Response[0])
-	}
-
-	d.learner = din.NewSimpleMLP(d.uProfileDim, d.uBehaviorSize, d.uBehaviorDim, d.iFeatureDim, d.cFeatureDim)
-	var (
-		inputs, labels tensor.Tensor
-		numExamples, _ = X.Dims()
-		numLabels, _   = Y.Dims()
-	)
-	if numExamples != numLabels {
+	if trainSample.Rows != len(trainSample.Y) {
 		err = fmt.Errorf("number of examples and labels do not match")
 		return
 	}
 
-	inputs = tensor.New(tensor.WithShape(X.Dims()), tensor.WithBacking(X.RawMatrix().Data))
-	labels = tensor.New(tensor.WithShape(Y.Dims()), tensor.WithBacking(Y.RawMatrix().Data))
+	d.learner = din.NewSimpleMLP(d.uProfileDim, d.uBehaviorSize, d.uBehaviorDim, d.iFeatureDim, d.cFeatureDim)
+
+	inputs := tensor.New(tensor.WithShape(trainSample.Rows, trainSample.XCols), tensor.WithBacking(trainSample.X))
+	labels := tensor.New(tensor.WithShape(trainSample.Rows, 1), tensor.WithBacking(trainSample.Y))
 	err = din.Train(d.uProfileDim, d.uBehaviorSize, d.uBehaviorDim, d.iFeatureDim, d.cFeatureDim,
-		numExamples, d.batchSize, d.epochs, d.earlyStop,
+		trainSample.Rows, d.batchSize, d.epochs, d.earlyStop,
 		d.sampleInfo,
 		inputs, labels,
 		d.learner,
diff --git a/example/movielens/mlpimpl_test.go b/example/movielens/mlpimpl_test.go
index c9be287..c203e59 100644
--- a/example/movielens/mlpimpl_test.go
+++ b/example/movielens/mlpimpl_test.go
@@ -6,10 +6,9 @@ import (
 	"math/rand"
 	"testing"
 
-	"github.com/auxten/edgeRec/nn/metrics"
+	"github.com/auxten/edgeRec/model/din"
 	rcmd "github.com/auxten/edgeRec/recommend"
 	. "github.com/smartystreets/goconvey/convey"
-	"gonum.org/v1/gonum/mat"
 )
 
 func TestSimpleMLPOnMovielens(t *testing.T) {
@@ -49,9 +48,9 @@ func TestSimpleMLPOnMovielens(t *testing.T) {
 		var (
 			userId     int
 			itemId     int
-			rating     float64
+			rating     float32
 			timestamp  int64
-			yTrue      = mat.NewDense(testCount, 1, nil)
+			yTrue      []float32
 			sampleKeys = make([]rcmd.Sample, 0, testCount)
 		)
 		for i := 0; rows.Next(); i++ {
@@ -59,14 +58,20 @@ func TestSimpleMLPOnMovielens(t *testing.T) {
 			if err != nil {
 				t.Errorf("scan error: %v", err)
 			}
-			yTrue.Set(i, 0, BinarizeLabel(rating))
+			//yTrue.Set(i, 0, BinarizeLabel(rating))
+			yTrue = append(yTrue, BinarizeLabel32(rating))
 			sampleKeys = append(sampleKeys, rcmd.Sample{userId, itemId, 0, timestamp})
 		}
 		batchPredictCtx := context.Background()
-		yPred, err := rcmd.BatchPredict(batchPredictCtx, model, sampleKeys)
+		dinPred := &dinPredictor{
+			PreRanker:    movielens,
+			Predictor:    model,
+			UserBehavior: movielens,
+		}
+		yPred, err := rcmd.BatchPredict(batchPredictCtx, dinPred, sampleKeys)
 		So(err, ShouldBeNil)
-		rocAuc := metrics.ROCAUCScore(yTrue, yPred, "", nil)
-		rowCount, _ := yTrue.Dims()
+		rocAuc := din.RocAuc32(yPred.Data().([]float32), yTrue)
+		rowCount := len(yTrue)
 		fmt.Printf("rocAuc on test set %d: %f\n", rowCount, rocAuc)
 	})
 }
diff --git a/feature/embedding/model/model.go b/feature/embedding/model/model.go
index ff187b9..aa73095 100644
--- a/feature/embedding/model/model.go
+++ b/feature/embedding/model/model.go
@@ -26,5 +26,6 @@ type Model interface {
 	Save(io.Writer, vector.Type) error
 	WordVector(vector.Type) *matrix.Matrix
 	GenEmbeddingMap() (map[string][]float64, error)
+	GenEmbeddingMap32() (map[string][]float32, error)
 	EmbeddingByWord(word string) ([]float64, bool)
 }
diff --git a/feature/embedding/model/word2vec/word2vec.go b/feature/embedding/model/word2vec/word2vec.go
index 118a25f..c1b6718 100644
--- a/feature/embedding/model/word2vec/word2vec.go
+++ b/feature/embedding/model/word2vec/word2vec.go
@@ -42,23 +42,30 @@ type word2vec struct {
 
 	corpus corpus.Corpus
 
-	param        *matrix.Matrix
-	subsampler   *subsample.Subsampler
-	currentlr    float64
-	mod          mod
-	optimizer    optimizer
-	embeddingMap EmbeddingMap
+	param          *matrix.Matrix
+	subsampler     *subsample.Subsampler
+	currentlr      float64
+	mod            mod
+	optimizer      optimizer
+	embeddingMap   EmbeddingMap
+	embeddingMap32 EmbeddingMap32
 
 	verbose *verbose.Verbose
 }
 
 type EmbeddingMap map[string][]float64
+type EmbeddingMap32 map[string][]float32
 
 func (m *EmbeddingMap) Get(word string) ([]float64, bool) {
 	vec, ok := (*m)[word]
 	return vec, ok
 }
 
+func (m *EmbeddingMap32) Get(word string) ([]float32, bool) {
+	vec, ok := (*m)[word]
+	return vec, ok
+}
+
 func New(opts ...ModelOption) (model.Model, error) {
 	options := DefaultOptions()
 	for _, fn := range opts {
@@ -288,6 +295,34 @@ func (w *word2vec) GenEmbeddingMap() (embMap map[string][]float64, err error) {
 	return w.embeddingMap, nil
 }
 
+func (w *word2vec) GenEmbeddingMap32() (embMap map[string][]float32, err error) {
+	dict := w.corpus.Dictionary()
+	wordVec := w.WordVector(vector.Agg)
+	if dict.Len() != wordVec.Row() {
+		err = fmt.Errorf("dictionary and word vector size mismatch")
+		return
+	}
+	if dict.Len() == 0 {
+		err = fmt.Errorf("dictionary is empty")
+		return
+	}
+
+	w.embeddingMap32 = make(map[string][]float32)
+	clk := clock.New()
+	for i := 0; i < dict.Len(); i++ {
+		word, _ := dict.Word(i)
+		vec := wordVec.Slice(i)
+		w.embeddingMap32[word] = make([]float32, len(vec))
+		for j := 0; j < len(vec); j++ {
+			w.embeddingMap32[word][j] = float32(vec[j])
+		}
+	}
+
+	log.Debugf("embedding map size %d created %v", len(w.embeddingMap32), clk.AllElapsed())
+
+	return w.embeddingMap32, nil
+}
+
 func LoadEmbeddingMap(f io.Reader) (embMap map[string][]float64, err error) {
 	var (
 		embeddings emb.Embeddings
diff --git a/feature/multihot.go b/feature/multihot.go
index 63cd20f..301012e 100644
--- a/feature/multihot.go
+++ b/feature/multihot.go
@@ -23,6 +23,17 @@ func HashOneHot(buf []byte, size int) []float64 {
 	return result
 }
 
+func HashOneHot32(buf []byte, size int) []float32 {
+	result := make([]float32, size)
+	hash := fnv.New32()
+	_, err := hash.Write(buf)
+	if err != nil {
+		return nil
+	}
+	result[int(hash.Sum32())%size] = 1
+	return result
+}
+
 func StringSplitMultiHot(str string, sep string, size int) []float64 {
 	result := make([]float64, size)
 	for _, s := range strings.Split(str, sep) {
diff --git a/model/din/activation_test.go b/model/din/activation_test.go
index 0608116..2a03a2a 100644
--- a/model/din/activation_test.go
+++ b/model/din/activation_test.go
@@ -11,8 +11,8 @@ import (
 func TestPRelu(t *testing.T) {
 	Convey("prelu", t, func() {
 		g := G.NewGraph()
-		x := G.NodeFromAny(g, tensor.New(tensor.WithShape(4, 1), tensor.WithBacking([]float64{-1, -2, 3, 4})), G.WithName("x"))
-		a := G.NewScalar(g, G.Float64, G.WithValue(0.1), G.WithName("a"))
+		x := G.NodeFromAny(g, tensor.New(tensor.WithShape(4, 1), tensor.WithBacking([]float32{-1, -2, 3, 4})), G.WithName("x"))
+		a := G.NewScalar(g, G.Float32, G.WithValue(0.1), G.WithName("a"))
 		output := PRelu(x, a)
 		//cost := G.Must(G.Mean(output))
 		//
@@ -24,15 +24,15 @@ func TestPRelu(t *testing.T) {
 			t.Fatalf("%+v", err)
 		}
 		defer m.Close()
-		So(output.Value().Data(), ShouldResemble, []float64{-0.1, -0.2, 3, 4})
+		So(output.Value().Data(), ShouldResemble, []float32{-0.1, -0.2, 3, 4})
 	})
 }
 
 func TestEucDistance(t *testing.T) {
 	Convey("euc distance 2 dim", t, func() {
 		g := G.NewGraph()
-		x := G.NodeFromAny(g, tensor.New(tensor.WithShape(2, 4), tensor.WithBacking([]float64{1, 2, -3, 4, -1, 0, -1, 2})), G.WithName("x"))
-		y := G.NodeFromAny(g, tensor.New(tensor.WithShape(2, 4), tensor.WithBacking([]float64{1, 2, -3, 4, 0, 1, 0, 1})), G.WithName("y"))
+		x := G.NodeFromAny(g, tensor.New(tensor.WithShape(2, 4), tensor.WithBacking([]float32{1, 2, -3, 4, -1, 0, -1, 2})), G.WithName("x"))
+		y := G.NodeFromAny(g, tensor.New(tensor.WithShape(2, 4), tensor.WithBacking([]float32{1, 2, -3, 4, 0, 1, 0, 1})), G.WithName("y"))
 		output := EucDistance(x, y)
 		m := G.NewTapeMachine(g)
 		if err := m.RunAll(); err != nil {
@@ -40,12 +40,12 @@ func TestEucDistance(t *testing.T) {
 		}
 		defer m.Close()
 		So([]int(output.Shape()), ShouldResemble, []int{2})
-		So(output.Value().Data(), ShouldResemble, []float64{0, 2})
+		So(output.Value().Data(), ShouldResemble, []float32{0, 2})
 	})
 	Convey("euc distance 3 dim no broadcast", t, func() {
 		g := G.NewGraph()
-		x := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 2, 2), tensor.WithBacking([]float64{1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0})), G.WithName("x"))
-		y := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 2, 2), tensor.WithBacking([]float64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})), G.WithName("y"))
+		x := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 2, 2), tensor.WithBacking([]float32{1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0})), G.WithName("x"))
+		y := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 2, 2), tensor.WithBacking([]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})), G.WithName("y"))
 		output := EucDistance(x, y)
 		m := G.NewTapeMachine(g)
 		if err := m.RunAll(); err != nil {
@@ -53,12 +53,12 @@ func TestEucDistance(t *testing.T) {
 		}
 		defer m.Close()
 		So([]int(output.Shape()), ShouldResemble, []int{3, 2})
-		So(output.Value().Data(), ShouldResemble, []float64{1, 0, 1, 1, 0, 1})
+		So(output.Value().Data(), ShouldResemble, []float32{1, 0, 1, 1, 0, 1})
 	})
 	Convey("euc distance 3 dim broadcast y", t, func() {
 		g := G.NewGraph()
-		x := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 2, 2), tensor.WithBacking([]float64{0, 2, 0, 0, -1, 1, -1, 1, 1, 2, 2, 2})), G.WithName("x"))
-		y := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 1, 2), tensor.WithBacking([]float64{0, 1, 0, 1, 1, 2})), G.WithName("y"))
+		x := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 2, 2), tensor.WithBacking([]float32{0, 2, 0, 0, -1, 1, -1, 1, 1, 2, 2, 2})), G.WithName("x"))
+		y := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 1, 2), tensor.WithBacking([]float32{0, 1, 0, 1, 1, 2})), G.WithName("y"))
 		output := EucDistance(x, y)
 		m := G.NewTapeMachine(g)
 		if err := m.RunAll(); err != nil {
@@ -66,12 +66,12 @@ func TestEucDistance(t *testing.T) {
 		}
 		defer m.Close()
 		So([]int(output.Shape()), ShouldResemble, []int{3, 2})
-		So(output.Value().Data(), ShouldResemble, []float64{1, 1, 1, 1, 0, 1})
+		So(output.Value().Data(), ShouldResemble, []float32{1, 1, 1, 1, 0, 1})
 	})
 	Convey("euc distance 3 dim broadcast x", t, func() {
 		g := G.NewGraph()
-		x := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 1, 2), tensor.WithBacking([]float64{0, 1, 0, 1, 1, 2})), G.WithName("x"))
-		y := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 2, 2), tensor.WithBacking([]float64{0, 2, 0, 0, -1, 1, -1, 1, 1, 2, 2, 2})), G.WithName("y"))
+		x := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 1, 2), tensor.WithBacking([]float32{0, 1, 0, 1, 1, 2})), G.WithName("x"))
+		y := G.NodeFromAny(g, tensor.New(tensor.WithShape(3, 2, 2), tensor.WithBacking([]float32{0, 2, 0, 0, -1, 1, -1, 1, 1, 2, 2, 2})), G.WithName("y"))
 		output := EucDistance(x, y)
 		m := G.NewTapeMachine(g)
 		if err := m.RunAll(); err != nil {
@@ -79,6 +79,6 @@ func TestEucDistance(t *testing.T) {
 		}
 		defer m.Close()
 		So([]int(output.Shape()), ShouldResemble, []int{3, 2})
-		So(output.Value().Data(), ShouldResemble, []float64{1, 1, 1, 1, 0, 1})
+		So(output.Value().Data(), ShouldResemble, []float32{1, 1, 1, 1, 0, 1})
 	})
 }
diff --git a/model/din/din.go b/model/din/din.go
index 6a4a1d9..59c1c81 100644
--- a/model/din/din.go
+++ b/model/din/din.go
@@ -23,7 +23,7 @@ type DinNet struct {
 	xUserProfile, xUbMatrix, xItemFeature, xCtxFeature *G.Node
 
 	mlp0, mlp1, mlp2 *G.Node // weights of MLP layers
-	d0, d1           float64 // dropout probabilities
+	d0, d1           float32 // dropout probabilities
 	att0             *G.Node // weights of attention layer
 	//att1       *G.Node // weights of Attention layers
 
@@ -36,11 +36,11 @@ type dinModel struct {
 	UBehaviorDim  int       `json:"uBehaviorDim"`
 	IFeatureDim   int       `json:"iFeatureDim"`
 	CFeatureDim   int       `json:"cFeatureDim"`
-	Mlp0          []float64 `json:"mlp0"`
-	Mlp1          []float64 `json:"mlp1"`
-	Mlp2          []float64 `json:"mlp2"`
-	Att0          []float64 `json:"att0"`
-	//Att1          []float64 `json:"att1"`
+	Mlp0          []float32 `json:"mlp0"`
+	Mlp1          []float32 `json:"mlp1"`
+	Mlp2          []float32 `json:"mlp2"`
+	Att0          []float32 `json:"att0"`
+	//Att1          []float32 `json:"att1"`
 }
 
 func (din *DinNet) Vm() G.VM {
@@ -58,11 +58,11 @@ func (din *DinNet) Marshal() (data []byte, err error) {
 		UBehaviorDim:  din.uBehaviorDim,
 		IFeatureDim:   din.iFeatureDim,
 		CFeatureDim:   din.cFeatureDim,
-		Mlp0:          din.mlp0.Value().Data().([]float64),
-		Mlp1:          din.mlp1.Value().Data().([]float64),
-		Mlp2:          din.mlp2.Value().Data().([]float64),
-		Att0:          din.att0.Value().Data().([]float64),
-		//Att1:          din.att1.Value().Data().([]float64),
+		Mlp0:          din.mlp0.Value().Data().([]float32),
+		Mlp1:          din.mlp1.Value().Data().([]float32),
+		Mlp2:          din.mlp2.Value().Data().([]float32),
+		Att0:          din.att0.Value().Data().([]float32),
+		//Att1:          din.att1.Value().Data().([]float32),
 	}
 
 	//marshal to json
@@ -88,20 +88,20 @@ func NewDinNetFromJson(data []byte) (din *DinNet, err error) {
 	// attention layer
 	att0 := G.NewMatrix(
 		g,
-		dt,
+		DT,
 		G.WithShape(1, uBehaviorSize),
 		G.WithValue(tensor.New(tensor.WithShape(1, uBehaviorSize), tensor.WithBacking(m.Att0))),
 		G.WithName("att0"),
 	)
 	//att1 := G.NewMatrix(
 	//	g,
-	//	dt,
+	//	DT,
 	//	G.WithShape(att0_1, 1),
 	//	G.WithValue(tensor.New(tensor.WithShape(att0_1, 1), tensor.WithBacking(m.Att1[i]))),
 	//	G.WithName("att1"),
 	//)
 
-	mlp0 := G.NewMatrix(g, dt,
+	mlp0 := G.NewMatrix(g, DT,
 		G.WithShape(uProfileDim+uBehaviorDim+iFeatureDim+cFeatureDim, mlp0_1),
 		G.WithName("mlp0"),
 		G.WithValue(tensor.New(
@@ -110,13 +110,13 @@ func NewDinNetFromJson(data []byte) (din *DinNet, err error) {
 		),
 	)
 
-	mlp1 := G.NewMatrix(g, dt,
+	mlp1 := G.NewMatrix(g, DT,
 		G.WithShape(mlp0_1, mlp1_2),
 		G.WithName("mlp1"),
 		G.WithValue(tensor.New(tensor.WithShape(mlp0_1, mlp1_2), tensor.WithBacking(m.Mlp1))),
 	)
 
-	mlp2 := G.NewMatrix(g, dt,
+	mlp2 := G.NewMatrix(g, DT,
 		G.WithShape(mlp1_2, 1),
 		G.WithName("mlp2"),
 		G.WithValue(tensor.New(tensor.WithShape(mlp1_2, 1), tensor.WithBacking(m.Mlp2))),
@@ -170,17 +170,17 @@ func NewDinNet(
 	}
 	g := G.NewGraph()
 	// attention layer
-	att0 := G.NewTensor(g, dt, 2, G.WithShape(1, uBehaviorSize), G.WithName("att0"), G.WithInit(G.ValuesOf(float64(1.0/uBehaviorSize))))
-	//att1 := G.NewTensor(g, dt, 3, G.WithShape(uBehaviorSize, att0_1, 1), G.WithName("att1"), G.WithInit(G.HeN(1.0)))
+	att0 := G.NewTensor(g, DT, 2, G.WithShape(1, uBehaviorSize), G.WithName("att0"), G.WithInit(G.ValuesOf(float32(1.0/uBehaviorSize))))
+	//att1 := G.NewTensor(g, DT, 3, G.WithShape(uBehaviorSize, att0_1, 1), G.WithName("att1"), G.WithInit(G.Gaussian(0, 1.0)))
 
 	// user behaviors are represented as a sequence of item embeddings. Before
 	// being fed into the MLP, we need to flatten the sequence into a single with
 	// sum pooling with Attention as the weights which is the key point of DIN model.
-	mlp0 := G.NewMatrix(g, dt, G.WithShape(uProfileDim+uBehaviorDim+iFeatureDim+cFeatureDim, mlp0_1), G.WithName("mlp0"), G.WithInit(G.HeN(1.0)))
+	mlp0 := G.NewMatrix(g, DT, G.WithShape(uProfileDim+uBehaviorDim+iFeatureDim+cFeatureDim, mlp0_1), G.WithName("mlp0"), G.WithInit(G.Gaussian(0, 1.0)))
 
-	mlp1 := G.NewMatrix(g, dt, G.WithShape(mlp0_1, mlp1_2), G.WithName("mlp1"), G.WithInit(G.HeN(1.0)))
+	mlp1 := G.NewMatrix(g, DT, G.WithShape(mlp0_1, mlp1_2), G.WithName("mlp1"), G.WithInit(G.Gaussian(0, 1.0)))
 
-	mlp2 := G.NewMatrix(g, dt, G.WithShape(mlp1_2, 1), G.WithName("mlp2"), G.WithInit(G.HeN(1.0)))
+	mlp2 := G.NewMatrix(g, DT, G.WithShape(mlp1_2, 1), G.WithName("mlp2"), G.WithInit(G.Gaussian(0, 1.0)))
 
 	return &DinNet{
 		uProfileDim:   uProfileDim,
@@ -260,7 +260,7 @@ func (din *DinNet) Fwd(xUserProfile, xUbMatrix, xItemFeature, xCtxFeature *G.Nod
 		nil, []byte{2},
 	))
 
-	//actOuts := G.NewTensor(din.Graph(), dt, 2, G.WithShape(batchSize, uBehaviorDim), G.WithName("actOuts"), G.WithInit(G.Zeroes()))
+	//actOuts := G.NewTensor(din.Graph(), DT, 2, G.WithShape(batchSize, uBehaviorDim), G.WithName("actOuts"), G.WithInit(G.Zeroes()))
 	//for i := 0; i < uBehaviorSize; i++ {
 	//	// xUserBehaviors[:, i, :], ub.shape: [batchSize, uBehaviorDim]
 	//	ub := G.Must(G.Slice(xUserBehaviors, []tensor.Slice{nil, G.S(i)}...))
@@ -290,11 +290,11 @@ func (din *DinNet) Fwd(xUserProfile, xUbMatrix, xItemFeature, xCtxFeature *G.Nod
 	// mlp0.Shape: [userProfileDim+userBehaviorDim+itemFeatureDim+contextFeatureDim, 200]
 	// out.Shape: [batchSize, 200]
 	mlp0Out := G.Must(G.Sigmoid(G.Must(G.Mul(concat, din.mlp0))))
-	mlp0Out = G.Must(G.Dropout(mlp0Out, din.d0))
+	mlp0Out = G.Must(G.Dropout(mlp0Out, float64(din.d0)))
 	// mlp1.Shape: [200, 80]
 	// out.Shape: [batchSize, 80]
 	mlp1Out := G.Must(G.Sigmoid(G.Must(G.Mul(mlp0Out, din.mlp1))))
-	mlp1Out = G.Must(G.Dropout(mlp1Out, din.d1))
+	mlp1Out = G.Must(G.Dropout(mlp1Out, float64(din.d1)))
 	// mlp2.Shape: [80, 1]
 	// out.Shape: [batchSize, 1]
 	mlp2Out := G.Must(G.Sigmoid(G.Must(G.Mul(mlp1Out, din.mlp2))))
diff --git a/model/din/model.go b/model/din/model.go
index 7a044fe..3748354 100644
--- a/model/din/model.go
+++ b/model/din/model.go
@@ -13,7 +13,7 @@ import (
 	"gorgonia.org/tensor"
 )
 
-var dt = tensor.Float64
+var DT = tensor.Float32
 
 const (
 	// magic numbers for din paper
@@ -41,12 +41,12 @@ func Train(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim in
 	m Model,
 ) (err error) {
 	g := m.Graph()
-	xUserProfile := G.NewMatrix(g, dt, G.WithShape(batchSize, uProfileDim), G.WithName("xUserProfile"))
-	//xUserBehaviors := G.NewTensor(g, dt, 3, G.WithShape(batchSize, uBehaviorSize, uBehaviorDim), G.WithName("xUserBehaviors"))
-	xUserBehaviorMatrix := G.NewMatrix(g, dt, G.WithShape(batchSize, uBehaviorSize*uBehaviorDim), G.WithName("xUserBehaviorMatrix"))
-	xItemFeature := G.NewMatrix(g, dt, G.WithShape(batchSize, iFeatureDim), G.WithName("xItemFeature"))
-	xCtxFeature := G.NewMatrix(g, dt, G.WithShape(batchSize, cFeatureDim), G.WithName("xCtxFeature"))
-	y := G.NewTensor(g, dt, 2, G.WithShape(batchSize, 1), G.WithName("y"))
+	xUserProfile := G.NewMatrix(g, DT, G.WithShape(batchSize, uProfileDim), G.WithName("xUserProfile"))
+	//xUserBehaviors := G.NewTensor(g, DT, 3, G.WithShape(batchSize, uBehaviorSize, uBehaviorDim), G.WithName("xUserBehaviors"))
+	xUserBehaviorMatrix := G.NewMatrix(g, DT, G.WithShape(batchSize, uBehaviorSize*uBehaviorDim), G.WithName("xUserBehaviorMatrix"))
+	xItemFeature := G.NewMatrix(g, DT, G.WithShape(batchSize, iFeatureDim), G.WithName("xItemFeature"))
+	xCtxFeature := G.NewMatrix(g, DT, G.WithShape(batchSize, cFeatureDim), G.WithName("xCtxFeature"))
+	y := G.NewTensor(g, DT, 2, G.WithShape(batchSize, 1), G.WithName("y"))
 	//m := NewDinNet(g, uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim)
 	if err = m.Fwd(xUserProfile, xUserBehaviorMatrix, xItemFeature, xCtxFeature, batchSize, uBehaviorSize, uBehaviorDim); err != nil {
 		log.Fatalf("%+v", err)
@@ -55,8 +55,8 @@ func Train(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim in
 	//losses := G.Must(G.HadamardProd(G.Must(G.Neg(G.Must(G.Log(m.out)))), y))
 	//losses := G.Must(G.Square(G.Must(G.Sub(m.Out(), y))))
 	positive := G.Must(G.HadamardProd(G.Must(G.Log(m.Out())), y))
-	negative := G.Must(G.HadamardProd(G.Must(G.Log(G.Must(G.Sub(G.NewConstant(float64(1.0+1e-8)), m.Out())))), G.Must(G.Sub(G.NewConstant(float64(1.0)), y))))
-	//negative := G.Must(G.Log(G.Must(G.Sub(G.NewConstant(float64(1.000000001)), m.Out()))))
+	negative := G.Must(G.HadamardProd(G.Must(G.Log(G.Must(G.Sub(G.NewConstant(float32(1.0+1e-8)), m.Out())))), G.Must(G.Sub(G.NewConstant(float32(1.0)), y))))
+	//negative := G.Must(G.Log(G.Must(G.Sub(G.NewConstant(float32(1.000000001)), m.Out()))))
 	cost := G.Must(G.Neg(G.Must(G.Mean(G.Must(G.Add(positive, negative))))))
 
 	// we want to track costs
@@ -93,11 +93,11 @@ func Train(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim in
 	)
 	m.SetVM(vm)
 
-	//solver := G.NewRMSPropSolver(G.WithBatchSize(float64(batchSize)))
-	//solver := G.NewVanillaSolver(G.WithBatchSize(float64(batchSize)), G.WithLearnRate(0.001))
-	//solver := G.NewBarzilaiBorweinSolver(G.WithBatchSize(float64(batchSize)), G.WithLearnRate(0.001))
-	//solver := G.NewAdaGradSolver(G.WithBatchSize(float64(batchSize)), G.WithLearnRate(0.001))
-	//solver := G.NewMomentum(G.WithBatchSize(float64(batchSize)), G.WithLearnRate(0.001))
+	//solver := G.NewRMSPropSolver(G.WithBatchSize(float32(batchSize)))
+	//solver := G.NewVanillaSolver(G.WithBatchSize(float32(batchSize)), G.WithLearnRate(0.001))
+	//solver := G.NewBarzilaiBorweinSolver(G.WithBatchSize(float32(batchSize)), G.WithLearnRate(0.001))
+	//solver := G.NewAdaGradSolver(G.WithBatchSize(float32(batchSize)), G.WithLearnRate(0.001))
+	//solver := G.NewMomentum(G.WithBatchSize(float32(batchSize)), G.WithLearnRate(0.001))
 	solver := G.NewAdamSolver(G.WithLearnRate(0.01), G.WithBatchSize(float64(batchSize)), G.WithL2Reg(0.0001))
 	//defer func() {
 	//	vm.Close()
@@ -110,7 +110,7 @@ func Train(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim in
 	log.Printf("Batches %d", batches)
 	bar := pb.New(batches)
 	var (
-		bestCost  = math.MaxFloat64
+		bestCost  float32 = math.MaxFloat32
 		noImprove int
 	)
 
@@ -180,7 +180,7 @@ func Train(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim in
 			vm.Reset()
 			bar.Increment()
 		}
-		costVal := cost.Value().Data().(float64)
+		costVal := cost.Value().Data().(float32)
 		if costVal < bestCost {
 			bestCost = costVal
 			noImprove = 0
@@ -202,10 +202,10 @@ func InitForwardOnlyVm(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cF
 	m Model,
 ) (err error) {
 	g := m.Graph()
-	xUserProfile := G.NewMatrix(g, dt, G.WithShape(batchSize, uProfileDim), G.WithName("xUserProfile"))
-	xUserBehaviorMatrix := G.NewMatrix(g, dt, G.WithShape(batchSize, uBehaviorSize*uBehaviorDim), G.WithName("xUserBehaviorMatrix"))
-	xItemFeature := G.NewMatrix(g, dt, G.WithShape(batchSize, iFeatureDim), G.WithName("xItemFeature"))
-	xCtxFeature := G.NewMatrix(g, dt, G.WithShape(batchSize, cFeatureDim), G.WithName("xCtxFeature"))
+	xUserProfile := G.NewMatrix(g, DT, G.WithShape(batchSize, uProfileDim), G.WithName("xUserProfile"))
+	xUserBehaviorMatrix := G.NewMatrix(g, DT, G.WithShape(batchSize, uBehaviorSize*uBehaviorDim), G.WithName("xUserBehaviorMatrix"))
+	xItemFeature := G.NewMatrix(g, DT, G.WithShape(batchSize, iFeatureDim), G.WithName("xItemFeature"))
+	xCtxFeature := G.NewMatrix(g, DT, G.WithShape(batchSize, cFeatureDim), G.WithName("xCtxFeature"))
 	if err = m.Fwd(xUserProfile, xUserBehaviorMatrix, xItemFeature, xCtxFeature,
 		batchSize, uBehaviorSize, uBehaviorDim); err != nil {
 		return
@@ -224,7 +224,7 @@ func InitForwardOnlyVm(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cF
 	return
 }
 
-func Predict(m Model, numExamples, batchSize int, si *rcmd.SampleInfo, inputs tensor.Tensor) (y []float64, err error) {
+func Predict(m Model, numExamples, batchSize int, si *rcmd.SampleInfo, inputs tensor.Tensor) (y []float32, err error) {
 	//input nodes
 	inputNodes := m.In()
 	xUserProfile := inputNodes[0]
@@ -299,7 +299,7 @@ func Predict(m Model, numExamples, batchSize int, si *rcmd.SampleInfo, inputs te
 		}
 
 		//get y
-		yVal := outputNode.Value().Data().([]float64)
+		yVal := outputNode.Value().Data().([]float32)
 		for i := 0; i < end-start; i++ {
 			y = append(y, yVal[i])
 		}
@@ -312,14 +312,14 @@ func Predict(m Model, numExamples, batchSize int, si *rcmd.SampleInfo, inputs te
 func accuracy(prediction, y []float64) float64 {
 	var ok float64
 	for i := 0; i < len(prediction); i++ {
-		if math.Round(prediction[i]-y[i]) == 0 {
+		if math.Round(float64(prediction[i]-y[i])) == 0 {
 			ok += 1.0
 		}
 	}
 	return ok / float64(len(y))
 }
 
-func rocauc(pred, y []float64) float64 {
+func RocAuc(pred, y []float64) float64 {
 	boolY := make([]float64, len(y))
 	for i := 0; i < len(y); i++ {
 		if y[i] > 0.5 {
@@ -333,3 +333,22 @@ func rocauc(pred, y []float64) float64 {
 
 	return metrics.ROCAUCScore(yTrue, yScore, "", nil)
 }
+
+func RocAuc32(pred, y []float32) float32 {
+	boolY := make([]float64, len(y))
+	for i := 0; i < len(y); i++ {
+		if y[i] > 0.5 {
+			boolY[i] = 1.0
+		} else {
+			boolY[i] = 0.0
+		}
+	}
+	pred64 := make([]float64, len(pred))
+	for i := 0; i < len(pred); i++ {
+		pred64[i] = float64(pred[i])
+	}
+	yTrue := mat.NewDense(len(y), 1, boolY)
+	yScore := mat.NewDense(len(pred), 1, pred64)
+
+	return float32(metrics.ROCAUCScore(yTrue, yScore, "", nil))
+}
diff --git a/model/din/model_test.go b/model/din/model_test.go
index 36f973f..acb3de6 100644
--- a/model/din/model_test.go
+++ b/model/din/model_test.go
@@ -37,39 +37,39 @@ func TestMultiModel(t *testing.T) {
 		}
 		inputWidth = uProfileDim + uBehaviorSize*uBehaviorDim + iFeatureDim + cFeatureDim
 	)
-	inputSlice := make([]float64, numExamples*inputWidth)
+	inputSlice := make([]float32, numExamples*inputWidth)
 	for i := 0; i < numExamples; i++ {
 		for j := 0; j < sampleInfo.UserProfileRange[1]; j++ {
-			inputSlice[i*inputWidth+j] = rand.Float64()
+			inputSlice[i*inputWidth+j] = rand.Float32()
 		}
 		for j := sampleInfo.CtxFeatureRange[0]; j < sampleInfo.CtxFeatureRange[1]; j++ {
-			inputSlice[i*inputWidth+j] = rand.Float64()
+			inputSlice[i*inputWidth+j] = rand.Float32()
 		}
 		for j := sampleInfo.UserBehaviorRange[0] + uBehaviorDim; j < sampleInfo.UserBehaviorRange[0]+2*uBehaviorDim; j++ {
-			inputSlice[i*inputWidth+j] = rand.Float64()
+			inputSlice[i*inputWidth+j] = rand.Float32()
 		}
 		for j := sampleInfo.ItemFeatureRange[0]; j < sampleInfo.ItemFeatureRange[1]; j++ {
-			inputSlice[i*inputWidth+j] = rand.Float64()
+			inputSlice[i*inputWidth+j] = rand.Float32()
 		}
 	}
 	//for i := 0; i < numExamples*inputWidth; i++ {
-	//	inputSlice[i] = rand.Float64()
+	//	inputSlice[i] = rand.Float32()
 	//}
 	inputs := tensor.New(tensor.WithShape(numExamples, inputWidth), tensor.WithBacking(inputSlice))
-	labelSlice := make([]float64, numExamples)
+	labelSlice := make([]float32, numExamples)
 	for i := 0; i < numExamples; i++ {
 		//distance of uProfile and cFeature slice
-		var dist1, dist2 float64
+		var dist1, dist2 float32
 		for j := 0; j < uProfileDim; j++ {
-			dist1 += math.Abs(inputSlice[i*inputWidth+sampleInfo.UserProfileRange[0]+j] - inputSlice[i*inputWidth+sampleInfo.CtxFeatureRange[0]+j])
+			dist1 += float32(math.Abs(float64(inputSlice[i*inputWidth+sampleInfo.UserProfileRange[0]+j] - inputSlice[i*inputWidth+sampleInfo.CtxFeatureRange[0]+j])))
 		}
-		labelSlice[i] = dist1 / float64(uProfileDim)
+		labelSlice[i] = dist1 / float32(uProfileDim)
 
 		//distance of 2nd uBehavior and iFeature
 		for j := 0; j < uBehaviorDim; j++ {
-			dist2 += math.Abs(inputSlice[i*inputWidth+sampleInfo.UserBehaviorRange[0]+uBehaviorDim+j] - inputSlice[i*inputWidth+sampleInfo.ItemFeatureRange[0]+j])
+			dist2 += float32(math.Abs(float64(inputSlice[i*inputWidth+sampleInfo.UserBehaviorRange[0]+uBehaviorDim+j] - inputSlice[i*inputWidth+sampleInfo.ItemFeatureRange[0]+j])))
 		}
-		labelSlice[i] = math.Round((labelSlice[i] + (dist2 / float64(uBehaviorDim))) * 0.6)
+		labelSlice[i] = float32(math.Round(float64((labelSlice[i] + (dist2 / float32(uBehaviorDim))) * 0.6)))
 	}
 
 	labels := tensor.New(tensor.WithShape(numExamples, 1), tensor.WithBacking(labelSlice))
@@ -103,7 +103,7 @@ func TestMultiModel(t *testing.T) {
 		So(predictions, ShouldNotBeNil)
 		So(predictions, ShouldHaveLength, testSamples)
 		log.Debugf("predictions: %+v", predictions)
-		auc := rocauc(predictions, labels.Data().([]float64)[:testSamples])
+		auc := RocAuc32(predictions, labels.Data().([]float32)[:testSamples])
 		log.Debugf("auc: %f", auc)
 		So(auc, ShouldBeGreaterThan, 0.5)
 	})
@@ -136,7 +136,7 @@ func TestMultiModel(t *testing.T) {
 		So(predictions, ShouldNotBeNil)
 		So(predictions, ShouldHaveLength, testSamples)
 		log.Debugf("predictions: %+v", predictions)
-		auc := rocauc(predictions, labels.Data().([]float64)[:testSamples])
+		auc := RocAuc32(predictions, labels.Data().([]float32)[:testSamples])
 		log.Debugf("auc: %f", auc)
 		So(auc, ShouldBeGreaterThan, 0.5)
 	})
diff --git a/model/din/simplemlp.go b/model/din/simplemlp.go
index c993e33..fa8c40d 100644
--- a/model/din/simplemlp.go
+++ b/model/din/simplemlp.go
@@ -19,7 +19,7 @@ type SimpleMLP struct {
 	xUserProfile, xUbMatrix, xItemFeature, xCtxFeature *G.Node
 	//learnable nodes
 	mlp0, mlp1, mlp2 *G.Node
-	d0, d1           float64 // dropout probabilities
+	d0, d1           float32 // dropout probabilities
 	out              *G.Node
 }
 
@@ -33,9 +33,9 @@ type mlpModel struct {
 	UBehaviorDim  int       `json:"uBehaviorDim"`
 	IFeatureDim   int       `json:"iFeatureDim"`
 	CFeatureDim   int       `json:"cFeatureDim"`
-	Mlp0          []float64 `json:"mlp0"`
-	Mlp1          []float64 `json:"mlp1"`
-	Mlp2          []float64 `json:"mlp2"`
+	Mlp0          []float32 `json:"mlp0"`
+	Mlp1          []float32 `json:"mlp1"`
+	Mlp2          []float32 `json:"mlp2"`
 }
 
 func (mlp *SimpleMLP) Marshal() (data []byte, err error) {
@@ -45,9 +45,9 @@ func (mlp *SimpleMLP) Marshal() (data []byte, err error) {
 		UBehaviorDim:  mlp.uBehaviorDim,
 		IFeatureDim:   mlp.iFeatureDim,
 		CFeatureDim:   mlp.cFeatureDim,
-		Mlp0:          mlp.mlp0.Value().Data().([]float64),
-		Mlp1:          mlp.mlp1.Value().Data().([]float64),
-		Mlp2:          mlp.mlp2.Value().Data().([]float64),
+		Mlp0:          mlp.mlp0.Value().Data().([]float32),
+		Mlp1:          mlp.mlp1.Value().Data().([]float32),
+		Mlp2:          mlp.mlp2.Value().Data().([]float32),
 	}
 	return json.Marshal(model)
 }
@@ -67,19 +67,19 @@ func NewSimpleMLPFromJson(data []byte) (mlp *SimpleMLP, err error) {
 		mlp0_0        = uProfileDim + uBehaviorSize*uBehaviorDim + iFeatureDim + cFeatureDim
 	)
 
-	mlp0 := G.NewMatrix(g, dt,
+	mlp0 := G.NewMatrix(g, DT,
 		G.WithShape(mlp0_0, mlp0_1),
 		G.WithName("mlp0"),
 		G.WithValue(tensor.New(tensor.WithShape(mlp0_0, mlp0_1), tensor.WithBacking(m.Mlp0))),
 	)
 
-	mlp1 := G.NewMatrix(g, dt,
+	mlp1 := G.NewMatrix(g, DT,
 		G.WithShape(mlp0_1, mlp1_2),
 		G.WithName("mlp1"),
 		G.WithValue(tensor.New(tensor.WithShape(mlp0_1, mlp1_2), tensor.WithBacking(m.Mlp1))),
 	)
 
-	mlp2 := G.NewMatrix(g, dt,
+	mlp2 := G.NewMatrix(g, DT,
 		G.WithShape(mlp1_2, 1),
 		G.WithName("mlp2"),
 		G.WithValue(tensor.New(tensor.WithShape(mlp1_2, 1), tensor.WithBacking(m.Mlp2))),
@@ -114,9 +114,9 @@ func NewSimpleMLP(
 	cFeatureDim int,
 ) (mlp *SimpleMLP) {
 	g := G.NewGraph()
-	mlp0 := G.NewMatrix(g, G.Float64, G.WithShape(uProfileDim+uBehaviorSize*uBehaviorDim+iFeatureDim+cFeatureDim, mlp0_1), G.WithName("mlp0"), G.WithInit(G.HeN(1.0)))
-	mlp1 := G.NewMatrix(g, G.Float64, G.WithShape(mlp0_1, mlp1_2), G.WithName("mlp1"), G.WithInit(G.HeN(1.0)))
-	mlp2 := G.NewMatrix(g, G.Float64, G.WithShape(mlp1_2, 1), G.WithName("mlp2"), G.WithInit(G.HeN(1.0)))
+	mlp0 := G.NewMatrix(g, G.Float32, G.WithShape(uProfileDim+uBehaviorSize*uBehaviorDim+iFeatureDim+cFeatureDim, mlp0_1), G.WithName("mlp0"), G.WithInit(G.Gaussian(0, 1.0)))
+	mlp1 := G.NewMatrix(g, G.Float32, G.WithShape(mlp0_1, mlp1_2), G.WithName("mlp1"), G.WithInit(G.Gaussian(0, 1.0)))
+	mlp2 := G.NewMatrix(g, G.Float32, G.WithShape(mlp1_2, 1), G.WithName("mlp2"), G.WithInit(G.Gaussian(0, 1.0)))
 	return &SimpleMLP{
 		uProfileDim:   uProfileDim,
 		uBehaviorSize: uBehaviorSize,
@@ -154,9 +154,9 @@ func (mlp *SimpleMLP) Fwd(xUserProfile, ubMatrix, xItemFeature, xCtxFeature *G.N
 	x := G.Must(G.Concat(1, xUserProfile, xUserBehaviors, xItemFeature, xCtxFeature))
 	// mlp
 	mlp0Out := G.Must(G.Sigmoid(G.Must(G.Mul(x, mlp.mlp0))))
-	mlp0Out = G.Must(G.Dropout(mlp0Out, mlp.d0))
+	mlp0Out = G.Must(G.Dropout(mlp0Out, float64(mlp.d0)))
 	mlp1Out := G.Must(G.Sigmoid(G.Must(G.Mul(mlp0Out, mlp.mlp1))))
-	mlp1Out = G.Must(G.Dropout(mlp1Out, mlp.d1))
+	mlp1Out = G.Must(G.Dropout(mlp1Out, float64(mlp.d1)))
 
 	mlp.out = G.Must(G.Sigmoid(G.Must(G.Mul(mlp1Out, mlp.mlp2))))
 	mlp.xUserProfile = xUserProfile
diff --git a/ps/batch.go b/ps/batch.go
deleted file mode 100644
index 91d5e64..0000000
--- a/ps/batch.go
+++ /dev/null
@@ -1,187 +0,0 @@
-package ps
-
-import (
-	"sync"
-	"time"
-
-	"github.com/auxten/edgeRec/nn"
-)
-
-// BatchTrainer implements parallelized batch training
-type BatchTrainer struct {
-	*internalb
-	verbosity   int
-	batchSize   int
-	parallelism int
-	solver      Solver
-	printer     *StatsPrinter
-}
-
-type internalb struct {
-	deltas            [][][]float64
-	partialDeltas     [][][][]float64
-	accumulatedDeltas [][][]float64
-	moments           [][][]float64
-}
-
-func newBatchTraining(layers []*nn.Layer, parallelism int) *internalb {
-	deltas := make([][][]float64, parallelism)
-	partialDeltas := make([][][][]float64, parallelism)
-	accumulatedDeltas := make([][][]float64, len(layers))
-	for w := 0; w < parallelism; w++ {
-		deltas[w] = make([][]float64, len(layers))
-		partialDeltas[w] = make([][][]float64, len(layers))
-
-		for i, l := range layers {
-			deltas[w][i] = make([]float64, len(l.Neurons))
-			accumulatedDeltas[i] = make([][]float64, len(l.Neurons))
-			partialDeltas[w][i] = make([][]float64, len(l.Neurons))
-			for j, n := range l.Neurons {
-				partialDeltas[w][i][j] = make([]float64, len(n.In))
-				accumulatedDeltas[i][j] = make([]float64, len(n.In))
-			}
-		}
-	}
-	return &internalb{
-		deltas:            deltas,
-		partialDeltas:     partialDeltas,
-		accumulatedDeltas: accumulatedDeltas,
-	}
-}
-
-// NewBatchTrainer returns a BatchTrainer
-func NewBatchTrainer(solver Solver, verbosity, batchSize, parallelism int) *BatchTrainer {
-	return &BatchTrainer{
-		solver:      solver,
-		verbosity:   verbosity,
-		batchSize:   iparam(batchSize, 1),
-		parallelism: iparam(parallelism, 1),
-		printer:     NewStatsPrinter(),
-	}
-}
-
-// Train trains n
-func (t *BatchTrainer) Train(n *nn.Neural, examples, validation Samples, iterations int, shuffle bool) {
-	t.internalb = newBatchTraining(n.Layers, t.parallelism)
-
-	train := make(Samples, len(examples))
-	copy(train, examples)
-
-	workCh := make(chan Sample, t.parallelism)
-	nets := make([]*nn.Neural, t.parallelism)
-
-	wg := sync.WaitGroup{}
-	for i := 0; i < t.parallelism; i++ {
-		nets[i] = nn.NewNeural(n.Config)
-
-		go func(id int, workCh <-chan Sample) {
-			n := nets[id]
-			for e := range workCh {
-				n.Forward(e.Input)
-				t.calculateDeltas(n, e.Response, id)
-				wg.Done()
-			}
-		}(i, workCh)
-	}
-
-	t.printer.Init(n)
-	t.solver.Init(n.NumWeights())
-
-	ts := time.Now()
-	for it := 1; it <= iterations; it++ {
-		if shuffle {
-			train.Shuffle()
-		}
-		batches := train.SplitSize(t.batchSize)
-
-		for _, b := range batches {
-			currentWeights := n.Weights()
-			for _, n := range nets {
-				n.ApplyWeights(currentWeights)
-			}
-
-			wg.Add(len(b))
-			for _, item := range b {
-				workCh <- item
-			}
-			wg.Wait()
-
-			for _, wPD := range t.partialDeltas {
-				for i, iPD := range wPD {
-					iAD := t.accumulatedDeltas[i]
-					for j, jPD := range iPD {
-						jAD := iAD[j]
-						for k, v := range jPD {
-							jAD[k] += v
-							jPD[k] = 0
-						}
-					}
-				}
-			}
-
-			t.update(n, it)
-		}
-
-		if t.verbosity > 0 && it%t.verbosity == 0 && len(validation) > 0 {
-			t.printer.PrintProgress(n, validation, time.Since(ts), it)
-		}
-	}
-}
-
-func (t *BatchTrainer) calculateDeltas(n *nn.Neural, ideal []float64, wid int) {
-	loss := nn.GetLoss(n.Config.Loss)
-	deltas := t.deltas[wid]
-	partialDeltas := t.partialDeltas[wid]
-	lastDeltas := deltas[len(n.Layers)-1]
-
-	for i, n := range n.Layers[len(n.Layers)-1].Neurons {
-		lastDeltas[i] = loss.Df(
-			n.Value,
-			ideal[i],
-			n.DActivate(n.Value))
-	}
-
-	for i := len(n.Layers) - 2; i >= 0; i-- {
-		l := n.Layers[i]
-		iD := deltas[i]
-		nextD := deltas[i+1]
-		for j, n := range l.Neurons {
-			var sum float64
-			for k, s := range n.Out {
-				sum += s.Weight * nextD[k]
-			}
-			iD[j] = n.DActivate(n.Value) * sum
-		}
-	}
-
-	for i, l := range n.Layers {
-		iD := deltas[i]
-		iPD := partialDeltas[i]
-		for j, n := range l.Neurons {
-			jD := iD[j]
-			jPD := iPD[j]
-			for k, s := range n.In {
-				jPD[k] += jD * s.In
-			}
-		}
-	}
-}
-
-func (t *BatchTrainer) update(n *nn.Neural, it int) {
-	var idx int
-	for i, l := range n.Layers {
-		iAD := t.accumulatedDeltas[i]
-		for j, n := range l.Neurons {
-			jAD := iAD[j]
-			for k, s := range n.In {
-				update := t.solver.Update(s.Weight,
-					jAD[k],
-					it,
-					idx)
-				s.Weight += update
-				jAD[k] = 0
-				idx++
-			}
-		}
-	}
-}
diff --git a/ps/batch_test.go b/ps/batch_test.go
deleted file mode 100644
index 7c11a79..0000000
--- a/ps/batch_test.go
+++ /dev/null
@@ -1,39 +0,0 @@
-package ps
-
-import (
-	"math/rand"
-	"runtime"
-	"testing"
-
-	"github.com/auxten/edgeRec/nn"
-)
-
-func Benchmark_xor(b *testing.B) {
-	rand.Seed(0)
-	n := nn.NewNeural(&nn.Config{
-		Inputs:     2,
-		Layout:     []int{32, 32, 1},
-		Activation: nn.ActivationSigmoid,
-		Mode:       nn.ModeBinary,
-		Weight:     nn.NewUniform(.25, 0),
-		Bias:       true,
-	})
-	exs := Samples{
-		{[]float64{0, 0}, []float64{0}},
-		{[]float64{1, 0}, []float64{1}},
-		{[]float64{0, 1}, []float64{1}},
-		{[]float64{1, 1}, []float64{0}},
-	}
-	const minSamples = 4000
-	var dupExs Samples
-	for len(dupExs) < minSamples {
-		dupExs = append(dupExs, exs...)
-	}
-
-	for i := 0; i < b.N; i++ {
-		const iterations = 20
-		solver := NewAdam(0.001, 0.9, 0.999, 1e-8)
-		trainer := NewBatchTrainer(solver, iterations, len(dupExs)/2, runtime.NumCPU())
-		trainer.Train(n, dupExs, dupExs, iterations, true)
-	}
-}
diff --git a/ps/printer.go b/ps/printer.go
deleted file mode 100644
index ccae573..0000000
--- a/ps/printer.go
+++ /dev/null
@@ -1,68 +0,0 @@
-package ps
-
-import (
-	"fmt"
-	"os"
-	"text/tabwriter"
-	"time"
-
-	"github.com/auxten/edgeRec/nn"
-)
-
-// StatsPrinter prints training progress
-type StatsPrinter struct {
-	w *tabwriter.Writer
-}
-
-// NewStatsPrinter creates a StatsPrinter
-func NewStatsPrinter() *StatsPrinter {
-	return &StatsPrinter{tabwriter.NewWriter(os.Stdout, 16, 0, 3, ' ', 0)}
-}
-
-// Init initializes printer
-func (p *StatsPrinter) Init(n *nn.Neural) {
-	fmt.Fprintf(p.w, "Epochs\tElapsed\tLoss (%s)\t", n.Config.Loss)
-	if n.Config.Mode == nn.ModeMultiClass {
-		fmt.Fprintf(p.w, "Accuracy\t\n---\t---\t---\t---\t\n")
-	} else {
-		fmt.Fprintf(p.w, "\n---\t---\t---\t\n")
-	}
-}
-
-// PrintProgress prints the current state of training
-func (p *StatsPrinter) PrintProgress(n *nn.Neural, validation Samples, elapsed time.Duration, iteration int) {
-	fmt.Fprintf(p.w, "%d\t%s\t%.4f\t%s\n",
-		iteration,
-		elapsed.String(),
-		crossValidate(n, validation),
-		formatAccuracy(n, validation))
-	p.w.Flush()
-}
-
-func formatAccuracy(n *nn.Neural, validation Samples) string {
-	if n.Config.Mode == nn.ModeMultiClass {
-		return fmt.Sprintf("%.2f\t", accuracy(n, validation))
-	}
-	return ""
-}
-
-func accuracy(n *nn.Neural, validation Samples) float64 {
-	correct := 0
-	for _, e := range validation {
-		est := n.Predict(e.Input)
-		if nn.ArgMax(e.Response) == nn.ArgMax(est) {
-			correct++
-		}
-	}
-	return float64(correct) / float64(len(validation))
-}
-
-func crossValidate(n *nn.Neural, validation Samples) float64 {
-	predictions, responses := make([][]float64, len(validation)), make([][]float64, len(validation))
-	for i := 0; i < len(validation); i++ {
-		predictions[i] = n.Predict(validation[i].Input)
-		responses[i] = validation[i].Response
-	}
-
-	return nn.GetLoss(n.Config.Loss).F(predictions, responses)
-}
diff --git a/ps/sample.go b/ps/sample.go
deleted file mode 100644
index 8a57fdb..0000000
--- a/ps/sample.go
+++ /dev/null
@@ -1,58 +0,0 @@
-package ps
-
-import "math/rand"
-
-// Sample is an input-target pair
-type Sample struct {
-	Input    []float64
-	Response []float64
-}
-
-// Samples is a set of input-output pairs
-type Samples []Sample
-
-// Shuffle shuffles slice in-place
-func (e Samples) Shuffle() {
-	for i := range e {
-		j := rand.Intn(i + 1)
-		e[i], e[j] = e[j], e[i]
-	}
-}
-
-// Split assigns each element to two new slices
-// according to probability p
-func (e Samples) Split(p float64) (first, second Samples) {
-	for i := 0; i < len(e); i++ {
-		if p > rand.Float64() {
-			first = append(first, e[i])
-		} else {
-			second = append(second, e[i])
-		}
-	}
-	return
-}
-
-// SplitSize splits slice into parts of size size
-func (e Samples) SplitSize(size int) []Samples {
-	res := make([]Samples, 0)
-	for i := 0; i < len(e); i += size {
-		res = append(res, e[i:min(i+size, len(e))])
-	}
-	return res
-}
-
-// SplitN splits slice into n parts
-func (e Samples) SplitN(n int) []Samples {
-	res := make([]Samples, n)
-	for i, el := range e {
-		res[i%n] = append(res[i%n], el)
-	}
-	return res
-}
-
-func min(a, b int) int {
-	if a <= b {
-		return a
-	}
-	return b
-}
diff --git a/ps/sample_test.go b/ps/sample_test.go
deleted file mode 100644
index 57255d2..0000000
--- a/ps/sample_test.go
+++ /dev/null
@@ -1,39 +0,0 @@
-package ps
-
-import (
-	"math/rand"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func Test_SplitSize(t *testing.T) {
-	e := make(Samples, 10)
-
-	batches := e.SplitSize(2)
-	assert.Len(t, batches, 5)
-	for _, batch := range batches {
-		assert.Equal(t, 2, len(batch))
-	}
-}
-
-func Test_SplitN(t *testing.T) {
-	e := make(Samples, 10)
-
-	partitions := e.SplitN(3)
-	assert.Len(t, partitions, 3)
-	assert.Len(t, partitions[0], 4)
-	assert.Len(t, partitions[1], 3)
-	assert.Len(t, partitions[2], 3)
-}
-
-func Test_Split(t *testing.T) {
-	rand.Seed(0)
-
-	e := make(Samples, 100)
-
-	a, b := e.Split(0.5)
-
-	assert.InEpsilon(t, len(a), 50, 0.1)
-	assert.InEpsilon(t, len(b), 50, 0.1)
-}
diff --git a/ps/solver.go b/ps/solver.go
deleted file mode 100644
index 42674e5..0000000
--- a/ps/solver.go
+++ /dev/null
@@ -1,95 +0,0 @@
-package ps
-
-import "math"
-
-// Solver implements an update rule for training a NN
-type Solver interface {
-	Init(size int)
-	Update(value, gradient float64, iteration, idx int) float64
-}
-
-// SGD is stochastic gradient descent with nesterov/momentum
-type SGD struct {
-	lr       float64
-	decay    float64
-	momentum float64
-	nesterov bool
-	moments  []float64
-}
-
-// NewSGD returns a new SGD solver
-func NewSGD(lr, momentum, decay float64, nesterov bool) *SGD {
-	return &SGD{
-		lr:       fparam(lr, 0.01),
-		decay:    decay,
-		momentum: momentum,
-		nesterov: nesterov,
-	}
-}
-
-// Init initializes vectors using number of weights in network
-func (o *SGD) Init(size int) {
-	o.moments = make([]float64, size)
-}
-
-// Update returns the update for a given weight
-func (o *SGD) Update(value, gradient float64, iteration, idx int) float64 {
-	lr := o.lr / (1 + o.decay*float64(iteration))
-
-	o.moments[idx] = o.momentum*o.moments[idx] - lr*gradient
-
-	if o.nesterov {
-		o.moments[idx] = o.momentum*o.moments[idx] - lr*gradient
-	}
-
-	return o.moments[idx]
-}
-
-// Adam is an Adam solver
-type Adam struct {
-	lr      float64
-	beta    float64
-	beta2   float64
-	epsilon float64
-
-	v, m []float64
-}
-
-// NewAdam returns a new Adam solver
-func NewAdam(lr, beta, beta2, epsilon float64) *Adam {
-	return &Adam{
-		lr:      fparam(lr, 0.001),
-		beta:    fparam(beta, 0.9),
-		beta2:   fparam(beta2, 0.999),
-		epsilon: fparam(epsilon, 1e-8),
-	}
-}
-
-// Init initializes vectors using number of weights in network
-func (o *Adam) Init(size int) {
-	o.v, o.m = make([]float64, size), make([]float64, size)
-}
-
-// Update returns the update for a given weight
-func (o *Adam) Update(value, gradient float64, t, idx int) float64 {
-	lrt := o.lr * (math.Sqrt(1.0 - math.Pow(o.beta2, float64(t)))) /
-		(1.0 - math.Pow(o.beta, float64(t)))
-	o.m[idx] = o.beta*o.m[idx] + (1.0-o.beta)*gradient
-	o.v[idx] = o.beta2*o.v[idx] + (1.0-o.beta2)*math.Pow(gradient, 2.0)
-
-	return -lrt * (o.m[idx] / (math.Sqrt(o.v[idx]) + o.epsilon))
-}
-
-func fparam(val, fallback float64) float64 {
-	if val == 0.0 {
-		return fallback
-	}
-	return val
-}
-
-func iparam(val, fallback int) int {
-	if val == 0 {
-		return fallback
-	}
-	return val
-}
diff --git a/ps/trainer.go b/ps/trainer.go
deleted file mode 100644
index a1cbb25..0000000
--- a/ps/trainer.go
+++ /dev/null
@@ -1,109 +0,0 @@
-package ps
-
-import (
-	"time"
-
-	"github.com/auxten/edgeRec/nn"
-)
-
-// Trainer is a neural network trainer
-type Trainer interface {
-	Train(n *nn.Neural, examples, validation Samples, iterations int, shuffle bool)
-}
-
-// OnlineTrainer is a basic, online network trainer
-type OnlineTrainer struct {
-	*internal
-	solver    Solver
-	printer   *StatsPrinter
-	verbosity int
-}
-
-// NewTrainer creates a new trainer
-func NewTrainer(solver Solver, verbosity int) *OnlineTrainer {
-	return &OnlineTrainer{
-		solver:    solver,
-		printer:   NewStatsPrinter(),
-		verbosity: verbosity,
-	}
-}
-
-type internal struct {
-	deltas [][]float64
-}
-
-func newTraining(layers []*nn.Layer) *internal {
-	deltas := make([][]float64, len(layers))
-	for i, l := range layers {
-		deltas[i] = make([]float64, len(l.Neurons))
-	}
-	return &internal{
-		deltas: deltas,
-	}
-}
-
-// Train trains n
-func (t *OnlineTrainer) Train(n *nn.Neural, examples, validation Samples, iterations int, shuffle bool) {
-	t.internal = newTraining(n.Layers)
-
-	t.printer.Init(n)
-	t.solver.Init(n.NumWeights())
-
-	ts := time.Now()
-	for i := 1; i <= iterations; i++ {
-		if shuffle {
-			examples.Shuffle()
-		}
-		for j := 0; j < len(examples); j++ {
-			t.learn(n, examples[j], i)
-		}
-		if t.verbosity > 0 && i%t.verbosity == 0 && len(validation) > 0 {
-			t.printer.PrintProgress(n, validation, time.Since(ts), i)
-		}
-	}
-}
-
-func (t *OnlineTrainer) learn(n *nn.Neural, e Sample, it int) {
-	n.Forward(e.Input)
-	t.calculateDeltas(n, e.Response)
-	t.update(n, it)
-}
-
-func (t *OnlineTrainer) Predict(n *nn.Neural, input []float64) []float64 {
-	return n.Predict(input)
-}
-
-func (t *OnlineTrainer) calculateDeltas(n *nn.Neural, ideal []float64) {
-	for i, neuron := range n.Layers[len(n.Layers)-1].Neurons {
-		t.deltas[len(n.Layers)-1][i] = nn.GetLoss(n.Config.Loss).Df(
-			neuron.Value,
-			ideal[i],
-			neuron.DActivate(neuron.Value))
-	}
-
-	for i := len(n.Layers) - 2; i >= 0; i-- {
-		for j, neuron := range n.Layers[i].Neurons {
-			var sum float64
-			for k, s := range neuron.Out {
-				sum += s.Weight * t.deltas[i+1][k]
-			}
-			t.deltas[i][j] = neuron.DActivate(neuron.Value) * sum
-		}
-	}
-}
-
-func (t *OnlineTrainer) update(n *nn.Neural, it int) {
-	var idx int
-	for i, l := range n.Layers {
-		for j := range l.Neurons {
-			for k := range l.Neurons[j].In {
-				update := t.solver.Update(l.Neurons[j].In[k].Weight,
-					t.deltas[i][j]*l.Neurons[j].In[k].In,
-					it,
-					idx)
-				l.Neurons[j].In[k].Weight += update
-				idx++
-			}
-		}
-	}
-}
diff --git a/ps/trainer_test.go b/ps/trainer_test.go
deleted file mode 100644
index 0bdce69..0000000
--- a/ps/trainer_test.go
+++ /dev/null
@@ -1,244 +0,0 @@
-package ps
-
-import (
-	"fmt"
-	"math"
-	"math/rand"
-	"testing"
-
-	"github.com/auxten/edgeRec/nn"
-	"github.com/stretchr/testify/assert"
-)
-
-func Test_BoundedRegression(t *testing.T) {
-	rand.Seed(0)
-
-	funcs := []func(float64) float64{
-		math.Sin,
-		func(x float64) float64 { return math.Pow(x, 2) },
-		math.Sqrt,
-	}
-
-	for _, f := range funcs {
-
-		data := Samples{}
-		for i := 0.0; i < 1; i += 0.01 {
-			data = append(data, Sample{Input: []float64{i}, Response: []float64{f(i)}})
-		}
-		n := nn.NewNeural(&nn.Config{
-			Inputs:     1,
-			Layout:     []int{4, 4, 1},
-			Activation: nn.ActivationTanh,
-			Mode:       nn.ModeRegression,
-			Weight:     nn.NewUniform(0.5, 0),
-			Bias:       true,
-		})
-
-		trainer := NewTrainer(NewSGD(0.25, 0.5, 0, false), 0)
-		trainer.Train(n, data, nil, 5000, true)
-
-		tests := []float64{0.0, 0.1, 0.25, 0.5, 0.75, 0.9}
-		for _, x := range tests {
-			assert.InEpsilon(t, f(x)+1, n.Predict([]float64{x})[0]+1, 0.1)
-		}
-	}
-}
-
-func Test_RegressionLinearOuts(t *testing.T) {
-	rand.Seed(0)
-	squares := Samples{}
-	for i := 0.0; i < 100.0; i++ {
-		squares = append(squares, Sample{Input: []float64{i}, Response: []float64{math.Sqrt(i)}})
-	}
-	squares.Shuffle()
-	n := nn.NewNeural(&nn.Config{
-		Inputs:     1,
-		Layout:     []int{3, 3, 1},
-		Activation: nn.ActivationReLU,
-		Mode:       nn.ModeRegression,
-		Weight:     nn.NewNormal(0.5, 0.5),
-		Bias:       true,
-	})
-
-	trainer := NewBatchTrainer(NewAdam(0.01, 0, 0, 0), 0, 25, 2)
-	trainer.Train(n, squares, nil, 25000, true)
-
-	for i := 0; i < 100; i++ {
-		x := float64(rand.Intn(99) + 1)
-		assert.InEpsilon(t, math.Sqrt(x)+1, n.Predict([]float64{x})[0]+1, 0.1)
-	}
-}
-
-func Test_Training(t *testing.T) {
-	rand.Seed(0)
-
-	data := Samples{
-		Sample{[]float64{0}, []float64{0}},
-		Sample{[]float64{0}, []float64{0}},
-		Sample{[]float64{0}, []float64{0}},
-		Sample{[]float64{5}, []float64{1}},
-		Sample{[]float64{5}, []float64{1}},
-	}
-
-	n := nn.NewNeural(&nn.Config{
-		Inputs:     1,
-		Layout:     []int{5, 1},
-		Activation: nn.ActivationSigmoid,
-		Weight:     nn.NewUniform(0.5, 0),
-		Bias:       true,
-	})
-
-	trainer := NewTrainer(NewSGD(0.5, 0.1, 0, false), 0)
-	trainer.Train(n, data, nil, 1000, true)
-
-	v := n.Predict([]float64{0})
-	assert.InEpsilon(t, 1, 1+v[0], 0.1)
-	v = n.Predict([]float64{5})
-	assert.InEpsilon(t, 1.0, v[0], 0.1)
-}
-
-var data = []Sample{
-	{[]float64{2.7810836, 2.550537003}, []float64{0}},
-	{[]float64{1.465489372, 2.362125076}, []float64{0}},
-	{[]float64{3.396561688, 4.400293529}, []float64{0}},
-	{[]float64{1.38807019, 1.850220317}, []float64{0}},
-	{[]float64{3.06407232, 3.005305973}, []float64{0}},
-	{[]float64{7.627531214, 2.759262235}, []float64{1}},
-	{[]float64{5.332441248, 2.088626775}, []float64{1}},
-	{[]float64{6.922596716, 1.77106367}, []float64{1}},
-	{[]float64{8.675418651, -0.242068655}, []float64{1}},
-	{[]float64{7.673756466, 3.508563011}, []float64{1}},
-}
-
-func Test_Prediction(t *testing.T) {
-	rand.Seed(0)
-
-	n := nn.NewNeural(&nn.Config{
-		Inputs:     2,
-		Layout:     []int{2, 2, 1},
-		Activation: nn.ActivationSigmoid,
-		Weight:     nn.NewUniform(0.5, 0),
-		Bias:       true,
-	})
-	trainer := NewTrainer(NewSGD(0.5, 0.1, 0, false), 0)
-
-	trainer.Train(n, data, nil, 5000, true)
-
-	for _, d := range data {
-		assert.InEpsilon(t, trainer.Predict(n, d.Input)[0]+1, d.Response[0]+1, 0.1)
-	}
-}
-
-func Test_CrossVal(t *testing.T) {
-	n := nn.NewNeural(&nn.Config{
-		Inputs:     2,
-		Layout:     []int{1, 1},
-		Activation: nn.ActivationTanh,
-		Loss:       nn.LossMeanSquared,
-		Weight:     nn.NewUniform(0.5, 0),
-		Bias:       true,
-	})
-
-	trainer := NewTrainer(NewSGD(0.5, 0.1, 0, false), 0)
-	trainer.Train(n, data, data, 1000, true)
-
-	for _, d := range data {
-		assert.InEpsilon(t, n.Predict(d.Input)[0]+1, d.Response[0]+1, 0.1)
-		assert.InEpsilon(t, 1, crossValidate(n, data)+1, 0.01)
-	}
-}
-
-func Test_MultiClass(t *testing.T) {
-	var data = []Sample{
-		{[]float64{2.7810836, 2.550537003}, []float64{1, 0}},
-		{[]float64{1.465489372, 2.362125076}, []float64{1, 0}},
-		{[]float64{3.396561688, 4.400293529}, []float64{1, 0}},
-		{[]float64{1.38807019, 1.850220317}, []float64{1, 0}},
-		{[]float64{3.06407232, 3.005305973}, []float64{1, 0}},
-		{[]float64{7.627531214, 2.759262235}, []float64{0, 1}},
-		{[]float64{5.332441248, 2.088626775}, []float64{0, 1}},
-		{[]float64{6.922596716, 1.77106367}, []float64{0, 1}},
-		{[]float64{8.675418651, -0.242068655}, []float64{0, 1}},
-		{[]float64{7.673756466, 3.508563011}, []float64{0, 1}},
-	}
-
-	n := nn.NewNeural(&nn.Config{
-		Inputs:     2,
-		Layout:     []int{2, 2},
-		Activation: nn.ActivationReLU,
-		Mode:       nn.ModeMultiClass,
-		Loss:       nn.LossMeanSquared,
-		Weight:     nn.NewUniform(0.1, 0),
-		Bias:       true,
-	})
-
-	trainer := NewTrainer(NewSGD(0.01, 0.1, 0, false), 0)
-	trainer.Train(n, data, data, 1000, true)
-
-	for _, d := range data {
-		est := n.Predict(d.Input)
-		assert.InEpsilon(t, 1.0, nn.Sum(est), 0.00001)
-		if d.Response[0] == 1.0 {
-			assert.InEpsilon(t, n.Predict(d.Input)[0]+1, d.Response[0]+1, 0.1)
-		} else {
-			assert.InEpsilon(t, n.Predict(d.Input)[1]+1, d.Response[1]+1, 0.1)
-		}
-		assert.InEpsilon(t, 1, crossValidate(n, data)+1, 0.01)
-	}
-
-}
-
-func Test_or(t *testing.T) {
-	rand.Seed(0)
-	n := nn.NewNeural(&nn.Config{
-		Inputs:     2,
-		Layout:     []int{1, 1},
-		Activation: nn.ActivationTanh,
-		Mode:       nn.ModeBinary,
-		Weight:     nn.NewUniform(0.5, 0),
-		Bias:       true,
-	})
-	permutations := Samples{
-		{[]float64{0, 0}, []float64{0}},
-		{[]float64{1, 0}, []float64{1}},
-		{[]float64{0, 1}, []float64{1}},
-		{[]float64{1, 1}, []float64{1}},
-	}
-
-	trainer := NewTrainer(NewSGD(0.5, 0, 0, false), 10)
-
-	trainer.Train(n, permutations, permutations, 25, true)
-
-	for _, perm := range permutations {
-		assert.Equal(t, nn.Round(n.Predict(perm.Input)[0]), perm.Response[0])
-	}
-}
-
-func Test_xor(t *testing.T) {
-	rand.Seed(0)
-	n := nn.NewNeural(&nn.Config{
-		Inputs:     2,
-		Layout:     []int{3, 1}, // Sufficient for modeling (AND+OR) - with 5-6 neuron always converges
-		Activation: nn.ActivationSigmoid,
-		Mode:       nn.ModeBinary,
-		Weight:     nn.NewUniform(.25, 0),
-		Bias:       true,
-	})
-	permutations := Samples{
-		{[]float64{0, 0}, []float64{0}},
-		{[]float64{1, 0}, []float64{1}},
-		{[]float64{0, 1}, []float64{1}},
-		{[]float64{1, 1}, []float64{0}},
-	}
-
-	trainer := NewTrainer(NewSGD(1.0, 0.1, 1e-6, false), 50)
-	trainer.Train(n, permutations, permutations, 500, true)
-
-	for _, perm := range permutations {
-		assert.InEpsilon(t, n.Predict(perm.Input)[0]+1, perm.Response[0]+1, 0.2)
-	}
-}
-
-func printResult(ideal, actual []float64) {
-	fmt.Printf("want: %+v have: %+v\n", ideal, actual)
-}
diff --git a/recommend/rcmd.go b/recommend/rcmd.go
index e39e7b6..09eeaf8 100644
--- a/recommend/rcmd.go
+++ b/recommend/rcmd.go
@@ -10,11 +10,10 @@ import (
 	"github.com/auxten/edgeRec/feature/embedding"
 	"github.com/auxten/edgeRec/feature/embedding/model"
 	"github.com/auxten/edgeRec/feature/embedding/model/word2vec"
-	"github.com/auxten/edgeRec/ps"
 	"github.com/auxten/edgeRec/utils"
 	"github.com/karlseguin/ccache/v2"
 	log "github.com/sirupsen/logrus"
-	"gonum.org/v1/gonum/mat"
+	"gorgonia.org/tensor"
 )
 
 const (
@@ -30,7 +29,7 @@ const (
 
 var (
 	itemEmbeddingModel model.Model
-	itemEmbeddingMap   word2vec.EmbeddingMap
+	itemEmbeddingMap   word2vec.EmbeddingMap32
 	//TODO: maybe a switch to control whether to reuse training cache when predict
 	UserFeatureCache  *ccache.Cache
 	ItemFeatureCache  *ccache.Cache
@@ -38,14 +37,14 @@ var (
 
 	// DefaultUserFeature and DefaultItemFeature are backup if not nil
 	//when user or item missing in database, use this to fill
-	DefaultUserFeature []float64
-	DefaultItemFeature []float64
+	DefaultUserFeature []float32
+	DefaultItemFeature []float32
 
 	DebugUserId int
 	DebugItemId int
 )
 
-type Tensor []float64
+type Tensor []float32
 
 type Stage int
 
@@ -55,13 +54,17 @@ const (
 )
 
 type TrainSample struct {
-	Data ps.Samples
+	X     []float32
+	Y     []float32
+	Rows  int
+	XCols int
+
 	Info SampleInfo
 }
 
 type sampleVec struct {
-	vec    []float64
-	label  float64
+	vec    []float32
+	label  float32
 	iWidth int
 	uWidth int
 }
@@ -82,7 +85,7 @@ type BasicFeatureProvider interface {
 }
 
 type PredictAbstract interface {
-	Predict(X mat.Matrix, Y mat.Mutable) *mat.Dense
+	Predict(X tensor.Tensor) tensor.Tensor
 }
 
 type Trainer interface {
@@ -180,13 +183,13 @@ type PreTrainer interface {
 
 type ItemScore struct {
 	ItemId int     `json:"itemId"`
-	Score  float64 `json:"score"`
+	Score  float32 `json:"score"`
 }
 
 type Sample struct {
 	UserId    int     `json:"userId"`
 	ItemId    int     `json:"itemId"`
-	Label     float64 `json:"label"`
+	Label     float32 `json:"label"`
 	Timestamp int64   `json:"timestamp"`
 }
 
@@ -207,7 +210,7 @@ func Train(ctx context.Context, recSys RecSys, mlp Fitter) (model Predictor, err
 			log.Errorf("get item embedding model error: %v", err)
 			return
 		}
-		itemEmbeddingMap, err = itemEmbeddingModel.GenEmbeddingMap()
+		itemEmbeddingMap, err = itemEmbeddingModel.GenEmbeddingMap32()
 		if err != nil {
 			log.Errorf("get item embedding map error: %v", err)
 			return
@@ -215,10 +218,13 @@ func Train(ctx context.Context, recSys RecSys, mlp Fitter) (model Predictor, err
 	}
 
 	trainSample, err := GetSample(recSys, ctx)
-	sampleLen := len(trainSample.Data)
+	if err != nil {
+		log.Errorf("get train sample error: %v", err)
+		return
+	}
 
 	// start training
-	log.Infof("\nstart training with %d samples\n", sampleLen)
+	log.Infof("\nstart training with %d x %d samples\n", trainSample.Rows, trainSample.XCols)
 
 	pred, err := mlp.Fit(trainSample)
 	if err != nil {
@@ -253,17 +259,22 @@ func Rank(ctx context.Context, recSys Predictor, userId int, itemIds []int) (ite
 		return
 	}
 	itemScores = make([]ItemScore, len(itemIds))
+	var score interface{}
 	for i, itemId := range itemIds {
+		if score, err = y.At(i, 0); err != nil {
+			itemScores = nil
+			return
+		}
 		itemScores[i] = ItemScore{
 			ItemId: itemId,
-			Score:  y.At(i, 0),
+			Score:  score.(float32),
 		}
 	}
 
 	return
 }
 
-func BatchPredict(ctx context.Context, recSys Predictor, sampleKeys []Sample) (y *mat.Dense, err error) {
+func BatchPredict(ctx context.Context, recSys Predictor, sampleKeys []Sample) (y tensor.Tensor, err error) {
 	ctx = context.WithValue(ctx, StageKey, PredictStage)
 	if preRanker, ok := recSys.(PreRanker); ok {
 		err = preRanker.PreRank(ctx)
@@ -273,16 +284,16 @@ func BatchPredict(ctx context.Context, recSys Predictor, sampleKeys []Sample) (y
 		}
 	}
 
-	y = mat.NewDense(len(sampleKeys), 1, nil)
 	var (
-		x          *mat.Dense
-		zeroSliceX []float64
+		xData      []float32
+		xWidth     int
+		zeroSliceX []float32
 		debugIds   = make([]int, 0)
 	)
 
 	for i, sKey := range sampleKeys {
 		var (
-			xSlice []float64
+			xSlice []float32
 		)
 		xSlice, _, _, err = GetSampleVector(ctx, UserFeatureCache, ItemFeatureCache, recSys, &sKey)
 		if err != nil {
@@ -290,37 +301,43 @@ func BatchPredict(ctx context.Context, recSys Predictor, sampleKeys []Sample) (y
 				log.Errorf("get sample vector error: %v", err)
 				return
 			} else {
-				_, col := x.Dims()
-				zeroSliceX = make([]float64, col)
+				zeroSliceX = make([]float32, xWidth)
 				xSlice = zeroSliceX
 			}
 		}
 		if i == 0 {
-			x = mat.NewDense(len(sampleKeys), len(xSlice), nil)
+			xWidth = len(xSlice)
+			xData = make([]float32, len(sampleKeys)*xWidth)
 		}
 
-		_, xCol := x.Dims()
-		if len(xSlice) != xCol {
-			log.Errorf("x slice length %d != x col %d", len(xSlice), xCol)
+		if len(xSlice) != xWidth {
+			log.Errorf("x slice length %d != x col %d", len(xSlice), xWidth)
 			return
 		}
-		x.SetRow(i, xSlice)
+		copy(xData[i*xWidth:], xSlice)
+
 		if DebugItemId == sKey.ItemId &&
 			(DebugUserId == 0 || DebugUserId == sKey.UserId) {
 			log.Infof("user %d: item %d: feature %v", sKey.UserId, sKey.ItemId, xSlice)
 			debugIds = append(debugIds, i)
 		}
 	}
-	recSys.Predict(x, y)
+	xDense := tensor.NewDense(tensor.Float32, tensor.Shape{len(sampleKeys), xWidth}, tensor.WithBacking(xData))
+
+	y = recSys.Predict(xDense)
 	for _, i := range debugIds {
-		log.Infof("user %d: item %d: score %v", sampleKeys[i].UserId, sampleKeys[i].ItemId, y.At(i, 0))
+		score, er := y.At(i, 0)
+		if er != nil {
+			log.Errorf("get score of line:%d error: %v", i, er)
+			return
+		}
+		log.Infof("user %d: item %d: score %v", sampleKeys[i].UserId, sampleKeys[i].ItemId, score)
 	}
 	return
 }
 
 func GetSample(recSys RecSys, ctx context.Context) (sample *TrainSample, err error) {
 	var (
-		sampleWidth      int
 		userFeatureWidth int
 		itemFeatureWidth int
 	)
@@ -409,37 +426,46 @@ func GetSample(recSys RecSys, ctx context.Context) (sample *TrainSample, err err
 			return
 		}
 
-		if sampleWidth == 0 {
-			sampleWidth = len(sv.vec)
+		if sample.XCols == 0 {
+			sample.XCols = len(sv.vec)
 		} else {
-			if len(sv.vec) != sampleWidth {
-				err = fmt.Errorf("sample width mismatch: %v:%v", sampleWidth, len(sv.vec))
+			if len(sv.vec) != sample.XCols {
+				err = fmt.Errorf("sample width mismatch: %v:%v", sample.XCols, len(sv.vec))
 				return
 			}
 		}
 
-		sample.Data = append(sample.Data, ps.Sample{
-			Input:    sv.vec,
-			Response: []float64{sv.label},
-		})
-		if len(sample.Data)%1000 == 0 {
-			log.Infof("sample size: %d, uc: %d, ic: %d", len(sample.Data),
+		sample.X = append(sample.X, sv.vec...)
+		sample.Y = append(sample.Y, sv.label)
+		sample.Rows++
+		if sample.Rows%1000 == 0 {
+			log.Infof("sample size: %d, uc: %d, ic: %d", sample.Rows,
 				UserFeatureCache.ItemCount(),
 				ItemFeatureCache.ItemCount(),
 			)
 		}
 	}
 
+	//check x and y dimension
+	if sample.Rows != len(sample.Y) {
+		err = fmt.Errorf("sample rows not match: %v:%v", sample.Rows, len(sample.Y))
+		return
+	}
+	if sample.Rows*sample.XCols != len(sample.X) {
+		err = fmt.Errorf("sample x size not match: %v:%v", sample.Rows*sample.XCols, len(sample.X))
+		return
+	}
+
 	return
 }
 
 func GetSampleVector(ctx context.Context,
 	userFeatureCache *ccache.Cache, itemFeatureCache *ccache.Cache,
 	featureProvider BasicFeatureProvider, sampleKey *Sample,
-) (vec []float64, userFeatureWidth int, itemFeatureWidth int, err error) {
+) (vec []float32, userFeatureWidth int, itemFeatureWidth int, err error) {
 	var (
-		zeroItemEmb       [ItemEmbDim]float64
-		zeroUserBehaviors [ItemEmbDim * UserBehaviorLen]float64
+		zeroItemEmb       [ItemEmbDim]float32
+		zeroUserBehaviors [ItemEmbDim * UserBehaviorLen]float32
 
 		user, item *ccache.Item
 	)
@@ -504,7 +530,7 @@ func GetSampleVector(ctx context.Context,
 		}
 	}
 
-	vec = utils.ConcatSlice(userFeature, userBehaviors, itemEmb, itemFeature)
+	vec = utils.ConcatSlice32(userFeature, userBehaviors, itemEmb, itemFeature)
 
 	return
 }
diff --git a/utils/util.go b/utils/util.go
index efd827f..a44018c 100644
--- a/utils/util.go
+++ b/utils/util.go
@@ -19,6 +19,14 @@ func ConcatSlice(slices ...[]float64) []float64 {
 	return result
 }
 
+func ConcatSlice32(slices ...[]float32) []float32 {
+	result := make([]float32, 0)
+	for _, slice := range slices {
+		result = append(result, slice...)
+	}
+	return result
+}
+
 func Float64toBytes(f float64) []byte {
 	bits := math.Float64bits(f)
 	bytes := make([]byte, 8)

From 27707fd4e0d0deb687aa49a3286605c314f45f73 Mon Sep 17 00:00:00 2001
From: auxten <auxtenwpc@gmail.com>
Date: Tue, 1 Nov 2022 16:25:07 +0800
Subject: [PATCH 2/4] FillTensorRows for train and predict

---
 example/movielens/dinimpl.go      |  10 +--
 example/movielens/dinimpl_test.go |   8 +-
 example/movielens/mlpimpl_test.go |   4 +-
 model/din/model.go                | 120 ++++++++++++++++++------------
 model/din/model_test.go           |   5 +-
 utils/util.go                     |  63 ++++++++++++++--
 utils/util_test.go                |   4 +-
 7 files changed, 147 insertions(+), 67 deletions(-)

diff --git a/example/movielens/dinimpl.go b/example/movielens/dinimpl.go
index d816869..f574da4 100644
--- a/example/movielens/dinimpl.go
+++ b/example/movielens/dinimpl.go
@@ -16,8 +16,8 @@ type dinImpl struct {
 	iFeatureDim   int
 	cFeatureDim   int
 
-	predBatchSize     int
-	batchSize, epochs int
+	PredBatchSize     int
+	BatchSize, epochs int
 	sampleInfo        *rcmd.SampleInfo
 
 	// stop training on earlyStop count of no cost improvement
@@ -30,7 +30,7 @@ type dinImpl struct {
 
 func (d *dinImpl) Predict(X tensor.Tensor) tensor.Tensor {
 	numPred := X.Shape()[0]
-	y, err := din.Predict(d.pred, numPred, d.predBatchSize, d.sampleInfo, X)
+	y, err := din.Predict(d.pred, numPred, d.PredBatchSize, d.sampleInfo, X)
 	if err != nil {
 		log.Errorf("predict din model failed: %v", err)
 		return nil
@@ -60,7 +60,7 @@ func (d *dinImpl) Fit(trainSample *rcmd.TrainSample) (pred rcmd.PredictAbstract,
 	d.learner = din.NewDinNet(d.uProfileDim, d.uBehaviorSize, d.uBehaviorDim, d.iFeatureDim, d.cFeatureDim)
 
 	err = din.Train(d.uProfileDim, d.uBehaviorSize, d.uBehaviorDim, d.iFeatureDim, d.cFeatureDim,
-		trainSample.Rows, d.batchSize, d.epochs, d.earlyStop,
+		trainSample.Rows, d.BatchSize, d.epochs, d.earlyStop,
 		d.sampleInfo,
 		inputs, labels,
 		d.learner,
@@ -80,7 +80,7 @@ func (d *dinImpl) Fit(trainSample *rcmd.TrainSample) (pred rcmd.PredictAbstract,
 		return
 	}
 	err = din.InitForwardOnlyVm(d.uProfileDim, d.uBehaviorSize, d.uBehaviorDim, d.iFeatureDim, d.cFeatureDim,
-		d.predBatchSize, dinPred)
+		d.PredBatchSize, dinPred)
 	if err != nil {
 		log.Errorf("init forward only vm failed: %v", err)
 		return
diff --git a/example/movielens/dinimpl_test.go b/example/movielens/dinimpl_test.go
index 6e63c85..60d6b3b 100644
--- a/example/movielens/dinimpl_test.go
+++ b/example/movielens/dinimpl_test.go
@@ -6,8 +6,8 @@ import (
 	"math/rand"
 	"testing"
 
-	"github.com/auxten/edgeRec/model/din"
 	rcmd "github.com/auxten/edgeRec/recommend"
+	"github.com/auxten/edgeRec/utils"
 	. "github.com/smartystreets/goconvey/convey"
 )
 
@@ -32,8 +32,8 @@ func TestDinOnMovielens(t *testing.T) {
 
 	Convey("Train din model", t, func() {
 		dinModel := &dinImpl{
-			predBatchSize: 100,
-			batchSize:     200,
+			PredBatchSize: 100,
+			BatchSize:     200,
 			epochs:        100,
 			earlyStop:     20,
 		}
@@ -73,7 +73,7 @@ func TestDinOnMovielens(t *testing.T) {
 		}
 		yPred, err := rcmd.BatchPredict(batchPredictCtx, dinPred, sampleKeys)
 		So(err, ShouldBeNil)
-		rocAuc := din.RocAuc32(yPred.Data().([]float32), yTrue)
+		rocAuc := utils.RocAuc32(yPred.Data().([]float32), yTrue)
 		//rocAuc := metrics.ROCAUCScore(yTrue, yPred, "", nil)
 		rowCount := len(yTrue)
 		fmt.Printf("rocAuc on test set %d: %f\n", rowCount, rocAuc)
diff --git a/example/movielens/mlpimpl_test.go b/example/movielens/mlpimpl_test.go
index c203e59..66cce7f 100644
--- a/example/movielens/mlpimpl_test.go
+++ b/example/movielens/mlpimpl_test.go
@@ -6,8 +6,8 @@ import (
 	"math/rand"
 	"testing"
 
-	"github.com/auxten/edgeRec/model/din"
 	rcmd "github.com/auxten/edgeRec/recommend"
+	"github.com/auxten/edgeRec/utils"
 	. "github.com/smartystreets/goconvey/convey"
 )
 
@@ -70,7 +70,7 @@ func TestSimpleMLPOnMovielens(t *testing.T) {
 		}
 		yPred, err := rcmd.BatchPredict(batchPredictCtx, dinPred, sampleKeys)
 		So(err, ShouldBeNil)
-		rocAuc := din.RocAuc32(yPred.Data().([]float32), yTrue)
+		rocAuc := utils.RocAuc32(yPred.Data().([]float32), yTrue)
 		rowCount := len(yTrue)
 		fmt.Printf("rocAuc on test set %d: %f\n", rowCount, rocAuc)
 	})
diff --git a/model/din/model.go b/model/din/model.go
index 3748354..cec27bb 100644
--- a/model/din/model.go
+++ b/model/din/model.go
@@ -4,10 +4,8 @@ import (
 	"fmt"
 	"math"
 
-	"github.com/auxten/edgeRec/nn/metrics"
 	rcmd "github.com/auxten/edgeRec/recommend"
 	log "github.com/sirupsen/logrus"
-	"gonum.org/v1/gonum/mat"
 	"gopkg.in/cheggaaa/pb.v1"
 	G "gorgonia.org/gorgonia"
 	"gorgonia.org/tensor"
@@ -107,6 +105,9 @@ func Train(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim in
 	// handlePprof(sigChan, doneChan)
 
 	batches := numExamples / batchSize
+	if numExamples%batchSize != 0 {
+		batches++
+	}
 	log.Printf("Batches %d", batches)
 	bar := pb.New(batches)
 	var (
@@ -139,6 +140,11 @@ func Train(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim in
 			if xUserProfileVal, err = inputs.Slice([]tensor.Slice{G.S(start, end), G.S(si.UserProfileRange[0], si.UserProfileRange[1])}...); err != nil {
 				log.Fatalf("Unable to slice xUserProfileVal %v", err)
 			}
+			if xUserProfileVal.Shape()[0] < batchSize {
+				if xUserProfileVal, err = FillTensorRows(batchSize, xUserProfileVal); err != nil {
+					log.Fatalf("Unable to fill sample rows %v", err)
+				}
+			}
 			if err = G.Let(xUserProfile, xUserProfileVal); err != nil {
 				log.Fatalf("Unable to let xUserProfileVal %v", err)
 			}
@@ -146,6 +152,11 @@ func Train(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim in
 			if xUserBehaviorsVal, err = inputs.Slice([]tensor.Slice{G.S(start, end), G.S(si.UserBehaviorRange[0], si.UserBehaviorRange[1])}...); err != nil {
 				log.Fatalf("Unable to slice xUserBehaviorsVal %v", err)
 			}
+			if xUserBehaviorsVal.Shape()[0] < batchSize {
+				if xUserBehaviorsVal, err = FillTensorRows(batchSize, xUserBehaviorsVal); err != nil {
+					log.Fatalf("Unable to fill sample rows %v", err)
+				}
+			}
 			if err = G.Let(xUserBehaviorMatrix, xUserBehaviorsVal); err != nil {
 				log.Fatalf("Unable to let xUserBehaviorsVal %v", err)
 			}
@@ -153,6 +164,11 @@ func Train(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim in
 			if xItemFeatureVal, err = inputs.Slice([]tensor.Slice{G.S(start, end), G.S(si.ItemFeatureRange[0], si.ItemFeatureRange[1])}...); err != nil {
 				log.Fatalf("Unable to slice xItemFeatureVal %v", err)
 			}
+			if xItemFeatureVal.Shape()[0] < batchSize {
+				if xItemFeatureVal, err = FillTensorRows(batchSize, xItemFeatureVal); err != nil {
+					log.Fatalf("Unable to fill sample rows %v", err)
+				}
+			}
 			if err = G.Let(xItemFeature, xItemFeatureVal); err != nil {
 				log.Fatalf("Unable to let xItemFeatureVal %v", err)
 			}
@@ -160,6 +176,11 @@ func Train(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim in
 			if xCtxFeatureVal, err = inputs.Slice([]tensor.Slice{G.S(start, end), G.S(si.CtxFeatureRange[0], si.CtxFeatureRange[1])}...); err != nil {
 				log.Fatalf("Unable to slice xCtxFeatureVal %v", err)
 			}
+			if xCtxFeatureVal.Shape()[0] < batchSize {
+				if xCtxFeatureVal, err = FillTensorRows(batchSize, xCtxFeatureVal); err != nil {
+					log.Fatalf("Unable to fill sample rows %v", err)
+				}
+			}
 			if err = G.Let(xCtxFeature, xCtxFeatureVal); err != nil {
 				log.Fatalf("Unable to let xCtxFeatureVal %v", err)
 			}
@@ -167,6 +188,11 @@ func Train(uProfileDim, uBehaviorSize, uBehaviorDim, iFeatureDim, cFeatureDim in
 			if yVal, err = targets.Slice(G.S(start, end)); err != nil {
 				log.Fatalf("Unable to slice y %v", err)
 			}
+			if yVal.Shape()[0] < batchSize {
+				if yVal, err = FillTensorRows(batchSize, yVal); err != nil {
+					log.Fatalf("Unable to fill sample rows %v", err)
+				}
+			}
 			if err = G.Let(y, yVal); err != nil {
 				log.Fatalf("Unable to let y %v", err)
 			}
@@ -239,8 +265,11 @@ func Predict(m Model, numExamples, batchSize int, si *rcmd.SampleInfo, inputs te
 	vm := m.Vm()
 
 	batches := numExamples / batchSize
+	if numExamples%batchSize != 0 {
+		batches++
+	}
 
-	for b := 0; b <= batches; b++ {
+	for b := 0; b < batches; b++ {
 		start := b * batchSize
 		end := start + batchSize
 		if start >= numExamples {
@@ -257,37 +286,61 @@ func Predict(m Model, numExamples, batchSize int, si *rcmd.SampleInfo, inputs te
 			xCtxFeatureVal    tensor.Tensor
 		)
 
-		if xUserProfileVal, err = inputs.Slice([]tensor.Slice{G.S(start, start+batchSize), G.S(si.UserProfileRange[0], si.UserProfileRange[1])}...); err != nil {
+		if xUserProfileVal, err = inputs.Slice([]tensor.Slice{G.S(start, end), G.S(si.UserProfileRange[0], si.UserProfileRange[1])}...); err != nil {
 			log.Errorf("Unable to slice xUserProfileVal %v", err)
 			return nil, err
 		}
+		if xUserProfileVal.Shape()[0] < batchSize {
+			if xUserProfileVal, err = FillTensorRows(batchSize, xUserProfileVal); err != nil {
+				log.Errorf("Unable to fill input rows %v", err)
+				return nil, err
+			}
+		}
 		if err = G.Let(xUserProfile, xUserProfileVal); err != nil {
 			log.Errorf("Unable to let xUserProfileVal %v", err)
 			return nil, err
 		}
 
-		if xUserBehaviorsVal, err = inputs.Slice([]tensor.Slice{G.S(start, start+batchSize), G.S(si.UserBehaviorRange[0], si.UserBehaviorRange[1])}...); err != nil {
+		if xUserBehaviorsVal, err = inputs.Slice([]tensor.Slice{G.S(start, end), G.S(si.UserBehaviorRange[0], si.UserBehaviorRange[1])}...); err != nil {
 			log.Errorf("Unable to slice xUserBehaviorsVal %v", err)
 			return nil, err
 		}
+		if xUserBehaviorsVal.Shape()[0] < batchSize {
+			if xUserBehaviorsVal, err = FillTensorRows(batchSize, xUserBehaviorsVal); err != nil {
+				log.Errorf("Unable to fill input rows %v", err)
+				return nil, err
+			}
+		}
 		if err = G.Let(xUbMatrix, xUserBehaviorsVal); err != nil {
 			log.Errorf("Unable to let xUserBehaviorsVal %v", err)
 			return nil, err
 		}
 
-		if xItemFeatureVal, err = inputs.Slice([]tensor.Slice{G.S(start, start+batchSize), G.S(si.ItemFeatureRange[0], si.ItemFeatureRange[1])}...); err != nil {
+		if xItemFeatureVal, err = inputs.Slice([]tensor.Slice{G.S(start, end), G.S(si.ItemFeatureRange[0], si.ItemFeatureRange[1])}...); err != nil {
 			log.Errorf("Unable to slice xItemFeatureVal %v", err)
 			return nil, err
 		}
+		if xItemFeatureVal.Shape()[0] < batchSize {
+			if xItemFeatureVal, err = FillTensorRows(batchSize, xItemFeatureVal); err != nil {
+				log.Errorf("Unable to fill input rows %v", err)
+				return nil, err
+			}
+		}
 		if err = G.Let(xItemFeature, xItemFeatureVal); err != nil {
 			log.Errorf("Unable to let xItemFeatureVal %v", err)
 			return nil, err
 		}
 
-		if xCtxFeatureVal, err = inputs.Slice([]tensor.Slice{G.S(start, start+batchSize), G.S(si.CtxFeatureRange[0], si.CtxFeatureRange[1])}...); err != nil {
+		if xCtxFeatureVal, err = inputs.Slice([]tensor.Slice{G.S(start, end), G.S(si.CtxFeatureRange[0], si.CtxFeatureRange[1])}...); err != nil {
 			log.Errorf("Unable to slice xCtxFeatureVal %v", err)
 			return nil, err
 		}
+		if xCtxFeatureVal.Shape()[0] < batchSize {
+			if xCtxFeatureVal, err = FillTensorRows(batchSize, xCtxFeatureVal); err != nil {
+				log.Errorf("Unable to fill input rows %v", err)
+				return nil, err
+			}
+		}
 		if err = G.Let(xCtxFeature, xCtxFeatureVal); err != nil {
 			log.Errorf("Unable to let xCtxFeatureVal %v", err)
 			return nil, err
@@ -309,46 +362,21 @@ func Predict(m Model, numExamples, batchSize int, si *rcmd.SampleInfo, inputs te
 	return
 }
 
-func accuracy(prediction, y []float64) float64 {
-	var ok float64
-	for i := 0; i < len(prediction); i++ {
-		if math.Round(float64(prediction[i]-y[i])) == 0 {
-			ok += 1.0
-		}
-	}
-	return ok / float64(len(y))
-}
-
-func RocAuc(pred, y []float64) float64 {
-	boolY := make([]float64, len(y))
-	for i := 0; i < len(y); i++ {
-		if y[i] > 0.5 {
-			boolY[i] = 1.0
-		} else {
-			boolY[i] = 0.0
-		}
-	}
-	yTrue := mat.NewDense(len(y), 1, boolY)
-	yScore := mat.NewDense(len(pred), 1, pred)
-
-	return metrics.ROCAUCScore(yTrue, yScore, "", nil)
-}
-
-func RocAuc32(pred, y []float32) float32 {
-	boolY := make([]float64, len(y))
-	for i := 0; i < len(y); i++ {
-		if y[i] > 0.5 {
-			boolY[i] = 1.0
-		} else {
-			boolY[i] = 0.0
-		}
+// FillTensorRows fills the batch samples with the zero data to make sample size fit the batch size
+// it sames tensor.Concat is not optimized for large dataset.
+// we should avoid using FillTensorRows while input data is large.
+func FillTensorRows(batchSize int, inputs tensor.Tensor) (x tensor.Tensor, err error) {
+	numExamples := inputs.Shape()[0]
+	fillSize := numExamples % batchSize
+	if fillSize == 0 {
+		return inputs, nil
 	}
-	pred64 := make([]float64, len(pred))
-	for i := 0; i < len(pred); i++ {
-		pred64[i] = float64(pred[i])
+	inputZeros := tensor.New(tensor.WithShape(batchSize-fillSize, inputs.Shape()[1]),
+		tensor.WithBacking(make([]float32, (batchSize-fillSize)*inputs.Shape()[1])))
+	x, err = tensor.Concat(0, inputs, inputZeros)
+	if err != nil {
+		return nil, err
 	}
-	yTrue := mat.NewDense(len(y), 1, boolY)
-	yScore := mat.NewDense(len(pred), 1, pred64)
 
-	return float32(metrics.ROCAUCScore(yTrue, yScore, "", nil))
+	return
 }
diff --git a/model/din/model_test.go b/model/din/model_test.go
index acb3de6..4904616 100644
--- a/model/din/model_test.go
+++ b/model/din/model_test.go
@@ -6,6 +6,7 @@ import (
 	"testing"
 
 	rcmd "github.com/auxten/edgeRec/recommend"
+	"github.com/auxten/edgeRec/utils"
 	log "github.com/sirupsen/logrus"
 	. "github.com/smartystreets/goconvey/convey"
 	"gorgonia.org/tensor"
@@ -103,7 +104,7 @@ func TestMultiModel(t *testing.T) {
 		So(predictions, ShouldNotBeNil)
 		So(predictions, ShouldHaveLength, testSamples)
 		log.Debugf("predictions: %+v", predictions)
-		auc := RocAuc32(predictions, labels.Data().([]float32)[:testSamples])
+		auc := utils.RocAuc32(predictions, labels.Data().([]float32)[:testSamples])
 		log.Debugf("auc: %f", auc)
 		So(auc, ShouldBeGreaterThan, 0.5)
 	})
@@ -136,7 +137,7 @@ func TestMultiModel(t *testing.T) {
 		So(predictions, ShouldNotBeNil)
 		So(predictions, ShouldHaveLength, testSamples)
 		log.Debugf("predictions: %+v", predictions)
-		auc := RocAuc32(predictions, labels.Data().([]float32)[:testSamples])
+		auc := utils.RocAuc32(predictions, labels.Data().([]float32)[:testSamples])
 		log.Debugf("auc: %f", auc)
 		So(auc, ShouldBeGreaterThan, 0.5)
 	})
diff --git a/utils/util.go b/utils/util.go
index a44018c..cd9f64f 100644
--- a/utils/util.go
+++ b/utils/util.go
@@ -7,8 +7,8 @@ import (
 	"strconv"
 	"strings"
 
-	"gonum.org/v1/gonum/integrate"
-	"gonum.org/v1/gonum/stat"
+	"github.com/auxten/edgeRec/nn/metrics"
+	"gonum.org/v1/gonum/mat"
 )
 
 func ConcatSlice(slices ...[]float64) []float64 {
@@ -68,11 +68,6 @@ func TopNOccurrences(s []string, n int) []KeyCnt {
 	return l1[:n]
 }
 
-func RocAuc(label []bool, y []float64) float64 {
-	tpr, fpr, _ := stat.ROC(nil, y, label, nil)
-	return integrate.Trapezoidal(fpr, tpr)
-}
-
 func ParseInt64Seq(s string) []int64 {
 	var (
 		seq []int64
@@ -97,3 +92,57 @@ func Int64SeqToIntSeq(seq []int64) []int {
 	}
 	return result
 }
+
+func Accuracy(prediction, y []float64) float64 {
+	var ok float64
+	for i := 0; i < len(prediction); i++ {
+		if math.Round(float64(prediction[i]-y[i])) == 0 {
+			ok += 1.0
+		}
+	}
+	return ok / float64(len(y))
+}
+
+func Accuracy32(prediction, y []float32) float32 {
+	var ok float32
+	for i := 0; i < len(prediction); i++ {
+		if math.Round(float64(prediction[i]-y[i])) == 0 {
+			ok += 1.0
+		}
+	}
+	return ok / float32(len(y))
+}
+
+func RocAuc(pred, y []float64) float64 {
+	boolY := make([]float64, len(y))
+	for i := 0; i < len(y); i++ {
+		if y[i] > 0.5 {
+			boolY[i] = 1.0
+		} else {
+			boolY[i] = 0.0
+		}
+	}
+	yTrue := mat.NewDense(len(y), 1, boolY)
+	yScore := mat.NewDense(len(pred), 1, pred)
+
+	return metrics.ROCAUCScore(yTrue, yScore, "", nil)
+}
+
+func RocAuc32(pred, y []float32) float32 {
+	boolY := make([]float64, len(y))
+	for i := 0; i < len(y); i++ {
+		if y[i] > 0.5 {
+			boolY[i] = 1.0
+		} else {
+			boolY[i] = 0.0
+		}
+	}
+	pred64 := make([]float64, len(pred))
+	for i := 0; i < len(pred); i++ {
+		pred64[i] = float64(pred[i])
+	}
+	yTrue := mat.NewDense(len(y), 1, boolY)
+	yScore := mat.NewDense(len(pred), 1, pred64)
+
+	return float32(metrics.ROCAUCScore(yTrue, yScore, "", nil))
+}
diff --git a/utils/util_test.go b/utils/util_test.go
index da7b738..79e4188 100644
--- a/utils/util_test.go
+++ b/utils/util_test.go
@@ -24,7 +24,9 @@ func TestUtils(t *testing.T) {
 
 func TestGetAUC(t *testing.T) {
 	Convey("test auc", t, func() {
-		auc := RocAuc([]bool{false, true, false, true}, []float64{0.1, 0.35, 0.4, 0.8})
+		auc := RocAuc([]float64{0.1, 0.35, 0.4, 0.8}, []float64{0, 1, 0, 1})
+		So(auc, ShouldEqual, .75)
+		auc = RocAuc([]float64{0.1, 0.4, 0.35, 0.8}, []float64{0, 0, 1, 1})
 		So(auc, ShouldEqual, .75)
 	})
 }

From 577ff2b1d1f0cc99db77ec590444f3a61a4ec7ac Mon Sep 17 00:00:00 2001
From: auxten <auxtenwpc@gmail.com>
Date: Tue, 1 Nov 2022 16:27:51 +0800
Subject: [PATCH 3/4] Add args for movielens-20m

---
 example/movielens/dinimpl_test.go | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/example/movielens/dinimpl_test.go b/example/movielens/dinimpl_test.go
index 60d6b3b..bbd0be8 100644
--- a/example/movielens/dinimpl_test.go
+++ b/example/movielens/dinimpl_test.go
@@ -22,6 +22,8 @@ func TestDinOnMovielens(t *testing.T) {
 
 	var (
 		movielens = &MovielensRec{
+			//DataPath:  "movielens-20m.db",
+			//SampleCnt: 14400000,
 			DataPath:  "movielens.db",
 			SampleCnt: 79948,
 			//SampleCnt: 10000,
@@ -33,9 +35,10 @@ func TestDinOnMovielens(t *testing.T) {
 	Convey("Train din model", t, func() {
 		dinModel := &dinImpl{
 			PredBatchSize: 100,
-			BatchSize:     200,
-			epochs:        100,
-			earlyStop:     20,
+			//BatchSize:     5000,
+			BatchSize: 200,
+			epochs:    100,
+			earlyStop: 20,
 		}
 		trainCtx := context.Background()
 		model, err = rcmd.Train(trainCtx, movielens, dinModel)
@@ -44,6 +47,7 @@ func TestDinOnMovielens(t *testing.T) {
 	})
 
 	Convey("Predict din model", t, func() {
+		//testCount := 5610000
 		testCount := 20600
 		rows, err := db.Query(
 			"SELECT userId, movieId, rating, timestamp FROM ratings_test ORDER BY timestamp, userId ASC LIMIT ?", testCount)

From 85e96f0ebb71640f9acacb33248f3bd3981271da Mon Sep 17 00:00:00 2001
From: auxten <auxtenwpc@gmail.com>
Date: Tue, 1 Nov 2022 19:08:25 +0800
Subject: [PATCH 4/4] Large predict batch size

---
 example/movielens/dinimpl_test.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/example/movielens/dinimpl_test.go b/example/movielens/dinimpl_test.go
index bbd0be8..09b5c2b 100644
--- a/example/movielens/dinimpl_test.go
+++ b/example/movielens/dinimpl_test.go
@@ -34,10 +34,11 @@ func TestDinOnMovielens(t *testing.T) {
 
 	Convey("Train din model", t, func() {
 		dinModel := &dinImpl{
+			//PredBatchSize: 5000,
 			PredBatchSize: 100,
 			//BatchSize:     5000,
 			BatchSize: 200,
-			epochs:    100,
+			epochs:    200,
 			earlyStop: 20,
 		}
 		trainCtx := context.Background()