-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_helper.py
122 lines (100 loc) · 3.9 KB
/
data_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
"""
Example classifier on Numerai data using a logistic regression classifier.
To get started, install the required packages: pip install pandas, numpy, sklearn
"""
import os.path
import pandas as pd
import numpy as np
#from sklearn import metrics, preprocessing, linear_model
#from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.model_selection import KFold, RepeatedKFold
#from sklearn import svm
#from sklearn.pipeline import Pipeline
data_dir_tmplt = '/home/stone/data/numerai/tour{:d}'
def get_feature_names(training_data):
features = [f for f in list(training_data) if "feature" in f]
return features
def get_random_data(nrows=None, nfeat=50):
import numpy.random as r
if nrows is None:
nrows = 393613
X = r.randn(nrows, nfeat)
# X = np.ones((nrows, nfeat));
# print(X)
w = r.rand(nfeat)
# w = np.ones(nfeat)
# print(w)
b = r.rand(1)
# b = 0.1
y = np.matmul(X, w) + b
# print(y)
y = (y > 0.5) #.astype(np.int)
# print(y)
df = pd.DataFrame(np.c_[X, y],
columns=['feature' + str(i) for i in range(1, nfeat+1)] + ['target'])
df['target'] = df.target.astype(np.int)
# print(df)
return df
def load_training_data(tour, nrows=None):
# Set seed for reproducibility
#np.random.seed(0)
fpath = get_data_fpath(tour, 'numerai_training_data.csv')
print("Loading training data from {}".format(fpath))
# Load the data from the CSV files
training_data = pd.read_csv(fpath, header=0, nrows=nrows)
# print(training_data.dtypes)
print('data size: {}, memory usage: {:,}'.format(training_data.shape,
training_data.memory_usage(index=True, deep=True).sum()))
return training_data
def load_training_Xy(tour, nrows=None, onehot=False):
training_data = load_training_data(tour, nrows)
return get_Xy(training_data, onehot=onehot)
def get_Xy(ds, onehot=False):
# Transform the loaded CSV data into numpy arrays
features = get_feature_names(ds)
X = ds[features].as_matrix()
y = ds["target"].as_matrix()
if onehot:
y = dense_to_one_hot(y, np.max(y)+1)
return X, y
def load_testing_data(tour, nrows=None):
fpath = get_data_fpath(tour, 'numerai_tournament_data.csv')
print("Loading testing data from {}".format(fpath))
prediction_data = pd.read_csv(fpath, header=0, nrows=nrows)
print('data size: {}, memory usage: {:,}'.format(prediction_data.shape,
prediction_data.memory_usage(index=True, deep=True).sum()))
return prediction_data
def load_testing_Xy(tour, nrows=None, onehot=False):
ds = loading_testing_data(tour, nrows)
return get_Xy(ds, onehot=onehot)
def get_data_fpath(tour, fname):
data_dir = data_dir_tmplt.format(tour)
fpath = os.path.join(data_dir, fname)
return fpath
def dense_to_one_hot(labels_dense, num_classes):
"""Convert class labels from scalars to one-hot vectors."""
num_labels = labels_dense.shape[0]
index_offset = np.arange(num_labels) * num_classes
labels_one_hot = np.zeros((num_labels, num_classes))
labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
return labels_one_hot
def eval_one(y_true, y_prob):
print('probability predicted: min {}, max {}' \
.format(np.min(y_prob), np.max(y_prob)))
y_pred = y_prob > thr
ll = metrics.log_loss(y_true, y_prob)
print('log loss: {}'.format(ll))
auc = metrics.roc_auc_score(y_true, y_prob)
print('AUC: {}'.format(auc))
acc = metrics.accuracy_score(y_true, y_pred)
print('accuracy: {}'.format(acc))
return (ll, auc, acc)
def write_result(results):
print("Writing predictions to predictions.csv")
# Save the predictions out to a CSV file
results.to_csv("predictions.csv", index=False)
# Now you can upload these predictions on numer.ai
if __name__ == '__main__':
main()