Skip to content

Commit

Permalink
changes in light of PR #41 comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Jordan Stomps committed Jan 16, 2023
1 parent 1a85591 commit ec47a63
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 53 deletions.
6 changes: 3 additions & 3 deletions models/LogReg.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ class LogReg:
Add multinomial functions and unit tests.
Add functionality for regression(?)
Inputs:
params: dictionary of logistic regression input functions.
keys max_iter, tol, and C supported.
kwargs: logistic regression input functions.
keys random_state, max_iter, tol, and C supported.
random_state: int/float for reproducible intiailization.
'''

# only binary so far
def __init__(self, **kwargs):
# supported keys = ['max_iter', 'tol', 'C']
# supported keys = ['max_iter', 'tol', 'C', 'random_state']
# defaults to a fixed value for reproducibility
self.random_state = kwargs.pop('random_state', 0)
# parameters for logistic regression model:
Expand Down
59 changes: 27 additions & 32 deletions models/SSML/CoTraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,37 +19,35 @@ class CoTraining:
Add multinomial functions and unit tests.
Add functionality for regression(?)
Inputs:
params: dictionary of logistic regression input functions.
keys max_iter, tol, and C supported.
kwargs: logistic regression input functions.
keys random_state, max_iter, tol, and C supported.
random_state: int/float for reproducible intiailization.
'''

# only binary so far
def __init__(self, params=None, random_state=0):
def __init__(self, **kwargs):
# supported keys = ['max_iter', 'tol', 'C', 'random_state']
# defaults to a fixed value for reproducibility
self.random_state = random_state
# dictionary of parameters for logistic regression model
self.params = params
if self.params is None:
self.model1 = linear_model.LogisticRegression(
random_state=self.random_state)
self.model2 = linear_model.LogisticRegression(
random_state=self.random_state)
# default needed for training
self.params = {'n_samples': 1}
else:
self.model1 = linear_model.LogisticRegression(
random_state=self.random_state,
max_iter=params['max_iter'],
tol=params['tol'],
C=params['C']
)
self.model2 = linear_model.LogisticRegression(
random_state=self.random_state,
max_iter=params['max_iter'],
tol=params['tol'],
C=params['C']
)
self.random_state = kwargs.pop('random_state', 0)
self.seed = kwargs.pop('seed', 0)
# parameters for cotraining logistic regression models:
# defaults to sklearn.linear_model.LogisticRegression default vals
self.max_iter = kwargs.pop('max_iter', 100)
self.tol = kwargs.pop('tol', 0.0001)
self.C = kwargs.pop('C', 1.0)
self.n_samples = kwargs.pop('n_samples', 1)
self.model1 = linear_model.LogisticRegression(
random_state=self.random_state,
max_iter=self.max_iter,
tol=self.tol,
C=self.C
)
self.model2 = linear_model.LogisticRegression(
random_state=self.random_state,
max_iter=self.max_iter,
tol=self.tol,
C=self.C
)

def training_loop(self, slr1, slr2, L_lr1, L_lr2,
Ly_lr1, Ly_lr2, U_lr, n_samples,
Expand Down Expand Up @@ -155,7 +153,7 @@ def fresh_start(self, params, data_dict):
# unlabeled co-training data
Ux = data_dict['Ux']

clf = CoTraining(params=params, random_state=self.random_state)
clf = CoTraining(**params, random_state=self.random_state)
# training and testing
model1_accs, model2_accs = clf.train(trainx, trainy, Ux, testx, testy)
# uses balanced_accuracy accounts for class imbalanced data
Expand Down Expand Up @@ -239,10 +237,7 @@ def train(self, trainx, trainy, Ux,
U_lr = Ux.copy()

# set the random seed of training splits for reproducibility
# This can be ignored by excluding params['seed']
# in the hyperopt space dictionary
if 'seed' in self.params.keys():
np.random.seed(self.params['seed'])
np.random.seed(self.seed)

# TODO: allow a user to specify uneven splits between the two models
split_frac = 0.5
Expand All @@ -262,7 +257,7 @@ def train(self, trainx, trainy, Ux,
self.model1, self.model2,
L_lr1, L_lr2,
Ly_lr1, Ly_lr2,
U_lr, self.params['n_samples'],
U_lr, self.n_samples,
testx, testy,
)

Expand Down
31 changes: 13 additions & 18 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,16 +67,6 @@ def test_cross_validation():
# therefore its accuracy should be less than all other folds
assert (accs[-1] < accs[:-1]).all()

# test cross validation for supervised data and StratifiedKFold with LogReg
# params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
# model = LogReg(params=params)
# max_acc_model = utils.cross_validation(model=model,
# X=X,
# y=y,
# params=params,
# stratified=True)
# assert max_acc_model['accuracy'] >= 0.5

# test cross validation for SSML with LabelProp
# params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5}
# model = LabelProp(params=params)
Expand Down Expand Up @@ -106,9 +96,10 @@ def test_pca():
utils.plot_pca(pcs, y_train, np.full_like(Uy, -1), filename, 2)
os.remove(filename+'.png')

# filename = 'test_multiD_pca'
# utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5)
# os.remove(filename+'.png')
filename = 'test_multiD_pca'
pcs = utils.pca(X_train, Ux, 5)
utils.plot_pca(pcs, y_train, np.full_like(Uy, -1), filename, 5)
os.remove(filename+'.png')

# normalization
normalizer = StandardScaler()
Expand Down Expand Up @@ -197,7 +188,9 @@ def test_LogReg():
def test_CoTraining():
# test saving model input parameters
params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
model = CoTraining(params=params)
model = CoTraining(max_iter=params['max_iter'],
tol=params['tol'],
C=params['C'])

assert model.model1.max_iter == params['max_iter']
assert model.model1.tol == params['tol']
Expand All @@ -207,8 +200,8 @@ def test_CoTraining():
assert model.model2.tol == params['tol']
assert model.model2.C == params['C']

X, Ux, y, Uy = train_test_split(spectra,
labels,
X, Ux, y, Uy = train_test_split(pytest.spectra,
pytest.labels,
test_size=0.5,
random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X,
Expand All @@ -231,8 +224,10 @@ def test_CoTraining():
# testing train and predict methods
pred, acc, *_ = model.predict(X_test, y_test)

assert acc > 0.7
np.testing.assert_equal(pred, y_test)
# since the test data used here is synthetic/toy data (i.e. uninteresting),
# the trained model should be at least better than a 50-50 guess
# if it was worse, something would be wrong with the ML class
assert acc > 0.5

# testing hyperopt optimize methods
space = {'max_iter': scope.int(hp.quniform('max_iter',
Expand Down

0 comments on commit ec47a63

Please sign in to comment.