From 95fc695a89ae209139787e1dfa7cdd7e9de426a6 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Wed, 18 Jan 2023 09:33:35 -0500 Subject: [PATCH] adjusting numpy.random.seed usage in cotraining --- models/SSML/CoTraining.py | 15 +++++++-------- tests/test_models.py | 16 ++++++++++++---- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/models/SSML/CoTraining.py b/models/SSML/CoTraining.py index 16eac7d..9fda2db 100644 --- a/models/SSML/CoTraining.py +++ b/models/SSML/CoTraining.py @@ -16,20 +16,22 @@ class CoTraining: regression implementation with hyperparameter optimization. Data agnostic (i.e. user supplied data inputs). TODO: Currently only supports binary classification. - Add multinomial functions and unit tests. - Add functionality for regression(?) + - Add multinomial functions and unit tests. + - Add functionality for regression(?) Inputs: kwargs: logistic regression input functions. - keys random_state, max_iter, tol, and C supported. - random_state: int/float for reproducible intiailization. + keys seed, random_state, max_iter, tol, and C supported. + seed/random_state: int/float for reproducible intiailization. ''' # only binary so far def __init__(self, **kwargs): - # supported keys = ['max_iter', 'tol', 'C', 'random_state'] + # supported keys = ['max_iter', 'tol', 'C', 'random_state', 'seed'] # defaults to a fixed value for reproducibility self.random_state = kwargs.pop('random_state', 0) + # set the random seed of training splits for reproducibility self.seed = kwargs.pop('seed', 0) + np.random.seed(self.seed) # parameters for cotraining logistic regression models: # defaults to sklearn.linear_model.LogisticRegression default vals self.max_iter = kwargs.pop('max_iter', 100) @@ -236,9 +238,6 @@ def train(self, trainx, trainy, Ux, # avoid overwriting when deleting in co-training loop U_lr = Ux.copy() - # set the random seed of training splits for reproducibility - np.random.seed(self.seed) - # TODO: allow a user to specify uneven splits between the two models split_frac = 0.5 # labeled training data diff --git a/tests/test_models.py b/tests/test_models.py index b7bb087..334fc19 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -122,14 +122,16 @@ def test_pca(): def test_LogReg(): # test saving model input parameters - params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0, 'random_state': 0} model = LogReg(max_iter=params['max_iter'], tol=params['tol'], - C=params['C']) + C=params['C'], + random_state=params['random_state']) assert model.model.max_iter == params['max_iter'] assert model.model.tol == params['tol'] assert model.model.C == params['C'] + assert model.random_state == params['random_state'] X_train, X_test, y_train, y_test = train_test_split(pytest.spectra, pytest.labels, @@ -187,10 +189,13 @@ def test_LogReg(): def test_CoTraining(): # test saving model input parameters - params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0, + 'random_state': 0, 'seed': 1} model = CoTraining(max_iter=params['max_iter'], tol=params['tol'], - C=params['C']) + C=params['C'], + random_state=params['random_state'], + seed=params['seed']) assert model.model1.max_iter == params['max_iter'] assert model.model1.tol == params['tol'] @@ -200,6 +205,9 @@ def test_CoTraining(): assert model.model2.tol == params['tol'] assert model.model2.C == params['C'] + assert model.random_state == params['random_state'] + assert model.seed == params['seed'] + X, Ux, y, Uy = train_test_split(pytest.spectra, pytest.labels, test_size=0.5,