From 3fcdb3afe86f26537faae14e2756a15af0a5940c Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Sun, 12 Nov 2023 20:58:01 +0900 Subject: [PATCH 1/5] refactor: simplify logic in datamodule (#295) * refactor: simplify logic in datamodule * black --- src/pytorch_tabular/tabular_datamodule.py | 160 +++++++++++----------- 1 file changed, 81 insertions(+), 79 deletions(-) diff --git a/src/pytorch_tabular/tabular_datamodule.py b/src/pytorch_tabular/tabular_datamodule.py index 9d639969..3c756ff2 100644 --- a/src/pytorch_tabular/tabular_datamodule.py +++ b/src/pytorch_tabular/tabular_datamodule.py @@ -160,29 +160,26 @@ def _encode_date_columns(self, data: pd.DataFrame) -> pd.DataFrame: return data, added_features def _encode_categorical_columns(self, data: pd.DataFrame, stage: str) -> pd.DataFrame: - if stage == "fit": - if self.do_leave_one_out_encoder(): - logger.debug("Encoding Categorical Columns using LeavOneOutEncoder") - self.categorical_encoder = ce.LeaveOneOutEncoder( - cols=self.config.categorical_cols, random_state=self.seed - ) - # Multi-Target Regression uses the first target to encode the categorical columns - if len(self.config.target) > 1: - logger.warning( - f"Multi-Target Regression: using the first target({self.config.target[0]})" - f" to encode the categorical columns" - ) - data = self.categorical_encoder.fit_transform(data, data[self.config.target[0]]) - else: - logger.debug("Encoding Categorical Columns using OrdinalEncoder") - self.categorical_encoder = OrdinalEncoder( - cols=self.config.categorical_cols, - handle_unseen="impute" if self.config.handle_unknown_categories else "error", - handle_missing="impute" if self.config.handle_missing_values else "error", + if stage != "fit": + return self.categorical_encoder.transform(data) + if self.do_leave_one_out_encoder(): + logger.debug("Encoding Categorical Columns using LeavOneOutEncoder") + self.categorical_encoder = ce.LeaveOneOutEncoder(cols=self.config.categorical_cols, random_state=self.seed) + # Multi-Target Regression uses the first target to encode the categorical columns + if len(self.config.target) > 1: + logger.warning( + f"Multi-Target Regression: using the first target({self.config.target[0]})" + f" to encode the categorical columns" ) - data = self.categorical_encoder.fit_transform(data) + data = self.categorical_encoder.fit_transform(data, data[self.config.target[0]]) else: - data = self.categorical_encoder.transform(data) + logger.debug("Encoding Categorical Columns using OrdinalEncoder") + self.categorical_encoder = OrdinalEncoder( + cols=self.config.categorical_cols, + handle_unseen="impute" if self.config.handle_unknown_categories else "error", + handle_missing="impute" if self.config.handle_missing_values else "error", + ) + data = self.categorical_encoder.fit_transform(data) return data def _transform_continuous_columns(self, data: pd.DataFrame, stage: str) -> pd.DataFrame: @@ -212,30 +209,33 @@ def _normalize_continuous_columns(self, data: pd.DataFrame, stage: str) -> pd.Da return data def _label_encode_target(self, data: pd.DataFrame, stage: str) -> pd.DataFrame: - if self.config.task == "classification": - if stage == "fit": - self.label_encoder = LabelEncoder() - data[self.config.target[0]] = self.label_encoder.fit_transform(data[self.config.target[0]]) - else: - if self.config.target[0] in data.columns: - data[self.config.target[0]] = self.label_encoder.transform(data[self.config.target[0]]) + if self.config.task != "classification": + return data + if stage == "fit": + self.label_encoder = LabelEncoder() + data[self.config.target[0]] = self.label_encoder.fit_transform(data[self.config.target[0]]) + else: + if self.config.target[0] in data.columns: + data[self.config.target[0]] = self.label_encoder.transform(data[self.config.target[0]]) return data def _target_transform(self, data: pd.DataFrame, stage: str) -> pd.DataFrame: - if self.config.task == "regression": - # target transform only for regression - if all(col in data.columns for col in self.config.target): - if self.do_target_transform: - if stage == "fit": - target_transforms = [] - for col in self.config.target: - _target_transform = copy.deepcopy(self.target_transform_template) - data[col] = _target_transform.fit_transform(data[col].values.reshape(-1, 1)) - target_transforms.append(_target_transform) - self.target_transforms = target_transforms - else: - for col, _target_transform in zip(self.config.target, self.target_transforms): - data[col] = _target_transform.transform(data[col].values.reshape(-1, 1)) + if self.config.task != "regression": + return data + # target transform only for regression + if not all(col in data.columns for col in self.config.target): + return data + if self.do_target_transform: + if stage == "fit": + target_transforms = [] + for col in self.config.target: + _target_transform = copy.deepcopy(self.target_transform_template) + data[col] = _target_transform.fit_transform(data[col].values.reshape(-1, 1)) + target_transforms.append(_target_transform) + self.target_transforms = target_transforms + else: + for col, _target_transform in zip(self.config.target, self.target_transforms): + data[col] = _target_transform.transform(data[col].values.reshape(-1, 1)) return data def preprocess_data(self, data: pd.DataFrame, stage: str = "inference") -> Tuple[pd.DataFrame, list]: @@ -286,27 +286,28 @@ def setup(self, stage: Optional[str] = None) -> None: stage (Optional[str], optional): Internal parameter to distinguish between fit and inference. Defaults to None. """ - if stage == "fit" or stage is None: - logger.info(f"Setting up the datamodule for {self.config.task} task") - if self.validation is None: - logger.debug( - f"No validation data provided." - f" Using {self.config.validation_split*100}% of train data as validation" - ) - val_idx = self.train.sample( - int(self.config.validation_split * len(self.train)), - random_state=self.seed, - ).index - self.validation = self.train[self.train.index.isin(val_idx)] - self.train = self.train[~self.train.index.isin(val_idx)] - else: - self.validation = self.validation.copy() - # Preprocessing Train, Validation - self.train, _ = self.preprocess_data(self.train, stage="fit") - self.validation, _ = self.preprocess_data(self.validation, stage="inference") - if self.test is not None: - self.test, _ = self.preprocess_data(self.test, stage="inference") - self._fitted = True + if not (stage is None or stage == "fit"): + return + logger.info(f"Setting up the datamodule for {self.config.task} task") + if self.validation is None: + logger.debug( + f"No validation data provided." + f" Using {self.config.validation_split*100}% of train data as validation" + ) + val_idx = self.train.sample( + int(self.config.validation_split * len(self.train)), + random_state=self.seed, + ).index + self.validation = self.train[self.train.index.isin(val_idx)] + self.train = self.train[~self.train.index.isin(val_idx)] + else: + self.validation = self.validation.copy() + # Preprocessing Train, Validation + self.train, _ = self.preprocess_data(self.train, stage="fit") + self.validation, _ = self.preprocess_data(self.validation, stage="inference") + if self.test is not None: + self.test, _ = self.preprocess_data(self.test, stage="inference") + self._fitted = True # adapted from gluonts @classmethod @@ -561,22 +562,23 @@ def test_dataloader(self, batch_size: Optional[int] = None) -> DataLoader: Returns: DataLoader: Test dataloader """ - if self.test is not None: - dataset = TabularDataset( - task=self.config.task, - data=self.test, - categorical_cols=self.config.categorical_cols, - continuous_cols=self.config.continuous_cols, - embed_categorical=(not self.do_leave_one_out_encoder()), - target=self.target, - ) - return DataLoader( - dataset, - batch_size if batch_size is not None else self.batch_size, - shuffle=False, - num_workers=self.config.num_workers, - pin_memory=self.config.pin_memory, - ) + if self.test is None: + raise RuntimeError("Undefined test attribute.") + dataset = TabularDataset( + task=self.config.task, + data=self.test, + categorical_cols=self.config.categorical_cols, + continuous_cols=self.config.continuous_cols, + embed_categorical=(not self.do_leave_one_out_encoder()), + target=self.target, + ) + return DataLoader( + dataset, + batch_size if batch_size is not None else self.batch_size, + shuffle=False, + num_workers=self.config.num_workers, + pin_memory=self.config.pin_memory, + ) def _prepare_inference_data(self, df: pd.DataFrame) -> pd.DataFrame: """Prepare data for inference.""" From 95d56405aa04e2b04304ecd2d49d862b3281608a Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Sun, 12 Nov 2023 21:00:33 +0900 Subject: [PATCH 2/5] tests: prune meaningless `assert True` (#294) * tests: prune meaningless `assert True` * ... --- tests/test_autoint.py | 213 ++++++++++----------- tests/test_categorical_embedding.py | 180 +++++++++--------- tests/test_datamodule.py | 97 +++++----- tests/test_ft_transformer.py | 156 ++++++++-------- tests/test_gandalf.py | 136 +++++++------- tests/test_gate.py | 156 ++++++++-------- tests/test_mdn.py | 128 ++++++------- tests/test_node.py | 156 ++++++++-------- tests/test_ssl.py | 278 ++++++++++++++-------------- tests/test_tabnet.py | 136 +++++++------- tests/test_tabtransformer.py | 156 ++++++++-------- 11 files changed, 868 insertions(+), 924 deletions(-) diff --git a/tests/test_autoint.py b/tests/test_autoint.py index f9c6bb32..3626ce77 100644 --- a/tests/test_autoint.py +++ b/tests/test_autoint.py @@ -32,53 +32,50 @@ def test_regression( attention_pooling, ): (train, test, target) = regression_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target + ["MedInc"] if multi_target else target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = {"task": "regression"} - if target_range: - _target_range = [] - for target in data_config.target: - _target_range.append( - ( - float(train[target].min()), - float(train[target].max()), - ) + data_config = DataConfig( + target=target + ["MedInc"] if multi_target else target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = {"task": "regression"} + if target_range: + _target_range = [] + for target in data_config.target: + _target_range.append( + ( + float(train[target].min()), + float(train[target].max()), ) - model_config_params["target_range"] = _target_range - model_config_params["deep_layers"] = deep_layers - model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input - model_config_params["attention_pooling"] = attention_pooling - model_config = AutoIntConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=3, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + ) + model_config_params["target_range"] = _target_range + model_config_params["deep_layers"] = deep_layers + model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input + model_config_params["attention_pooling"] = attention_pooling + model_config = AutoIntConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=3, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - # print(result[0]["valid_loss"]) - assert "test_mean_squared_error" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + # print(result[0]["valid_loss"]) + assert "test_mean_squared_error" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] @pytest.mark.parametrize( @@ -102,42 +99,39 @@ def test_classification( batch_norm_continuous_input, ): (train, test, target) = classification_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = {"task": "classification"} - model_config_params["deep_layers"] = deep_layers - model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input - model_config = AutoIntConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=3, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + data_config = DataConfig( + target=target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = {"task": "classification"} + model_config_params["deep_layers"] = deep_layers + model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input + model_config = AutoIntConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=3, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - # print(result[0]["valid_loss"]) - assert "test_accuracy" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + # print(result[0]["valid_loss"]) + assert "test_accuracy" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] # @pytest.mark.parametrize( @@ -174,36 +168,33 @@ def test_classification( # aug_task, # ): # (train, test, target) = regression_data -# if len(continuous_cols) + len(categorical_cols) == 0: -# assert True -# else: -# data_config = DataConfig( -# target=target, -# continuous_cols=continuous_cols, -# categorical_cols=categorical_cols, -# continuous_feature_transform=continuous_feature_transform, -# normalize_continuous_features=normalize_continuous_features, -# ) -# model_config_params = dict(task="ssl", ssl_task=ssl_task, aug_task=aug_task) -# model_config_params["deep_layers"] = deep_layers -# model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input -# model_config_params["attention_pooling"] = attention_pooling -# model_config = AutoIntConfig(**model_config_params) -# trainer_config = TrainerConfig( -# max_epochs=3, -# checkpoints=None, -# early_stopping=None, -# fast_dev_run=True, -# ) -# optimizer_config = OptimizerConfig() - -# tabular_model = TabularModel( -# data_config=data_config, -# model_config=model_config, -# optimizer_config=optimizer_config, -# trainer_config=trainer_config, -# ) -# tabular_model.fit(train=train, test=test) - -# result = tabular_model.evaluate(test) -# assert "test_mean_squared_error" in result[0].keys() +# data_config = DataConfig( +# target=target, +# continuous_cols=continuous_cols, +# categorical_cols=categorical_cols, +# continuous_feature_transform=continuous_feature_transform, +# normalize_continuous_features=normalize_continuous_features, +# ) +# model_config_params = dict(task="ssl", ssl_task=ssl_task, aug_task=aug_task) +# model_config_params["deep_layers"] = deep_layers +# model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input +# model_config_params["attention_pooling"] = attention_pooling +# model_config = AutoIntConfig(**model_config_params) +# trainer_config = TrainerConfig( +# max_epochs=3, +# checkpoints=None, +# early_stopping=None, +# fast_dev_run=True, +# ) +# optimizer_config = OptimizerConfig() +# +# tabular_model = TabularModel( +# data_config=data_config, +# model_config=model_config, +# optimizer_config=optimizer_config, +# trainer_config=trainer_config, +# ) +# tabular_model.fit(train=train, test=test) +# +# result = tabular_model.evaluate(test) +# assert "test_mean_squared_error" in result[0].keys() diff --git a/tests/test_categorical_embedding.py b/tests/test_categorical_embedding.py index c48467d5..3917c589 100644 --- a/tests/test_categorical_embedding.py +++ b/tests/test_categorical_embedding.py @@ -64,64 +64,64 @@ def test_regression( (train, test, target) = regression_data (custom_metrics, custom_metrics_prob_input, custom_loss, custom_optimizer) = custom_args if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target + ["MedInc"] if multi_target else target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = {"task": "regression"} - if target_range: - _target_range = [] - for target in data_config.target: - _target_range.append( - ( - float(train[target].min()), - float(train[target].max()), - ) + return + + data_config = DataConfig( + target=target + ["MedInc"] if multi_target else target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = {"task": "regression"} + if target_range: + _target_range = [] + for target in data_config.target: + _target_range.append( + ( + float(train[target].min()), + float(train[target].max()), ) - model_config_params["target_range"] = _target_range - if custom_head_config is not None: - model_config_params["head"] = "LinearHead" - model_config_params["head_config"] = {"layers": custom_head_config} - model_config = CategoryEmbeddingModelConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=3, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() - - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit( - train=train, - test=test, - metrics=custom_metrics, - metrics_prob_inputs=custom_metrics_prob_input, - target_transform=target_transform, - loss=custom_loss, - optimizer=custom_optimizer, - optimizer_params={}, - ) - - result = tabular_model.evaluate(test) - # print(result[0]["valid_loss"]) - if custom_metrics is None: - assert "test_mean_squared_error" in result[0].keys() - else: - assert "test_fake_metric" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + ) + model_config_params["target_range"] = _target_range + if custom_head_config is not None: + model_config_params["head"] = "LinearHead" + model_config_params["head_config"] = {"layers": custom_head_config} + model_config = CategoryEmbeddingModelConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=3, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() + + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit( + train=train, + test=test, + metrics=custom_metrics, + metrics_prob_inputs=custom_metrics_prob_input, + target_transform=target_transform, + loss=custom_loss, + optimizer=custom_optimizer, + optimizer_params={}, + ) + + result = tabular_model.evaluate(test) + # print(result[0]["valid_loss"]) + if custom_metrics is None: + assert "test_mean_squared_error" in result[0].keys() + else: + assert "test_fake_metric" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] @pytest.mark.parametrize( @@ -143,39 +143,39 @@ def test_classification( ): (train, test, target) = classification_data if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = {"task": "classification"} - model_config = CategoryEmbeddingModelConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=3, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() - - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) - - result = tabular_model.evaluate(test) - # print(result[0]["valid_loss"]) - assert "test_accuracy" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + return + + data_config = DataConfig( + target=target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = {"task": "classification"} + model_config = CategoryEmbeddingModelConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=3, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() + + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) + + result = tabular_model.evaluate(test) + # print(result[0]["valid_loss"]) + assert "test_accuracy" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] def test_embedding_transformer(regression_data): diff --git a/tests/test_datamodule.py b/tests/test_datamodule.py index c88c0fe5..edbdecd0 100644 --- a/tests/test_datamodule.py +++ b/tests/test_datamodule.py @@ -53,54 +53,54 @@ def test_dataloader( (train, test, target) = regression_data train, valid = train_test_split(train, random_state=42) if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target + ["MedInc"] if multi_target else target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - validation_split=validation_split, - ) - model_config_params = {"task": "regression", "embedding_dims": embedding_dims} - model_config = CategoryEmbeddingModelConfig(**model_config_params) - trainer_config = TrainerConfig(max_epochs=1, checkpoints=None, early_stopping=None) - optimizer_config = OptimizerConfig() + return - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - config = tabular_model.config - datamodule = TabularDatamodule( - train=train, - validation=valid, - config=config, - test=test, - target_transform=target_transform, - ) - datamodule.prepare_data() - datamodule.setup("fit") - inferred_config = datamodule.update_config(config) - if len(categorical_cols) > 0: - assert inferred_config.categorical_cardinality[0] == 5 - if embedding_dims is None: - assert inferred_config.embedding_dims[0][-1] == 3 - else: - assert inferred_config.embedding_dims[0][-1] == embedding_dims[0][-1] - if normalize_continuous_features and len(continuous_cols) > 0: - assert round(datamodule.train[config.continuous_cols[0]].mean()) == 0 - assert round(datamodule.train[config.continuous_cols[0]].std()) == 1 - # assert round(datamodule.validation[config.continuous_cols[0]].mean()) == 0 - # assert round(datamodule.validation[config.continuous_cols[0]].std()) == 1 - val_loader = datamodule.val_dataloader() - _val_loader = datamodule.prepare_inference_dataloader(valid) - chk_1 = next(iter(val_loader))["continuous"] - chk_2 = next(iter(_val_loader))["continuous"] - assert np.not_equal(chk_1, chk_2).sum().item() == 0 + data_config = DataConfig( + target=target + ["MedInc"] if multi_target else target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + validation_split=validation_split, + ) + model_config_params = {"task": "regression", "embedding_dims": embedding_dims} + model_config = CategoryEmbeddingModelConfig(**model_config_params) + trainer_config = TrainerConfig(max_epochs=1, checkpoints=None, early_stopping=None) + optimizer_config = OptimizerConfig() + + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + config = tabular_model.config + datamodule = TabularDatamodule( + train=train, + validation=valid, + config=config, + test=test, + target_transform=target_transform, + ) + datamodule.prepare_data() + datamodule.setup("fit") + inferred_config = datamodule.update_config(config) + if len(categorical_cols) > 0: + assert inferred_config.categorical_cardinality[0] == 5 + if embedding_dims is None: + assert inferred_config.embedding_dims[0][-1] == 3 + else: + assert inferred_config.embedding_dims[0][-1] == embedding_dims[0][-1] + if normalize_continuous_features and len(continuous_cols) > 0: + assert round(datamodule.train[config.continuous_cols[0]].mean()) == 0 + assert round(datamodule.train[config.continuous_cols[0]].std()) == 1 + # assert round(datamodule.validation[config.continuous_cols[0]].mean()) == 0 + # assert round(datamodule.validation[config.continuous_cols[0]].std()) == 1 + val_loader = datamodule.val_dataloader() + _val_loader = datamodule.prepare_inference_dataloader(valid) + chk_1 = next(iter(val_loader))["continuous"] + chk_2 = next(iter(_val_loader))["continuous"] + assert np.not_equal(chk_1, chk_2).sum().item() == 0 @pytest.mark.parametrize( @@ -148,6 +148,5 @@ def test_date_encoding(timeseries_data, freq): elif freq == "S": try: datamodule.setup("fit") - assert False except RuntimeError: - assert True + pass diff --git a/tests/test_ft_transformer.py b/tests/test_ft_transformer.py index 1f41fa32..567f86d9 100644 --- a/tests/test_ft_transformer.py +++ b/tests/test_ft_transformer.py @@ -37,53 +37,52 @@ def test_regression( ): (train, test, target) = regression_data if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target + ["MedInc"] if multi_target else target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = { - "task": "regression", - "input_embed_dim": 8, - "num_attn_blocks": 1, - "num_heads": 2, - } - if target_range: - _target_range = [] - for target in data_config.target: - _target_range.append( - ( - float(train[target].min()), - float(train[target].max()), - ) + return + data_config = DataConfig( + target=target + ["MedInc"] if multi_target else target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = { + "task": "regression", + "input_embed_dim": 8, + "num_attn_blocks": 1, + "num_heads": 2, + } + if target_range: + _target_range = [] + for target in data_config.target: + _target_range.append( + ( + float(train[target].min()), + float(train[target].max()), ) - model_config_params["target_range"] = _target_range - model_config = FTTransformerConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + ) + model_config_params["target_range"] = _target_range + model_config = FTTransformerConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_mean_squared_error" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_mean_squared_error" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] @pytest.mark.parametrize( @@ -103,44 +102,41 @@ def test_classification( normalize_continuous_features, ): (train, test, target) = classification_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = { - "task": "classification", - "input_embed_dim": 8, - "num_attn_blocks": 1, - "num_heads": 2, - } - model_config = FTTransformerConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + data_config = DataConfig( + target=target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = { + "task": "classification", + "input_embed_dim": 8, + "num_attn_blocks": 1, + "num_heads": 2, + } + model_config = FTTransformerConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_accuracy" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_accuracy" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] def test_embedding_transformer(regression_data): diff --git a/tests/test_gandalf.py b/tests/test_gandalf.py index 88d84f5f..eb6c02ff 100644 --- a/tests/test_gandalf.py +++ b/tests/test_gandalf.py @@ -37,49 +37,46 @@ def test_regression( target_range, ): (train, test, target) = regression_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target + ["MedInc"] if multi_target else target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = {"task": "regression", "gflu_stages": 1} - if target_range: - _target_range = [] - for target in data_config.target: - _target_range.append( - ( - float(train[target].min()), - float(train[target].max()), - ) + data_config = DataConfig( + target=target + ["MedInc"] if multi_target else target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = {"task": "regression", "gflu_stages": 1} + if target_range: + _target_range = [] + for target in data_config.target: + _target_range.append( + ( + float(train[target].min()), + float(train[target].max()), ) - model_config_params["target_range"] = _target_range - model_config = GANDALFConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + ) + model_config_params["target_range"] = _target_range + model_config = GANDALFConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_mean_squared_error" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_mean_squared_error" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] @pytest.mark.parametrize( @@ -99,39 +96,36 @@ def test_classification( normalize_continuous_features, ): (train, test, target) = classification_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = {"task": "classification", "gflu_stages": 1} - model_config = GANDALFConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + data_config = DataConfig( + target=target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = {"task": "classification", "gflu_stages": 1} + model_config = GANDALFConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_accuracy" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_accuracy" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] # def test_embedding_transformer(regression_data): diff --git a/tests/test_gate.py b/tests/test_gate.py index b08165ff..c56bc4ca 100644 --- a/tests/test_gate.py +++ b/tests/test_gate.py @@ -37,54 +37,51 @@ def test_regression( target_range, ): (train, test, target) = regression_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target + ["MedInc"] if multi_target else target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = { - "task": "regression", - "gflu_stages": 1, - "tree_depth": 1, - "num_trees": 2, - } - if target_range: - _target_range = [] - for target in data_config.target: - _target_range.append( - ( - float(train[target].min()), - float(train[target].max()), - ) + data_config = DataConfig( + target=target + ["MedInc"] if multi_target else target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = { + "task": "regression", + "gflu_stages": 1, + "tree_depth": 1, + "num_trees": 2, + } + if target_range: + _target_range = [] + for target in data_config.target: + _target_range.append( + ( + float(train[target].min()), + float(train[target].max()), ) - model_config_params["target_range"] = _target_range - model_config = GatedAdditiveTreeEnsembleConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + ) + model_config_params["target_range"] = _target_range + model_config = GatedAdditiveTreeEnsembleConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_mean_squared_error" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_mean_squared_error" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] @pytest.mark.parametrize( @@ -104,44 +101,41 @@ def test_classification( normalize_continuous_features, ): (train, test, target) = classification_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = { - "task": "classification", - "gflu_stages": 1, - "tree_depth": 1, - "num_trees": 2, - } - model_config = GatedAdditiveTreeEnsembleConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + data_config = DataConfig( + target=target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = { + "task": "classification", + "gflu_stages": 1, + "tree_depth": 1, + "num_trees": 2, + } + model_config = GatedAdditiveTreeEnsembleConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_accuracy" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_accuracy" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] # def test_embedding_transformer(regression_data): diff --git a/tests/test_mdn.py b/tests/test_mdn.py index 36a85860..7329d04c 100644 --- a/tests/test_mdn.py +++ b/tests/test_mdn.py @@ -38,45 +38,42 @@ def test_regression( num_gaussian, ): (train, test, target) = regression_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target + ["MedInc"] if multi_target else target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = {"task": "regression"} - mdn_config = {"num_gaussian": num_gaussian} - model_config_params["head_config"] = mdn_config - model_config_params["backbone_config_class"] = variant - model_config_params["backbone_config_params"] = {"task": "backbone"} + data_config = DataConfig( + target=target + ["MedInc"] if multi_target else target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = {"task": "regression"} + mdn_config = {"num_gaussian": num_gaussian} + model_config_params["head_config"] = mdn_config + model_config_params["backbone_config_class"] = variant + model_config_params["backbone_config_params"] = {"task": "backbone"} - model_config = MDNConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=3, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + model_config = MDNConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=3, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - # print(result[0]["valid_loss"]) - assert "test_mean_squared_error" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + # print(result[0]["valid_loss"]) + assert "test_mean_squared_error" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] @pytest.mark.parametrize( @@ -99,36 +96,33 @@ def test_classification( num_gaussian, ): (train, test, target) = classification_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = {"task": "classification"} - mdn_config = {"num_gaussian": num_gaussian} - model_config_params["head_config"] = mdn_config - model_config_params["backbone_config_class"] = "CategoryEmbeddingMDNConfig" - model_config_params["backbone_config_params"] = {"task": "backbone"} + data_config = DataConfig( + target=target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = {"task": "classification"} + mdn_config = {"num_gaussian": num_gaussian} + model_config_params["head_config"] = mdn_config + model_config_params["backbone_config_class"] = "CategoryEmbeddingMDNConfig" + model_config_params["backbone_config_params"] = {"task": "backbone"} - model_config = MDNConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=3, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, + model_config = MDNConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=3, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() + with pytest.raises(AssertionError): + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, ) - optimizer_config = OptimizerConfig() - with pytest.raises(AssertionError): - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model.fit(train=train, test=test) diff --git a/tests/test_node.py b/tests/test_node.py index fa04fae0..f07494d1 100644 --- a/tests/test_node.py +++ b/tests/test_node.py @@ -38,54 +38,51 @@ def test_regression( target_range, ): (train, test, target) = regression_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target + ["MedInc"] if multi_target else target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = { - "task": "regression", - "depth": 2, - "num_trees": 50, - "embed_categorical": embed_categorical, - } - if target_range: - _target_range = [] - for target in data_config.target: - _target_range.append( - ( - float(train[target].min()), - float(train[target].max()), - ) + data_config = DataConfig( + target=target + ["MedInc"] if multi_target else target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = { + "task": "regression", + "depth": 2, + "num_trees": 50, + "embed_categorical": embed_categorical, + } + if target_range: + _target_range = [] + for target in data_config.target: + _target_range.append( + ( + float(train[target].min()), + float(train[target].max()), ) - model_config_params["target_range"] = _target_range - model_config = NodeConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + ) + model_config_params["target_range"] = _target_range + model_config = NodeConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_mean_squared_error" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_mean_squared_error" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] @pytest.mark.parametrize( @@ -107,44 +104,41 @@ def test_classification( normalize_continuous_features, ): (train, test, target) = classification_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = { - "task": "classification", - "depth": 2, - "num_trees": 50, - "embed_categorical": embed_categorical, - } - model_config = NodeConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + data_config = DataConfig( + target=target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = { + "task": "classification", + "depth": 2, + "num_trees": 50, + "embed_categorical": embed_categorical, + } + model_config = NodeConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_accuracy" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_accuracy" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] def test_embedding_transformer(regression_data): diff --git a/tests/test_ssl.py b/tests/test_ssl.py index 1a732e5a..4ff6a338 100644 --- a/tests/test_ssl.py +++ b/tests/test_ssl.py @@ -58,90 +58,87 @@ def test_regression( ssl, finetune = train_test_split(train, random_state=42) ssl_train, ssl_val = train_test_split(ssl, random_state=42) finetune_train, finetune_val = train_test_split(finetune, random_state=42) - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target + ["MedInc"] if multi_target else target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - handle_missing_values=False, - handle_unknown_categories=False, - ) - encoder_config = CategoryEmbeddingModelConfig( - task="backbone", - layers="4096-2048-512", # Number of nodes in each layer - activation="LeakyReLU", # Activation between each layers - ) - decoder_config = CategoryEmbeddingModelConfig( - task="backbone", - layers="512-2048-4096", # Number of nodes in each layer - activation="LeakyReLU", # Activation between each layers - ) + data_config = DataConfig( + target=target + ["MedInc"] if multi_target else target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + handle_missing_values=False, + handle_unknown_categories=False, + ) + encoder_config = CategoryEmbeddingModelConfig( + task="backbone", + layers="4096-2048-512", # Number of nodes in each layer + activation="LeakyReLU", # Activation between each layers + ) + decoder_config = CategoryEmbeddingModelConfig( + task="backbone", + layers="512-2048-4096", # Number of nodes in each layer + activation="LeakyReLU", # Activation between each layers + ) - model_config_params = { - "encoder_config": encoder_config, - "decoder_config": decoder_config, - } - model_config = DenoisingAutoEncoderConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + model_config_params = { + "encoder_config": encoder_config, + "decoder_config": decoder_config, + } + model_config = DenoisingAutoEncoderConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.pretrain(train=ssl_train, validation=ssl_val) - if target_range: - _target_range = [] - for target in data_config.target: - _target_range.append( - ( - float(train[target].min()), - float(train[target].max()), - ) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.pretrain(train=ssl_train, validation=ssl_val) + if target_range: + _target_range = [] + for target in data_config.target: + _target_range.append( + ( + float(train[target].min()), + float(train[target].max()), ) - else: - _target_range = None - finetune_model = tabular_model.create_finetune_model( - task="regression", - head="LinearHead", - head_config={ - "layers": "64-32-16", - "activation": "LeakyReLU", - }, - trainer_config=trainer_config, - optimizer_config=optimizer_config, - target_range=_target_range, - loss=custom_loss, - metrics=custom_metrics, - metrics_params=[{}], - metrics_prob_input=metrics_prob_input, - optimizer=custom_optimizer, - ) - finetune_model.finetune( - train=finetune_train, - validation=finetune_val, - freeze_backbone=freeze_backbone, - target_transform=target_transform, - ) - result = finetune_model.evaluate(test) - if custom_metrics is None: - assert "test_mean_squared_error" in result[0].keys() - else: - assert "test_fake_metric" in result[0].keys() - pred_df = finetune_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + ) + else: + _target_range = None + finetune_model = tabular_model.create_finetune_model( + task="regression", + head="LinearHead", + head_config={ + "layers": "64-32-16", + "activation": "LeakyReLU", + }, + trainer_config=trainer_config, + optimizer_config=optimizer_config, + target_range=_target_range, + loss=custom_loss, + metrics=custom_metrics, + metrics_params=[{}], + metrics_prob_input=metrics_prob_input, + optimizer=custom_optimizer, + ) + finetune_model.finetune( + train=finetune_train, + validation=finetune_val, + freeze_backbone=freeze_backbone, + target_transform=target_transform, + ) + result = finetune_model.evaluate(test) + if custom_metrics is None: + assert "test_mean_squared_error" in result[0].keys() + else: + assert "test_fake_metric" in result[0].keys() + pred_df = finetune_model.predict(test) + assert pred_df.shape[0] == test.shape[0] @pytest.mark.parametrize( @@ -166,65 +163,62 @@ def test_classification( ssl, finetune = train_test_split(train, random_state=42) ssl_train, ssl_val = train_test_split(ssl, random_state=42) finetune_train, finetune_val = train_test_split(finetune, random_state=42) - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - handle_missing_values=False, - handle_unknown_categories=False, - ) - encoder_config = CategoryEmbeddingModelConfig( - task="backbone", - layers="4096-2048-512", # Number of nodes in each layer - activation="LeakyReLU", # Activation between each layers - ) - decoder_config = CategoryEmbeddingModelConfig( - task="backbone", - layers="512-2048-4096", # Number of nodes in each layer - activation="LeakyReLU", # Activation between each layers - ) - model_config_params = { - "encoder_config": encoder_config, - "decoder_config": decoder_config, - } - model_config = DenoisingAutoEncoderConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + data_config = DataConfig( + target=target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + handle_missing_values=False, + handle_unknown_categories=False, + ) + encoder_config = CategoryEmbeddingModelConfig( + task="backbone", + layers="4096-2048-512", # Number of nodes in each layer + activation="LeakyReLU", # Activation between each layers + ) + decoder_config = CategoryEmbeddingModelConfig( + task="backbone", + layers="512-2048-4096", # Number of nodes in each layer + activation="LeakyReLU", # Activation between each layers + ) + model_config_params = { + "encoder_config": encoder_config, + "decoder_config": decoder_config, + } + model_config = DenoisingAutoEncoderConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.pretrain(train=ssl_train, validation=ssl_val) - finetune_model = tabular_model.create_finetune_model( - task="classification", - head="LinearHead", - head_config={ - "layers": "64-32-16", - "activation": "LeakyReLU", - }, - trainer_config=trainer_config, - optimizer_config=optimizer_config, - ) - finetune_model.finetune( - train=finetune_train, - validation=finetune_val, - freeze_backbone=freeze_backbone, - ) - result = finetune_model.evaluate(test) - assert "test_accuracy" in result[0].keys() - pred_df = finetune_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.pretrain(train=ssl_train, validation=ssl_val) + finetune_model = tabular_model.create_finetune_model( + task="classification", + head="LinearHead", + head_config={ + "layers": "64-32-16", + "activation": "LeakyReLU", + }, + trainer_config=trainer_config, + optimizer_config=optimizer_config, + ) + finetune_model.finetune( + train=finetune_train, + validation=finetune_val, + freeze_backbone=freeze_backbone, + ) + result = finetune_model.evaluate(test) + assert "test_accuracy" in result[0].keys() + pred_df = finetune_model.predict(test) + assert pred_df.shape[0] == test.shape[0] diff --git a/tests/test_tabnet.py b/tests/test_tabnet.py index 514f497c..14bd3401 100644 --- a/tests/test_tabnet.py +++ b/tests/test_tabnet.py @@ -35,49 +35,46 @@ def test_regression( target_range, ): (train, test, target) = regression_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target + ["MedInc"] if multi_target else target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = {"task": "regression"} - if target_range: - _target_range = [] - for target in data_config.target: - _target_range.append( - ( - float(train[target].min()), - float(train[target].max()), - ) + data_config = DataConfig( + target=target + ["MedInc"] if multi_target else target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = {"task": "regression"} + if target_range: + _target_range = [] + for target in data_config.target: + _target_range.append( + ( + float(train[target].min()), + float(train[target].max()), ) - model_config_params["target_range"] = _target_range - model_config = TabNetModelConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + ) + model_config_params["target_range"] = _target_range + model_config = TabNetModelConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_mean_squared_error" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_mean_squared_error" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] @pytest.mark.parametrize( @@ -95,36 +92,33 @@ def test_classification( normalize_continuous_features, ): (train, test, target) = classification_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = {"task": "classification"} - model_config = TabNetModelConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + data_config = DataConfig( + target=target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = {"task": "classification"} + model_config = TabNetModelConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_accuracy" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_accuracy" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] diff --git a/tests/test_tabtransformer.py b/tests/test_tabtransformer.py index a88cb470..188df3aa 100644 --- a/tests/test_tabtransformer.py +++ b/tests/test_tabtransformer.py @@ -36,54 +36,51 @@ def test_regression( target_range, ): (train, test, target) = regression_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target + ["MedInc"] if multi_target else target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = { - "task": "regression", - "input_embed_dim": 8, - "num_attn_blocks": 1, - "num_heads": 2, - } - if target_range: - _target_range = [] - for target in data_config.target: - _target_range.append( - ( - float(train[target].min()), - float(train[target].max()), - ) + data_config = DataConfig( + target=target + ["MedInc"] if multi_target else target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = { + "task": "regression", + "input_embed_dim": 8, + "num_attn_blocks": 1, + "num_heads": 2, + } + if target_range: + _target_range = [] + for target in data_config.target: + _target_range.append( + ( + float(train[target].min()), + float(train[target].max()), ) - model_config_params["target_range"] = _target_range - model_config = TabTransformerConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + ) + model_config_params["target_range"] = _target_range + model_config = TabTransformerConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_mean_squared_error" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_mean_squared_error" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] @pytest.mark.parametrize( @@ -103,44 +100,41 @@ def test_classification( normalize_continuous_features, ): (train, test, target) = classification_data - if len(continuous_cols) + len(categorical_cols) == 0: - assert True - else: - data_config = DataConfig( - target=target, - continuous_cols=continuous_cols, - categorical_cols=categorical_cols, - continuous_feature_transform=continuous_feature_transform, - normalize_continuous_features=normalize_continuous_features, - ) - model_config_params = { - "task": "classification", - "input_embed_dim": 8, - "num_attn_blocks": 1, - "num_heads": 2, - } - model_config = TabTransformerConfig(**model_config_params) - trainer_config = TrainerConfig( - max_epochs=1, - checkpoints=None, - early_stopping=None, - accelerator="cpu", - fast_dev_run=True, - ) - optimizer_config = OptimizerConfig() + data_config = DataConfig( + target=target, + continuous_cols=continuous_cols, + categorical_cols=categorical_cols, + continuous_feature_transform=continuous_feature_transform, + normalize_continuous_features=normalize_continuous_features, + ) + model_config_params = { + "task": "classification", + "input_embed_dim": 8, + "num_attn_blocks": 1, + "num_heads": 2, + } + model_config = TabTransformerConfig(**model_config_params) + trainer_config = TrainerConfig( + max_epochs=1, + checkpoints=None, + early_stopping=None, + accelerator="cpu", + fast_dev_run=True, + ) + optimizer_config = OptimizerConfig() - tabular_model = TabularModel( - data_config=data_config, - model_config=model_config, - optimizer_config=optimizer_config, - trainer_config=trainer_config, - ) - tabular_model.fit(train=train, test=test) + tabular_model = TabularModel( + data_config=data_config, + model_config=model_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + ) + tabular_model.fit(train=train, test=test) - result = tabular_model.evaluate(test) - assert "test_accuracy" in result[0].keys() - pred_df = tabular_model.predict(test) - assert pred_df.shape[0] == test.shape[0] + result = tabular_model.evaluate(test) + assert "test_accuracy" in result[0].keys() + pred_df = tabular_model.predict(test) + assert pred_df.shape[0] == test.shape[0] def test_embedding_transformer(regression_data): From 317a172950f089cff880394cdd8e0b92a628bb19 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Sun, 12 Nov 2023 21:02:01 +0900 Subject: [PATCH 3/5] ci: add action to label PR conflicts (#293) --- .github/workflows/label-conflicts.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/label-conflicts.yml diff --git a/.github/workflows/label-conflicts.yml b/.github/workflows/label-conflicts.yml new file mode 100644 index 00000000..01868ef0 --- /dev/null +++ b/.github/workflows/label-conflicts.yml @@ -0,0 +1,22 @@ +name: Label merge conflicts + +on: + push: + branches: ["main"] + pull_request_target: + types: ["synchronize", "reopened", "opened"] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + triage-conflicts: + runs-on: ubuntu-latest + steps: + - uses: mschilde/auto-label-merge-conflicts@8c6faa8a252e35ba5e15703b3d747bf726cdb95c # Oct 25, 2021 + with: + CONFLICT_LABEL_NAME: "has conflicts" + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + MAX_RETRIES: 3 + WAIT_MS: 5000 From 26e5055b97fe460f8b56d4ff61e539128f3d80cd Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Sun, 12 Nov 2023 21:03:20 +0900 Subject: [PATCH 4/5] docs: fix args in DataConfig (#292) --- docs/data.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/data.md b/docs/data.md index ef93353b..359de1fc 100644 --- a/docs/data.md +++ b/docs/data.md @@ -5,16 +5,16 @@ Pytorch Tabular handles this using a `DataConfig` object. ## Basic Usage - `target`: List\[str\]: A list of strings with the names of the target column(s) -- `continuous_columns`: List\[str\]: Column names of the numeric fields. Defaults to \[\] -- `categorical_columns`: List\[str\]: Column names of the categorical fields to treat differently +- `continuous_cols`: List\[str\]: Column names of the numeric fields. Defaults to \[\] +- `categorical_cols`: List\[str\]: Column names of the categorical fields to treat differently ### Usage Example ```python data_config = DataConfig( target=["label"], - continuous_columns=["feature_1", "feature_2"], - categorical_columns=["cat_feature_1", "cat_feature_2"], + continuous_cols=["feature_1", "feature_2"], + categorical_cols=["cat_feature_1", "cat_feature_2"], ) ``` From 1914c9164403bdd89bc2d97ec11a597cbb5a440b Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Mon, 13 Nov 2023 21:33:35 +0900 Subject: [PATCH 5/5] ci: cash tests dataset (#301) * ci: cash tests dataset * makedirs --- .github/workflows/testing.yml | 17 ++++++++++++++++- tests/conftest.py | 19 ++++++++++++++----- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 99cf8e63..f3b8ee0f 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -16,6 +16,9 @@ jobs: - {os: 'ubuntu-20.04', python-version: "3.7", requires: 'oldest'} - {os: 'ubuntu-20.04', python-version: "3.8", requires: 'oldest'} + env: + TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" + steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} @@ -34,8 +37,20 @@ jobs: - name: Install main package & dependencies run: | - pip install -e .[extra] -r requirements_dev.txt -f https://download.pytorch.org/whl/cpu/torch_stable.html + pip install -e .[extra] -r requirements_dev.txt -f ${TORCH_URL} pip list + - name: Restore test's datasets + uses: actions/cache/restore@v3 + with: + path: tests/.datasets + key: test-datasets + - name: Run test-suite run: python -m pytest -v + + - name: Save test's datasets + uses: actions/cache/save@v3 + with: + path: tests/.datasets + key: test-datasets diff --git a/tests/conftest.py b/tests/conftest.py index 4b891249..69f0e07a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,4 @@ -from io import BytesIO -from urllib.request import urlopen +import os.path from zipfile import ZipFile import numpy as np @@ -7,6 +6,18 @@ import pytest from sklearn.datasets import fetch_california_housing, fetch_covtype +_PATH_TEST = os.path.dirname(__file__) +PATH_DATASETS = os.path.join(_PATH_TEST, ".datasets") +os.makedirs(PATH_DATASETS, exist_ok=True) + +DATASET_ZIP_OCCUPANCY = os.path.join(PATH_DATASETS, "occupancy_data.zip") +if not os.path.isfile(DATASET_ZIP_OCCUPANCY): + import urllib.request + + urllib.request.urlretrieve( + "https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip", DATASET_ZIP_OCCUPANCY + ) + def load_regression_data(): dataset = fetch_california_housing(data_home="data", as_frame=True) @@ -34,9 +45,7 @@ def load_classification_data(): def load_timeseries_data(): - url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip" - resp = urlopen(url) - zipfile = ZipFile(BytesIO(resp.read())) + zipfile = ZipFile(DATASET_ZIP_OCCUPANCY) train = pd.read_csv(zipfile.open("datatraining.txt"), sep=",") val = pd.read_csv(zipfile.open("datatest.txt"), sep=",") test = pd.read_csv(zipfile.open("datatest2.txt"), sep=",")