Skip to content

Commit

Permalink
[update] Update the PR according to the latest version and disable sh…
Browse files Browse the repository at this point in the history
…uffle

Since the shuffle is performed in the split functions, we do not need
shuffle before the splitting.
For this reason, I disabled the shuffle argument from BaseDataset and
added the shuffle keyword for resampling_strategy_args.
  • Loading branch information
nabenabe0928 committed May 10, 2021
1 parent 36cef27 commit 1e00413
Show file tree
Hide file tree
Showing 13 changed files with 70 additions and 71 deletions.
6 changes: 3 additions & 3 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
STRING_TO_TASK_TYPES,
)
from autoPyTorch.datasets.base_dataset import BaseDataset
from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutTypes
from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection
from autoPyTorch.ensemble.singlebest_ensemble import SingleBest
Expand Down Expand Up @@ -138,7 +138,7 @@ def __init__(
include_components: Optional[Dict] = None,
exclude_components: Optional[Dict] = None,
backend: Optional[Backend] = None,
resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy: Union[CrossValTypes, HoldoutTypes] = HoldoutTypes.holdout,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
task_type: Optional[str] = None
Expand Down Expand Up @@ -1171,7 +1171,7 @@ def predict(
assert self.ensemble_ is not None, "Load models should error out if no ensemble"
self.ensemble_ = cast(Union[SingleBest, EnsembleSelection], self.ensemble_)

if isinstance(self.resampling_strategy, HoldoutValTypes):
if isinstance(self.resampling_strategy, HoldoutTypes):
models = self.models_
elif isinstance(self.resampling_strategy, CrossValTypes):
models = self.cv_models_
Expand Down
4 changes: 2 additions & 2 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from autoPyTorch.datasets.base_dataset import BaseDataset
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
HoldoutValTypes,
HoldoutTypes,
)
from autoPyTorch.datasets.tabular_dataset import TabularDataset
from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
Expand Down Expand Up @@ -72,7 +72,7 @@ def __init__(
delete_output_folder_after_terminate: bool = True,
include_components: Optional[Dict] = None,
exclude_components: Optional[Dict] = None,
resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy: Union[CrossValTypes, HoldoutTypes] = HoldoutTypes.holdout,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
backend: Optional[Backend] = None,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
Expand Down
4 changes: 2 additions & 2 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from autoPyTorch.datasets.base_dataset import BaseDataset
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
HoldoutValTypes,
HoldoutTypes,
)
from autoPyTorch.datasets.tabular_dataset import TabularDataset
from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
Expand Down Expand Up @@ -64,7 +64,7 @@ def __init__(
delete_output_folder_after_terminate: bool = True,
include_components: Optional[Dict] = None,
exclude_components: Optional[Dict] = None,
resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy: Union[CrossValTypes, HoldoutTypes] = HoldoutTypes.holdout,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
backend: Optional[Backend] = None,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
Expand Down
10 changes: 4 additions & 6 deletions autoPyTorch/datasets/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ def __init__(
test_tensors: Optional[BaseDatasetInputType] = None,
resampling_strategy: Union[CrossValTypes, HoldoutTypes] = HoldoutTypes.holdout,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
shuffle: Optional[bool] = True,
seed: Optional[int] = 42,
train_transforms: Optional[torchvision.transforms.Compose] = None,
val_transforms: Optional[torchvision.transforms.Compose] = None,
Expand All @@ -89,10 +88,9 @@ def __init__(
resampling_strategy (Union[CrossValTypes, HoldoutTypes]),
(default=HoldoutTypes.holdout):
strategy to split the training data.
resampling_strategy_args (Optional[Dict[str, Any]]): arguments
required for the chosen resampling strategy. If None, uses
the default values provided in DEFAULT_RESAMPLING_PARAMETERS
in ```datasets/resampling_strategy.py```.
resampling_strategy_args (Optional[Dict[str, Any]]):
arguments required for the chosen resampling strategy.
The details are provided in autoPytorch/datasets/resampling_strategy.py
shuffle: Whether to shuffle the data when performing splits
seed (int), (default=1): seed to be used for reproducibility.
train_transforms (Optional[torchvision.transforms.Compose]):
Expand All @@ -109,9 +107,9 @@ def __init__(
type_check(train_tensors, val_tensors)
self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors
self.random_state = np.random.RandomState(seed=seed)
self.shuffle = shuffle
self.resampling_strategy = resampling_strategy
self.resampling_strategy_args = resampling_strategy_args
self.shuffle = self.resampling_strategy_args['shuffle']
self.is_stratify = self.resampling_strategy.get('stratify', False)

self.task_type: Optional[str] = None
Expand Down
18 changes: 8 additions & 10 deletions autoPyTorch/datasets/image_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from autoPyTorch.datasets.base_dataset import BaseDataset
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
HoldoutValTypes,
HoldoutTypes,
)

IMAGE_DATASET_INPUT = Union[Dataset, Tuple[Union[np.ndarray, List[str]], np.ndarray]]
Expand All @@ -39,13 +39,12 @@ class ImageDataset(BaseDataset):
validation data
test (Union[Dataset, Tuple[Union[np.ndarray, List[str]], np.ndarray]]):
testing data
resampling_strategy (Union[CrossValTypes, HoldoutValTypes]),
(default=HoldoutValTypes.holdout_validation):
resampling_strategy (Union[CrossValTypes, HoldoutTypes]),
(default=HoldoutTypes.holdout):
strategy to split the training data.
resampling_strategy_args (Optional[Dict[str, Any]]): arguments
required for the chosen resampling strategy. If None, uses
the default values provided in DEFAULT_RESAMPLING_PARAMETERS
in ```datasets/resampling_strategy.py```.
resampling_strategy_args (Optional[Dict[str, Any]]):
arguments required for the chosen resampling strategy.
The details are provided in autoPytorch/datasets/resampling_strategy.py
shuffle: Whether to shuffle the data before performing splits
seed (int), (default=1): seed to be used for reproducibility.
train_transforms (Optional[torchvision.transforms.Compose]):
Expand All @@ -57,9 +56,8 @@ def __init__(self,
train: IMAGE_DATASET_INPUT,
val: Optional[IMAGE_DATASET_INPUT] = None,
test: Optional[IMAGE_DATASET_INPUT] = None,
resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy: Union[CrossValTypes, HoldoutTypes] = HoldoutTypes.holdout,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
shuffle: Optional[bool] = True,
seed: Optional[int] = 42,
train_transforms: Optional[torchvision.transforms.Compose] = None,
val_transforms: Optional[torchvision.transforms.Compose] = None,
Expand All @@ -72,7 +70,7 @@ def __init__(self,
test = _create_image_dataset(data=test)
self.mean, self.std = _calc_mean_std(train=train)

super().__init__(train_tensors=train, val_tensors=val, test_tensors=test, shuffle=shuffle,
super().__init__(train_tensors=train, val_tensors=val, test_tensors=test,
resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args,
seed=seed,
train_transforms=train_transforms,
Expand Down
9 changes: 8 additions & 1 deletion autoPyTorch/datasets/resampling_strategy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from enum import Enum
from functools import partial
from typing import List, Optional, Tuple, Union
from typing import List, NamedTuple, Optional, Tuple, Union

import numpy as np

Expand All @@ -16,6 +16,13 @@
from torch.utils.data import Dataset


class _ResamplingStrategyArgs(NamedTuple):
val_share: float = 0.33
num_splits: int = 5
shuffle: bool = False
stratify: bool = False


class HoldoutFuncs():
@staticmethod
def holdout(
Expand Down
18 changes: 8 additions & 10 deletions autoPyTorch/datasets/tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from autoPyTorch.datasets.base_dataset import BaseDataset
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
HoldoutValTypes,
HoldoutTypes,
)


Expand All @@ -44,13 +44,12 @@ class TabularDataset(BaseDataset):
Y (Union[np.ndarray, pd.Series]): training data targets.
X_test (Optional[Union[np.ndarray, pd.DataFrame]]): input testing data.
Y_test (Optional[Union[np.ndarray, pd.DataFrame]]): testing data targets
resampling_strategy (Union[CrossValTypes, HoldoutValTypes]),
(default=HoldoutValTypes.holdout_validation):
resampling_strategy (Union[CrossValTypes, HoldoutTypes]),
(default=HoldoutTypes.holdout):
strategy to split the training data.
resampling_strategy_args (Optional[Dict[str, Any]]): arguments
required for the chosen resampling strategy. If None, uses
the default values provided in DEFAULT_RESAMPLING_PARAMETERS
in ```datasets/resampling_strategy.py```.
resampling_strategy_args (Optional[Dict[str, Any]]):
arguments required for the chosen resampling strategy.
The details are provided in autoPytorch/datasets/resampling_strategy.py
shuffle: Whether to shuffle the data before performing splits
seed (int), (default=1): seed to be used for reproducibility.
train_transforms (Optional[torchvision.transforms.Compose]):
Expand All @@ -67,9 +66,8 @@ def __init__(self,
Y: Union[np.ndarray, pd.Series],
X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy: Union[CrossValTypes, HoldoutTypes] = HoldoutTypes.holdout,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
shuffle: Optional[bool] = True,
seed: Optional[int] = 42,
train_transforms: Optional[torchvision.transforms.Compose] = None,
val_transforms: Optional[torchvision.transforms.Compose] = None,
Expand All @@ -92,7 +90,7 @@ def __init__(self,
self.num_features = validator.feature_validator.num_features
self.categories = validator.feature_validator.categories

super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle,
super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test),
resampling_strategy=resampling_strategy,
resampling_strategy_args=resampling_strategy_args,
seed=seed, train_transforms=train_transforms,
Expand Down
9 changes: 5 additions & 4 deletions autoPyTorch/datasets/time_series_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def __init__(self,
val: Optional[TIME_SERIES_FORECASTING_INPUT] = None,
resampling_strategy: Union[CrossValTypes, HoldoutTypes] = HoldoutTypes.holdout,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
shuffle: Optional[bool] = False,
seed: Optional[int] = 42,
train_transforms: Optional[torchvision.transforms.Compose] = None,
val_transforms: Optional[torchvision.transforms.Compose] = None,
Expand Down Expand Up @@ -69,7 +68,7 @@ def __init__(self,
target_variables=target_variables,
sequence_length=sequence_length,
n_steps=n_steps)
super().__init__(train_tensors=train, val_tensors=val, shuffle=shuffle,
super().__init__(train_tensors=train, val_tensors=val,
resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args,
seed=seed,
train_transforms=train_transforms,
Expand Down Expand Up @@ -129,15 +128,17 @@ def __init__(self,
_check_time_series_inputs(train=train,
val=val,
task_type="time_series_classification")
super().__init__(train_tensors=train, val_tensors=val, shuffle=True)
resampling_strategy_args = {'shuffle': True}
super().__init__(train_tensors=train, val_tensors=val, resampling_strategy_args=resampling_strategy_args)


class TimeSeriesRegressionDataset(BaseDataset):
def __init__(self, train: Tuple[np.ndarray, np.ndarray], val: Optional[Tuple[np.ndarray, np.ndarray]] = None):
_check_time_series_inputs(train=train,
val=val,
task_type="time_series_regression")
super().__init__(train_tensors=train, val_tensors=val, shuffle=True)
resampling_strategy_args = {'shuffle': True}
super().__init__(train_tensors=train, val_tensors=val, resampling_strategy_args=resampling_strategy_args)


def _check_time_series_inputs(task_type: str,
Expand Down
9 changes: 3 additions & 6 deletions autoPyTorch/optimizer/smbo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
from autoPyTorch.datasets.base_dataset import BaseDataset
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
DEFAULT_RESAMPLING_PARAMETERS,
HoldoutValTypes,
HoldoutTypes,
)
from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
Expand Down Expand Up @@ -93,7 +92,7 @@ def __init__(self,
pipeline_config: typing.Dict[str, typing.Any],
start_num_run: int = 1,
seed: int = 1,
resampling_strategy: typing.Union[HoldoutValTypes, CrossValTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy: typing.Union[HoldoutTypes, CrossValTypes] = HoldoutTypes.holdout,
resampling_strategy_args: typing.Optional[typing.Dict[str, typing.Any]] = None,
include: typing.Optional[typing.Dict[str, typing.Any]] = None,
exclude: typing.Optional[typing.Dict[str, typing.Any]] = None,
Expand Down Expand Up @@ -173,9 +172,7 @@ def __init__(self,

# Evaluation
self.resampling_strategy = resampling_strategy
if resampling_strategy_args is None:
resampling_strategy_args = DEFAULT_RESAMPLING_PARAMETERS[resampling_strategy]
self.resampling_strategy_args = resampling_strategy_args
self.resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is None else {}

# and a bunch of useful limits
self.worst_possible_result = get_cost_of_crash(self.metric)
Expand Down
14 changes: 7 additions & 7 deletions examples/tabular/40_advanced/example_resampling_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import sklearn.model_selection

from autoPyTorch.api.tabular_classification import TabularClassificationTask
from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutTypes


if __name__ == '__main__':
Expand All @@ -48,11 +48,11 @@
# To maintain logs of the run, set the next two as False
delete_tmp_folder_after_terminate=True,
delete_output_folder_after_terminate=True,
# 'HoldoutValTypes.holdout_validation' with 'val_share': 0.33
# 'HoldoutTypes.holdout' with 'val_share': 0.33
# is the default argument setting for TabularClassificationTask.
# It is explicitly specified in this example for demonstrational
# purpose.
resampling_strategy=HoldoutValTypes.holdout_validation,
resampling_strategy=HoldoutTypes.holdout,
resampling_strategy_args={'val_share': 0.33}
)

Expand Down Expand Up @@ -90,7 +90,7 @@
# To maintain logs of the run, set the next two as False
delete_tmp_folder_after_terminate=True,
delete_output_folder_after_terminate=True,
resampling_strategy=CrossValTypes.k_fold_cross_validation,
resampling_strategy=CrossValTypes.k_fold,
resampling_strategy_args={'num_splits': 3}
)

Expand Down Expand Up @@ -130,9 +130,9 @@
delete_output_folder_after_terminate=True,
# For demonstration purposes, we use
# Stratified hold out validation. However,
# one can also use CrossValTypes.stratified_k_fold_cross_validation.
resampling_strategy=HoldoutValTypes.stratified_holdout_validation,
resampling_strategy_args={'val_share': 0.33}
# one can also use CrossValTypes.k_fold.
resampling_strategy=HoldoutTypes.holdout,
resampling_strategy_args={'val_share': 0.33, 'stratify': True}
)

############################################################################
Expand Down
Loading

0 comments on commit 1e00413

Please sign in to comment.