From b8037703d82172c7c485a0dcb3e9d514718c5874 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Mon, 10 May 2021 12:29:49 +0900 Subject: [PATCH 01/12] [refactor] Update the split functions to be able to call function directly --- autoPyTorch/datasets/resampling_strategy.py | 330 ++++++++++---------- 1 file changed, 159 insertions(+), 171 deletions(-) diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index a1e599dd6..f6e6ae570 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -1,5 +1,6 @@ -from enum import IntEnum -from typing import Any, Dict, List, Optional, Tuple, Union +from enum import Enum +from functools import partial +from typing import List, NamedTuple, Optional, Tuple, Union import numpy as np @@ -12,187 +13,69 @@ train_test_split ) -from typing_extensions import Protocol +from torch.utils.data import Dataset -# Use callback protocol as workaround, since callable with function fields count 'self' as argument -class CrossValFunc(Protocol): - def __call__(self, - random_state: np.random.RandomState, - num_splits: int, - indices: np.ndarray, - stratify: Optional[Any]) -> List[Tuple[np.ndarray, np.ndarray]]: - ... +class _ResamplingStrategyArgs(NamedTuple): + val_share: float = 0.33 + num_splits: int = 5 + shuffle: bool = False + stratify: bool = False -class HoldOutFunc(Protocol): - def __call__(self, random_state: np.random.RandomState, val_share: float, - indices: np.ndarray, stratify: Optional[Any] - ) -> Tuple[np.ndarray, np.ndarray]: - ... - - -class CrossValTypes(IntEnum): - """The type of cross validation - - This class is used to specify the cross validation function - and is not supposed to be instantiated. - - Examples: This class is supposed to be used as follows - >>> cv_type = CrossValTypes.k_fold_cross_validation - >>> print(cv_type.name) - - k_fold_cross_validation - - >>> for cross_val_type in CrossValTypes: - print(cross_val_type.name, cross_val_type.value) - - stratified_k_fold_cross_validation 1 - k_fold_cross_validation 2 - stratified_shuffle_split_cross_validation 3 - shuffle_split_cross_validation 4 - time_series_cross_validation 5 - """ - stratified_k_fold_cross_validation = 1 - k_fold_cross_validation = 2 - stratified_shuffle_split_cross_validation = 3 - shuffle_split_cross_validation = 4 - time_series_cross_validation = 5 - - def is_stratified(self) -> bool: - stratified = [self.stratified_k_fold_cross_validation, - self.stratified_shuffle_split_cross_validation] - return getattr(self, self.name) in stratified - - -class HoldoutValTypes(IntEnum): - """TODO: change to enum using functools.partial""" - """The type of hold out validation (refer to CrossValTypes' doc-string)""" - holdout_validation = 6 - stratified_holdout_validation = 7 - - def is_stratified(self) -> bool: - stratified = [self.stratified_holdout_validation] - return getattr(self, self.name) in stratified - - -# TODO: replace it with another way -RESAMPLING_STRATEGIES = [CrossValTypes, HoldoutValTypes] - -DEFAULT_RESAMPLING_PARAMETERS = { - HoldoutValTypes.holdout_validation: { - 'val_share': 0.33, - }, - HoldoutValTypes.stratified_holdout_validation: { - 'val_share': 0.33, - }, - CrossValTypes.k_fold_cross_validation: { - 'num_splits': 5, - }, - CrossValTypes.stratified_k_fold_cross_validation: { - 'num_splits': 5, - }, - CrossValTypes.shuffle_split_cross_validation: { - 'num_splits': 5, - }, - CrossValTypes.time_series_cross_validation: { - 'num_splits': 5, - }, -} # type: Dict[Union[HoldoutValTypes, CrossValTypes], Dict[str, Any]] - - -class HoldOutFuncs(): +class HoldoutFuncs(): @staticmethod - def holdout_validation(random_state: np.random.RandomState, - val_share: float, - indices: np.ndarray, - **kwargs: Any - ) -> Tuple[np.ndarray, np.ndarray]: - shuffle = kwargs.get('shuffle', True) - train, val = train_test_split(indices, test_size=val_share, - shuffle=shuffle, - random_state=random_state if shuffle else None, - ) + def holdout_validation( + random_state: np.random.RandomState, + val_share: float, + indices: np.ndarray, + shuffle: bool = False, + labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None + ): + + train, val = train_test_split( + indices, test_size=val_share, shuffle=shuffle, + random_state=random_state if shuffle else None, + stratify=labels_to_stratify + ) return train, val - @staticmethod - def stratified_holdout_validation(random_state: np.random.RandomState, - val_share: float, - indices: np.ndarray, - **kwargs: Any - ) -> Tuple[np.ndarray, np.ndarray]: - train, val = train_test_split(indices, test_size=val_share, shuffle=True, stratify=kwargs["stratify"], - random_state=random_state) - return train, val - - @classmethod - def get_holdout_validators(cls, *holdout_val_types: HoldoutValTypes) -> Dict[str, HoldOutFunc]: - - holdout_validators = { - holdout_val_type.name: getattr(cls, holdout_val_type.name) - for holdout_val_type in holdout_val_types - } - return holdout_validators - class CrossValFuncs(): - @staticmethod - def shuffle_split_cross_validation(random_state: np.random.RandomState, - num_splits: int, - indices: np.ndarray, - **kwargs: Any - ) -> List[Tuple[np.ndarray, np.ndarray]]: - cv = ShuffleSplit(n_splits=num_splits, random_state=random_state) - splits = list(cv.split(indices)) - return splits - - @staticmethod - def stratified_shuffle_split_cross_validation(random_state: np.random.RandomState, - num_splits: int, - indices: np.ndarray, - **kwargs: Any - ) -> List[Tuple[np.ndarray, np.ndarray]]: - cv = StratifiedShuffleSplit(n_splits=num_splits, random_state=random_state) - splits = list(cv.split(indices, kwargs["stratify"])) - return splits - - @staticmethod - def stratified_k_fold_cross_validation(random_state: np.random.RandomState, - num_splits: int, - indices: np.ndarray, - **kwargs: Any - ) -> List[Tuple[np.ndarray, np.ndarray]]: - cv = StratifiedKFold(n_splits=num_splits, random_state=random_state) - splits = list(cv.split(indices, kwargs["stratify"])) - return splits + # (shuffle, is_stratify) -> split_fn + _args2split_fn = { + (True, True): StratifiedShuffleSplit, + (True, False): ShuffleSplit, + (False, True): StratifiedKFold, + (False, False): KFold, + } @staticmethod - def k_fold_cross_validation(random_state: np.random.RandomState, - num_splits: int, - indices: np.ndarray, - **kwargs: Any - ) -> List[Tuple[np.ndarray, np.ndarray]]: + def k_fold_cross_validation( + random_state: np.random.RandomState, + num_splits: int, + indices: np.ndarray, + shuffle: bool = False, + labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None + ) -> List[Tuple[np.ndarray, np.ndarray]]: """ - Standard k fold cross validation. - - Args: - indices (np.ndarray): array of indices to be split - num_splits (int): number of cross validation splits - Returns: splits (List[Tuple[List, List]]): list of tuples of training and validation indices """ - shuffle = kwargs.get('shuffle', True) - cv = KFold(n_splits=num_splits, random_state=random_state if shuffle else None, shuffle=shuffle) + + split_fn = CrossValFuncs._args2split_fn[(shuffle, labels_to_stratify is not None)] + cv = split_fn(n_splits=num_splits, random_state=random_state) splits = list(cv.split(indices)) return splits @staticmethod - def time_series_cross_validation(random_state: np.random.RandomState, - num_splits: int, - indices: np.ndarray, - **kwargs: Any - ) -> List[Tuple[np.ndarray, np.ndarray]]: + def time_series( + random_state: np.random.RandomState, + num_splits: int, + indices: np.ndarray, + shuffle: bool = False, + labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None + ) -> List[Tuple[np.ndarray, np.ndarray]]: """ Returns train and validation indices respecting the temporal ordering of the data. @@ -215,10 +98,115 @@ def time_series_cross_validation(random_state: np.random.RandomState, splits = list(cv.split(indices)) return splits - @classmethod - def get_cross_validators(cls, *cross_val_types: CrossValTypes) -> Dict[str, CrossValFunc]: - cross_validators = { - cross_val_type.name: getattr(cls, cross_val_type.name) - for cross_val_type in cross_val_types - } - return cross_validators + +class CrossValTypes(Enum): + """The type of cross validation + + This class is used to specify the cross validation function + and is not supposed to be instantiated. + + Examples: This class is supposed to be used as follows + >>> cv_type = CrossValTypes.k_fold_cross_validation + >>> print(cv_type.name) + + k_fold_cross_validation + + >>> for cross_val_type in CrossValTypes: + print(cross_val_type.name, cross_val_type.value) + + k_fold_cross_validation functools.partial() + time_series + """ + k_fold_cross_validation = partial(CrossValFuncs.k_fold_cross_validation) + time_series = partial(CrossValFuncs.time_series) + + def __call__( + self, + random_state: np.random.RandomState, + indices: np.ndarray, + num_splits: int = 5, + shuffle: bool = False, + labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None + ) -> List[Tuple[np.ndarray, np.ndarray]]: + """ + This function allows to call and type-check the specified function. + + Args: + random_state (np.random.RandomState): random number genetor for the reproducibility + num_splits (int): The number of splits in cross validation + indices (np.ndarray): The indices of data points in a dataset + shuffle (bool): If shuffle the indices or not + labels_to_stratify (Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]]): + The labels of the corresponding data points. It is used for the stratification. + + Returns: + splits (List[Tuple[np.ndarray, np.ndarray]]): + splits[a split identifier][0: train, 1: val][a data point identifier] + + """ + return self.value( + random_state=random_state, + num_splits=num_splits, + indices=indices, + shuffle=shuffle, + labels_to_stratify=labels_to_stratify + ) + + +class HoldoutValTypes(Enum): + """The type of holdout validation + + This class is used to specify the holdout validation function + and is not supposed to be instantiated. + + Examples: This class is supposed to be used as follows + >>> holdout_type = HoldoutValTypes.holdout_validation + >>> print(holdout_type.name) + + holdout_validation + + >>> print(holdout_type.value) + + functools.partial() + + >>> for holdout_type in HoldoutValTypes: + print(holdout_type.name) + + holdout_validation + + Additionally, HoldoutValTypes. can be called directly. + """ + + holdout = partial(HoldoutFuncs.holdout_validation) + + def __call__( + self, + random_state: np.random.RandomState, + indices: np.ndarray, + val_share: float = 0.33, + shuffle: bool = False, + labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None + ) -> List[Tuple[np.ndarray, np.ndarray]]: + """ + This function allows to call and type-check the specified function. + + Args: + random_state (np.random.RandomState): random number genetor for the reproducibility + val_share (float): The ratio of validation dataset vs the given dataset + indices (np.ndarray): The indices of data points in a dataset + shuffle (bool): If shuffle the indices or not + labels_to_stratify (Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]]): + The labels of the corresponding data points. It is used for the stratification. + + Returns: + splits (List[Tuple[np.ndarray, np.ndarray]]): + splits[a split identifier][0: train, 1: val][a data point identifier] + + """ + return self.value( + random_state=random_state, + val_share=val_share, + indices=indices, + shuffle=shuffle, + labels_to_stratify=labels_to_stratify + ) From 4d901f9b6c2e8bcbefa998a90c097f35f8ab7be7 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Mon, 10 May 2021 12:37:16 +0900 Subject: [PATCH 02/12] [feat] Deprecate shuffle inside BaseDataset and enable only in split funcs --- autoPyTorch/datasets/base_dataset.py | 165 ++++++------------ autoPyTorch/datasets/image_dataset.py | 10 +- autoPyTorch/datasets/tabular_dataset.py | 10 +- autoPyTorch/datasets/time_series_dataset.py | 55 +++--- autoPyTorch/optimizer/smbo.py | 4 +- .../example_resampling_strategy.py | 6 +- 6 files changed, 92 insertions(+), 158 deletions(-) diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 9955e706f..de2ede902 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -1,7 +1,7 @@ import os import uuid from abc import ABCMeta -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union import numpy as np @@ -14,15 +14,7 @@ import torchvision from autoPyTorch.constants import CLASSIFICATION_OUTPUTS, STRING_TO_OUTPUT_TYPES -from autoPyTorch.datasets.resampling_strategy import ( - CrossValFunc, - CrossValFuncs, - CrossValTypes, - DEFAULT_RESAMPLING_PARAMETERS, - HoldOutFunc, - HoldOutFuncs, - HoldoutValTypes -) +from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes from autoPyTorch.utils.common import FitRequirement BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset] @@ -79,7 +71,6 @@ def __init__( test_tensors: Optional[BaseDatasetInputType] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, - shuffle: Optional[bool] = True, seed: Optional[int] = 42, train_transforms: Optional[torchvision.transforms.Compose] = None, val_transforms: Optional[torchvision.transforms.Compose] = None, @@ -97,11 +88,10 @@ def __init__( resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), (default=HoldoutValTypes.holdout_validation): strategy to split the training data. - resampling_strategy_args (Optional[Dict[str, Any]]): arguments - required for the chosen resampling strategy. If None, uses - the default values provided in DEFAULT_RESAMPLING_PARAMETERS - in ```datasets/resampling_strategy.py```. - shuffle: Whether to shuffle the data before performing splits + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. + The details are provided in autoPytorch/datasets/resampling_strategy.py + shuffle: Whether to shuffle the data when performing splits seed (int), (default=1): seed to be used for reproducibility. train_transforms (Optional[torchvision.transforms.Compose]): Additional Transforms to be applied to the training data @@ -116,12 +106,12 @@ def __init__( if not hasattr(train_tensors[0], 'shape'): type_check(train_tensors, val_tensors) self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors - self.cross_validators: Dict[str, CrossValFunc] = {} - self.holdout_validators: Dict[str, HoldOutFunc] = {} self.random_state = np.random.RandomState(seed=seed) - self.shuffle = shuffle self.resampling_strategy = resampling_strategy self.resampling_strategy_args = resampling_strategy_args + self.shuffle = self.resampling_strategy_args['shuffle'] + self.is_stratify = self.resampling_strategy.get('stratify', False) + self.task_type: Optional[str] = None self.issparse: bool = issparse(self.train_tensors[0]) self.input_shape: Tuple[int] = self.train_tensors[0].shape[1:] @@ -137,9 +127,6 @@ def __init__( # TODO: Look for a criteria to define small enough to preprocess self.is_small_preprocess = True - # Make sure cross validation splits are created once - self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes) - self.holdout_validators = HoldOutFuncs.get_holdout_validators(*HoldoutValTypes) self.splits = self.get_splits_from_resampling_strategy() # We also need to be able to transform the data, be it for pre-processing @@ -205,7 +192,30 @@ def __len__(self) -> int: return self.train_tensors[0].shape[0] def _get_indices(self) -> np.ndarray: - return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self)) + return np.arange(len(self)) + + def _process_resampling_strategy_args(self) -> None: + if not any(isinstance(self.resampling_strategy, val_type) + for val_type in [HoldoutValTypes, CrossValTypes]): + raise ValueError(f"resampling_strategy {self.resampling_strategy} is not supported.") + + if self.resampling_strategy_args is not None and \ + not isinstance(self.resampling_strategy_args, dict): + + raise TypeError("resampling_strategy_args must be dict or None," + f" but got {type(self.resampling_strategy_args)}") + + val_share = self.resampling_strategy_args.get('val_share', None) + num_splits = self.resampling_strategy_args.get('num_splits', None) + + if val_share is not None and (val_share < 0 or val_share > 1): + raise ValueError(f"`val_share` must be between 0 and 1, got {val_share}.") + + if num_splits is not None: + if num_splits <= 0: + raise ValueError(f"`num_splits` must be a positive integer, got {num_splits}.") + elif not isinstance(num_splits, int): + raise ValueError(f"`num_splits` must be an integer, got {num_splits}.") def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]: """ @@ -214,100 +224,33 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int] Returns (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format """ - splits = [] + # check if the requirements are met and if we can get splits + self._process_resampling_strategy_args() + + labels_to_stratify = self.train_tensors[-1] if self.is_stratify else None + if isinstance(self.resampling_strategy, HoldoutValTypes): - val_share = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get( - 'val_share', None) - if self.resampling_strategy_args is not None: - val_share = self.resampling_strategy_args.get('val_share', val_share) - splits.append( - self.create_holdout_val_split( - holdout_val_type=self.resampling_strategy, - val_share=val_share, - ) + val_share = self.resampling_strategy_args['val_share'] + + return self.resampling_strategy( + random_state=self.random_state, + val_share=val_share, + shuffle=self.shuffle, + indices=self._get_indices(), + labels_to_stratify=labels_to_stratify ) elif isinstance(self.resampling_strategy, CrossValTypes): - num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get( - 'num_splits', None) - if self.resampling_strategy_args is not None: - num_splits = self.resampling_strategy_args.get('num_splits', num_splits) - # Create the split if it was not created before - splits.extend( - self.create_cross_val_splits( - cross_val_type=self.resampling_strategy, - num_splits=cast(int, num_splits), - ) + num_splits = self.resampling_strategy_args['num_splits'] + + return self.create_cross_val_splits( + random_state=self.random_state, + num_splits=int(num_splits), + shuffle=self.shuffle, + indices=self._get_indices(), + labels_to_stratify=labels_to_stratify ) else: raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}") - return splits - - def create_cross_val_splits( - self, - cross_val_type: CrossValTypes, - num_splits: int - ) -> List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]: - """ - This function creates the cross validation split for the given task. - - It is done once per dataset to have comparable results among pipelines - Args: - cross_val_type (CrossValTypes): - num_splits (int): number of splits to be created - - Returns: - (List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]): - list containing 'num_splits' splits. - """ - # Create just the split once - # This is gonna be called multiple times, because the current dataset - # is being used for multiple pipelines. That is, to be efficient with memory - # we dump the dataset to memory and read it on a need basis. So this function - # should be robust against multiple calls, and it does so by remembering the splits - if not isinstance(cross_val_type, CrossValTypes): - raise NotImplementedError(f'The selected `cross_val_type` "{cross_val_type}" is not implemented.') - kwargs = {} - if cross_val_type.is_stratified(): - # we need additional information about the data for stratification - kwargs["stratify"] = self.train_tensors[-1] - splits = self.cross_validators[cross_val_type.name]( - self.random_state, num_splits, self._get_indices(), **kwargs) - return splits - - def create_holdout_val_split( - self, - holdout_val_type: HoldoutValTypes, - val_share: float, - ) -> Tuple[np.ndarray, np.ndarray]: - """ - This function creates the holdout split for the given task. - - It is done once per dataset to have comparable results among pipelines - Args: - holdout_val_type (HoldoutValTypes): - val_share (float): share of the validation data - - Returns: - (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices) - """ - if holdout_val_type is None: - raise ValueError( - '`val_share` specified, but `holdout_val_type` not specified.' - ) - if self.val_tensors is not None: - raise ValueError( - '`val_share` specified, but the Dataset was a given a pre-defined split at initialization already.') - if val_share < 0 or val_share > 1: - raise ValueError(f"`val_share` must be between 0 and 1, got {val_share}.") - if not isinstance(holdout_val_type, HoldoutValTypes): - raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.') - kwargs = {} - if holdout_val_type.is_stratified(): - # we need additional information about the data for stratification - kwargs["stratify"] = self.train_tensors[-1] - train, val = self.holdout_validators[holdout_val_type.name]( - self.random_state, val_share, self._get_indices(), **kwargs) - return train, val def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]: """ diff --git a/autoPyTorch/datasets/image_dataset.py b/autoPyTorch/datasets/image_dataset.py index 4664dbaf5..5abeb6d8a 100644 --- a/autoPyTorch/datasets/image_dataset.py +++ b/autoPyTorch/datasets/image_dataset.py @@ -42,10 +42,9 @@ class ImageDataset(BaseDataset): resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), (default=HoldoutValTypes.holdout_validation): strategy to split the training data. - resampling_strategy_args (Optional[Dict[str, Any]]): arguments - required for the chosen resampling strategy. If None, uses - the default values provided in DEFAULT_RESAMPLING_PARAMETERS - in ```datasets/resampling_strategy.py```. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. + The details are provided in autoPytorch/datasets/resampling_strategy.py shuffle: Whether to shuffle the data before performing splits seed (int), (default=1): seed to be used for reproducibility. train_transforms (Optional[torchvision.transforms.Compose]): @@ -59,7 +58,6 @@ def __init__(self, test: Optional[IMAGE_DATASET_INPUT] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, - shuffle: Optional[bool] = True, seed: Optional[int] = 42, train_transforms: Optional[torchvision.transforms.Compose] = None, val_transforms: Optional[torchvision.transforms.Compose] = None, @@ -72,7 +70,7 @@ def __init__(self, test = _create_image_dataset(data=test) self.mean, self.std = _calc_mean_std(train=train) - super().__init__(train_tensors=train, val_tensors=val, test_tensors=test, shuffle=shuffle, + super().__init__(train_tensors=train, val_tensors=val, test_tensors=test, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, seed=seed, train_transforms=train_transforms, diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py index 19e483612..1e2b677e5 100644 --- a/autoPyTorch/datasets/tabular_dataset.py +++ b/autoPyTorch/datasets/tabular_dataset.py @@ -47,10 +47,9 @@ class TabularDataset(BaseDataset): resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), (default=HoldoutValTypes.holdout_validation): strategy to split the training data. - resampling_strategy_args (Optional[Dict[str, Any]]): arguments - required for the chosen resampling strategy. If None, uses - the default values provided in DEFAULT_RESAMPLING_PARAMETERS - in ```datasets/resampling_strategy.py```. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. + The details are provided in autoPytorch/datasets/resampling_strategy.py shuffle: Whether to shuffle the data before performing splits seed (int), (default=1): seed to be used for reproducibility. train_transforms (Optional[torchvision.transforms.Compose]): @@ -69,7 +68,6 @@ def __init__(self, Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, - shuffle: Optional[bool] = True, seed: Optional[int] = 42, train_transforms: Optional[torchvision.transforms.Compose] = None, val_transforms: Optional[torchvision.transforms.Compose] = None, @@ -92,7 +90,7 @@ def __init__(self, self.num_features = validator.feature_validator.num_features self.categories = validator.feature_validator.categories - super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle, + super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, seed=seed, train_transforms=train_transforms, diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py index edd07a80e..d5a21c550 100644 --- a/autoPyTorch/datasets/time_series_dataset.py +++ b/autoPyTorch/datasets/time_series_dataset.py @@ -5,18 +5,33 @@ import torchvision.transforms from autoPyTorch.datasets.base_dataset import BaseDataset -from autoPyTorch.datasets.resampling_strategy import ( - CrossValFuncs, - CrossValTypes, - HoldOutFuncs, - HoldoutValTypes -) +from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray] # currently only numpy arrays are supported TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray] TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray] +def _check_prohibited_resampling() -> None: + """Check if resampling strategy is suitable for a given task + + Args: + task_name (str): Typically the Dataset class name + resampling_strategy (Union[CrossValTypes, HoldoutValTypes]): + The splitting function + args (Union[CrossValTypes, HoldoutValTypes]): + The list of cross validation functions and + holdout validation functions that are suitable for the given task + + Returns: + None + + TODO: Especially, reject shuffle splits + """ + + pass + + class TimeSeriesForecastingDataset(BaseDataset): def __init__(self, target_variables: Tuple[int], @@ -26,7 +41,6 @@ def __init__(self, val: Optional[TIME_SERIES_FORECASTING_INPUT] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, - shuffle: Optional[bool] = False, seed: Optional[int] = 42, train_transforms: Optional[torchvision.transforms.Compose] = None, val_transforms: Optional[torchvision.transforms.Compose] = None, @@ -54,14 +68,12 @@ def __init__(self, target_variables=target_variables, sequence_length=sequence_length, n_steps=n_steps) - super().__init__(train_tensors=train, val_tensors=val, shuffle=shuffle, + super().__init__(train_tensors=train, val_tensors=val, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, seed=seed, train_transforms=train_transforms, val_transforms=val_transforms, ) - self.cross_validators = CrossValFuncs.get_cross_validators(CrossValTypes.time_series_cross_validation) - self.holdout_validators = HoldOutFuncs.get_holdout_validators(HoldoutValTypes.holdout_validation) def _check_time_series_forecasting_inputs(target_variables: Tuple[int], @@ -116,17 +128,8 @@ def __init__(self, _check_time_series_inputs(train=train, val=val, task_type="time_series_classification") - super().__init__(train_tensors=train, val_tensors=val, shuffle=True) - self.cross_validators = CrossValFuncs.get_cross_validators( - CrossValTypes.stratified_k_fold_cross_validation, - CrossValTypes.k_fold_cross_validation, - CrossValTypes.shuffle_split_cross_validation, - CrossValTypes.stratified_shuffle_split_cross_validation - ) - self.holdout_validators = HoldOutFuncs.get_holdout_validators( - HoldoutValTypes.holdout_validation, - HoldoutValTypes.stratified_holdout_validation - ) + resampling_strategy_args = {'shuffle': True} + super().__init__(train_tensors=train, val_tensors=val, resampling_strategy_args=resampling_strategy_args) class TimeSeriesRegressionDataset(BaseDataset): @@ -134,14 +137,8 @@ def __init__(self, train: Tuple[np.ndarray, np.ndarray], val: Optional[Tuple[np. _check_time_series_inputs(train=train, val=val, task_type="time_series_regression") - super().__init__(train_tensors=train, val_tensors=val, shuffle=True) - self.cross_validators = CrossValFuncs.get_cross_validators( - CrossValTypes.k_fold_cross_validation, - CrossValTypes.shuffle_split_cross_validation - ) - self.holdout_validators = HoldOutFuncs.get_holdout_validators( - HoldoutValTypes.holdout_validation - ) + resampling_strategy_args = {'shuffle': True} + super().__init__(train_tensors=train, val_tensors=val, resampling_strategy_args=resampling_strategy_args) def _check_time_series_inputs(task_type: str, diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index ddd6e95a1..45e9546e4 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -173,9 +173,7 @@ def __init__(self, # Evaluation self.resampling_strategy = resampling_strategy - if resampling_strategy_args is None: - resampling_strategy_args = DEFAULT_RESAMPLING_PARAMETERS[resampling_strategy] - self.resampling_strategy_args = resampling_strategy_args + self.resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else {} # and a bunch of useful limits self.worst_possible_result = get_cost_of_crash(self.metric) diff --git a/examples/40_advanced/example_resampling_strategy.py b/examples/40_advanced/example_resampling_strategy.py index 270f518c8..e0b1ec77a 100644 --- a/examples/40_advanced/example_resampling_strategy.py +++ b/examples/40_advanced/example_resampling_strategy.py @@ -115,9 +115,9 @@ api = TabularClassificationTask( # For demonstration purposes, we use # Stratified hold out validation. However, - # one can also use CrossValTypes.stratified_k_fold_cross_validation. - resampling_strategy=HoldoutValTypes.stratified_holdout_validation, - resampling_strategy_args={'val_share': 0.33} + # one can also use CrossValTypes.k_fold_cross_validation. + resampling_strategy=HoldoutValTypes.holdout_validation, + resampling_strategy_args={'val_share': 0.33, 'stratify': True} ) ############################################################################ From 6c31f6144ffcbda0e4f3f1a4121175d0ade7386c Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Mon, 10 May 2021 12:49:13 +0900 Subject: [PATCH 03/12] [fix] Fix flake8 and mypy issues --- autoPyTorch/datasets/base_dataset.py | 6 +++--- autoPyTorch/datasets/image_dataset.py | 2 +- autoPyTorch/datasets/resampling_strategy.py | 6 +++--- autoPyTorch/datasets/tabular_dataset.py | 2 +- autoPyTorch/optimizer/smbo.py | 1 - 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index de2ede902..69ac74663 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -108,9 +108,9 @@ def __init__( self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors self.random_state = np.random.RandomState(seed=seed) self.resampling_strategy = resampling_strategy - self.resampling_strategy_args = resampling_strategy_args - self.shuffle = self.resampling_strategy_args['shuffle'] - self.is_stratify = self.resampling_strategy.get('stratify', False) + self.resampling_strategy_args = resampling_strategy_args if resampling_strategy is not None else {} + self.shuffle = self.resampling_strategy_args.get('shuffle', False) + self.is_stratify = self.resampling_strategy_args.get('stratify', False) self.task_type: Optional[str] = None self.issparse: bool = issparse(self.train_tensors[0]) diff --git a/autoPyTorch/datasets/image_dataset.py b/autoPyTorch/datasets/image_dataset.py index 5abeb6d8a..96898fc4f 100644 --- a/autoPyTorch/datasets/image_dataset.py +++ b/autoPyTorch/datasets/image_dataset.py @@ -45,7 +45,7 @@ class ImageDataset(BaseDataset): resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. The details are provided in autoPytorch/datasets/resampling_strategy.py - shuffle: Whether to shuffle the data before performing splits + shuffle: Whether to shuffle the data when performing splits seed (int), (default=1): seed to be used for reproducibility. train_transforms (Optional[torchvision.transforms.Compose]): Additional Transforms to be applied to the training data diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index f6e6ae570..85553fe61 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -31,14 +31,14 @@ def holdout_validation( indices: np.ndarray, shuffle: bool = False, labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None - ): + ) -> List[Tuple[np.ndarray, np.ndarray]]: train, val = train_test_split( indices, test_size=val_share, shuffle=shuffle, random_state=random_state if shuffle else None, stratify=labels_to_stratify ) - return train, val + return [train, val] class CrossValFuncs(): @@ -177,7 +177,7 @@ class HoldoutValTypes(Enum): Additionally, HoldoutValTypes. can be called directly. """ - holdout = partial(HoldoutFuncs.holdout_validation) + holdout_validation = partial(HoldoutFuncs.holdout_validation) def __call__( self, diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py index 1e2b677e5..83d733280 100644 --- a/autoPyTorch/datasets/tabular_dataset.py +++ b/autoPyTorch/datasets/tabular_dataset.py @@ -50,7 +50,7 @@ class TabularDataset(BaseDataset): resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. The details are provided in autoPytorch/datasets/resampling_strategy.py - shuffle: Whether to shuffle the data before performing splits + shuffle: Whether to shuffle the data when performing splits seed (int), (default=1): seed to be used for reproducibility. train_transforms (Optional[torchvision.transforms.Compose]): Additional Transforms to be applied to the training data. diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index 45e9546e4..e1e070228 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -20,7 +20,6 @@ from autoPyTorch.datasets.base_dataset import BaseDataset from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, - DEFAULT_RESAMPLING_PARAMETERS, HoldoutValTypes, ) from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager From b7d35314a8a8af2973188021c39c8c70aac18a00 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Mon, 10 May 2021 12:55:41 +0900 Subject: [PATCH 04/12] [fix] Fix mypy issues --- autoPyTorch/datasets/base_dataset.py | 7 +++++-- autoPyTorch/datasets/time_series_dataset.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 69ac74663..f29d1cf33 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -108,7 +108,10 @@ def __init__( self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors self.random_state = np.random.RandomState(seed=seed) self.resampling_strategy = resampling_strategy - self.resampling_strategy_args = resampling_strategy_args if resampling_strategy is not None else {} + self.resampling_strategy_args: Dict[str, Any] = {} + if resampling_strategy_args is not None: + self.resampling_strategy_args = resampling_strategy_args + self.shuffle = self.resampling_strategy_args.get('shuffle', False) self.is_stratify = self.resampling_strategy_args.get('stratify', False) @@ -242,7 +245,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int] elif isinstance(self.resampling_strategy, CrossValTypes): num_splits = self.resampling_strategy_args['num_splits'] - return self.create_cross_val_splits( + return self.resampling_strategy( random_state=self.random_state, num_splits=int(num_splits), shuffle=self.shuffle, diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py index d5a21c550..f7de96025 100644 --- a/autoPyTorch/datasets/time_series_dataset.py +++ b/autoPyTorch/datasets/time_series_dataset.py @@ -107,8 +107,8 @@ def _prepare_time_series_forecasting_tensor(tensor: TIME_SERIES_FORECASTING_INPU population_size, time_series_length, num_features = tensor[0].shape num_targets = len(target_variables) num_datapoints = time_series_length - sequence_length - n_steps + 1 - x_tensor = np.zeros((num_datapoints, population_size, sequence_length, num_features), dtype=np.float) - y_tensor = np.zeros((num_datapoints, population_size, num_targets), dtype=np.float) + x_tensor = np.zeros((num_datapoints, population_size, sequence_length, num_features), dtype=np.float64) + y_tensor = np.zeros((num_datapoints, population_size, num_targets), dtype=np.float64) for p in range(population_size): for i in range(num_datapoints): From eee3b1c49a823f6242b9c624e7e87fcab1f2dba0 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Mon, 10 May 2021 16:42:20 +0900 Subject: [PATCH 05/12] [fix] Fix mypy issues and modify the test accordingly Since the previous codes had the default shuffle = True and the indices shuffle before splitting, the test cases for CV and Holdout did not match. More specifically, when I bring back the followings, I could reproduce the original outputs: 1. Bring back _get_indices in BaseDataset 2. Make the default value of self.shuffle in BaseDataset True 3. Input shuffle = True in KFold instead of using ShuffleSplit These reproduce the original outputs. Note that KFold(shuffle=True) and ShuffleSplit() are not identical and even when we input the same random_state, the results do not reproduce. --- autoPyTorch/datasets/base_dataset.py | 6 +-- autoPyTorch/datasets/resampling_strategy.py | 47 ++++++++++---------- test/test_evaluation/test_train_evaluator.py | 4 +- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index f29d1cf33..16e8b8a9d 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -233,7 +233,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int] labels_to_stratify = self.train_tensors[-1] if self.is_stratify else None if isinstance(self.resampling_strategy, HoldoutValTypes): - val_share = self.resampling_strategy_args['val_share'] + val_share = self.resampling_strategy_args.get('val_share', None) return self.resampling_strategy( random_state=self.random_state, @@ -243,11 +243,11 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int] labels_to_stratify=labels_to_stratify ) elif isinstance(self.resampling_strategy, CrossValTypes): - num_splits = self.resampling_strategy_args['num_splits'] + num_splits = self.resampling_strategy_args.get('num_splits', None) return self.resampling_strategy( random_state=self.random_state, - num_splits=int(num_splits), + num_splits=num_splits, shuffle=self.shuffle, indices=self._get_indices(), labels_to_stratify=labels_to_stratify diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index 85553fe61..64c502475 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -26,19 +26,19 @@ class _ResamplingStrategyArgs(NamedTuple): class HoldoutFuncs(): @staticmethod def holdout_validation( - random_state: np.random.RandomState, - val_share: float, indices: np.ndarray, + random_state: Optional[np.random.RandomState] = None, + val_share: Optional[float] = None, shuffle: bool = False, labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None ) -> List[Tuple[np.ndarray, np.ndarray]]: train, val = train_test_split( - indices, test_size=val_share, shuffle=shuffle, - random_state=random_state if shuffle else None, + indices, test_size=val_share, + shuffle=shuffle, random_state=random_state, stratify=labels_to_stratify ) - return [train, val] + return [(train, val)] class CrossValFuncs(): @@ -52,9 +52,9 @@ class CrossValFuncs(): @staticmethod def k_fold_cross_validation( - random_state: np.random.RandomState, - num_splits: int, indices: np.ndarray, + random_state: Optional[np.random.RandomState] = None, + num_splits: Optional[int] = None, shuffle: bool = False, labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None ) -> List[Tuple[np.ndarray, np.ndarray]]: @@ -70,22 +70,15 @@ def k_fold_cross_validation( @staticmethod def time_series( - random_state: np.random.RandomState, - num_splits: int, indices: np.ndarray, + random_state: Optional[np.random.RandomState] = None, + num_splits: Optional[int] = None, shuffle: bool = False, labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None ) -> List[Tuple[np.ndarray, np.ndarray]]: """ Returns train and validation indices respecting the temporal ordering of the data. - Args: - indices (np.ndarray): array of indices to be split - num_splits (int): number of cross validation splits - - Returns: - splits (List[Tuple[List, List]]): list of tuples of training and validation indices - Examples: >>> indices = np.array([0, 1, 2, 3]) >>> CrossValFuncs.time_series_cross_validation(3, indices) @@ -94,7 +87,7 @@ def time_series( ([0, 1, 2], [3])] """ - cv = TimeSeriesSplit(n_splits=num_splits, random_state=random_state) + cv = TimeSeriesSplit(n_splits=num_splits) splits = list(cv.split(indices)) return splits @@ -122,9 +115,9 @@ class CrossValTypes(Enum): def __call__( self, - random_state: np.random.RandomState, indices: np.ndarray, - num_splits: int = 5, + random_state: Optional[np.random.RandomState] = None, + num_splits: Optional[int] = None, shuffle: bool = False, labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None ) -> List[Tuple[np.ndarray, np.ndarray]]: @@ -144,8 +137,12 @@ def __call__( splits[a split identifier][0: train, 1: val][a data point identifier] """ + + default_num_splits = _ResamplingStrategyArgs().num_splits + num_splits = num_splits if num_splits is not None else default_num_splits + return self.value( - random_state=random_state, + random_state=random_state if shuffle else None, num_splits=num_splits, indices=indices, shuffle=shuffle, @@ -181,9 +178,9 @@ class HoldoutValTypes(Enum): def __call__( self, - random_state: np.random.RandomState, indices: np.ndarray, - val_share: float = 0.33, + random_state: Optional[np.random.RandomState] = None, + val_share: Optional[float] = None, shuffle: bool = False, labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None ) -> List[Tuple[np.ndarray, np.ndarray]]: @@ -203,8 +200,12 @@ def __call__( splits[a split identifier][0: train, 1: val][a data point identifier] """ + + default_val_share = _ResamplingStrategyArgs().val_share + val_share = val_share if val_share is not None else default_val_share + return self.value( - random_state=random_state, + random_state=random_state if shuffle else None, val_share=val_share, indices=indices, shuffle=shuffle, diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index ae35c097b..2282d0954 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -112,7 +112,7 @@ def test_holdout(self, pipeline_mock): self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(evaluator.file_output.call_count, 1) - self.assertEqual(result, 0.5652173913043479) + self.assertEqual(result, 0.30434782608695654) self.assertEqual(pipeline_mock.fit.call_count, 1) # 3 calls because of train, holdout and test set self.assertEqual(pipeline_mock.predict_proba.call_count, 3) @@ -150,7 +150,7 @@ def test_cv(self, pipeline_mock): self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(evaluator.file_output.call_count, 1) - self.assertEqual(result, 0.46235467431119603) + self.assertEqual(result, 0.4651019270584489) self.assertEqual(pipeline_mock.fit.call_count, 5) # 9 calls because of the training, holdout and # test set (3 sets x 5 folds = 15) From 910e7d461402b965ea2ec688a5d7f89665dc3a6f Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Mon, 10 May 2021 19:43:29 +0900 Subject: [PATCH 06/12] [fix] Fix most test cases --- autoPyTorch/datasets/resampling_strategy.py | 25 +++++++++++---------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index 64c502475..b016efe57 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -1,5 +1,4 @@ -from enum import Enum -from functools import partial +from enum import IntEnum from typing import List, NamedTuple, Optional, Tuple, Union import numpy as np @@ -92,7 +91,7 @@ def time_series( return splits -class CrossValTypes(Enum): +class CrossValTypes(IntEnum): """The type of cross validation This class is used to specify the cross validation function @@ -107,11 +106,11 @@ class CrossValTypes(Enum): >>> for cross_val_type in CrossValTypes: print(cross_val_type.name, cross_val_type.value) - k_fold_cross_validation functools.partial() - time_series + k_fold_cross_validation 100 + time_series 101 """ - k_fold_cross_validation = partial(CrossValFuncs.k_fold_cross_validation) - time_series = partial(CrossValFuncs.time_series) + k_fold_cross_validation = 100 + time_series = 101 def __call__( self, @@ -140,8 +139,9 @@ def __call__( default_num_splits = _ResamplingStrategyArgs().num_splits num_splits = num_splits if num_splits is not None else default_num_splits + split_fn = getattr(CrossValFuncs, self.name) - return self.value( + return split_fn( random_state=random_state if shuffle else None, num_splits=num_splits, indices=indices, @@ -150,7 +150,7 @@ def __call__( ) -class HoldoutValTypes(Enum): +class HoldoutValTypes(IntEnum): """The type of holdout validation This class is used to specify the holdout validation function @@ -164,7 +164,7 @@ class HoldoutValTypes(Enum): >>> print(holdout_type.value) - functools.partial() + 0 >>> for holdout_type in HoldoutValTypes: print(holdout_type.name) @@ -174,7 +174,7 @@ class HoldoutValTypes(Enum): Additionally, HoldoutValTypes. can be called directly. """ - holdout_validation = partial(HoldoutFuncs.holdout_validation) + holdout_validation = 0 def __call__( self, @@ -203,8 +203,9 @@ def __call__( default_val_share = _ResamplingStrategyArgs().val_share val_share = val_share if val_share is not None else default_val_share + split_fn = getattr(HoldoutFuncs, self.name) - return self.value( + return split_fn( random_state=random_state if shuffle else None, val_share=val_share, indices=indices, From 8c9b89568b410fb7e10702882ecdc75c1bf1101b Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 11 May 2021 00:44:52 +0900 Subject: [PATCH 07/12] [fix] Bring back the data generator shuffle --- autoPyTorch/datasets/base_dataset.py | 13 ++++++++----- autoPyTorch/datasets/image_dataset.py | 5 +++-- autoPyTorch/datasets/tabular_dataset.py | 5 +++-- autoPyTorch/datasets/time_series_dataset.py | 9 ++++----- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 16e8b8a9d..410f9f490 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -71,6 +71,7 @@ def __init__( test_tensors: Optional[BaseDatasetInputType] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, + shuffle: Optional[bool] = True, seed: Optional[int] = 42, train_transforms: Optional[torchvision.transforms.Compose] = None, val_transforms: Optional[torchvision.transforms.Compose] = None, @@ -91,7 +92,7 @@ def __init__( resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. The details are provided in autoPytorch/datasets/resampling_strategy.py - shuffle: Whether to shuffle the data when performing splits + shuffle: Whether to shuffle the data before performing splits seed (int), (default=1): seed to be used for reproducibility. train_transforms (Optional[torchvision.transforms.Compose]): Additional Transforms to be applied to the training data @@ -107,12 +108,14 @@ def __init__( type_check(train_tensors, val_tensors) self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors self.random_state = np.random.RandomState(seed=seed) + self.shuffle = shuffle + self.resampling_strategy = resampling_strategy self.resampling_strategy_args: Dict[str, Any] = {} if resampling_strategy_args is not None: self.resampling_strategy_args = resampling_strategy_args - self.shuffle = self.resampling_strategy_args.get('shuffle', False) + self.shuffle_split = self.resampling_strategy_args.get('shuffle', False) self.is_stratify = self.resampling_strategy_args.get('stratify', False) self.task_type: Optional[str] = None @@ -195,7 +198,7 @@ def __len__(self) -> int: return self.train_tensors[0].shape[0] def _get_indices(self) -> np.ndarray: - return np.arange(len(self)) + return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self)) def _process_resampling_strategy_args(self) -> None: if not any(isinstance(self.resampling_strategy, val_type) @@ -238,7 +241,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int] return self.resampling_strategy( random_state=self.random_state, val_share=val_share, - shuffle=self.shuffle, + shuffle=self.shuffle_split, indices=self._get_indices(), labels_to_stratify=labels_to_stratify ) @@ -248,7 +251,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int] return self.resampling_strategy( random_state=self.random_state, num_splits=num_splits, - shuffle=self.shuffle, + shuffle=self.shuffle_split, indices=self._get_indices(), labels_to_stratify=labels_to_stratify ) diff --git a/autoPyTorch/datasets/image_dataset.py b/autoPyTorch/datasets/image_dataset.py index 96898fc4f..6d915f513 100644 --- a/autoPyTorch/datasets/image_dataset.py +++ b/autoPyTorch/datasets/image_dataset.py @@ -45,7 +45,7 @@ class ImageDataset(BaseDataset): resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. The details are provided in autoPytorch/datasets/resampling_strategy.py - shuffle: Whether to shuffle the data when performing splits + shuffle: Whether to shuffle the data before performing splits seed (int), (default=1): seed to be used for reproducibility. train_transforms (Optional[torchvision.transforms.Compose]): Additional Transforms to be applied to the training data @@ -58,6 +58,7 @@ def __init__(self, test: Optional[IMAGE_DATASET_INPUT] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, + shuffle: Optional[bool] = True, seed: Optional[int] = 42, train_transforms: Optional[torchvision.transforms.Compose] = None, val_transforms: Optional[torchvision.transforms.Compose] = None, @@ -70,7 +71,7 @@ def __init__(self, test = _create_image_dataset(data=test) self.mean, self.std = _calc_mean_std(train=train) - super().__init__(train_tensors=train, val_tensors=val, test_tensors=test, + super().__init__(train_tensors=train, val_tensors=val, test_tensors=test, shuffle=shuffle, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, seed=seed, train_transforms=train_transforms, diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py index 83d733280..3516e585e 100644 --- a/autoPyTorch/datasets/tabular_dataset.py +++ b/autoPyTorch/datasets/tabular_dataset.py @@ -50,7 +50,7 @@ class TabularDataset(BaseDataset): resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. The details are provided in autoPytorch/datasets/resampling_strategy.py - shuffle: Whether to shuffle the data when performing splits + shuffle: Whether to shuffle the data before performing splits seed (int), (default=1): seed to be used for reproducibility. train_transforms (Optional[torchvision.transforms.Compose]): Additional Transforms to be applied to the training data. @@ -68,6 +68,7 @@ def __init__(self, Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, + shuffle: Optional[bool] = True, seed: Optional[int] = 42, train_transforms: Optional[torchvision.transforms.Compose] = None, val_transforms: Optional[torchvision.transforms.Compose] = None, @@ -90,7 +91,7 @@ def __init__(self, self.num_features = validator.feature_validator.num_features self.categories = validator.feature_validator.categories - super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), + super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, seed=seed, train_transforms=train_transforms, diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py index f7de96025..2e143bf20 100644 --- a/autoPyTorch/datasets/time_series_dataset.py +++ b/autoPyTorch/datasets/time_series_dataset.py @@ -41,6 +41,7 @@ def __init__(self, val: Optional[TIME_SERIES_FORECASTING_INPUT] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, + shuffle: Optional[bool] = False, seed: Optional[int] = 42, train_transforms: Optional[torchvision.transforms.Compose] = None, val_transforms: Optional[torchvision.transforms.Compose] = None, @@ -68,7 +69,7 @@ def __init__(self, target_variables=target_variables, sequence_length=sequence_length, n_steps=n_steps) - super().__init__(train_tensors=train, val_tensors=val, + super().__init__(train_tensors=train, val_tensors=val, shuffle=shuffle, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, seed=seed, train_transforms=train_transforms, @@ -128,8 +129,7 @@ def __init__(self, _check_time_series_inputs(train=train, val=val, task_type="time_series_classification") - resampling_strategy_args = {'shuffle': True} - super().__init__(train_tensors=train, val_tensors=val, resampling_strategy_args=resampling_strategy_args) + super().__init__(train_tensors=train, val_tensors=val, shuffle=True) class TimeSeriesRegressionDataset(BaseDataset): @@ -137,8 +137,7 @@ def __init__(self, train: Tuple[np.ndarray, np.ndarray], val: Optional[Tuple[np. _check_time_series_inputs(train=train, val=val, task_type="time_series_regression") - resampling_strategy_args = {'shuffle': True} - super().__init__(train_tensors=train, val_tensors=val, resampling_strategy_args=resampling_strategy_args) + super().__init__(train_tensors=train, val_tensors=val, shuffle=True) def _check_time_series_inputs(task_type: str, From 93e68628c89d00b68d27d7e0eb8a960a87b6d895 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 11 May 2021 00:57:41 +0900 Subject: [PATCH 08/12] [fix] Fix the test value caused by putting back the shuffle generator --- test/test_evaluation/test_train_evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index 2282d0954..e0b0b74f0 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -112,7 +112,7 @@ def test_holdout(self, pipeline_mock): self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(evaluator.file_output.call_count, 1) - self.assertEqual(result, 0.30434782608695654) + self.assertEqual(result, 0.4782608695652174) self.assertEqual(pipeline_mock.fit.call_count, 1) # 3 calls because of train, holdout and test set self.assertEqual(pipeline_mock.predict_proba.call_count, 3) From bef43231a592a5527f123c9da37394fb16c3844f Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 11 May 2021 21:50:14 +0900 Subject: [PATCH 09/12] [fix] Fix pytest errors --- autoPyTorch/datasets/resampling_strategy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index b016efe57..198c2318c 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -32,9 +32,11 @@ def holdout_validation( labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None ) -> List[Tuple[np.ndarray, np.ndarray]]: + """ SKLearn requires shuffle=True for stratify """ train, val = train_test_split( indices, test_size=val_share, - shuffle=shuffle, random_state=random_state, + shuffle=shuffle if labels_to_stratify is None else True, + random_state=random_state, stratify=labels_to_stratify ) return [(train, val)] From 2d2ebb89fae85b5de4347fd4f482a5781c827faf Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 11 May 2021 22:22:53 +0900 Subject: [PATCH 10/12] [refactor] Change files so that we can see the difference easier --- autoPyTorch/datasets/base_dataset.py | 4 +- autoPyTorch/datasets/resampling_strategy.py | 142 ++++++++++---------- 2 files changed, 73 insertions(+), 73 deletions(-) diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 410f9f490..19e9c360e 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -200,7 +200,7 @@ def __len__(self) -> int: def _get_indices(self) -> np.ndarray: return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self)) - def _process_resampling_strategy_args(self) -> None: + def _check_resampling_strategy_args(self) -> None: if not any(isinstance(self.resampling_strategy, val_type) for val_type in [HoldoutValTypes, CrossValTypes]): raise ValueError(f"resampling_strategy {self.resampling_strategy} is not supported.") @@ -231,7 +231,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int] (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format """ # check if the requirements are met and if we can get splits - self._process_resampling_strategy_args() + self._check_resampling_strategy_args() labels_to_stratify = self.train_tensors[-1] if self.is_stratify else None diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index 198c2318c..f031cd443 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -22,77 +22,6 @@ class _ResamplingStrategyArgs(NamedTuple): stratify: bool = False -class HoldoutFuncs(): - @staticmethod - def holdout_validation( - indices: np.ndarray, - random_state: Optional[np.random.RandomState] = None, - val_share: Optional[float] = None, - shuffle: bool = False, - labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None - ) -> List[Tuple[np.ndarray, np.ndarray]]: - - """ SKLearn requires shuffle=True for stratify """ - train, val = train_test_split( - indices, test_size=val_share, - shuffle=shuffle if labels_to_stratify is None else True, - random_state=random_state, - stratify=labels_to_stratify - ) - return [(train, val)] - - -class CrossValFuncs(): - # (shuffle, is_stratify) -> split_fn - _args2split_fn = { - (True, True): StratifiedShuffleSplit, - (True, False): ShuffleSplit, - (False, True): StratifiedKFold, - (False, False): KFold, - } - - @staticmethod - def k_fold_cross_validation( - indices: np.ndarray, - random_state: Optional[np.random.RandomState] = None, - num_splits: Optional[int] = None, - shuffle: bool = False, - labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None - ) -> List[Tuple[np.ndarray, np.ndarray]]: - """ - Returns: - splits (List[Tuple[List, List]]): list of tuples of training and validation indices - """ - - split_fn = CrossValFuncs._args2split_fn[(shuffle, labels_to_stratify is not None)] - cv = split_fn(n_splits=num_splits, random_state=random_state) - splits = list(cv.split(indices)) - return splits - - @staticmethod - def time_series( - indices: np.ndarray, - random_state: Optional[np.random.RandomState] = None, - num_splits: Optional[int] = None, - shuffle: bool = False, - labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None - ) -> List[Tuple[np.ndarray, np.ndarray]]: - """ - Returns train and validation indices respecting the temporal ordering of the data. - - Examples: - >>> indices = np.array([0, 1, 2, 3]) - >>> CrossValFuncs.time_series_cross_validation(3, indices) - [([0], [1]), - ([0, 1], [2]), - ([0, 1, 2], [3])] - - """ - cv = TimeSeriesSplit(n_splits=num_splits) - splits = list(cv.split(indices)) - return splits - - class CrossValTypes(IntEnum): """The type of cross validation @@ -214,3 +143,74 @@ def __call__( shuffle=shuffle, labels_to_stratify=labels_to_stratify ) + + +class HoldoutFuncs(): + @staticmethod + def holdout_validation( + indices: np.ndarray, + random_state: Optional[np.random.RandomState] = None, + val_share: Optional[float] = None, + shuffle: bool = False, + labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None + ) -> List[Tuple[np.ndarray, np.ndarray]]: + + """ SKLearn requires shuffle=True for stratify """ + train, val = train_test_split( + indices, test_size=val_share, + shuffle=shuffle if labels_to_stratify is None else True, + random_state=random_state, + stratify=labels_to_stratify + ) + return [(train, val)] + + +class CrossValFuncs(): + # (shuffle, is_stratify) -> split_fn + _args2split_fn = { + (True, True): StratifiedShuffleSplit, + (True, False): ShuffleSplit, + (False, True): StratifiedKFold, + (False, False): KFold, + } + + @staticmethod + def k_fold_cross_validation( + indices: np.ndarray, + random_state: Optional[np.random.RandomState] = None, + num_splits: Optional[int] = None, + shuffle: bool = False, + labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None + ) -> List[Tuple[np.ndarray, np.ndarray]]: + """ + Returns: + splits (List[Tuple[List, List]]): list of tuples of training and validation indices + """ + + split_fn = CrossValFuncs._args2split_fn[(shuffle, labels_to_stratify is not None)] + cv = split_fn(n_splits=num_splits, random_state=random_state) + splits = list(cv.split(indices)) + return splits + + @staticmethod + def time_series( + indices: np.ndarray, + random_state: Optional[np.random.RandomState] = None, + num_splits: Optional[int] = None, + shuffle: bool = False, + labels_to_stratify: Optional[Union[Tuple[np.ndarray, np.ndarray], Dataset]] = None + ) -> List[Tuple[np.ndarray, np.ndarray]]: + """ + Returns train and validation indices respecting the temporal ordering of the data. + + Examples: + >>> indices = np.array([0, 1, 2, 3]) + >>> CrossValFuncs.time_series_cross_validation(3, indices) + [([0], [1]), + ([0, 1], [2]), + ([0, 1, 2], [3])] + + """ + cv = TimeSeriesSplit(n_splits=num_splits) + splits = list(cv.split(indices)) + return splits From 8d90b85aad8d8cfaa0350ce674801fa5a9d591b8 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Thu, 13 May 2021 09:48:36 +0900 Subject: [PATCH 11/12] [refactor] Gether kwargs for get splits for CV and Holdout --- autoPyTorch/datasets/base_dataset.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 19e9c360e..60b8b0d41 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -234,27 +234,22 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int] self._check_resampling_strategy_args() labels_to_stratify = self.train_tensors[-1] if self.is_stratify else None + kwargs = {} + kwargs.update( + random_state=self.random_state, + shuffle=self.shuffle_split, + indices=self._get_indices(), + labels_to_stratify=labels_to_stratify + ) if isinstance(self.resampling_strategy, HoldoutValTypes): val_share = self.resampling_strategy_args.get('val_share', None) + return self.resampling_strategy(val_share=val_share, **kwargs) - return self.resampling_strategy( - random_state=self.random_state, - val_share=val_share, - shuffle=self.shuffle_split, - indices=self._get_indices(), - labels_to_stratify=labels_to_stratify - ) elif isinstance(self.resampling_strategy, CrossValTypes): num_splits = self.resampling_strategy_args.get('num_splits', None) + return self.resampling_strategy(num_splits=num_splits, **kwargs) - return self.resampling_strategy( - random_state=self.random_state, - num_splits=num_splits, - shuffle=self.shuffle_split, - indices=self._get_indices(), - labels_to_stratify=labels_to_stratify - ) else: raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}") From 6ef981d5eb5c71dcd3c76c4a2fc882fb5904113a Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Thu, 13 May 2021 09:58:26 +0900 Subject: [PATCH 12/12] [fix] Fix a mypy issue --- autoPyTorch/datasets/base_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 60b8b0d41..0c46522f5 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -234,7 +234,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int] self._check_resampling_strategy_args() labels_to_stratify = self.train_tensors[-1] if self.is_stratify else None - kwargs = {} + kwargs: Dict[str, Any] = {} kwargs.update( random_state=self.random_state, shuffle=self.shuffle_split,