Adds more examples to customise AutoPyTorch. (#124)

ravinkohli · web-flow · commit ef6acf2cc1f8 · 2021-03-16T16:06:43.000+01:00
* 3 examples plus doc update

* Forgot the examples

* Added example for resampling strategy

* Update example worflow

* Fixed bugs in example and resampling strategies

* Addressed comments

* Addressed comments

* Addressed comments from shuhei, better documentation
diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
@@ -30,6 +30,8 @@ jobs:
         echo "::set-output name=BEFORE::$(git status --porcelain -b)"
     - name: Run tests
       run: |
-        python examples/example_tabular_classification.py
-        python examples/example_tabular_regression.py
+        python examples/tabular/20_basics/example_tabular_classification.py
+        python examples/tabular/20_basics/example_tabular_regression.py
+        python examples/tabular/40_advanced/example_custom_configuration_space.py
+        python examples/tabular/40_advanced/example_resampling_strategy.py
         python examples/example_image_classification.py
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -27,27 +27,36 @@ class TabularClassificationTask(BaseTask):
     """
     Tabular Classification API to the pipelines.
     Args:
-        seed (int): seed to be used for reproducibility.
-        n_jobs (int), (default=1): number of consecutive processes to spawn.
-        logging_config (Optional[Dict]): specifies configuration
-            for logging, if None, it is loaded from the logging.yaml
-        ensemble_size (int), (default=50): Number of models added to the ensemble built by
+        seed (int):
+            seed to be used for reproducibility.
+        n_jobs (int), (default=1):
+            number of consecutive processes to spawn.
+        logging_config (Optional[Dict]):
+            specifies configuration for logging, if None, it is loaded from the logging.yaml
+        ensemble_size (int), (default=50):
+            Number of models added to the ensemble built by
             Ensemble selection from libraries of models.
             Models are drawn with replacement.
-        ensemble_nbest (int), (default=50): only consider the ensemble_nbest
+        ensemble_nbest (int), (default=50):
+            only consider the ensemble_nbest
             models to build the ensemble
-        max_models_on_disc (int), (default=50): maximum number of models saved to disc.
+        max_models_on_disc (int), (default=50):
+            maximum number of models saved to disc.
             Also, controls the size of the ensemble as any additional models will be deleted.
             Must be greater than or equal to 1.
-        temporary_directory (str): folder to store configuration output and log file
-        output_directory (str): folder to store predictions for optional test set
-        delete_tmp_folder_after_terminate (bool): determines whether to delete the temporary directory,
-            when finished
-        include_components (Optional[Dict]): If None, all possible components are used.
-            Otherwise specifies set of components to use.
-        exclude_components (Optional[Dict]): If None, all possible components are used.
-            Otherwise specifies set of components not to use. Incompatible with include
-            components
+        temporary_directory (str):
+            folder to store configuration output and log file
+        output_directory (str):
+            folder to store predictions for optional test set
+        delete_tmp_folder_after_terminate (bool):
+            determines whether to delete the temporary directory, when finished
+        include_components (Optional[Dict]):
+            If None, all possible components are used. Otherwise
+            specifies set of components to use.
+        exclude_components (Optional[Dict]):
+            If None, all possible components are used. Otherwise
+            specifies set of components not to use. Incompatible
+            with include components
     """
     def __init__(
         self,
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
@@ -97,7 +97,7 @@ def holdout_validation(val_share: float, indices: np.ndarray, **kwargs: Any) ->
 
 def stratified_holdout_validation(val_share: float, indices: np.ndarray, **kwargs: Any) \
         -> Tuple[np.ndarray, np.ndarray]:
-    train, val = train_test_split(indices, test_size=val_share, shuffle=False, stratify=kwargs["stratify"])
+    train, val = train_test_split(indices, test_size=val_share, shuffle=True, stratify=kwargs["stratify"])
     return train, val
 
 
diff --git a/autoPyTorch/utils/hyperparameter_search_space_update.py b/autoPyTorch/utils/hyperparameter_search_space_update.py
@@ -6,7 +6,25 @@
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 
 
-class HyperparameterSearchSpaceUpdate():
+class HyperparameterSearchSpaceUpdate:
+    """
+    Allows specifying update to the search space of a
+    particular hyperparameter.
+
+    Args:
+        node_name (str):
+            The name of the node in the pipeline
+        hyperparameter (str):
+            The name of the hyperparameter
+        value_range (Union[List, Tuple]):
+            In case of categorical hyperparameter, defines the new categorical choices.
+            In case of numerical hyperparameter, defines the new range
+            in the form of (LOWER, UPPER)
+        default_value (Union[int, float, str]):
+            New default value for the hyperparameter
+        log (bool) (default=False):
+            In case of numerical hyperparameters, whether to sample on a log scale
+    """
     def __init__(self, node_name: str, hyperparameter: str, value_range: Union[List, Tuple],
                  default_value: Union[int, float, str], log: bool = False) -> None:
         self.node_name = node_name
@@ -16,6 +34,15 @@ def __init__(self, node_name: str, hyperparameter: str, value_range: Union[List,
         self.default_value = default_value
 
     def apply(self, pipeline: List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]) -> None:
+        """
+        Applies the update to the appropriate hyperparameter of the pipeline
+        Args:
+            pipeline (List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]):
+                The named steps of the current autopytorch pipeline
+
+        Returns:
+            None
+        """
         [node[1]._apply_search_space_update(name=self.hyperparameter,
                                             new_value_range=self.value_range,
                                             log=self.log,
@@ -29,30 +56,69 @@ def __str__(self) -> str:
                                            (" log" if self.log else ""))
 
 
-class HyperparameterSearchSpaceUpdates():
+class HyperparameterSearchSpaceUpdates:
+    """ Contains a collection of HyperparameterSearchSpaceUpdate """
     def __init__(self, updates: Optional[List[HyperparameterSearchSpaceUpdate]] = None) -> None:
         self.updates = updates if updates is not None else []
 
     def apply(self, pipeline: List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]) -> None:
+        """
+        Iteratively applies updates to the pipeline
+
+        Args:
+            pipeline: (List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]):
+                The named steps of the current autoPyTorch pipeline
+
+        Returns:
+            None
+        """
         for update in self.updates:
             update.apply(pipeline)
 
     def append(self, node_name: str, hyperparameter: str, value_range: Union[List, Tuple],
                default_value: Union[int, float, str], log: bool = False) -> None:
+        """
+        Add a new update
+
+        Args:
+            node_name (str):
+            The name of the node in the pipeline
+        hyperparameter (str):
+            The name of the hyperparameter
+        value_range (Union[List, Tuple]):
+            In case of categorical hyperparameter, defines the new categorical choices.
+            In case of numerical hyperparameter, defines the new range
+            in the form of (LOWER, UPPER)
+        default_value (Union[int, float, str]):
+            New default value for the hyperparameter
+        log (bool) (default=False):
+            In case of numerical hyperparameters, whether to sample on a log scale
+
+        Returns:
+            None
+        """
         self.updates.append(HyperparameterSearchSpaceUpdate(node_name=node_name,
                                                             hyperparameter=hyperparameter,
                                                             value_range=value_range,
                                                             default_value=default_value,
                                                             log=log))
 
     def save_as_file(self, path: str) -> None:
+        """
+        Save the updates as a file to reuse later
+
+        Args:
+            path (str): path of the file
+
+        Returns:
+            None
+        """
         with open(path, "w") as f:
-            with open(path, "w") as f:
-                for update in self.updates:
-                    print(update.node_name, update.hyperparameter,  # noqa: T001
-                          str(update.value_range), "'{}'".format(update.default_value)
-                          if isinstance(update.default_value, str) else update.default_value,
-                          (" log" if update.log else ""), file=f)
+            for update in self.updates:
+                print(update.node_name, update.hyperparameter,  # noqa: T001
+                      str(update.value_range), "'{}'".format(update.default_value)
+                      if isinstance(update.default_value, str) else update.default_value,
+                      (" log" if update.log else ""), file=f)
 
 
 def parse_hyperparameter_search_space_updates(updates_file: Optional[str]
diff --git a/docs/conf.py b/docs/conf.py
@@ -68,9 +68,9 @@
 
 sphinx_gallery_conf = {
     # path to the examples
-    'examples_dirs': '../examples',
+    'examples_dirs': ['../examples/tabular/20_basics', '../examples/tabular/40_advanced'],
     # path where to save gallery generated examples
-    'gallery_dirs': 'examples',
+    'gallery_dirs': ['basics_tabular', 'advanced_tabular'],
     #TODO: fix back/forward references for the examples.
     #'doc_module': ('autoPyTorch'),
     #'reference_url': {
diff --git a/examples/tabular/20_basics/README.txt b/examples/tabular/20_basics/README.txt
@@ -0,0 +1,8 @@
+.. _examples_tabular_basics:
+
+
+==============================
+Basic Tabular Dataset Examples
+==============================
+
+Basic examples for using *Auto-PyTorch* on tabular datasets
diff --git a/examples/tabular/20_basics/example_tabular_classification.py b/examples/tabular/20_basics/example_tabular_classification.py
@@ -22,32 +22,10 @@
 import sklearn.model_selection
 
 from autoPyTorch.api.tabular_classification import TabularClassificationTask
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-
-
-def get_search_space_updates():
-    """
-    Search space updates to the task can be added using HyperparameterSearchSpaceUpdates
-    Returns:
-        HyperparameterSearchSpaceUpdates
-    """
-    updates = HyperparameterSearchSpaceUpdates()
-    updates.append(node_name="data_loader",
-                   hyperparameter="batch_size",
-                   value_range=[16, 512],
-                   default_value=32)
-    updates.append(node_name="lr_scheduler",
-                   hyperparameter="CosineAnnealingLR:T_max",
-                   value_range=[50, 60],
-                   default_value=55)
-    updates.append(node_name='network_backbone',
-                   hyperparameter='ResNetBackbone:dropout',
-                   value_range=[0, 0.5],
-                   default_value=0.2)
-    return updates
 
 
 if __name__ == '__main__':
+
     ############################################################################
     # Data Loading
     # ============
@@ -62,16 +40,23 @@ def get_search_space_updates():
     # Build and fit a classifier
     # ==========================
     api = TabularClassificationTask(
-        delete_tmp_folder_after_terminate=False,
-        search_space_updates=get_search_space_updates()
+        temporary_directory='./tmp/autoPyTorch_example_tmp_01',
+        output_directory='./tmp/autoPyTorch_example_out_01',
+        # To maintain logs of the run, set the next two as False
+        delete_tmp_folder_after_terminate=True,
+        delete_output_folder_after_terminate=True
     )
+
+    ############################################################################
+    # Search for an ensemble of machine learning algorithms
+    # =====================================================
     api.search(
         X_train=X_train,
         y_train=y_train,
         X_test=X_test.copy(),
         y_test=y_test.copy(),
         optimize_metric='accuracy',
-        total_walltime_limit=500,
+        total_walltime_limit=300,
         func_eval_time_limit=50
     )
 
@@ -82,4 +67,5 @@ def get_search_space_updates():
     y_pred = api.predict(X_test)
     score = api.score(y_pred, y_test)
     print(score)
+    # Print the final ensemble built by AutoPyTorch
     print(api.show_models())
diff --git a/examples/tabular/20_basics/example_tabular_regression.py b/examples/tabular/20_basics/example_tabular_regression.py
@@ -3,17 +3,15 @@
 Tabular Regression
 ======================
 
-The following example shows how to fit a sample classification model
+The following example shows how to fit a sample regression model
 with AutoPyTorch
 """
 import os
 import tempfile as tmp
-import typing
 import warnings
 
-from sklearn.datasets import make_regression
-
-from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
+import sklearn.datasets
+import sklearn.model_selection
 
 os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
 os.environ['OMP_NUM_THREADS'] = '1'
@@ -23,54 +21,16 @@
 warnings.simplefilter(action='ignore', category=UserWarning)
 warnings.simplefilter(action='ignore', category=FutureWarning)
 
-from sklearn import model_selection, preprocessing
-
 from autoPyTorch.api.tabular_regression import TabularRegressionTask
-from autoPyTorch.datasets.tabular_dataset import TabularDataset
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-
-
-def get_search_space_updates():
-    """
-    Search space updates to the task can be added using HyperparameterSearchSpaceUpdates
-    Returns:
-        HyperparameterSearchSpaceUpdates
-    """
-    updates = HyperparameterSearchSpaceUpdates()
-    updates.append(node_name="data_loader",
-                   hyperparameter="batch_size",
-                   value_range=[16, 512],
-                   default_value=32)
-    updates.append(node_name="lr_scheduler",
-                   hyperparameter="CosineAnnealingLR:T_max",
-                   value_range=[50, 60],
-                   default_value=55)
-    updates.append(node_name='network_backbone',
-                   hyperparameter='ResNetBackbone:dropout',
-                   value_range=[0, 0.5],
-                   default_value=0.2)
-    return updates
 
 
 if __name__ == '__main__':
+
     ############################################################################
     # Data Loading
     # ============
-
-    # Get the training data for tabular regression
-    # X, y = datasets.fetch_openml(name="cholesterol", return_X_y=True)
-
-    # Use dummy data for now since there are problems with categorical columns
-    X, y = make_regression(
-        n_samples=5000,
-        n_features=4,
-        n_informative=3,
-        n_targets=1,
-        shuffle=True,
-        random_state=0
-    )
-
-    X_train, X_test, y_train, y_test = model_selection.train_test_split(
+    X, y = sklearn.datasets.fetch_openml(name='boston', return_X_y=True, as_frame=True)
+    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
         X,
         y,
         random_state=1,
@@ -89,16 +49,23 @@ def get_search_space_updates():
     # Build and fit a regressor
     # ==========================
     api = TabularRegressionTask(
-        delete_tmp_folder_after_terminate=False,
-        search_space_updates=get_search_space_updates()
+        temporary_directory='./tmp/autoPyTorch_example_tmp_02',
+        output_directory='./tmp/autoPyTorch_example_out_02',
+        # To maintain logs of the run, set the next two as False
+        delete_tmp_folder_after_terminate=True,
+        delete_output_folder_after_terminate=True
     )
+
+    ############################################################################
+    # Search for an ensemble of machine learning algorithms
+    # =====================================================
     api.search(
         X_train=X_train,
         y_train=y_train_scaled,
         X_test=X_test.copy(),
         y_test=y_test_scaled.copy(),
         optimize_metric='r2',
-        total_walltime_limit=500,
+        total_walltime_limit=300,
         func_eval_time_limit=50,
         traditional_per_total_budget=0
     )
@@ -114,3 +81,5 @@ def get_search_space_updates():
     score = api.score(y_pred, y_test)
 
     print(score)
+    # Print the final ensemble built by AutoPyTorch
+    print(api.show_models())
diff --git a/examples/tabular/40_advanced/README.txt b/examples/tabular/40_advanced/README.txt
diff --git a/examples/tabular/40_advanced/example_custom_configuration_space.py b/examples/tabular/40_advanced/example_custom_configuration_space.py
diff --git a/examples/tabular/40_advanced/example_resampling_strategy.py b/examples/tabular/40_advanced/example_resampling_strategy.py