automl · ravinkohli · Mar 23, 2022 · Mar 31, 2022 · Mar 31, 2022 · Mar 31, 2022
diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
@@ -46,10 +46,10 @@ def __init__(
 
         # Required for dataset properties
         self.num_features: Optional[int] = None
-        self.categories: List[List[int]] = []
         self.categorical_columns: List[int] = []
         self.numerical_columns: List[int] = []
 
+        self.num_categories_per_col: Optional[List[int]] = []
         self.all_nan_columns: Optional[Set[Union[int, str]]] = None
 
         self._is_fitted = False

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -193,10 +193,8 @@ def _fit(
                 encoded_categories = self.column_transformer.\
                     named_transformers_['categorical_pipeline'].\
                     named_steps['ordinalencoder'].categories_
-                self.categories = [
-                    list(range(len(cat)))
-                    for cat in encoded_categories
-                ]
+
+                self.num_categories_per_col = [len(cat) for cat in encoded_categories]
 
             # differently to categorical_columns and numerical_columns,
             # this saves the index of the column.

diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py
@@ -81,7 +81,7 @@ def __init__(self,
         self.categorical_columns = validator.feature_validator.categorical_columns
         self.numerical_columns = validator.feature_validator.numerical_columns
         self.num_features = validator.feature_validator.num_features
-        self.categories = validator.feature_validator.categories
+        self.num_categories_per_col = validator.feature_validator.num_categories_per_col
 
         super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle,
                          resampling_strategy=resampling_strategy,

diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
@@ -310,33 +310,6 @@ def _add_forbidden_conditions(self, cs):
 
         """
 
-        # Learned Entity Embedding is only valid when encoder is one hot encoder
-        if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys():
-            embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices
-            if 'LearnedEntityEmbedding' in embeddings:
-                encoders = cs.get_hyperparameter('encoder:__choice__').choices
-                possible_default_embeddings = copy(list(embeddings))
-                del possible_default_embeddings[possible_default_embeddings.index('LearnedEntityEmbedding')]
-
-                for encoder in encoders:
-                    if encoder == 'OneHotEncoder':
-                        continue
-                    while True:
-                        try:
-                            cs.add_forbidden_clause(ForbiddenAndConjunction(
-                                ForbiddenEqualsClause(cs.get_hyperparameter(
-                                    'network_embedding:__choice__'), 'LearnedEntityEmbedding'),
-                                ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder)
-                            ))
-                            break
-                        except ValueError:
-                            # change the default and try again
-                            try:
-                                default = possible_default_embeddings.pop()
-                            except IndexError:
-                                raise ValueError("Cannot find a legal default configuration")
-                            cs.get_hyperparameter('network_embedding:__choice__').default_value = default
-
         # Disable CyclicLR until todo is completed.
         if 'lr_scheduler' in self.named_steps.keys() and 'trainer' in self.named_steps.keys():
             trainers = cs.get_hyperparameter('trainer:__choice__').choices

diff --git a/...ipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py b/...ipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py
@@ -0,0 +1,81 @@
+from typing import Any, Dict, List, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    UniformIntegerHyperparameter,
+)
+
+import numpy as np
+
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \
+    autoPyTorchTabularPreprocessingComponent
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, ispandas
+
+
+class ColumnSplitter(autoPyTorchTabularPreprocessingComponent):
+    """
+    Removes features that have the same value in the training data.
+    """
+    def __init__(
+        self,
+        min_categories_for_embedding: float = 5,
+        random_state: Optional[np.random.RandomState] = None
+    ):
+        self.min_categories_for_embedding = min_categories_for_embedding
+
+        self.special_feature_types = dict(encode_columns=[], embed_columns=[])
+        self.num_categories_per_col: Optional[List] = None
+        super().__init__()
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'ColumnSplitter':
+
+        self.check_requirements(X, y)
+
+        if len(X['dataset_properties']['categorical_columns']) > 0:
+            self.num_categories_per_col = []
+        for categories_per_column, column in zip(X['dataset_properties']['num_categories_per_col'], X['dataset_properties']['categorical_columns']):
+            if (
+                categories_per_column >= self.min_categories_for_embedding
+            ):
+                self.special_feature_types['embed_columns'].append(column)
+                # we only care about the categories for columns to be embedded
+                self.num_categories_per_col.append(categories_per_column)
+            else:
+                self.special_feature_types['encode_columns'].append(column)
+
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        if self.num_categories_per_col is not None:
+            # update such that only n categories for embedding columns is passed
+            X['dataset_properties']['num_categories_per_col'] = self.num_categories_per_col
+        X.update(self.special_feature_types)
+        return X
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+
+        return {
+            'shortname': 'ColumnSplitter',
+            'name': 'Column Splitter',
+            'handles_sparse': False,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        min_categories_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter="min_categories_for_embedding",
+            value_range=(3, 7),
+            default_value=3,
+            log=True),
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, min_categories_for_embedding, UniformIntegerHyperparameter)
+
+        return cs
diff --git a/...PyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py b/...PyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py
@@ -22,8 +22,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder:
 
         self.preprocessor['categorical'] = OHE(
             # It is safer to have the OHE produce a 0 array than to crash a good configuration
-            categories=X['dataset_properties']['categories']
-            if len(X['dataset_properties']['categories']) > 0 else 'auto',
+            categories='auto',
             sparse=False,
             handle_unknown='ignore')
         return self

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py
@@ -14,7 +14,7 @@ def __init__(self) -> None:
         super().__init__()
         self.add_fit_requirements([
             FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
-            FitRequirement('categories', (List,), user_defined=True, dataset_property=True)])
+            ])
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """

diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py
@@ -40,7 +40,10 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X['X_train'] = preprocess(dataset=X_train, transforms=transforms)
 
         # We need to also save the preprocess transforms for inference
-        X.update({'preprocess_transforms': transforms})
+        X.update({
+            'preprocess_transforms': transforms,
+            'shape_after_preprocessing': X['X_train'].shape[1:]
+            })
         return X
 
     @staticmethod

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
@@ -25,7 +25,9 @@ def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...]
     :param input_shape: shape of the input
     :return: output_shape
     """
-    placeholder = torch.randn((2, *input_shape), dtype=torch.float)
+    # as we are using nn embedding, 2 is a safe upper limit as 3
+    # is the lowest `min_values_for_embedding` can be
+    placeholder = torch.randint(high=2, size=(2, *input_shape), dtype=torch.float)
     with torch.no_grad():
         output = network(placeholder)
 

diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -1,3 +1,4 @@
+from math import ceil
 from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -19,69 +20,59 @@
 class _LearnedEntityEmbedding(nn.Module):
     """ Learned entity embedding module for categorical features"""
 
-    def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_numerical_features: int):
+    def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, num_features_excl_embed: int):
         """
         Args:
             config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer
             num_input_features (np.ndarray): column wise information of number of output columns after transformation
                 for each categorical column and 0 for numerical columns
-            num_numerical_features (int): number of numerical features in X
+            num_features_excl_embed (int): number of features in X excluding the features that need to be embedded
         """
         super().__init__()
         self.config = config
-
-        self.num_numerical = num_numerical_features
         # list of number of categories of categorical data
         # or 0 for numerical data
-        self.num_input_features = num_input_features
-        categorical_features = self.num_input_features > 0
-
-        self.num_categorical_features = self.num_input_features[categorical_features]
-
-        self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in
-                               self.num_input_features]
-        self.num_output_dimensions = [0] * num_numerical_features
-        self.num_output_dimensions.extend([config["dimension_reduction_" + str(i)] * num_in for i, num_in in
-                                           enumerate(self.num_categorical_features)])
-        self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in
-                                      zip(self.num_output_dimensions, self.num_input_features)]
-        self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in
-                                      zip(self.num_output_dimensions, self.embed_features,
-                                          self.num_input_features)]
-        self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions)
+        self.num_categories_per_col = num_categories_per_col
+        self.embed_features = self.num_categories_per_col > 0
+
+        self.num_embed_features = self.num_categories_per_col[self.embed_features]
+
+        self.num_output_dimensions = [1] * num_features_excl_embed
+        self.num_output_dimensions.extend([ceil(config["dimension_reduction_" + str(i)] * num_in) for i, num_in in
+                                           enumerate(self.num_embed_features)])
+
+        self.num_out_feats = num_features_excl_embed + sum(self.num_output_dimensions)
 
         self.ee_layers = self._create_ee_layers()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # pass the columns of each categorical feature through entity embedding layer
         # before passing it through the model
         concat_seq = []
-        last_concat = 0
+
         x_pointer = 0
         layer_pointer = 0
-        for num_in, embed in zip(self.num_input_features, self.embed_features):
+        for x_pointer, embed in enumerate(self.embed_features):
+            current_feature_slice = x[:, x_pointer]
             if not embed:
                 x_pointer += 1
+                concat_seq.append(current_feature_slice.view(-1, 1))
                 continue
-            if x_pointer > last_concat:
-                concat_seq.append(x[:, last_concat: x_pointer])
-            categorical_feature_slice = x[:, x_pointer: x_pointer + num_in]
-            concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice))
+            current_feature_slice = current_feature_slice.to(torch.int)
+            concat_seq.append(self.ee_layers[layer_pointer](current_feature_slice))
             layer_pointer += 1
-            x_pointer += num_in
-            last_concat = x_pointer
 
-        concat_seq.append(x[:, last_concat:])
         return torch.cat(concat_seq, dim=1)
 
     def _create_ee_layers(self) -> nn.ModuleList:
         # entity embeding layers are Linear Layers
         layers = nn.ModuleList()
-        for i, (num_in, embed, num_out) in enumerate(zip(self.num_input_features, self.embed_features,
-                                                         self.num_output_dimensions)):
+        for num_cat, embed, num_out in zip(self.num_categories_per_col,
+                                           self.embed_features,
+                                           self.num_output_dimensions):
             if not embed:
                 continue
-            layers.append(nn.Linear(num_in, num_out))
+            layers.append(nn.Embedding(num_cat, num_out))
         return layers
 
 
@@ -94,28 +85,30 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None, **kwarg
         super().__init__(random_state=random_state)
         self.config = kwargs
 
-    def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
+    def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module:
         return _LearnedEntityEmbedding(config=self.config,
-                                       num_input_features=num_input_features,
-                                       num_numerical_features=num_numerical_features)
+                                       num_categories_per_col=num_categories_per_col,
+                                       num_features_excl_embed=num_features_excl_embed)
 
     @staticmethod
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        min_unique_values_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace(
-            hyperparameter="min_unique_values_for_embedding",
-            value_range=(3, 7),
-            default_value=5,
-            log=True),
         dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction",
                                                                                    value_range=(0, 1),
                                                                                    default_value=0.5),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
-        add_hyperparameter(cs, min_unique_values_for_embedding, UniformIntegerHyperparameter)
         if dataset_properties is not None:
             for i in range(len(dataset_properties['categorical_columns'])
                            if isinstance(dataset_properties['categorical_columns'], List) else 0):
+                # currently as we dont have information about the embedding columns
+                # we search for more dimensions than necessary. This can be solved by
+                # not having `min_unique_values_for_embedding` as a hyperparameter and
+                # instead passing it as a parameter to the feature validator, which
+                # allows us to pass embed_columns to the dataset properties.
+                # TODO: test the trade off
+                # Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding` in one custom transformer.
+                # this will also allow users to use this transformer outside the pipeline
-                # this will also allow users to use this transformer outside the pipeline
+                # this will also allow users to use this transformer outside the pipeline, see [this](https://github.com/manujosephv/pytorch_tabular/blob/main/pytorch_tabular/categorical_encoders.py#L132)
-                # this will also allow users to use this transformer outside the pipeline
+                # this will also allow users to use this transformer outside the pipeline, see [this](https://github.com/manujosephv/pytorch_tabular/blob/main/pytorch_tabular/categorical_encoders.py#L132)
                 ee_dimensions_search_space = HyperparameterSearchSpace(hyperparameter="dimension_reduction_" + str(i),
                                                                        value_range=dimension_reduction.value_range,
                                                                        default_value=dimension_reduction.default_value,

diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
@@ -24,7 +24,7 @@ class NoEmbedding(NetworkEmbeddingComponent):
     def __init__(self, random_state: Optional[np.random.RandomState] = None):
         super().__init__(random_state=random_state)
 
-    def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
+    def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module:
         return _NoEmbedding()
 
     @staticmethod