Skip to content

nn.Embedding to avoid OneHotEncoding all categorical columns #425

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion autoPyTorch/data/base_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ def __init__(

# Required for dataset properties
self.num_features: Optional[int] = None
self.categories: List[List[int]] = []
self.categorical_columns: List[int] = []
self.numerical_columns: List[int] = []

self.num_categories_per_col: Optional[List[int]] = []
self.all_nan_columns: Optional[Set[Union[int, str]]] = None

self._is_fitted = False
Expand Down
6 changes: 2 additions & 4 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,8 @@ def _fit(
encoded_categories = self.column_transformer.\
named_transformers_['categorical_pipeline'].\
named_steps['ordinalencoder'].categories_
self.categories = [
list(range(len(cat)))
for cat in encoded_categories
]

self.num_categories_per_col = [len(cat) for cat in encoded_categories]

# differently to categorical_columns and numerical_columns,
# this saves the index of the column.
Expand Down
2 changes: 1 addition & 1 deletion autoPyTorch/datasets/tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def __init__(self,
self.categorical_columns = validator.feature_validator.categorical_columns
self.numerical_columns = validator.feature_validator.numerical_columns
self.num_features = validator.feature_validator.num_features
self.categories = validator.feature_validator.categories
self.num_categories_per_col = validator.feature_validator.num_categories_per_col

super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle,
resampling_strategy=resampling_strategy,
Expand Down
27 changes: 0 additions & 27 deletions autoPyTorch/pipeline/base_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,33 +310,6 @@ def _add_forbidden_conditions(self, cs):

"""

# Learned Entity Embedding is only valid when encoder is one hot encoder
if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys():
embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices
if 'LearnedEntityEmbedding' in embeddings:
encoders = cs.get_hyperparameter('encoder:__choice__').choices
possible_default_embeddings = copy(list(embeddings))
del possible_default_embeddings[possible_default_embeddings.index('LearnedEntityEmbedding')]

for encoder in encoders:
if encoder == 'OneHotEncoder':
continue
while True:
try:
cs.add_forbidden_clause(ForbiddenAndConjunction(
ForbiddenEqualsClause(cs.get_hyperparameter(
'network_embedding:__choice__'), 'LearnedEntityEmbedding'),
ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder)
))
break
except ValueError:
# change the default and try again
try:
default = possible_default_embeddings.pop()
except IndexError:
raise ValueError("Cannot find a legal default configuration")
cs.get_hyperparameter('network_embedding:__choice__').default_value = default

# Disable CyclicLR until todo is completed.
if 'lr_scheduler' in self.named_steps.keys() and 'trainer' in self.named_steps.keys():
trainers = cs.get_hyperparameter('trainer:__choice__').choices
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from typing import Any, Dict, List, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
UniformIntegerHyperparameter,
)

import numpy as np


from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \
autoPyTorchTabularPreprocessingComponent
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, ispandas


class ColumnSplitter(autoPyTorchTabularPreprocessingComponent):
"""
Removes features that have the same value in the training data.
"""
def __init__(
self,
min_categories_for_embedding: float = 5,
random_state: Optional[np.random.RandomState] = None
):
self.min_categories_for_embedding = min_categories_for_embedding

self.special_feature_types = dict(encode_columns=[], embed_columns=[])
self.num_categories_per_col: Optional[List] = None
super().__init__()

def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'ColumnSplitter':

self.check_requirements(X, y)

if len(X['dataset_properties']['categorical_columns']) > 0:
self.num_categories_per_col = []
for categories_per_column, column in zip(X['dataset_properties']['num_categories_per_col'], X['dataset_properties']['categorical_columns']):
if (
categories_per_column >= self.min_categories_for_embedding
):
self.special_feature_types['embed_columns'].append(column)
# we only care about the categories for columns to be embedded
self.num_categories_per_col.append(categories_per_column)
else:
self.special_feature_types['encode_columns'].append(column)

return self

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
if self.num_categories_per_col is not None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.num_categories_per_col is initialized as an empty list, which means that it will not be None also for the encoded columns. Maybe this conditions should be changed to:

if self.num_categories_per_col:
    ...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it will be none when there were no categorical column, see line 38

Copy link
Collaborator

@theodorju theodorju Jul 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, but line 38 initializes self.num_categories_per_col to an empty list if there are categorical columns, and [] is not None returns True.

I'm mentioning this because I thought in line 53 we check if there are columns to be embedded, currently the if conditions evaluates to true both for embedded and encoded columns.

# update such that only n categories for embedding columns is passed
X['dataset_properties']['num_categories_per_col'] = self.num_categories_per_col
X.update(self.special_feature_types)
return X

@staticmethod
def get_properties(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:

return {
'shortname': 'ColumnSplitter',
'name': 'Column Splitter',
'handles_sparse': False,
}

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
min_categories_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace(
hyperparameter="min_categories_for_embedding",
value_range=(3, 7),
default_value=3,
log=True),
) -> ConfigurationSpace:
cs = ConfigurationSpace()

add_hyperparameter(cs, min_categories_for_embedding, UniformIntegerHyperparameter)

return cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder:

self.preprocessor['categorical'] = OHE(
# It is safer to have the OHE produce a 0 array than to crash a good configuration
categories=X['dataset_properties']['categories']
if len(X['dataset_properties']['categories']) > 0 else 'auto',
categories='auto',
sparse=False,
handle_unknown='ignore')
return self
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(self) -> None:
super().__init__()
self.add_fit_requirements([
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
FitRequirement('categories', (List,), user_defined=True, dataset_property=True)])
])

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
X['X_train'] = preprocess(dataset=X_train, transforms=transforms)

# We need to also save the preprocess transforms for inference
X.update({'preprocess_transforms': transforms})
X.update({
'preprocess_transforms': transforms,
'shape_after_preprocessing': X['X_train'].shape[1:]
})
return X

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...]
:param input_shape: shape of the input
:return: output_shape
"""
placeholder = torch.randn((2, *input_shape), dtype=torch.float)
# as we are using nn embedding, 2 is a safe upper limit as 3
# is the lowest `min_values_for_embedding` can be
placeholder = torch.randint(high=2, size=(2, *input_shape), dtype=torch.float)
with torch.no_grad():
output = network(placeholder)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from math import ceil
from typing import Any, Dict, List, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
Expand All @@ -19,69 +20,59 @@
class _LearnedEntityEmbedding(nn.Module):
""" Learned entity embedding module for categorical features"""

def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_numerical_features: int):
def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, num_features_excl_embed: int):
"""
Args:
config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer
num_input_features (np.ndarray): column wise information of number of output columns after transformation
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think num_input_features should be replaced with num_categories_per_col (np.ndarray): number of categories for categorical columns that will be embedded

for each categorical column and 0 for numerical columns
num_numerical_features (int): number of numerical features in X
num_features_excl_embed (int): number of features in X excluding the features that need to be embedded
"""
super().__init__()
self.config = config

self.num_numerical = num_numerical_features
# list of number of categories of categorical data
# or 0 for numerical data
self.num_input_features = num_input_features
categorical_features = self.num_input_features > 0

self.num_categorical_features = self.num_input_features[categorical_features]

self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in
self.num_input_features]
self.num_output_dimensions = [0] * num_numerical_features
self.num_output_dimensions.extend([config["dimension_reduction_" + str(i)] * num_in for i, num_in in
enumerate(self.num_categorical_features)])
self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in
zip(self.num_output_dimensions, self.num_input_features)]
self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in
zip(self.num_output_dimensions, self.embed_features,
self.num_input_features)]
self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions)
self.num_categories_per_col = num_categories_per_col
self.embed_features = self.num_categories_per_col > 0

self.num_embed_features = self.num_categories_per_col[self.embed_features]

self.num_output_dimensions = [1] * num_features_excl_embed
self.num_output_dimensions.extend([ceil(config["dimension_reduction_" + str(i)] * num_in) for i, num_in in
enumerate(self.num_embed_features)])

self.num_out_feats = num_features_excl_embed + sum(self.num_output_dimensions)

self.ee_layers = self._create_ee_layers()

def forward(self, x: torch.Tensor) -> torch.Tensor:
# pass the columns of each categorical feature through entity embedding layer
# before passing it through the model
concat_seq = []
last_concat = 0

x_pointer = 0
layer_pointer = 0
for num_in, embed in zip(self.num_input_features, self.embed_features):
for x_pointer, embed in enumerate(self.embed_features):
current_feature_slice = x[:, x_pointer]
if not embed:
x_pointer += 1
concat_seq.append(current_feature_slice.view(-1, 1))
continue
if x_pointer > last_concat:
concat_seq.append(x[:, last_concat: x_pointer])
categorical_feature_slice = x[:, x_pointer: x_pointer + num_in]
concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice))
current_feature_slice = current_feature_slice.to(torch.int)
concat_seq.append(self.ee_layers[layer_pointer](current_feature_slice))
layer_pointer += 1
x_pointer += num_in
last_concat = x_pointer

concat_seq.append(x[:, last_concat:])
return torch.cat(concat_seq, dim=1)

def _create_ee_layers(self) -> nn.ModuleList:
# entity embeding layers are Linear Layers
layers = nn.ModuleList()
for i, (num_in, embed, num_out) in enumerate(zip(self.num_input_features, self.embed_features,
self.num_output_dimensions)):
for num_cat, embed, num_out in zip(self.num_categories_per_col,
self.embed_features,
self.num_output_dimensions):
if not embed:
continue
layers.append(nn.Linear(num_in, num_out))
layers.append(nn.Embedding(num_cat, num_out))
return layers


Expand All @@ -94,28 +85,30 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None, **kwarg
super().__init__(random_state=random_state)
self.config = kwargs

def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module:
return _LearnedEntityEmbedding(config=self.config,
num_input_features=num_input_features,
num_numerical_features=num_numerical_features)
num_categories_per_col=num_categories_per_col,
num_features_excl_embed=num_features_excl_embed)

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
min_unique_values_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace(
hyperparameter="min_unique_values_for_embedding",
value_range=(3, 7),
default_value=5,
log=True),
dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction",
value_range=(0, 1),
default_value=0.5),
) -> ConfigurationSpace:
cs = ConfigurationSpace()
add_hyperparameter(cs, min_unique_values_for_embedding, UniformIntegerHyperparameter)
if dataset_properties is not None:
for i in range(len(dataset_properties['categorical_columns'])
if isinstance(dataset_properties['categorical_columns'], List) else 0):
# currently as we dont have information about the embedding columns
# we search for more dimensions than necessary. This can be solved by
# not having `min_unique_values_for_embedding` as a hyperparameter and
# instead passing it as a parameter to the feature validator, which
# allows us to pass embed_columns to the dataset properties.
# TODO: test the trade off
# Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding` in one custom transformer.
# this will also allow users to use this transformer outside the pipeline
Copy link
Contributor Author

@ravinkohli ravinkohli Mar 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# this will also allow users to use this transformer outside the pipeline
# this will also allow users to use this transformer outside the pipeline, see [this](https://github.com/manujosephv/pytorch_tabular/blob/main/pytorch_tabular/categorical_encoders.py#L132)

ee_dimensions_search_space = HyperparameterSearchSpace(hyperparameter="dimension_reduction_" + str(i),
value_range=dimension_reduction.value_range,
default_value=dimension_reduction.default_value,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class NoEmbedding(NetworkEmbeddingComponent):
def __init__(self, random_state: Optional[np.random.RandomState] = None):
super().__init__(random_state=random_state)

def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module:
return _NoEmbedding()

@staticmethod
Expand Down
Loading