-
Notifications
You must be signed in to change notification settings - Fork 300
nn.Embedding to avoid OneHotEncoding all categorical columns #425
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
05d187c
769b51e
0d9beae
a2d84e5
6b188a4
539fdba
adc26d5
9573358
95a5969
b2c0ecc
6830116
3761b53
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
from ConfigSpace.configuration_space import ConfigurationSpace | ||
from ConfigSpace.hyperparameters import ( | ||
UniformIntegerHyperparameter, | ||
) | ||
|
||
import numpy as np | ||
|
||
|
||
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType | ||
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ | ||
autoPyTorchTabularPreprocessingComponent | ||
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, ispandas | ||
|
||
|
||
class ColumnSplitter(autoPyTorchTabularPreprocessingComponent): | ||
""" | ||
Removes features that have the same value in the training data. | ||
""" | ||
def __init__( | ||
self, | ||
min_categories_for_embedding: float = 5, | ||
random_state: Optional[np.random.RandomState] = None | ||
): | ||
self.min_categories_for_embedding = min_categories_for_embedding | ||
|
||
self.special_feature_types = dict(encode_columns=[], embed_columns=[]) | ||
self.num_categories_per_col: Optional[List] = None | ||
super().__init__() | ||
|
||
def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'ColumnSplitter': | ||
|
||
self.check_requirements(X, y) | ||
|
||
if len(X['dataset_properties']['categorical_columns']) > 0: | ||
self.num_categories_per_col = [] | ||
for categories_per_column, column in zip(X['dataset_properties']['num_categories_per_col'], X['dataset_properties']['categorical_columns']): | ||
if ( | ||
categories_per_column >= self.min_categories_for_embedding | ||
): | ||
self.special_feature_types['embed_columns'].append(column) | ||
# we only care about the categories for columns to be embedded | ||
self.num_categories_per_col.append(categories_per_column) | ||
else: | ||
self.special_feature_types['encode_columns'].append(column) | ||
|
||
return self | ||
|
||
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: | ||
if self.num_categories_per_col is not None: | ||
# update such that only n categories for embedding columns is passed | ||
X['dataset_properties']['num_categories_per_col'] = self.num_categories_per_col | ||
X.update(self.special_feature_types) | ||
return X | ||
|
||
@staticmethod | ||
def get_properties( | ||
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None | ||
) -> Dict[str, Union[str, bool]]: | ||
|
||
return { | ||
'shortname': 'ColumnSplitter', | ||
'name': 'Column Splitter', | ||
'handles_sparse': False, | ||
} | ||
|
||
@staticmethod | ||
def get_hyperparameter_search_space( | ||
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, | ||
min_categories_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace( | ||
hyperparameter="min_categories_for_embedding", | ||
value_range=(3, 7), | ||
default_value=3, | ||
log=True), | ||
) -> ConfigurationSpace: | ||
cs = ConfigurationSpace() | ||
|
||
add_hyperparameter(cs, min_categories_for_embedding, UniformIntegerHyperparameter) | ||
|
||
return cs |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,3 +1,4 @@ | ||||||
from math import ceil | ||||||
from typing import Any, Dict, List, Optional, Union | ||||||
|
||||||
from ConfigSpace.configuration_space import ConfigurationSpace | ||||||
|
@@ -19,69 +20,59 @@ | |||||
class _LearnedEntityEmbedding(nn.Module): | ||||||
""" Learned entity embedding module for categorical features""" | ||||||
|
||||||
def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_numerical_features: int): | ||||||
def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, num_features_excl_embed: int): | ||||||
""" | ||||||
Args: | ||||||
config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer | ||||||
num_input_features (np.ndarray): column wise information of number of output columns after transformation | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think |
||||||
for each categorical column and 0 for numerical columns | ||||||
num_numerical_features (int): number of numerical features in X | ||||||
num_features_excl_embed (int): number of features in X excluding the features that need to be embedded | ||||||
""" | ||||||
super().__init__() | ||||||
self.config = config | ||||||
|
||||||
self.num_numerical = num_numerical_features | ||||||
# list of number of categories of categorical data | ||||||
# or 0 for numerical data | ||||||
self.num_input_features = num_input_features | ||||||
categorical_features = self.num_input_features > 0 | ||||||
|
||||||
self.num_categorical_features = self.num_input_features[categorical_features] | ||||||
|
||||||
self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in | ||||||
self.num_input_features] | ||||||
self.num_output_dimensions = [0] * num_numerical_features | ||||||
self.num_output_dimensions.extend([config["dimension_reduction_" + str(i)] * num_in for i, num_in in | ||||||
enumerate(self.num_categorical_features)]) | ||||||
self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in | ||||||
zip(self.num_output_dimensions, self.num_input_features)] | ||||||
self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in | ||||||
zip(self.num_output_dimensions, self.embed_features, | ||||||
self.num_input_features)] | ||||||
self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) | ||||||
self.num_categories_per_col = num_categories_per_col | ||||||
self.embed_features = self.num_categories_per_col > 0 | ||||||
|
||||||
self.num_embed_features = self.num_categories_per_col[self.embed_features] | ||||||
|
||||||
self.num_output_dimensions = [1] * num_features_excl_embed | ||||||
self.num_output_dimensions.extend([ceil(config["dimension_reduction_" + str(i)] * num_in) for i, num_in in | ||||||
enumerate(self.num_embed_features)]) | ||||||
|
||||||
self.num_out_feats = num_features_excl_embed + sum(self.num_output_dimensions) | ||||||
|
||||||
self.ee_layers = self._create_ee_layers() | ||||||
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor: | ||||||
# pass the columns of each categorical feature through entity embedding layer | ||||||
# before passing it through the model | ||||||
concat_seq = [] | ||||||
last_concat = 0 | ||||||
|
||||||
x_pointer = 0 | ||||||
layer_pointer = 0 | ||||||
for num_in, embed in zip(self.num_input_features, self.embed_features): | ||||||
for x_pointer, embed in enumerate(self.embed_features): | ||||||
current_feature_slice = x[:, x_pointer] | ||||||
if not embed: | ||||||
x_pointer += 1 | ||||||
concat_seq.append(current_feature_slice.view(-1, 1)) | ||||||
continue | ||||||
if x_pointer > last_concat: | ||||||
concat_seq.append(x[:, last_concat: x_pointer]) | ||||||
categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] | ||||||
concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) | ||||||
current_feature_slice = current_feature_slice.to(torch.int) | ||||||
concat_seq.append(self.ee_layers[layer_pointer](current_feature_slice)) | ||||||
layer_pointer += 1 | ||||||
x_pointer += num_in | ||||||
last_concat = x_pointer | ||||||
|
||||||
concat_seq.append(x[:, last_concat:]) | ||||||
return torch.cat(concat_seq, dim=1) | ||||||
|
||||||
def _create_ee_layers(self) -> nn.ModuleList: | ||||||
# entity embeding layers are Linear Layers | ||||||
layers = nn.ModuleList() | ||||||
for i, (num_in, embed, num_out) in enumerate(zip(self.num_input_features, self.embed_features, | ||||||
self.num_output_dimensions)): | ||||||
for num_cat, embed, num_out in zip(self.num_categories_per_col, | ||||||
self.embed_features, | ||||||
self.num_output_dimensions): | ||||||
if not embed: | ||||||
continue | ||||||
layers.append(nn.Linear(num_in, num_out)) | ||||||
layers.append(nn.Embedding(num_cat, num_out)) | ||||||
return layers | ||||||
|
||||||
|
||||||
|
@@ -94,28 +85,30 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None, **kwarg | |||||
super().__init__(random_state=random_state) | ||||||
self.config = kwargs | ||||||
|
||||||
def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: | ||||||
def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module: | ||||||
return _LearnedEntityEmbedding(config=self.config, | ||||||
num_input_features=num_input_features, | ||||||
num_numerical_features=num_numerical_features) | ||||||
num_categories_per_col=num_categories_per_col, | ||||||
num_features_excl_embed=num_features_excl_embed) | ||||||
|
||||||
@staticmethod | ||||||
def get_hyperparameter_search_space( | ||||||
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, | ||||||
min_unique_values_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace( | ||||||
hyperparameter="min_unique_values_for_embedding", | ||||||
value_range=(3, 7), | ||||||
default_value=5, | ||||||
log=True), | ||||||
dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction", | ||||||
value_range=(0, 1), | ||||||
default_value=0.5), | ||||||
) -> ConfigurationSpace: | ||||||
cs = ConfigurationSpace() | ||||||
add_hyperparameter(cs, min_unique_values_for_embedding, UniformIntegerHyperparameter) | ||||||
if dataset_properties is not None: | ||||||
for i in range(len(dataset_properties['categorical_columns']) | ||||||
if isinstance(dataset_properties['categorical_columns'], List) else 0): | ||||||
# currently as we dont have information about the embedding columns | ||||||
# we search for more dimensions than necessary. This can be solved by | ||||||
# not having `min_unique_values_for_embedding` as a hyperparameter and | ||||||
# instead passing it as a parameter to the feature validator, which | ||||||
# allows us to pass embed_columns to the dataset properties. | ||||||
# TODO: test the trade off | ||||||
# Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding` in one custom transformer. | ||||||
# this will also allow users to use this transformer outside the pipeline | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
ee_dimensions_search_space = HyperparameterSearchSpace(hyperparameter="dimension_reduction_" + str(i), | ||||||
value_range=dimension_reduction.value_range, | ||||||
default_value=dimension_reduction.default_value, | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
self.num_categories_per_col
is initialized as an empty list, which means that it will not be None also for the encoded columns. Maybe this conditions should be changed to:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it will be none when there were no categorical column, see line 38
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, but line 38 initializes
self.num_categories_per_col
to an empty list if there are categorical columns, and[] is not None
returnsTrue
.I'm mentioning this because I thought in line 53 we check if there are columns to be embedded, currently the if conditions evaluates to true both for embedded and encoded columns.