-
-
Notifications
You must be signed in to change notification settings - Fork 18.9k
DEPR: Categorical with values not present in categories #62142
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
9a76ad6
a89dee5
b82fdb1
fcd2773
85d4980
c46dee9
ea40e3d
e732fb9
faddfde
de9bec0
ddab79c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
cast, | ||
overload, | ||
) | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
|
@@ -23,6 +24,7 @@ | |
) | ||
from pandas._libs.arrays import NDArrayBacked | ||
from pandas.compat.numpy import function as nv | ||
from pandas.util._exceptions import find_stack_level | ||
from pandas.util._validators import validate_bool_kwarg | ||
|
||
from pandas.core.dtypes.cast import ( | ||
|
@@ -479,7 +481,11 @@ def __init__( | |
elif isinstance(values.dtype, CategoricalDtype): | ||
old_codes = extract_array(values)._codes | ||
codes = recode_for_categories( | ||
old_codes, values.dtype.categories, dtype.categories, copy=copy | ||
old_codes, | ||
values.dtype.categories, | ||
dtype.categories, | ||
copy=copy, | ||
warn=True, | ||
) | ||
|
||
else: | ||
|
@@ -535,7 +541,13 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: | |
# The _from_scalars strictness doesn't make much sense in this case. | ||
raise NotImplementedError | ||
|
||
res = cls._from_sequence(scalars, dtype=dtype) | ||
with warnings.catch_warnings(): | ||
warnings.filterwarnings( | ||
"ignore", | ||
"Constructing a Categorical with a dtype and values", | ||
FutureWarning, | ||
) | ||
res = cls._from_sequence(scalars, dtype=dtype) | ||
|
||
# if there are any non-category elements in scalars, these will be | ||
# converted to NAs in res. | ||
|
@@ -576,6 +588,15 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: | |
dtype = self.dtype.update_dtype(dtype) | ||
self = self.copy() if copy else self | ||
result = self._set_dtype(dtype, copy=False) | ||
wrong = result.isna() & ~self.isna() | ||
if wrong.any(): | ||
warnings.warn( | ||
"Constructing a Categorical with a dtype and values containing " | ||
"non-null entries not in that dtype's categories is deprecated " | ||
"and will raise in a future version.", | ||
FutureWarning, | ||
stacklevel=find_stack_level(), | ||
) | ||
|
||
elif isinstance(dtype, ExtensionDtype): | ||
return super().astype(dtype, copy=copy) | ||
|
@@ -670,14 +691,16 @@ def _from_inferred_categories( | |
if known_categories: | ||
# Recode from observation order to dtype.categories order. | ||
categories = dtype.categories | ||
codes = recode_for_categories(inferred_codes, cats, categories, copy=False) | ||
codes = recode_for_categories( | ||
inferred_codes, cats, categories, copy=False, warn=True | ||
) | ||
elif not cats.is_monotonic_increasing: | ||
# Sort categories and recode for unknown categories. | ||
unsorted = cats.copy() | ||
categories = cats.sort_values() | ||
|
||
codes = recode_for_categories( | ||
inferred_codes, unsorted, categories, copy=False | ||
inferred_codes, unsorted, categories, copy=False, warn=True | ||
) | ||
dtype = CategoricalDtype(categories, ordered=False) | ||
else: | ||
|
@@ -798,7 +821,7 @@ def categories(self) -> Index: | |
>>> ser.cat.categories | ||
Index(['a', 'b', 'c'], dtype='str') | ||
|
||
>>> raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"]) | ||
>>> raw_cat = pd.Categorical([None, "b", "c", "a"], categories=["b", "c", "d"]) | ||
>>> ser = pd.Series(raw_cat) | ||
>>> ser.cat.categories | ||
Index(['b', 'c', 'd'], dtype='str') | ||
|
@@ -1106,7 +1129,7 @@ def set_categories( | |
For :class:`pandas.Series`: | ||
|
||
>>> raw_cat = pd.Categorical( | ||
... ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True | ||
... ["a", "b", "c", None], categories=["a", "b", "c"], ordered=True | ||
... ) | ||
>>> ser = pd.Series(raw_cat) | ||
>>> ser | ||
|
@@ -1156,7 +1179,7 @@ def set_categories( | |
codes = cat._codes | ||
else: | ||
codes = recode_for_categories( | ||
cat.codes, cat.categories, new_dtype.categories, copy=False | ||
cat.codes, cat.categories, new_dtype.categories, copy=False, warn=False | ||
Comment on lines
1171
to
+1172
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When the deprecation is enforced, do we still need to pass the flag on whether to allow values outside the categories? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the flag will become a "should we raise" flag, and will still be needed bc the answer will be "no" for |
||
) | ||
NDArrayBacked.__init__(cat, codes, new_dtype) | ||
return cat | ||
|
@@ -3004,11 +3027,25 @@ def _get_codes_for_values( | |
If `values` is known to be a Categorical, use recode_for_categories instead. | ||
""" | ||
codes = categories.get_indexer_for(values) | ||
wrong = (codes == -1) & ~isna(values) | ||
if wrong.any(): | ||
warnings.warn( | ||
"Constructing a Categorical with a dtype and values containing " | ||
"non-null entries not in that dtype's categories is deprecated " | ||
"and will raise in a future version.", | ||
FutureWarning, | ||
stacklevel=find_stack_level(), | ||
) | ||
return coerce_indexer_dtype(codes, categories) | ||
|
||
|
||
def recode_for_categories( | ||
codes: np.ndarray, old_categories, new_categories, *, copy: bool | ||
codes: np.ndarray, | ||
old_categories, | ||
new_categories, | ||
*, | ||
copy: bool = True, | ||
warn: bool = False, | ||
) -> np.ndarray: | ||
""" | ||
Convert a set of codes for to a new set of categories | ||
|
@@ -3019,6 +3056,8 @@ def recode_for_categories( | |
old_categories, new_categories : Index | ||
copy: bool, default True | ||
Whether to copy if the codes are unchanged. | ||
warn : bool, default False | ||
Whether to warn on silent-NA mapping. | ||
|
||
Returns | ||
------- | ||
|
@@ -3043,9 +3082,18 @@ def recode_for_categories( | |
return codes.copy() | ||
return codes | ||
|
||
indexer = coerce_indexer_dtype( | ||
new_categories.get_indexer_for(old_categories), new_categories | ||
) | ||
codes_in_old_cats = new_categories.get_indexer_for(old_categories) | ||
if warn: | ||
wrong = codes_in_old_cats == -1 | ||
if wrong.any(): | ||
warnings.warn( | ||
"Constructing a Categorical with a dtype and values containing " | ||
"non-null entries not in that dtype's categories is deprecated " | ||
"and will raise in a future version.", | ||
FutureWarning, | ||
stacklevel=find_stack_level(), | ||
) | ||
indexer = coerce_indexer_dtype(codes_in_old_cats, new_categories) | ||
new_codes = take_nd(indexer, codes, fill_value=-1) | ||
return new_codes | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -258,6 +258,12 @@ def _is_dtype_compat(self, other: Index) -> Categorical: | |
else: | ||
values = other | ||
|
||
codes = self.categories.get_indexer(values) | ||
if ((codes == -1) & ~values.isna()).any(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need the check on L269 with this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think not, will check and update |
||
# GH#37667 see test_equals_non_category | ||
raise TypeError( | ||
"categories must match existing categories when appending" | ||
) | ||
cat = Categorical(other, dtype=self.dtype) | ||
other = CategoricalIndex(cat) | ||
if not other.isin(values).all(): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens here once the deprecation is enforced?