Skip to content
2 changes: 1 addition & 1 deletion doc/source/user_guide/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it
.. ipython:: python

raw_cat = pd.Categorical(
["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=False
[None, "b", "c", "a"], categories=["b", "c", "d"], ordered=False
)
s = pd.Series(raw_cat)
s
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,7 @@ Other Deprecations
- Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`)
- Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`)
- Deprecated ``pd.core.internals.api.maybe_infer_ndim`` (:issue:`40226`)
- Deprecated allowing constructing or casting to :class:`Categorical` with non-NA values that are not present in specified ``dtype.categories`` (:issue:`40996`)
- Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`)
- Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`)
- Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`)
Expand Down
70 changes: 59 additions & 11 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
cast,
overload,
)
import warnings

import numpy as np

Expand All @@ -23,6 +24,7 @@
)
from pandas._libs.arrays import NDArrayBacked
from pandas.compat.numpy import function as nv
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -479,7 +481,11 @@ def __init__(
elif isinstance(values.dtype, CategoricalDtype):
old_codes = extract_array(values)._codes
codes = recode_for_categories(
old_codes, values.dtype.categories, dtype.categories, copy=copy
old_codes,
values.dtype.categories,
dtype.categories,
copy=copy,
warn=True,
)

else:
Expand Down Expand Up @@ -535,7 +541,13 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
# The _from_scalars strictness doesn't make much sense in this case.
raise NotImplementedError

res = cls._from_sequence(scalars, dtype=dtype)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"Constructing a Categorical with a dtype and values",
FutureWarning,
)
res = cls._from_sequence(scalars, dtype=dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens here once the deprecation is enforced?


# if there are any non-category elements in scalars, these will be
# converted to NAs in res.
Expand Down Expand Up @@ -576,6 +588,15 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
dtype = self.dtype.update_dtype(dtype)
self = self.copy() if copy else self
result = self._set_dtype(dtype, copy=False)
wrong = result.isna() & ~self.isna()
if wrong.any():
warnings.warn(
"Constructing a Categorical with a dtype and values containing "
"non-null entries not in that dtype's categories is deprecated "
"and will raise in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)

elif isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy=copy)
Expand Down Expand Up @@ -670,14 +691,16 @@ def _from_inferred_categories(
if known_categories:
# Recode from observation order to dtype.categories order.
categories = dtype.categories
codes = recode_for_categories(inferred_codes, cats, categories, copy=False)
codes = recode_for_categories(
inferred_codes, cats, categories, copy=False, warn=True
)
elif not cats.is_monotonic_increasing:
# Sort categories and recode for unknown categories.
unsorted = cats.copy()
categories = cats.sort_values()

codes = recode_for_categories(
inferred_codes, unsorted, categories, copy=False
inferred_codes, unsorted, categories, copy=False, warn=True
)
dtype = CategoricalDtype(categories, ordered=False)
else:
Expand Down Expand Up @@ -798,7 +821,7 @@ def categories(self) -> Index:
>>> ser.cat.categories
Index(['a', 'b', 'c'], dtype='str')

>>> raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"])
>>> raw_cat = pd.Categorical([None, "b", "c", "a"], categories=["b", "c", "d"])
>>> ser = pd.Series(raw_cat)
>>> ser.cat.categories
Index(['b', 'c', 'd'], dtype='str')
Expand Down Expand Up @@ -1106,7 +1129,7 @@ def set_categories(
For :class:`pandas.Series`:

>>> raw_cat = pd.Categorical(
... ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True
... ["a", "b", "c", None], categories=["a", "b", "c"], ordered=True
... )
>>> ser = pd.Series(raw_cat)
>>> ser
Expand Down Expand Up @@ -1156,7 +1179,7 @@ def set_categories(
codes = cat._codes
else:
codes = recode_for_categories(
cat.codes, cat.categories, new_dtype.categories, copy=False
cat.codes, cat.categories, new_dtype.categories, copy=False, warn=False
Comment on lines 1171 to +1172
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the deprecation is enforced, do we still need to pass the flag on whether to allow values outside the categories?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the flag will become a "should we raise" flag, and will still be needed bc the answer will be "no" for set_categories

)
NDArrayBacked.__init__(cat, codes, new_dtype)
return cat
Expand Down Expand Up @@ -3004,11 +3027,25 @@ def _get_codes_for_values(
If `values` is known to be a Categorical, use recode_for_categories instead.
"""
codes = categories.get_indexer_for(values)
wrong = (codes == -1) & ~isna(values)
if wrong.any():
warnings.warn(
"Constructing a Categorical with a dtype and values containing "
"non-null entries not in that dtype's categories is deprecated "
"and will raise in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)
return coerce_indexer_dtype(codes, categories)


def recode_for_categories(
codes: np.ndarray, old_categories, new_categories, *, copy: bool
codes: np.ndarray,
old_categories,
new_categories,
*,
copy: bool = True,
warn: bool = False,
) -> np.ndarray:
"""
Convert a set of codes for to a new set of categories
Expand All @@ -3019,6 +3056,8 @@ def recode_for_categories(
old_categories, new_categories : Index
copy: bool, default True
Whether to copy if the codes are unchanged.
warn : bool, default False
Whether to warn on silent-NA mapping.

Returns
-------
Expand All @@ -3043,9 +3082,18 @@ def recode_for_categories(
return codes.copy()
return codes

indexer = coerce_indexer_dtype(
new_categories.get_indexer_for(old_categories), new_categories
)
codes_in_old_cats = new_categories.get_indexer_for(old_categories)
if warn:
wrong = codes_in_old_cats == -1
if wrong.any():
warnings.warn(
"Constructing a Categorical with a dtype and values containing "
"non-null entries not in that dtype's categories is deprecated "
"and will raise in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)
indexer = coerce_indexer_dtype(codes_in_old_cats, new_categories)
new_codes = take_nd(indexer, codes, fill_value=-1)
return new_codes

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ def groups(self) -> dict[Hashable, Index]:
return self.groupings[0].groups
result_index, ids = self.result_index_and_ids
values = result_index._values
categories = Categorical(ids, categories=range(len(result_index)))
categories = Categorical.from_codes(ids, categories=range(len(result_index)))
result = {
# mypy is not aware that group has to be an integer
values[group]: self.axis.take(axis_ilocs) # type: ignore[call-overload]
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,12 @@ def _is_dtype_compat(self, other: Index) -> Categorical:
else:
values = other

codes = self.categories.get_indexer(values)
if ((codes == -1) & ~values.isna()).any():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need the check on L269 with this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think not, will check and update

# GH#37667 see test_equals_non_category
raise TypeError(
"categories must match existing categories when appending"
)
cat = Categorical(other, dtype=self.dtype)
other = CategoricalIndex(cat)
if not other.isin(values).all():
Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/arrays/categorical/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,16 @@ def test_set_categories(self):
],
)
def test_set_categories_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
msg = "Constructing a Categorical with a dtype and values containing"

warn1 = FutureWarning if set(values).difference(categories) else None
with tm.assert_produces_warning(warn1, match=msg):
c = Categorical(values, categories)

warn2 = FutureWarning if set(values).difference(new_categories) else None
with tm.assert_produces_warning(warn2, match=msg):
expected = Categorical(values, new_categories, ordered)

result = c.set_categories(new_categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)

Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/arrays/categorical/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,11 @@ def test_astype_category(self, dtype_ordered, ordered):

# non-standard categories
dtype = CategoricalDtype(list("adc"), dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, dtype=dtype)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = cat.astype(dtype)
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = Categorical(data, dtype=dtype)
tm.assert_categorical_equal(result, expected)

if dtype_ordered is False:
Expand Down
65 changes: 44 additions & 21 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,14 +228,15 @@ def test_constructor(self):
# two arrays
# - when the first is an integer dtype and the second is not
# - when the resulting codes are all -1/NaN
with tm.assert_produces_warning(None):
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"])

with tm.assert_produces_warning(None):
with tm.assert_produces_warning(FutureWarning, match=msg):
Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5])

# the next one are from the old docs
with tm.assert_produces_warning(None):
with tm.assert_produces_warning(FutureWarning, match=msg):
Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3])
cat = Categorical([1, 2], categories=[1, 2, 3])

Expand All @@ -247,12 +248,16 @@ def test_constructor_with_existing_categories(self):
# GH25318: constructing with pd.Series used to bogusly skip recoding
# categories
c0 = Categorical(["a", "b", "c", "a"])
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])

c2 = Categorical(c0, categories=c1.categories)
with tm.assert_produces_warning(FutureWarning, match=msg):
c2 = Categorical(c0, categories=c1.categories)
tm.assert_categorical_equal(c1, c2)

c3 = Categorical(Series(c0), categories=c1.categories)
with tm.assert_produces_warning(FutureWarning, match=msg):
c3 = Categorical(Series(c0), categories=c1.categories)
tm.assert_categorical_equal(c1, c3)

def test_constructor_not_sequence(self):
Expand Down Expand Up @@ -430,10 +435,13 @@ def test_constructor_dtype_and_others_raises(self):

@pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]])
def test_constructor_str_category(self, categories, ordered):
result = Categorical(
["a", "b"], categories=categories, ordered=ordered, dtype="category"
)
expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
warn = FutureWarning if categories == ["a", "c"] else None
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(warn, match=msg):
result = Categorical(
["a", "b"], categories=categories, ordered=ordered, dtype="category"
)
expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)

def test_constructor_str_unknown(self):
Expand All @@ -450,10 +458,12 @@ def test_constructor_np_strs(self):
def test_constructor_from_categorical_with_dtype(self):
dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = Categorical(values, dtype=dtype)
# We use dtype.categories, not values.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
["a", "b", None], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)

Expand All @@ -470,16 +480,19 @@ def test_constructor_from_categorical_with_unknown_dtype(self):
def test_constructor_from_categorical_string(self):
values = Categorical(["a", "b", "d"])
# use categories, ordered
result = Categorical(
values, categories=["a", "b", "c"], ordered=True, dtype="category"
)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = Categorical(
values, categories=["a", "b", "c"], ordered=True, dtype="category"
)
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
["a", "b", None], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)

# No string
result = Categorical(values, categories=["a", "b", "c"], ordered=True)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = Categorical(values, categories=["a", "b", "c"], ordered=True)
tm.assert_categorical_equal(result, expected)

def test_constructor_with_categorical_categories(self):
Expand Down Expand Up @@ -661,17 +674,25 @@ def test_from_inferred_categories_dtype(self):
cats = ["a", "b", "d"]
codes = np.array([0, 1, 0, 2], dtype="i8")
dtype = CategoricalDtype(["c", "b", "a"], ordered=True)
result = Categorical._from_inferred_categories(cats, codes, dtype)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(
FutureWarning, match=msg, check_stacklevel=False
):
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical(
["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True
["a", "b", "a", None], categories=["c", "b", "a"], ordered=True
)
tm.assert_categorical_equal(result, expected)

def test_from_inferred_categories_coerces(self):
cats = ["1", "2", "bad"]
codes = np.array([0, 0, 1, 2], dtype="i8")
dtype = CategoricalDtype([1, 2])
result = Categorical._from_inferred_categories(cats, codes, dtype)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(
FutureWarning, match=msg, check_stacklevel=False
):
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical([1, 1, 2, np.nan])
tm.assert_categorical_equal(result, expected)

Expand Down Expand Up @@ -722,7 +743,9 @@ def test_interval(self):

# extra
values = pd.interval_range(8, 11, periods=3)
cat = Categorical(values, categories=idx)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
cat = Categorical(values, categories=idx)
expected_codes = np.array([8, 9, -1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
Expand Down
Loading
Loading