From ddbc8ec92bd52f6b8f56852490a3b3d45ac3e1e8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 19 Jul 2025 20:24:05 +0200 Subject: [PATCH 1/4] BUG: fix fill value for gouped sum in case of unobserved categories for string dtype (empty string instead of 0) --- pandas/_libs/groupby.pyi | 1 + pandas/_libs/groupby.pyx | 5 +++++ pandas/core/arrays/base.py | 3 +++ pandas/core/groupby/ops.py | 8 ++++++++ pandas/tests/groupby/test_categorical.py | 14 +++++++++----- 5 files changed, 26 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 163fc23535022..e1b438c2427e2 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -67,6 +67,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., + is_string: bool = ..., skipna: bool = ..., ) -> None: ... def group_prod( diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f65fa2368967a..76c0c19850081 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -707,6 +707,7 @@ def group_sum( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, + bint is_string=False, bint skipna=True, ) -> None: """ @@ -729,6 +730,10 @@ def group_sum( sumx = np.zeros((out).shape, dtype=(out).base.dtype) compensation = np.zeros((out).shape, dtype=(out).base.dtype) + if is_string: + # for strings start with empty string instead of 0 as `initial` value + sumx = np.full((out).shape, "", dtype=object) + N, K = (values).shape if uses_mask: nan_val = 0 diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d0048e122051a..3f9a75c60ec52 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2608,6 +2608,7 @@ def _groupby_op( kind = WrappedCythonOp.get_kind_from_how(how) op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + is_string = False # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray @@ -2632,6 +2633,7 @@ def _groupby_op( arr = self if op.how == "sum": + is_string = True # https://github.com/pandas-dev/pandas/issues/60229 # All NA should result in the empty string. assert "skipna" in kwargs @@ -2649,6 +2651,7 @@ def _groupby_op( ngroups=ngroups, comp_ids=ids, mask=None, + is_string=is_string, **kwargs, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 75f3495041917..59b897b526e26 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -319,6 +319,7 @@ def _cython_op_ndim_compat( comp_ids: np.ndarray, mask: npt.NDArray[np.bool_] | None = None, result_mask: npt.NDArray[np.bool_] | None = None, + is_string: bool = False, **kwargs, ) -> np.ndarray: if values.ndim == 1: @@ -335,6 +336,7 @@ def _cython_op_ndim_compat( comp_ids=comp_ids, mask=mask, result_mask=result_mask, + is_string=is_string, **kwargs, ) if res.shape[0] == 1: @@ -350,6 +352,7 @@ def _cython_op_ndim_compat( comp_ids=comp_ids, mask=mask, result_mask=result_mask, + is_string=is_string, **kwargs, ) @@ -363,6 +366,7 @@ def _call_cython_op( comp_ids: np.ndarray, mask: npt.NDArray[np.bool_] | None, result_mask: npt.NDArray[np.bool_] | None, + is_string: bool = False, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values @@ -420,6 +424,10 @@ def _call_cython_op( "sum", "median", ]: + if self.how == "sum": + # pass in through kwargs only for sum (other functions don't have + # the keyword) + kwargs["is_string"] = is_string func( out=result, counts=counts, diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index cae3013642739..2f2a70a02a730 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -32,6 +32,14 @@ def f(a): return a index = MultiIndex.from_product(map(f, args), names=names) + if isinstance(fill_value, dict): + # fill_value is a dict mapping column names to fill values + # -> reindex column by column (reindex itself does not support this) + res = {} + for col in result.columns: + res[col] = result[col].reindex(index, fill_value=fill_value[col]) + return DataFrame(res, index=index).sort_index() + return result.reindex(index, fill_value=fill_value).sort_index() @@ -325,10 +333,6 @@ def test_observed(request, using_infer_string, observed): # gh-8138 (back-compat) # gh-8869 - if using_infer_string and not observed: - # TODO(infer_string) this fails with filling the string column with 0 - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) - cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) @@ -356,7 +360,7 @@ def test_observed(request, using_infer_string, observed): result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, [cat1, cat2], list("AB"), fill_value=0 + expected, [cat1, cat2], list("AB"), fill_value={"values": 0, "C": ""} ) tm.assert_frame_equal(result, expected) From 6a32c836a0d9c784e9927af1263f94914687ec32 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 19 Jul 2025 23:06:37 +0200 Subject: [PATCH 2/4] fix one more test --- pandas/tests/groupby/test_categorical.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 2f2a70a02a730..1b27a98658821 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -325,7 +325,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) -def test_observed(request, using_infer_string, observed): +def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a64b15c211908..5ef36331a20fa 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -108,7 +108,7 @@ def test_groupby_with_timegrouper(self, using_infer_string): unit=df.index.unit, ) expected = DataFrame( - {"Buyer": 0, "Quantity": 0}, + {"Buyer": "" if using_infer_string else 0, "Quantity": 0}, index=exp_dti, ) # Cast to object/str to avoid implicit cast when setting From 88501c6fe02d8798961ef452fd3b547d7b00ac19 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 20 Jul 2025 12:47:07 +0200 Subject: [PATCH 3/4] use initial instead + fix test for non-infer mode --- pandas/_libs/groupby.pyi | 2 +- pandas/_libs/groupby.pyx | 18 ++++++++++-------- pandas/core/arrays/base.py | 6 +++--- pandas/core/groupby/ops.py | 11 ++++++----- pandas/tests/groupby/test_categorical.py | 7 +++++-- 5 files changed, 25 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index e1b438c2427e2..803c2cb0b0d19 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -67,7 +67,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., - is_string: bool = ..., + initial: object = ..., skipna: bool = ..., ) -> None: ... def group_prod( diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 76c0c19850081..1ec4dc1ffb482 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -707,7 +707,7 @@ def group_sum( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, - bint is_string=False, + object initial=0, bint skipna=True, ) -> None: """ @@ -726,13 +726,15 @@ def group_sum( raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - # the below is equivalent to `np.zeros_like(out)` but faster - sumx = np.zeros((out).shape, dtype=(out).base.dtype) - compensation = np.zeros((out).shape, dtype=(out).base.dtype) - - if is_string: - # for strings start with empty string instead of 0 as `initial` value - sumx = np.full((out).shape, "", dtype=object) + if initial == 0: + # the below is equivalent to `np.zeros_like(out)` but faster + sumx = np.zeros((out).shape, dtype=(out).base.dtype) + compensation = np.zeros((out).shape, dtype=(out).base.dtype) + else: + # in practice this path is only taken for strings to use empty string as initial + assert sum_t is object + sumx = np.full((out).shape, initial, dtype=object) + # object code path does not use `compensation` N, K = (values).shape if uses_mask: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3f9a75c60ec52..c7cc983fc1586 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2608,7 +2608,7 @@ def _groupby_op( kind = WrappedCythonOp.get_kind_from_how(how) op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) - is_string = False + initial = 0 # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray @@ -2633,7 +2633,7 @@ def _groupby_op( arr = self if op.how == "sum": - is_string = True + initial = "" # https://github.com/pandas-dev/pandas/issues/60229 # All NA should result in the empty string. assert "skipna" in kwargs @@ -2651,7 +2651,7 @@ def _groupby_op( ngroups=ngroups, comp_ids=ids, mask=None, - is_string=is_string, + initial=initial, **kwargs, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 59b897b526e26..e8393cb6aee1a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -12,6 +12,7 @@ import functools from typing import ( TYPE_CHECKING, + Any, Generic, final, ) @@ -319,7 +320,7 @@ def _cython_op_ndim_compat( comp_ids: np.ndarray, mask: npt.NDArray[np.bool_] | None = None, result_mask: npt.NDArray[np.bool_] | None = None, - is_string: bool = False, + initial: Any = 0, **kwargs, ) -> np.ndarray: if values.ndim == 1: @@ -336,7 +337,7 @@ def _cython_op_ndim_compat( comp_ids=comp_ids, mask=mask, result_mask=result_mask, - is_string=is_string, + initial=initial, **kwargs, ) if res.shape[0] == 1: @@ -352,7 +353,7 @@ def _cython_op_ndim_compat( comp_ids=comp_ids, mask=mask, result_mask=result_mask, - is_string=is_string, + initial=initial, **kwargs, ) @@ -366,7 +367,7 @@ def _call_cython_op( comp_ids: np.ndarray, mask: npt.NDArray[np.bool_] | None, result_mask: npt.NDArray[np.bool_] | None, - is_string: bool = False, + initial: Any = 0, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values @@ -427,7 +428,7 @@ def _call_cython_op( if self.how == "sum": # pass in through kwargs only for sum (other functions don't have # the keyword) - kwargs["is_string"] = is_string + kwargs["initial"] = initial func( out=result, counts=counts, diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1b27a98658821..0b1ae516ba843 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -325,7 +325,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) -def test_observed(observed): +def test_observed(observed, using_infer_string): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -360,7 +360,10 @@ def test_observed(observed): result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, [cat1, cat2], list("AB"), fill_value={"values": 0, "C": ""} + expected, + [cat1, cat2], + list("AB"), + fill_value={"values": 0, "C": ""} if using_infer_string else 0, ) tm.assert_frame_equal(result, expected) From b0013bd34586f21a9a9f3e752ef0b1a6a3267184 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 20 Jul 2025 13:04:49 +0200 Subject: [PATCH 4/4] fix typing --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c7cc983fc1586..d11e2271f9574 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2608,7 +2608,7 @@ def _groupby_op( kind = WrappedCythonOp.get_kind_from_how(how) op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) - initial = 0 + initial: Any = 0 # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray