diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 163fc23535022..803c2cb0b0d19 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -67,6 +67,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., + initial: object = ..., skipna: bool = ..., ) -> None: ... def group_prod( diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f65fa2368967a..1ec4dc1ffb482 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -707,6 +707,7 @@ def group_sum( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, + object initial=0, bint skipna=True, ) -> None: """ @@ -725,9 +726,15 @@ def group_sum( raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - # the below is equivalent to `np.zeros_like(out)` but faster - sumx = np.zeros((out).shape, dtype=(out).base.dtype) - compensation = np.zeros((out).shape, dtype=(out).base.dtype) + if initial == 0: + # the below is equivalent to `np.zeros_like(out)` but faster + sumx = np.zeros((out).shape, dtype=(out).base.dtype) + compensation = np.zeros((out).shape, dtype=(out).base.dtype) + else: + # in practice this path is only taken for strings to use empty string as initial + assert sum_t is object + sumx = np.full((out).shape, initial, dtype=object) + # object code path does not use `compensation` N, K = (values).shape if uses_mask: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d0048e122051a..d11e2271f9574 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2608,6 +2608,7 @@ def _groupby_op( kind = WrappedCythonOp.get_kind_from_how(how) op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + initial: Any = 0 # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray @@ -2632,6 +2633,7 @@ def _groupby_op( arr = self if op.how == "sum": + initial = "" # https://github.com/pandas-dev/pandas/issues/60229 # All NA should result in the empty string. assert "skipna" in kwargs @@ -2649,6 +2651,7 @@ def _groupby_op( ngroups=ngroups, comp_ids=ids, mask=None, + initial=initial, **kwargs, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 75f3495041917..e8393cb6aee1a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -12,6 +12,7 @@ import functools from typing import ( TYPE_CHECKING, + Any, Generic, final, ) @@ -319,6 +320,7 @@ def _cython_op_ndim_compat( comp_ids: np.ndarray, mask: npt.NDArray[np.bool_] | None = None, result_mask: npt.NDArray[np.bool_] | None = None, + initial: Any = 0, **kwargs, ) -> np.ndarray: if values.ndim == 1: @@ -335,6 +337,7 @@ def _cython_op_ndim_compat( comp_ids=comp_ids, mask=mask, result_mask=result_mask, + initial=initial, **kwargs, ) if res.shape[0] == 1: @@ -350,6 +353,7 @@ def _cython_op_ndim_compat( comp_ids=comp_ids, mask=mask, result_mask=result_mask, + initial=initial, **kwargs, ) @@ -363,6 +367,7 @@ def _call_cython_op( comp_ids: np.ndarray, mask: npt.NDArray[np.bool_] | None, result_mask: npt.NDArray[np.bool_] | None, + initial: Any = 0, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values @@ -420,6 +425,10 @@ def _call_cython_op( "sum", "median", ]: + if self.how == "sum": + # pass in through kwargs only for sum (other functions don't have + # the keyword) + kwargs["initial"] = initial func( out=result, counts=counts, diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index cae3013642739..0b1ae516ba843 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -32,6 +32,14 @@ def f(a): return a index = MultiIndex.from_product(map(f, args), names=names) + if isinstance(fill_value, dict): + # fill_value is a dict mapping column names to fill values + # -> reindex column by column (reindex itself does not support this) + res = {} + for col in result.columns: + res[col] = result[col].reindex(index, fill_value=fill_value[col]) + return DataFrame(res, index=index).sort_index() + return result.reindex(index, fill_value=fill_value).sort_index() @@ -317,7 +325,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) -def test_observed(request, using_infer_string, observed): +def test_observed(observed, using_infer_string): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -325,10 +333,6 @@ def test_observed(request, using_infer_string, observed): # gh-8138 (back-compat) # gh-8869 - if using_infer_string and not observed: - # TODO(infer_string) this fails with filling the string column with 0 - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) - cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) @@ -356,7 +360,10 @@ def test_observed(request, using_infer_string, observed): result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, [cat1, cat2], list("AB"), fill_value=0 + expected, + [cat1, cat2], + list("AB"), + fill_value={"values": 0, "C": ""} if using_infer_string else 0, ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a64b15c211908..5ef36331a20fa 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -108,7 +108,7 @@ def test_groupby_with_timegrouper(self, using_infer_string): unit=df.index.unit, ) expected = DataFrame( - {"Buyer": 0, "Quantity": 0}, + {"Buyer": "" if using_infer_string else 0, "Quantity": 0}, index=exp_dti, ) # Cast to object/str to avoid implicit cast when setting