From ddbc8ec92bd52f6b8f56852490a3b3d45ac3e1e8 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 19 Jul 2025 20:24:05 +0200
Subject: [PATCH 1/4] BUG: fix fill value for gouped sum in case of unobserved
 categories for string dtype (empty string instead of 0)

---
 pandas/_libs/groupby.pyi                 |  1 +
 pandas/_libs/groupby.pyx                 |  5 +++++
 pandas/core/arrays/base.py               |  3 +++
 pandas/core/groupby/ops.py               |  8 ++++++++
 pandas/tests/groupby/test_categorical.py | 14 +++++++++-----
 5 files changed, 26 insertions(+), 5 deletions(-)
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
index 163fc23535022..e1b438c2427e2 100644
--- a/pandas/_libs/groupby.pyi
+++ b/pandas/_libs/groupby.pyi
@@ -67,6 +67,7 @@ def group_sum(
     result_mask: np.ndarray | None = ...,
     min_count: int = ...,
     is_datetimelike: bool = ...,
+    is_string: bool = ...,
     skipna: bool = ...,
 ) -> None: ...
 def group_prod(
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index f65fa2368967a..76c0c19850081 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -707,6 +707,7 @@ def group_sum(
     uint8_t[:, ::1] result_mask=None,
     Py_ssize_t min_count=0,
     bint is_datetimelike=False,
+    bint is_string=False,
     bint skipna=True,
 ) -> None:
     """
@@ -729,6 +730,10 @@ def group_sum(
     sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
     compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
 
+    if is_string:
+        # for strings start with empty string instead of 0 as `initial` value
+        sumx = np.full((<object>out).shape, "", dtype=object)
+
     N, K = (<object>values).shape
     if uses_mask:
         nan_val = 0
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index d0048e122051a..3f9a75c60ec52 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2608,6 +2608,7 @@ def _groupby_op(
         kind = WrappedCythonOp.get_kind_from_how(how)
         op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)
 
+        is_string = False
         # GH#43682
         if isinstance(self.dtype, StringDtype):
             # StringArray
@@ -2632,6 +2633,7 @@ def _groupby_op(
 
             arr = self
             if op.how == "sum":
+                is_string = True
                 # https://github.com/pandas-dev/pandas/issues/60229
                 # All NA should result in the empty string.
                 assert "skipna" in kwargs
@@ -2649,6 +2651,7 @@ def _groupby_op(
             ngroups=ngroups,
             comp_ids=ids,
             mask=None,
+            is_string=is_string,
             **kwargs,
         )
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 75f3495041917..59b897b526e26 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -319,6 +319,7 @@ def _cython_op_ndim_compat(
         comp_ids: np.ndarray,
         mask: npt.NDArray[np.bool_] | None = None,
         result_mask: npt.NDArray[np.bool_] | None = None,
+        is_string: bool = False,
         **kwargs,
     ) -> np.ndarray:
         if values.ndim == 1:
@@ -335,6 +336,7 @@ def _cython_op_ndim_compat(
                 comp_ids=comp_ids,
                 mask=mask,
                 result_mask=result_mask,
+                is_string=is_string,
                 **kwargs,
             )
             if res.shape[0] == 1:
@@ -350,6 +352,7 @@ def _cython_op_ndim_compat(
             comp_ids=comp_ids,
             mask=mask,
             result_mask=result_mask,
+            is_string=is_string,
             **kwargs,
         )
 
@@ -363,6 +366,7 @@ def _call_cython_op(
         comp_ids: np.ndarray,
         mask: npt.NDArray[np.bool_] | None,
         result_mask: npt.NDArray[np.bool_] | None,
+        is_string: bool = False,
         **kwargs,
     ) -> np.ndarray:  # np.ndarray[ndim=2]
         orig_values = values
@@ -420,6 +424,10 @@ def _call_cython_op(
                 "sum",
                 "median",
             ]:
+                if self.how == "sum":
+                    # pass in through kwargs only for sum (other functions don't have
+                    # the keyword)
+                    kwargs["is_string"] = is_string
                 func(
                     out=result,
                     counts=counts,
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index cae3013642739..2f2a70a02a730 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -32,6 +32,14 @@ def f(a):
         return a
 
     index = MultiIndex.from_product(map(f, args), names=names)
+    if isinstance(fill_value, dict):
+        # fill_value is a dict mapping column names to fill values
+        # -> reindex column by column (reindex itself does not support this)
+        res = {}
+        for col in result.columns:
+            res[col] = result[col].reindex(index, fill_value=fill_value[col])
+        return DataFrame(res, index=index).sort_index()
+
     return result.reindex(index, fill_value=fill_value).sort_index()
 
 
@@ -325,10 +333,6 @@ def test_observed(request, using_infer_string, observed):
     # gh-8138 (back-compat)
     # gh-8869
 
-    if using_infer_string and not observed:
-        # TODO(infer_string) this fails with filling the string column with 0
-        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
-
     cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
     cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
     df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
@@ -356,7 +360,7 @@ def test_observed(request, using_infer_string, observed):
     result = gb.sum()
     if not observed:
         expected = cartesian_product_for_groupers(
-            expected, [cat1, cat2], list("AB"), fill_value=0
+            expected, [cat1, cat2], list("AB"), fill_value={"values": 0, "C": ""}
         )
 
     tm.assert_frame_equal(result, expected)

From 6a32c836a0d9c784e9927af1263f94914687ec32 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 19 Jul 2025 23:06:37 +0200
Subject: [PATCH 2/4] fix one more test

---
 pandas/tests/groupby/test_categorical.py | 2 +-
 pandas/tests/groupby/test_timegrouper.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index 2f2a70a02a730..1b27a98658821 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -325,7 +325,7 @@ def test_apply(ordered):
     tm.assert_series_equal(result, expected)
 
 
-def test_observed(request, using_infer_string, observed):
+def test_observed(observed):
     # multiple groupers, don't re-expand the output space
     # of the grouper
     # gh-14942 (implement)
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
index a64b15c211908..5ef36331a20fa 100644
--- a/pandas/tests/groupby/test_timegrouper.py
+++ b/pandas/tests/groupby/test_timegrouper.py
@@ -108,7 +108,7 @@ def test_groupby_with_timegrouper(self, using_infer_string):
                 unit=df.index.unit,
             )
             expected = DataFrame(
-                {"Buyer": 0, "Quantity": 0},
+                {"Buyer": "" if using_infer_string else 0, "Quantity": 0},
                 index=exp_dti,
             )
             # Cast to object/str to avoid implicit cast when setting

From 88501c6fe02d8798961ef452fd3b547d7b00ac19 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sun, 20 Jul 2025 12:47:07 +0200
Subject: [PATCH 3/4] use initial instead + fix test for non-infer mode

---
 pandas/_libs/groupby.pyi                 |  2 +-
 pandas/_libs/groupby.pyx                 | 18 ++++++++++--------
 pandas/core/arrays/base.py               |  6 +++---
 pandas/core/groupby/ops.py               | 11 ++++++-----
 pandas/tests/groupby/test_categorical.py |  7 +++++--
 5 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
index e1b438c2427e2..803c2cb0b0d19 100644
--- a/pandas/_libs/groupby.pyi
+++ b/pandas/_libs/groupby.pyi
@@ -67,7 +67,7 @@ def group_sum(
     result_mask: np.ndarray | None = ...,
     min_count: int = ...,
     is_datetimelike: bool = ...,
-    is_string: bool = ...,
+    initial: object = ...,
     skipna: bool = ...,
 ) -> None: ...
 def group_prod(
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 76c0c19850081..1ec4dc1ffb482 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -707,7 +707,7 @@ def group_sum(
     uint8_t[:, ::1] result_mask=None,
     Py_ssize_t min_count=0,
     bint is_datetimelike=False,
-    bint is_string=False,
+    object initial=0,
     bint skipna=True,
 ) -> None:
     """
@@ -726,13 +726,15 @@ def group_sum(
         raise ValueError("len(index) != len(labels)")
 
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
-    # the below is equivalent to `np.zeros_like(out)` but faster
-    sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
-    compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
-
-    if is_string:
-        # for strings start with empty string instead of 0 as `initial` value
-        sumx = np.full((<object>out).shape, "", dtype=object)
+    if initial == 0:
+        # the below is equivalent to `np.zeros_like(out)` but faster
+        sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
+        compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
+    else:
+        # in practice this path is only taken for strings to use empty string as initial
+        assert sum_t is object
+        sumx = np.full((<object>out).shape, initial, dtype=object)
+        # object code path does not use `compensation`
 
     N, K = (<object>values).shape
     if uses_mask:
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 3f9a75c60ec52..c7cc983fc1586 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2608,7 +2608,7 @@ def _groupby_op(
         kind = WrappedCythonOp.get_kind_from_how(how)
         op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)
 
-        is_string = False
+        initial = 0
         # GH#43682
         if isinstance(self.dtype, StringDtype):
             # StringArray
@@ -2633,7 +2633,7 @@ def _groupby_op(
 
             arr = self
             if op.how == "sum":
-                is_string = True
+                initial = ""
                 # https://github.com/pandas-dev/pandas/issues/60229
                 # All NA should result in the empty string.
                 assert "skipna" in kwargs
@@ -2651,7 +2651,7 @@ def _groupby_op(
             ngroups=ngroups,
             comp_ids=ids,
             mask=None,
-            is_string=is_string,
+            initial=initial,
             **kwargs,
         )
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 59b897b526e26..e8393cb6aee1a 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -12,6 +12,7 @@
 import functools
 from typing import (
     TYPE_CHECKING,
+    Any,
     Generic,
     final,
 )
@@ -319,7 +320,7 @@ def _cython_op_ndim_compat(
         comp_ids: np.ndarray,
         mask: npt.NDArray[np.bool_] | None = None,
         result_mask: npt.NDArray[np.bool_] | None = None,
-        is_string: bool = False,
+        initial: Any = 0,
         **kwargs,
     ) -> np.ndarray:
         if values.ndim == 1:
@@ -336,7 +337,7 @@ def _cython_op_ndim_compat(
                 comp_ids=comp_ids,
                 mask=mask,
                 result_mask=result_mask,
-                is_string=is_string,
+                initial=initial,
                 **kwargs,
             )
             if res.shape[0] == 1:
@@ -352,7 +353,7 @@ def _cython_op_ndim_compat(
             comp_ids=comp_ids,
             mask=mask,
             result_mask=result_mask,
-            is_string=is_string,
+            initial=initial,
             **kwargs,
         )
 
@@ -366,7 +367,7 @@ def _call_cython_op(
         comp_ids: np.ndarray,
         mask: npt.NDArray[np.bool_] | None,
         result_mask: npt.NDArray[np.bool_] | None,
-        is_string: bool = False,
+        initial: Any = 0,
         **kwargs,
     ) -> np.ndarray:  # np.ndarray[ndim=2]
         orig_values = values
@@ -427,7 +428,7 @@ def _call_cython_op(
                 if self.how == "sum":
                     # pass in through kwargs only for sum (other functions don't have
                     # the keyword)
-                    kwargs["is_string"] = is_string
+                    kwargs["initial"] = initial
                 func(
                     out=result,
                     counts=counts,
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index 1b27a98658821..0b1ae516ba843 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -325,7 +325,7 @@ def test_apply(ordered):
     tm.assert_series_equal(result, expected)
 
 
-def test_observed(observed):
+def test_observed(observed, using_infer_string):
     # multiple groupers, don't re-expand the output space
     # of the grouper
     # gh-14942 (implement)
@@ -360,7 +360,10 @@ def test_observed(observed):
     result = gb.sum()
     if not observed:
         expected = cartesian_product_for_groupers(
-            expected, [cat1, cat2], list("AB"), fill_value={"values": 0, "C": ""}
+            expected,
+            [cat1, cat2],
+            list("AB"),
+            fill_value={"values": 0, "C": ""} if using_infer_string else 0,
         )
 
     tm.assert_frame_equal(result, expected)

From b0013bd34586f21a9a9f3e752ef0b1a6a3267184 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sun, 20 Jul 2025 13:04:49 +0200
Subject: [PATCH 4/4] fix typing

---
 pandas/core/arrays/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index c7cc983fc1586..d11e2271f9574 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2608,7 +2608,7 @@ def _groupby_op(
         kind = WrappedCythonOp.get_kind_from_how(how)
         op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)
 
-        initial = 0
+        initial: Any = 0
         # GH#43682
         if isinstance(self.dtype, StringDtype):
             # StringArray