diff --git a/docs/release_notes/release_notes-0.15.0.rst b/docs/release_notes/release_notes-0.15.0.rst index d273aaf6ae0..8b72d06faa1 100644 --- a/docs/release_notes/release_notes-0.15.0.rst +++ b/docs/release_notes/release_notes-0.15.0.rst @@ -29,6 +29,7 @@ Key Features and Updates * FIX-#4503: Stop the memory logging thread after session exit (#4515) * FIX-#4531: Fix a makedirs race condition in to_parquet (#4533) * FIX-#4464: Refactor Ray utils and quick fix groupby.count failing on virtual partitions (#4490) + * FIX-#4522: Correct multiindex metadata with groupby (#4523) * FIX-#4436: Fix to_pydatetime dtype for timezone None (#4437) * Performance enhancements * FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346) diff --git a/modin/core/dataframe/algebra/groupby.py b/modin/core/dataframe/algebra/groupby.py index 78bb5dbc9dd..c2b7e38ad37 100644 --- a/modin/core/dataframe/algebra/groupby.py +++ b/modin/core/dataframe/algebra/groupby.py @@ -133,6 +133,14 @@ def map( axis=1, ) other = list(other.columns) + # GH#4522: Vile as this may be, it is necessary to avoid the case where we are + # grouping by columns that were recently added to the data via + # `from_labels`. The internal dataframe doesn't know what to do when + # the label matches a column name. + # We ensure that the columns, index, and by don't intersect in the API level, + # so if we hit this if statement, we know its a result of a deferred re-index. + if len(df.columns.intersection(df.index.names)) > 0: + df = df.reset_index(drop=True) by_part = other else: by_part = by diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 30f43189eac..c220b7291cb 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3461,7 +3461,9 @@ def value_counts( Count unique values in the `BasePandasDataset`. """ if subset is None: - subset = self._query_compiler.columns + # Need to get column names as array rather than as Index, since `groupby` does not + # treat `Index` arguments to `by` as a list of labels. + subset = self._query_compiler.columns.values counted_values = self.groupby(by=subset, dropna=dropna, observed=True).size() if sort: counted_values.sort_values(ascending=ascending, inplace=True) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 6d3cc6812b4..887ac139af3 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -424,9 +424,22 @@ def groupby( # strings is passed in, the data used for the groupby is dropped before the # groupby takes place. drop = False - + # Check that there is no ambiguity in the parameter we were given. + # We don't need to check if `by` is a Series or Index, since those + # won't be referencing labels + if not isinstance(by, (pandas.Series, Series, pandas.Index)): + _by_list = by if is_list_like(by) else [by] + for k in _by_list: + if not isinstance(k, (Series, pandas.Series, pandas.Index)): + if k in self.index.names and k in self.axes[axis ^ 1]: + level_name, index_name = "an index", "a column" + if axis == 1: + level_name, index_name = index_name, level_name + raise ValueError( + f"{k} is both {level_name} level and {index_name} label, which is ambiguous." + ) if ( - not isinstance(by, (pandas.Series, Series)) + not isinstance(by, (pandas.Series, Series, pandas.Index)) and is_list_like(by) and len(by) == 1 ): @@ -443,6 +456,11 @@ def groupby( level, by = by, None elif level is None: by = self.__getitem__(by)._query_compiler + elif isinstance(by, (pandas.Series, pandas.Index)): + if isinstance(by, pandas.Index) and len(by) != len(self.axes[axis]): + raise ValueError("Grouper and axis must be same length") + idx_name = by.name + by = Series(by)._query_compiler elif isinstance(by, Series): drop = by._parent is self idx_name = by.name diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index bac6b02ee6b..5dc438c79d3 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -1570,7 +1570,7 @@ def test_agg_exceptions(operation): }, ], ) -def test_to_pandas_convertion(kwargs): +def test_to_pandas_conversion(kwargs): data = {"a": [1, 2], "b": [3, 4], "c": [5, 6]} by = ["a", "b"] @@ -2032,3 +2032,126 @@ def test_sum_with_level(): } modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) eval_general(modin_df, pandas_df, lambda df: df.set_index("C").groupby("C").sum()) + + +def test_reset_index_groupby(): + # Due to `reset_index` deferring the actual reindexing of partitions, + # when we call groupby after a `reset_index` with a `by` column name + # that was moved from the index to the columns via `from_labels` the + # algebra layer incorrectly thinks that the `by` key is duplicated + # across both the columns and labels, and fails, when it should + # succeed. We have this test to ensure that that case is correctly + # handled, and passes. For more details, checkout + # https://github.com/modin-project/modin/issues/4522. + frame_data = np.random.randint(97, 198, size=(2**6, 2**4)) + pandas_df = pandas.DataFrame( + frame_data, + index=pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in range(2**6)] + ), + ).add_prefix("col") + pandas_df.index.names = [f"index_{i}" for i in range(len(pandas_df.index.names))] + # Convert every even column to string + for col in pandas_df.iloc[ + :, [i for i in range(len(pandas_df.columns)) if i % 2 == 0] + ]: + pandas_df[col] = [str(chr(i)) for i in pandas_df[col]] + # The `pandas_df` contains a multi-index with 3 levels, named `index_0`, `index_1`, + # and `index_2`, and 16 columns, named `col0` through `col15`. Every even column + # has dtype `str`, while odd columns have dtype `int64`. + modin_df = from_pandas(pandas_df) + eval_general( + modin_df, + pandas_df, + lambda df: df.reset_index().groupby(["index_0", "index_1"]).count(), + ) + + +def test_by_in_index_and_columns(): + pandas_df = pandas.DataFrame( + [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"] + ) + modin_df = from_pandas(pandas_df) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by="a").count(), + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=["a", "b"]).count(), + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=[df["b"], "a"]).count(), + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", "b"]), columns=["a", "b", "c"] + ) + modin_df = from_pandas(pandas_df) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by="a").count(), + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=["a", "c"]).count(), + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=["a", "b"]).count(), + ) + + +def test_by_series(): + pandas_df = pandas.DataFrame( + [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"] + ) + modin_df = from_pandas(pandas_df) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=pandas.Series(["a"])).count(), + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=pandas.Series(["a", "b"])).count(), + ) + + def make_appropriately_typed_series(df, values=["a"]): + """Return a Series from either pandas or modin.pandas depending on type of `df`.""" + if isinstance(df, pd.DataFrame): + return pd.Series(values) + return pandas.Series(values) + + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=make_appropriately_typed_series(df)).count(), + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby( + by=make_appropriately_typed_series(df, ["a", "b"]) + ).count(), + ) + + +def test_by_index(): + pandas_df = pandas.DataFrame( + [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"] + ) + modin_df = from_pandas(pandas_df) + eval_general(modin_df, pandas_df, lambda df: df.groupby(by=pd.Index(["a"])).count()) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=pd.Index(["a", "b"])).count(), + )