Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release_notes/release_notes-0.15.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Key Features and Updates
* FIX-#4504: Support na_action in applymap (#4505)
* FIX-#4503: Stop the memory logging thread after session exit (#4515)
* FIX-#4464: Refactor Ray utils and quick fix groupby.count failing on virtual partitions (#4490)
* FIX-#4522: Correct multiindex metadata with groupby (#4523)
* Performance enhancements
* FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346)
* PERF-#4493: Use partition size caches more in Modin dataframe (#4495)
Expand Down
8 changes: 8 additions & 0 deletions modin/core/dataframe/algebra/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,14 @@ def map(
axis=1,
)
other = list(other.columns)
# GH#4522: Vile as this may be, it is necessary to avoid the case where we are
# grouping by columns that were recently added to the data via
# `from_labels`. The internal dataframe doesn't know what to do when
# the label matches a column name.
# We ensure that the columns, index, and by don't intersect in the API level,
# so if we hit this if statement, we know its a result of a deferred re-index.
if len(df.columns.intersection(df.index.names)) > 0:
df = df.reset_index(drop=True)
by_part = other
else:
by_part = by
Expand Down
12 changes: 10 additions & 2 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,6 @@ def groupby(
)
else:
squeeze = False

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: why are we removing the empty line?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will remove!

axis = self._get_axis_number(axis)
idx_name = None
# Drop here indicates whether or not to drop the data column before doing the
Expand All @@ -424,7 +423,16 @@ def groupby(
# strings is passed in, the data used for the groupby is dropped before the
# groupby takes place.
drop = False

# Check that there is no ambiguity in the parameter we were given.
_by_check = by if is_list_like(by) else [by]
for k in _by_check:
if k in self.index.names and k in self.axes[axis]:
level_name, index_name = "an index", "a column"
if axis == 1:
level_name, index_name = index_name, level_name
raise ValueError(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: it's possible for multiple k to be invalid and cause us to raise a ValueError, however currently only one such k will be raised. Is it worth collecting all invalid k and raising a ValueError if any k are invalid?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked, and in pandas, only the first invalid k is raised!

f"{k} is both {level_name} level and {index_name} label, which is ambiguous."
)
if (
not isinstance(by, (pandas.Series, Series))
and is_list_like(by)
Expand Down
69 changes: 68 additions & 1 deletion modin/pandas/test/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1570,7 +1570,7 @@ def test_agg_exceptions(operation):
},
],
)
def test_to_pandas_convertion(kwargs):
def test_to_pandas_conversion(kwargs):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quick fix for a typo I noticed when updating the testing suite!

data = {"a": [1, 2], "b": [3, 4], "c": [5, 6]}
by = ["a", "b"]

Expand Down Expand Up @@ -2032,3 +2032,70 @@ def test_sum_with_level():
}
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)
eval_general(modin_df, pandas_df, lambda df: df.set_index("C").groupby("C").sum())


def test_reset_index_groupby():

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

without context, this test is unclear. Could you add a brief description of what error condition this is testing or link to the gh issue?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, can do!

frame_data = np.random.randint(97, 198, size=(2**6, 2**4))
pandas_df = pandas.DataFrame(
frame_data,
index=pandas.MultiIndex.from_tuples(
[(i // 4, i // 2, i) for i in range(2**6)]
),
).add_prefix("col")
pandas_df.index.names = [f"index_{i}" for i in range(len(pandas_df.index.names))]
# Convert every other column to string

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: convert every even column to string

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will change!

for col in pandas_df.iloc[
:, [i for i in range(len(pandas_df.columns)) if i % 2 == 0]
]:
pandas_df[col] = [str(chr(i)) for i in pandas_df[col]]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be helpful if you could describe the schema for pandas_df here, to make the results of the above computation clearer.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense!

modin_df = from_pandas(pandas_df)
eval_general(
modin_df,
pandas_df,
lambda df: df.reset_index().groupby(["index_0", "index_1"]).count(),
)

def test_by_in_index_and_columns():
pandas_df = pandas.DataFrame(
[[1, 2, 3]], index=pd.Index([0], name="a"), columns=['a', 'b', 'c']
)
modin_df = from_pandas(pandas_df)
eval_general(
modin_df,
pandas_df,
lambda df: df.groupby(by='a').count(),
raising_exceptions=True,
check_exception_type=True,
)
eval_general(
modin_df,
pandas_df,
lambda df: df.groupby(by=['a', 'b']).count(),
raising_exceptions=True,
check_exception_type=True,
)
pandas_df = pandas.DataFrame(
[[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", 'b']), columns=['a', 'b', 'c']
)
modin_df = from_pandas(pandas_df)
eval_general(
modin_df,
pandas_df,
lambda df: df.groupby(by='a').count(),
raising_exceptions=True,
check_exception_type=True,
)
eval_general(
modin_df,
pandas_df,
lambda df: df.groupby(by=['a', 'c']).count(),
raising_exceptions=True,
check_exception_type=True,
)
eval_general(
modin_df,
pandas_df,
lambda df: df.groupby(by=['a', 'b']).count(),
raising_exceptions=True,
check_exception_type=True,
)