-
Notifications
You must be signed in to change notification settings - Fork 653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FIX-#4522: Correct multiindex metadata with groupby #4523
base: master
Are you sure you want to change the base?
Changes from 7 commits
13395f0
8040915
6e8466e
95f5510
d2e8c5f
a5bb81e
5a11af0
75c6089
177bfe2
a547d32
3ce19ea
0e4a521
639a201
d2c7295
55ec3a1
2e9a056
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -415,7 +415,6 @@ def groupby( | |
) | ||
else: | ||
squeeze = False | ||
|
||
axis = self._get_axis_number(axis) | ||
idx_name = None | ||
# Drop here indicates whether or not to drop the data column before doing the | ||
|
@@ -424,7 +423,16 @@ def groupby( | |
# strings is passed in, the data used for the groupby is dropped before the | ||
# groupby takes place. | ||
drop = False | ||
|
||
# Check that there is no ambiguity in the parameter we were given. | ||
_by_check = by if is_list_like(by) else [by] | ||
for k in _by_check: | ||
if k in self.index.names and k in self.axes[axis]: | ||
level_name, index_name = "an index", "a column" | ||
if axis == 1: | ||
level_name, index_name = index_name, level_name | ||
raise ValueError( | ||
RehanSD marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: it's possible for multiple There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I checked, and in pandas, only the first invalid |
||
f"{k} is both {level_name} level and {index_name} label, which is ambiguous." | ||
) | ||
if ( | ||
not isinstance(by, (pandas.Series, Series)) | ||
and is_list_like(by) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1570,7 +1570,7 @@ def test_agg_exceptions(operation): | |
}, | ||
], | ||
) | ||
def test_to_pandas_convertion(kwargs): | ||
def test_to_pandas_conversion(kwargs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Quick fix for a typo I noticed when updating the testing suite! |
||
data = {"a": [1, 2], "b": [3, 4], "c": [5, 6]} | ||
by = ["a", "b"] | ||
|
||
|
@@ -2032,3 +2032,70 @@ def test_sum_with_level(): | |
} | ||
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) | ||
eval_general(modin_df, pandas_df, lambda df: df.set_index("C").groupby("C").sum()) | ||
|
||
|
||
def test_reset_index_groupby(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. without context, this test is unclear. Could you add a brief description of what error condition this is testing or link to the gh issue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, can do! |
||
frame_data = np.random.randint(97, 198, size=(2**6, 2**4)) | ||
pandas_df = pandas.DataFrame( | ||
frame_data, | ||
index=pandas.MultiIndex.from_tuples( | ||
[(i // 4, i // 2, i) for i in range(2**6)] | ||
), | ||
).add_prefix("col") | ||
pandas_df.index.names = [f"index_{i}" for i in range(len(pandas_df.index.names))] | ||
# Convert every other column to string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: convert every even column to string There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will change! |
||
for col in pandas_df.iloc[ | ||
:, [i for i in range(len(pandas_df.columns)) if i % 2 == 0] | ||
]: | ||
pandas_df[col] = [str(chr(i)) for i in pandas_df[col]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would be helpful if you could describe the schema for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense! |
||
modin_df = from_pandas(pandas_df) | ||
eval_general( | ||
modin_df, | ||
pandas_df, | ||
lambda df: df.reset_index().groupby(["index_0", "index_1"]).count(), | ||
) | ||
|
||
def test_by_in_index_and_columns(): | ||
RehanSD marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pandas_df = pandas.DataFrame( | ||
[[1, 2, 3]], index=pd.Index([0], name="a"), columns=['a', 'b', 'c'] | ||
) | ||
modin_df = from_pandas(pandas_df) | ||
eval_general( | ||
modin_df, | ||
pandas_df, | ||
lambda df: df.groupby(by='a').count(), | ||
raising_exceptions=True, | ||
check_exception_type=True, | ||
) | ||
eval_general( | ||
modin_df, | ||
pandas_df, | ||
lambda df: df.groupby(by=['a', 'b']).count(), | ||
raising_exceptions=True, | ||
check_exception_type=True, | ||
) | ||
pandas_df = pandas.DataFrame( | ||
[[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", 'b']), columns=['a', 'b', 'c'] | ||
) | ||
modin_df = from_pandas(pandas_df) | ||
eval_general( | ||
modin_df, | ||
pandas_df, | ||
lambda df: df.groupby(by='a').count(), | ||
raising_exceptions=True, | ||
check_exception_type=True, | ||
) | ||
eval_general( | ||
modin_df, | ||
pandas_df, | ||
lambda df: df.groupby(by=['a', 'c']).count(), | ||
raising_exceptions=True, | ||
check_exception_type=True, | ||
) | ||
eval_general( | ||
modin_df, | ||
pandas_df, | ||
lambda df: df.groupby(by=['a', 'b']).count(), | ||
raising_exceptions=True, | ||
check_exception_type=True, | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: why are we removing the empty line?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will remove!