modin-project · devin-petersohn · Jun 2, 2022 · Jun 2, 2022 · Jun 2, 2022 · Jun 3, 2022
@@ -28,6 +28,7 @@ Key Features and Updates
   * FIX-#4504: Support na_action in applymap (#4505)
   * FIX-#4503: Stop the memory logging thread after session exit (#4515)
   * FIX-#4464: Refactor Ray utils and quick fix groupby.count failing on virtual partitions (#4490)
+  * FIX-#4522: Correct multiindex metadata with groupby (#4523)
 * Performance enhancements
   * FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346)
   * PERF-#4493: Use partition size caches more in Modin dataframe (#4495)

@@ -133,6 +133,14 @@ def map(
                     axis=1,
                 )
                 other = list(other.columns)
+                # GH#4522: Vile as this may be, it is necessary to avoid the case where we are
+                # grouping by columns that were recently added to the data via
+                # `from_labels`. The internal dataframe doesn't know what to do when
+                # the label matches a column name.
+                # We ensure that the columns, index, and by don't intersect in the API level,
+                # so if we hit this if statement, we know its a result of a deferred re-index.
+                if len(df.columns.intersection(df.index.names)) > 0:
+                    df = df.reset_index(drop=True)
             by_part = other
         else:
             by_part = by

@@ -415,7 +415,6 @@ def groupby(
             )
         else:
             squeeze = False
-
         axis = self._get_axis_number(axis)
         idx_name = None
         # Drop here indicates whether or not to drop the data column before doing the
@@ -424,7 +423,16 @@ def groupby(
         # strings is passed in, the data used for the groupby is dropped before the
         # groupby takes place.
         drop = False
-
+        # Check that there is no ambiguity in the parameter we were given.
+        _by_check = by if is_list_like(by) else [by]
+        for k in _by_check:
+            if k in self.index.names and k in self.axes[axis]:
+                level_name, index_name = "an index", "a column"
+                if axis == 1:
+                    level_name, index_name = index_name, level_name
+                raise ValueError(
+                    f"{k} is both {level_name} level and {index_name} label, which is ambiguous."
+                )
         if (
             not isinstance(by, (pandas.Series, Series))
             and is_list_like(by)

@@ -1570,7 +1570,7 @@ def test_agg_exceptions(operation):
         },
     ],
 )
-def test_to_pandas_convertion(kwargs):
+def test_to_pandas_conversion(kwargs):
     data = {"a": [1, 2], "b": [3, 4], "c": [5, 6]}
     by = ["a", "b"]
 
@@ -2032,3 +2032,70 @@ def test_sum_with_level():
     }
     modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)
     eval_general(modin_df, pandas_df, lambda df: df.set_index("C").groupby("C").sum())
+
+
+def test_reset_index_groupby():
+    frame_data = np.random.randint(97, 198, size=(2**6, 2**4))
+    pandas_df = pandas.DataFrame(
+        frame_data,
+        index=pandas.MultiIndex.from_tuples(
+            [(i // 4, i // 2, i) for i in range(2**6)]
+        ),
+    ).add_prefix("col")
+    pandas_df.index.names = [f"index_{i}" for i in range(len(pandas_df.index.names))]
+    # Convert every other column to string
+    for col in pandas_df.iloc[
+        :, [i for i in range(len(pandas_df.columns)) if i % 2 == 0]
+    ]:
+        pandas_df[col] = [str(chr(i)) for i in pandas_df[col]]
+    modin_df = from_pandas(pandas_df)
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.reset_index().groupby(["index_0", "index_1"]).count(),
+    )
+
+def test_by_in_index_and_columns():
+    pandas_df = pandas.DataFrame(
+        [[1, 2, 3]], index=pd.Index([0], name="a"), columns=['a', 'b', 'c']
+    )
+    modin_df = from_pandas(pandas_df)
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by='a').count(),
+        raising_exceptions=True,
+        check_exception_type=True,
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=['a', 'b']).count(),
+        raising_exceptions=True,
+        check_exception_type=True,
+    )
+    pandas_df = pandas.DataFrame(
+        [[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", 'b']), columns=['a', 'b', 'c']
+    )
+    modin_df = from_pandas(pandas_df)
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by='a').count(),
+        raising_exceptions=True,
+        check_exception_type=True,
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=['a', 'c']).count(),
+        raising_exceptions=True,
+        check_exception_type=True,
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=['a', 'b']).count(),
+        raising_exceptions=True,
+        check_exception_type=True,
+    )