modin-project · devin-petersohn · Jun 2, 2022 · Jun 2, 2022 · Jun 2, 2022 · Jun 3, 2022
@@ -28,6 +28,7 @@ Key Features and Updates
   * FIX-#4504: Support na_action in applymap (#4505)
   * FIX-#4503: Stop the memory logging thread after session exit (#4515)
   * FIX-#4464: Refactor Ray utils and quick fix groupby.count failing on virtual partitions (#4490)
+  * FIX-#4522: Correct multiindex metadata with groupby (#4523)
 * Performance enhancements
   * FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346)
   * PERF-#4493: Use partition size caches more in Modin dataframe (#4495)

@@ -133,6 +133,14 @@ def map(
                     axis=1,
                 )
                 other = list(other.columns)
+                # GH#4522: Vile as this may be, it is necessary to avoid the case where we are
+                # grouping by columns that were recently added to the data via
+                # `from_labels`. The internal dataframe doesn't know what to do when
+                # the label matches a column name.
+                # We ensure that the columns, index, and by don't intersect in the API level,
+                # so if we hit this if statement, we know its a result of a deferred re-index.
+                if len(df.columns.intersection(df.index.names)) > 0:
+                    df = df.reset_index(drop=True)
             by_part = other
         else:
             by_part = by

@@ -424,7 +424,16 @@ def groupby(
         # strings is passed in, the data used for the groupby is dropped before the
         # groupby takes place.
         drop = False
-
+        # Check that there is no ambiguity in the parameter we were given.
+        _by_check = by if is_list_like(by) else [by]
+        for k in _by_check:
+            if k in self.index.names and k in self.axes[axis]:
+                level_name, index_name = "an index", "a column"
+                if axis == 1:
+                    level_name, index_name = index_name, level_name
+                raise ValueError(
+                    f"{k} is both {level_name} level and {index_name} label, which is ambiguous."
+                )
         if (
             not isinstance(by, (pandas.Series, Series))
             and is_list_like(by)

@@ -1570,7 +1570,7 @@ def test_agg_exceptions(operation):
         },
     ],
 )
-def test_to_pandas_convertion(kwargs):
+def test_to_pandas_conversion(kwargs):
     data = {"a": [1, 2], "b": [3, 4], "c": [5, 6]}
     by = ["a", "b"]
 
@@ -2032,3 +2032,82 @@ def test_sum_with_level():
     }
     modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)
     eval_general(modin_df, pandas_df, lambda df: df.set_index("C").groupby("C").sum())
+
+
+def test_reset_index_groupby():
+    # Due to `reset_index` deferring the actual reindexing of partitions,
+    # when we call groupby after a `reset_index` with a `by` column name
+    # that was moved from the index to the columns via `from_labels` the
+    # algebra layer incorrectly thinks that the `by` key is duplicated
+    # across both the columns and labels, and fails, when it should
+    # succeed. We have this test to ensure that that case is correctly
+    # handled, and passes. For more details, checkout
+    # https://github.com/modin-project/modin/issues/4522.
+    frame_data = np.random.randint(97, 198, size=(2**6, 2**4))
+    pandas_df = pandas.DataFrame(
+        frame_data,
+        index=pandas.MultiIndex.from_tuples(
+            [(i // 4, i // 2, i) for i in range(2**6)]
+        ),
+    ).add_prefix("col")
+    pandas_df.index.names = [f"index_{i}" for i in range(len(pandas_df.index.names))]
+    # Convert every even column to string
+    for col in pandas_df.iloc[
+        :, [i for i in range(len(pandas_df.columns)) if i % 2 == 0]
+    ]:
+        pandas_df[col] = [str(chr(i)) for i in pandas_df[col]]
+    # The `pandas_df` contains a multi-index with 3 levels, named `index_0`, `index_1`,
+    # and `index_2`, and 16 columns, named `col0` through `col15`. Every even column
+    # has dtype `str`, while odd columns have dtype `int64`.
+    modin_df = from_pandas(pandas_df)
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.reset_index().groupby(["index_0", "index_1"]).count(),
+    )
+
+
+def test_by_in_index_and_columns():
+    pandas_df = pandas.DataFrame(
+        [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"]
+    )
+    modin_df = from_pandas(pandas_df)
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by="a").count(),
+        raising_exceptions=True,
+        check_exception_type=True,
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=["a", "b"]).count(),
+        raising_exceptions=True,
+        check_exception_type=True,
+    )
+    pandas_df = pandas.DataFrame(
+        [[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", "b"]), columns=["a", "b", "c"]
+    )
+    modin_df = from_pandas(pandas_df)
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by="a").count(),
+        raising_exceptions=True,
+        check_exception_type=True,
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=["a", "c"]).count(),
+        raising_exceptions=True,
+        check_exception_type=True,
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=["a", "b"]).count(),
+        raising_exceptions=True,
+        check_exception_type=True,
+    )