PR comments + changes

modin-project · Jun 10, 2024 · da02e5f · da02e5f
1 parent 625da37
commit da02e5f
Show file tree

Hide file tree

Showing 10 changed files with 58 additions and 123 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -667,7 +667,7 @@ jobs:
       - run: python -m pytest modin/tests/pandas/dataframe/test_reduce.py
       - run: python -m pytest modin/tests/pandas/dataframe/test_udf.py
       - run: python -m pytest modin/tests/pandas/dataframe/test_window.py
-      - uses: codecov/codecov-action@v2
+      - uses: ./.github/actions/upload-coverage
 
   merge-coverage-artifacts:
     needs: [test-internals, test-api-and-no-engine, test-defaults, test-all-unidist, test-all, test-experimental, test-sanity]

diff --git a/modin/config/envvars.py b/modin/config/envvars.py
@@ -850,16 +850,15 @@ def _check_vars() -> None:
 
 class NativeDataframeMode(EnvironmentVariable, type=str):
     """
-    The mode of execution used for handling dataframes in Modin
+    The mode of execution used for handling dataframes in Modin.
 
     When the env variable is set to None the PandasQueryCompiler would be used
-    which would lead to modin executing dataframes in distributed fashion.
+    which would lead to Modin executing dataframes in distributed fashion.
     When set to Native_pandas NativeQueryCompiler is used which handles the
     dataframes without distributing, falling back to native pandas functions.
 
     In future more execution modes can be added for single node execution so
     keeping the parameter as string.
-
     """
 
     varname = "MODIN_NATIVE_DATAFRAME_MODE"

diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
@@ -4574,17 +4574,6 @@ def frame_has_dtypes_cache(self) -> bool:
         """
         return self._modin_frame.has_dtypes_cache
 
-    def has_dtypes_cache(self) -> bool:
-        """
-        Check if the dtypes cache exists for the underlying modin frame.
-
-        Returns
-        -------
-        bool
-            True for base class as dtypes are always present
-        """
-        return True
-
     def get_index_name(self, axis=0):
         """
         Get index name of specified axis.

diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
@@ -365,37 +365,6 @@ def copy(self):
 
     # END Copy
 
-    def has_materialized_dtypes(self):
-        """
-        Check if the undelying modin frame has materialized dtypes
-
-        Returns
-        -------
-        bool
-            True if if the undelying modin frame and False otherwise.
-        """
-        return self._modin_frame.has_materialized_dtypes
-
-    def set_frame_dtypes_cache(self, dtypes):
-        """
-        Set dtypes cache for the underlying modin frame.
-
-        Parameters
-        ----------
-        dtypes : pandas.Series, ModinDtypes, callable or None
-        """
-        self._modin_frame.set_dtypes_cache(dtypes)
-
-    def has_dtypes_cache(self) -> bool:
-        """
-        Check if the dtypes cache exists for the underlying modin frame.
-
-        Returns
-        -------
-        bool
-        """
-        return self._modin_frame.has_dtypes_cache
-
     # Append/Concat/Join (Not Merge)
     # The append/concat/join operations should ideally never trigger remote
     # compute. These operations should only ever be manipulations of the

diff --git a/modin/experimental/core/storage_formats/pandas/native_query_compiler.py b/modin/experimental/core/storage_formats/pandas/native_query_compiler.py
@@ -294,8 +294,6 @@ def groupby_callable(
         groupby_obj = df.groupby(by=by, axis=axis, **groupby_kwargs)
         if agg_name == "agg":
             if isinstance(agg_func, dict):
-                # Related to pandas issue when dict with list of funcs as value is passed in agg_func
-                # https://github.com/pandas-dev/pandas/issues/39103
                 agg_func = {
                     k: v[0] if isinstance(v, list) and len(v) == 1 else v
                     for k, v in agg_func.items()
@@ -314,12 +312,6 @@ def groupby_callable(
     return groupby_callable
 
 
-def _take_2d(df, index=None, columns=None):  # noqa: GL08
-    columns = columns if columns is not None else slice(None)
-    index = index if index is not None else slice(None)
-    return df.iloc[index, columns]
-
-
 def _register_binary(op):
     """
     Build function that apply specified binary method of the passed frame.
@@ -346,7 +338,6 @@ def binary_operator(df, other, **kwargs):
 
         if squeeze_self:
             df = df.squeeze(axis=1)
-
         result = getattr(df, op)(other, **kwargs)
         if (
             not isinstance(result, pandas.Series)
@@ -727,6 +718,7 @@ def setitem_bool(self, row_loc, col_loc, item):
         pandas.DataFrame.update, in_place=True, df_copy=True
     )
     diff = _register_default_pandas(pandas.DataFrame.diff)
+    dot = _register_default_pandas(_register_binary("dot"))
     drop = _register_default_pandas(_drop)
     dropna = _register_default_pandas(pandas.DataFrame.dropna)  # axis values switched?
     dt_ceil = _register_default_pandas(_dt_func_map("ceil"))
@@ -859,7 +851,6 @@ def setitem_bool(self, row_loc, col_loc, item):
     groupby_quantile = _register_default_pandas(_groupby("quantile"))
     groupby_rank = _register_default_pandas(_groupby("rank"))
     groupby_shift = _register_default_pandas(_groupby("shift"))
-    groupby_size = _register_default_pandas(_groupby("size"))
     groupby_skew = _register_default_pandas(_groupby("skew"))
     groupby_std = _register_default_pandas(_groupby("std"))
     groupby_sum = _register_default_pandas(_groupby("sum"))
@@ -988,9 +979,6 @@ def setitem_bool(self, row_loc, col_loc, item):
     rtruediv = _register_default_pandas(_register_binary("rtruediv"))
     searchsorted = _register_default_pandas(pandas.Series.searchsorted, is_series=True)
     sem = _register_default_pandas(pandas.DataFrame.sem)
-    series_update = _register_default_pandas(
-        pandas.Series.update, is_series=True, in_place=True, df_copy=True
-    )
     series_view = _register_default_pandas(pandas.Series.view, is_series=True)
     set_index_from_columns = _register_default_pandas(pandas.DataFrame.set_index)
     setitem = _register_default_pandas(_setitem)
@@ -1054,7 +1042,6 @@ def setitem_bool(self, row_loc, col_loc, item):
     sub = _register_default_pandas(_register_binary("sub"))
     sum = _register_default_pandas(pandas.DataFrame.sum)
     sum_min_count = _register_default_pandas(pandas.DataFrame.sum)
-    take_2d = _register_default_pandas(_take_2d)
     to_datetime = _register_default_pandas(_to_datetime)
     to_numeric = _register_default_pandas(_to_numeric)
     to_numpy = _register_default_pandas(pandas.DataFrame.to_numpy, return_modin=False)
@@ -1094,24 +1081,14 @@ def describe(self, percentiles: np.ndarray):
             include="all",
         )
 
-    def dot(self, other, squeeze_self=None, squeeze_other=None):
-        other = try_cast_to_pandas(other)
-        if squeeze_other:
-            other = other.squeeze()
-        if squeeze_self:
-            result = self._modin_frame.squeeze(axis=1).dot(other)
-        else:
-            result = self._modin_frame.dot(other)
-        if isinstance(result, pandas.Series):
-            if result.name is None:
-                result.name = "__reduced__"
-            result = result.to_frame()
-        if is_list_like(result):
-            result = pandas.DataFrame(result)
-        else:
-            result = pandas.DataFrame([result])
-
-        return self.__constructor__(result)
+    def series_update(self, other, **kwargs):
+        return _register_default_pandas(_register_binary("update"), in_place=True)(
+            self,
+            other=other,
+            squeeze_self=True,
+            squeeze_other=True,
+            **kwargs,
+        )
 
     def expanding_cov(
         self,
@@ -1134,8 +1111,6 @@ def expanding_cov(
                 else other.to_pandas()
             )
         )
-        # expanding_rank = _register_default_pandas(_register_expanding(pandas.core.window.expanding.Expanding.rank))
-
         return _register_default_pandas(
             _register_expanding(pandas.core.window.expanding.Expanding.cov)
         )(
@@ -1185,6 +1160,31 @@ def expanding_corr(
             **kwargs,
         )
 
+    def groupby_size(
+        self,
+        by,
+        axis,
+        groupby_kwargs,
+        agg_args,
+        agg_kwargs,
+        drop=False,
+    ):
+        result = _register_default_pandas(_groupby("size"))(
+            self,
+            by=by,
+            axis=axis,
+            groupby_kwargs=groupby_kwargs,
+            agg_args=agg_args,
+            agg_kwargs=agg_kwargs,
+            drop=drop,
+            method="size",
+        )
+        if not groupby_kwargs.get("as_index", False):
+            # Renaming 'MODIN_UNNAMED_SERIES_LABEL' to a proper name
+
+            result.columns = result.columns[:-1].append(pandas.Index(["size"]))
+        return result
+
     def get_axis(self, axis):
         return self._modin_frame.index if axis == 0 else self._modin_frame.columns
 

diff --git a/modin/pandas/series.py b/modin/pandas/series.py
@@ -144,6 +144,7 @@ def __init__(
                 name = MODIN_UNNAMED_SERIES_LABEL
                 if isinstance(data, pandas.Series) and data.name is not None:
                     name = data.name
+
             query_compiler = from_pandas(
                 pandas.DataFrame(
                     pandas.Series(

diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py
@@ -11,7 +11,6 @@
 # ANY KIND, either express or implied. See the License for the specific language
 # governing permissions and limitations under the License.
 
-import contextlib
 import io
 import warnings
 
@@ -88,11 +87,7 @@
 )
 def test_ops_defaulting_to_pandas(op, make_args):
     modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1)
-    with (
-        warns_that_defaulting_to_pandas()
-        if not NativeDataframeMode.get()
-        else contextlib.nullcontext()
-    ):
+    with warns_that_defaulting_to_pandas():
         operation = getattr(modin_df, op)
         if make_args is not None:
             operation(**make_args(modin_df))
@@ -106,23 +101,15 @@ def test_ops_defaulting_to_pandas(op, make_args):
 
 def test_style():
     data = test_data_values[0]
-    with (
-        warns_that_defaulting_to_pandas()
-        if not NativeDataframeMode.get()
-        else contextlib.nullcontext()
-    ):
+    with warns_that_defaulting_to_pandas():
         pd.DataFrame(data).style
 
 
 def test_to_timestamp():
     idx = pd.date_range("1/1/2012", periods=5, freq="M")
     df = pd.DataFrame(np.random.randint(0, 100, size=(len(idx), 4)), index=idx)
 
-    with (
-        warns_that_defaulting_to_pandas()
-        if not NativeDataframeMode.get()
-        else contextlib.nullcontext()
-    ):
+    with warns_that_defaulting_to_pandas():
         df.to_period().to_timestamp()
 
 
@@ -151,11 +138,7 @@ def test_asfreq():
     index = pd.date_range("1/1/2000", periods=4, freq="min")
     series = pd.Series([0.0, None, 2.0, 3.0], index=index)
     df = pd.DataFrame({"s": series})
-    with (
-        warns_that_defaulting_to_pandas()
-        if not NativeDataframeMode.get()
-        else contextlib.nullcontext()
-    ):
+    with warns_that_defaulting_to_pandas():
         # We are only testing that this defaults to pandas, so we will just check for
         # the warning
         df.asfreq(freq="30S")

diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py
@@ -11,7 +11,6 @@
 # ANY KIND, either express or implied. See the License for the specific language
 # governing permissions and limitations under the License.
 
-import contextlib
 import warnings
 
 import matplotlib
@@ -611,11 +610,7 @@ def test_sort_multiindex(sort_remaining):
             setattr(df, index, new_index)
 
     for kwargs in [{"level": 0}, {"axis": 0}, {"axis": 1}]:
-        with (
-            warns_that_defaulting_to_pandas()
-            if not NativeDataframeMode.get()
-            else contextlib.nullcontext()
-        ):
+        with warns_that_defaulting_to_pandas():
             df_equals(
                 modin_df.sort_index(sort_remaining=sort_remaining, **kwargs),
                 pandas_df.sort_index(sort_remaining=sort_remaining, **kwargs),

diff --git a/modin/tests/pandas/test_expanding.py b/modin/tests/pandas/test_expanding.py
@@ -11,14 +11,12 @@
 # ANY KIND, either express or implied. See the License for the specific language
 # governing permissions and limitations under the License.
 
-import contextlib
-
 import numpy as np
 import pandas
 import pytest
 
 import modin.pandas as pd
-from modin.config import NativeDataframeMode, NPartitions
+from modin.config import NPartitions
 from modin.tests.test_utils import warns_that_defaulting_to_pandas
 
 from .utils import (
@@ -69,11 +67,7 @@ def test_dataframe(data, min_periods, axis, method, kwargs):
 @pytest.mark.parametrize("axis", [0, 1])
 @pytest.mark.parametrize("method", ["corr", "cov"])
 def test_dataframe_corr_cov(data, min_periods, axis, method):
-    with (
-        warns_that_defaulting_to_pandas()
-        if not NativeDataframeMode.get()
-        else contextlib.nullcontext()
-    ):
+    with warns_that_defaulting_to_pandas():
         eval_general(
             *create_test_dfs(data),
             lambda df: getattr(
@@ -85,11 +79,7 @@ def test_dataframe_corr_cov(data, min_periods, axis, method):
 @pytest.mark.parametrize("method", ["corr", "cov"])
 def test_dataframe_corr_cov_with_self(method):
     mdf, pdf = create_test_dfs(test_data["float_nan_data"])
-    with (
-        warns_that_defaulting_to_pandas()
-        if not NativeDataframeMode.get()
-        else contextlib.nullcontext()
-    ):
+    with warns_that_defaulting_to_pandas():
         eval_general(
             mdf,
             pdf,

diff --git a/modin/tests/test_utils.py b/modin/tests/test_utils.py
@@ -11,6 +11,7 @@
 # ANY KIND, either express or implied. See the License for the specific language
 # governing permissions and limitations under the License.
 
+import contextlib
 import json
 from textwrap import dedent, indent
 from unittest.mock import Mock, patch
@@ -21,6 +22,7 @@
 
 import modin.pandas as pd
 import modin.utils
+from modin.config import NativeDataframeMode
 from modin.error_message import ErrorMessage
 from modin.tests.pandas.utils import create_test_dfs
 
@@ -263,10 +265,17 @@ def warns_that_defaulting_to_pandas(prefix=None, suffix=None):
 
     Returns
     -------
-    pytest.recwarn.WarningsChecker
-        A WarningsChecker checking for a UserWarning saying that Modin is
-        defaulting to Pandas.
+    pytest.recwarn.WarningsChecker or contextlib.nullcontext
+        If Modin is not operating in MODIN_NATIVE_DATAFRAME_MODE,a WarningsChecker
+        is returned whic will check for a UserWarning indicating that Modin
+        is defaulting to Pandas. If MODIN_NATIVE_DATAFRAME_MODE is set, a
+        nullcontext is returned to avoid warning about the default to Pandas,
+        as this occurs due  user selecting of MODIN_NATIVE_DATAFRAME_MODE.
+
     """
+    if NativeDataframeMode.get():
+        return contextlib.nullcontext()
+
     match = "[Dd]efaulting to pandas"
     if prefix:
         # Message may be separated by newlines