diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ecedb33b929..3125a1d094e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -667,7 +667,7 @@ jobs: - run: python -m pytest modin/tests/pandas/dataframe/test_reduce.py - run: python -m pytest modin/tests/pandas/dataframe/test_udf.py - run: python -m pytest modin/tests/pandas/dataframe/test_window.py - - uses: codecov/codecov-action@v2 + - uses: ./.github/actions/upload-coverage merge-coverage-artifacts: needs: [test-internals, test-api-and-no-engine, test-defaults, test-all-unidist, test-all, test-experimental, test-sanity] diff --git a/modin/config/envvars.py b/modin/config/envvars.py index c940c89f6c4..407a4ddd038 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -850,16 +850,15 @@ def _check_vars() -> None: class NativeDataframeMode(EnvironmentVariable, type=str): """ - The mode of execution used for handling dataframes in Modin + The mode of execution used for handling dataframes in Modin. When the env variable is set to None the PandasQueryCompiler would be used - which would lead to modin executing dataframes in distributed fashion. + which would lead to Modin executing dataframes in distributed fashion. When set to Native_pandas NativeQueryCompiler is used which handles the dataframes without distributing, falling back to native pandas functions. In future more execution modes can be added for single node execution so keeping the parameter as string. - """ varname = "MODIN_NATIVE_DATAFRAME_MODE" diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 83aed3a5647..f239f1b46ae 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -4574,17 +4574,6 @@ def frame_has_dtypes_cache(self) -> bool: """ return self._modin_frame.has_dtypes_cache - def has_dtypes_cache(self) -> bool: - """ - Check if the dtypes cache exists for the underlying modin frame. - - Returns - ------- - bool - True for base class as dtypes are always present - """ - return True - def get_index_name(self, axis=0): """ Get index name of specified axis. diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 04e2d96fad2..d29901b8fdb 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -365,37 +365,6 @@ def copy(self): # END Copy - def has_materialized_dtypes(self): - """ - Check if the undelying modin frame has materialized dtypes - - Returns - ------- - bool - True if if the undelying modin frame and False otherwise. - """ - return self._modin_frame.has_materialized_dtypes - - def set_frame_dtypes_cache(self, dtypes): - """ - Set dtypes cache for the underlying modin frame. - - Parameters - ---------- - dtypes : pandas.Series, ModinDtypes, callable or None - """ - self._modin_frame.set_dtypes_cache(dtypes) - - def has_dtypes_cache(self) -> bool: - """ - Check if the dtypes cache exists for the underlying modin frame. - - Returns - ------- - bool - """ - return self._modin_frame.has_dtypes_cache - # Append/Concat/Join (Not Merge) # The append/concat/join operations should ideally never trigger remote # compute. These operations should only ever be manipulations of the diff --git a/modin/experimental/core/storage_formats/pandas/native_query_compiler.py b/modin/experimental/core/storage_formats/pandas/native_query_compiler.py index 8679dc318dc..f010207adbc 100644 --- a/modin/experimental/core/storage_formats/pandas/native_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/native_query_compiler.py @@ -294,8 +294,6 @@ def groupby_callable( groupby_obj = df.groupby(by=by, axis=axis, **groupby_kwargs) if agg_name == "agg": if isinstance(agg_func, dict): - # Related to pandas issue when dict with list of funcs as value is passed in agg_func - # https://github.com/pandas-dev/pandas/issues/39103 agg_func = { k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in agg_func.items() @@ -314,12 +312,6 @@ def groupby_callable( return groupby_callable -def _take_2d(df, index=None, columns=None): # noqa: GL08 - columns = columns if columns is not None else slice(None) - index = index if index is not None else slice(None) - return df.iloc[index, columns] - - def _register_binary(op): """ Build function that apply specified binary method of the passed frame. @@ -346,7 +338,6 @@ def binary_operator(df, other, **kwargs): if squeeze_self: df = df.squeeze(axis=1) - result = getattr(df, op)(other, **kwargs) if ( not isinstance(result, pandas.Series) @@ -727,6 +718,7 @@ def setitem_bool(self, row_loc, col_loc, item): pandas.DataFrame.update, in_place=True, df_copy=True ) diff = _register_default_pandas(pandas.DataFrame.diff) + dot = _register_default_pandas(_register_binary("dot")) drop = _register_default_pandas(_drop) dropna = _register_default_pandas(pandas.DataFrame.dropna) # axis values switched? dt_ceil = _register_default_pandas(_dt_func_map("ceil")) @@ -859,7 +851,6 @@ def setitem_bool(self, row_loc, col_loc, item): groupby_quantile = _register_default_pandas(_groupby("quantile")) groupby_rank = _register_default_pandas(_groupby("rank")) groupby_shift = _register_default_pandas(_groupby("shift")) - groupby_size = _register_default_pandas(_groupby("size")) groupby_skew = _register_default_pandas(_groupby("skew")) groupby_std = _register_default_pandas(_groupby("std")) groupby_sum = _register_default_pandas(_groupby("sum")) @@ -988,9 +979,6 @@ def setitem_bool(self, row_loc, col_loc, item): rtruediv = _register_default_pandas(_register_binary("rtruediv")) searchsorted = _register_default_pandas(pandas.Series.searchsorted, is_series=True) sem = _register_default_pandas(pandas.DataFrame.sem) - series_update = _register_default_pandas( - pandas.Series.update, is_series=True, in_place=True, df_copy=True - ) series_view = _register_default_pandas(pandas.Series.view, is_series=True) set_index_from_columns = _register_default_pandas(pandas.DataFrame.set_index) setitem = _register_default_pandas(_setitem) @@ -1054,7 +1042,6 @@ def setitem_bool(self, row_loc, col_loc, item): sub = _register_default_pandas(_register_binary("sub")) sum = _register_default_pandas(pandas.DataFrame.sum) sum_min_count = _register_default_pandas(pandas.DataFrame.sum) - take_2d = _register_default_pandas(_take_2d) to_datetime = _register_default_pandas(_to_datetime) to_numeric = _register_default_pandas(_to_numeric) to_numpy = _register_default_pandas(pandas.DataFrame.to_numpy, return_modin=False) @@ -1094,24 +1081,14 @@ def describe(self, percentiles: np.ndarray): include="all", ) - def dot(self, other, squeeze_self=None, squeeze_other=None): - other = try_cast_to_pandas(other) - if squeeze_other: - other = other.squeeze() - if squeeze_self: - result = self._modin_frame.squeeze(axis=1).dot(other) - else: - result = self._modin_frame.dot(other) - if isinstance(result, pandas.Series): - if result.name is None: - result.name = "__reduced__" - result = result.to_frame() - if is_list_like(result): - result = pandas.DataFrame(result) - else: - result = pandas.DataFrame([result]) - - return self.__constructor__(result) + def series_update(self, other, **kwargs): + return _register_default_pandas(_register_binary("update"), in_place=True)( + self, + other=other, + squeeze_self=True, + squeeze_other=True, + **kwargs, + ) def expanding_cov( self, @@ -1134,8 +1111,6 @@ def expanding_cov( else other.to_pandas() ) ) - # expanding_rank = _register_default_pandas(_register_expanding(pandas.core.window.expanding.Expanding.rank)) - return _register_default_pandas( _register_expanding(pandas.core.window.expanding.Expanding.cov) )( @@ -1185,6 +1160,31 @@ def expanding_corr( **kwargs, ) + def groupby_size( + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + ): + result = _register_default_pandas(_groupby("size"))( + self, + by=by, + axis=axis, + groupby_kwargs=groupby_kwargs, + agg_args=agg_args, + agg_kwargs=agg_kwargs, + drop=drop, + method="size", + ) + if not groupby_kwargs.get("as_index", False): + # Renaming 'MODIN_UNNAMED_SERIES_LABEL' to a proper name + + result.columns = result.columns[:-1].append(pandas.Index(["size"])) + return result + def get_axis(self, axis): return self._modin_frame.index if axis == 0 else self._modin_frame.columns diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 4df41168b5a..7818c52654d 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -144,6 +144,7 @@ def __init__( name = MODIN_UNNAMED_SERIES_LABEL if isinstance(data, pandas.Series) and data.name is not None: name = data.name + query_compiler = from_pandas( pandas.DataFrame( pandas.Series( diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 016ee2e7ac9..a794aeb6446 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -11,7 +11,6 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -import contextlib import io import warnings @@ -88,11 +87,7 @@ ) def test_ops_defaulting_to_pandas(op, make_args): modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1) - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): operation = getattr(modin_df, op) if make_args is not None: operation(**make_args(modin_df)) @@ -106,11 +101,7 @@ def test_ops_defaulting_to_pandas(op, make_args): def test_style(): data = test_data_values[0] - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): pd.DataFrame(data).style @@ -118,11 +109,7 @@ def test_to_timestamp(): idx = pd.date_range("1/1/2012", periods=5, freq="M") df = pd.DataFrame(np.random.randint(0, 100, size=(len(idx), 4)), index=idx) - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): df.to_period().to_timestamp() @@ -151,11 +138,7 @@ def test_asfreq(): index = pd.date_range("1/1/2000", periods=4, freq="min") series = pd.Series([0.0, None, 2.0, 3.0], index=index) df = pd.DataFrame({"s": series}) - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): # We are only testing that this defaults to pandas, so we will just check for # the warning df.asfreq(freq="30S") diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py index 46983ffd45c..cebe1194c6c 100644 --- a/modin/tests/pandas/dataframe/test_join_sort.py +++ b/modin/tests/pandas/dataframe/test_join_sort.py @@ -11,7 +11,6 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -import contextlib import warnings import matplotlib @@ -611,11 +610,7 @@ def test_sort_multiindex(sort_remaining): setattr(df, index, new_index) for kwargs in [{"level": 0}, {"axis": 0}, {"axis": 1}]: - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): df_equals( modin_df.sort_index(sort_remaining=sort_remaining, **kwargs), pandas_df.sort_index(sort_remaining=sort_remaining, **kwargs), diff --git a/modin/tests/pandas/test_expanding.py b/modin/tests/pandas/test_expanding.py index d96a38bc21e..fe184dbd249 100644 --- a/modin/tests/pandas/test_expanding.py +++ b/modin/tests/pandas/test_expanding.py @@ -11,14 +11,12 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -import contextlib - import numpy as np import pandas import pytest import modin.pandas as pd -from modin.config import NativeDataframeMode, NPartitions +from modin.config import NPartitions from modin.tests.test_utils import warns_that_defaulting_to_pandas from .utils import ( @@ -69,11 +67,7 @@ def test_dataframe(data, min_periods, axis, method, kwargs): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("method", ["corr", "cov"]) def test_dataframe_corr_cov(data, min_periods, axis, method): - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): eval_general( *create_test_dfs(data), lambda df: getattr( @@ -85,11 +79,7 @@ def test_dataframe_corr_cov(data, min_periods, axis, method): @pytest.mark.parametrize("method", ["corr", "cov"]) def test_dataframe_corr_cov_with_self(method): mdf, pdf = create_test_dfs(test_data["float_nan_data"]) - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): eval_general( mdf, pdf, diff --git a/modin/tests/test_utils.py b/modin/tests/test_utils.py index bc478d957f9..9e5589314cc 100644 --- a/modin/tests/test_utils.py +++ b/modin/tests/test_utils.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import contextlib import json from textwrap import dedent, indent from unittest.mock import Mock, patch @@ -21,6 +22,7 @@ import modin.pandas as pd import modin.utils +from modin.config import NativeDataframeMode from modin.error_message import ErrorMessage from modin.tests.pandas.utils import create_test_dfs @@ -263,10 +265,17 @@ def warns_that_defaulting_to_pandas(prefix=None, suffix=None): Returns ------- - pytest.recwarn.WarningsChecker - A WarningsChecker checking for a UserWarning saying that Modin is - defaulting to Pandas. + pytest.recwarn.WarningsChecker or contextlib.nullcontext + If Modin is not operating in MODIN_NATIVE_DATAFRAME_MODE,a WarningsChecker + is returned whic will check for a UserWarning indicating that Modin + is defaulting to Pandas. If MODIN_NATIVE_DATAFRAME_MODE is set, a + nullcontext is returned to avoid warning about the default to Pandas, + as this occurs due user selecting of MODIN_NATIVE_DATAFRAME_MODE. + """ + if NativeDataframeMode.get(): + return contextlib.nullcontext() + match = "[Dd]efaulting to pandas" if prefix: # Message may be separated by newlines