From e8925cb0a2aef81a7830b94631835082bf3cca52 Mon Sep 17 00:00:00 2001 From: Arun Jose <40291569+arunjose696@users.noreply.github.com> Date: Wed, 12 Jun 2024 11:30:03 +0200 Subject: [PATCH] Apply suggestions from code review Co-authored-by: Iaroslav Igoshev Signed-off-by: arunjose696 --- .github/workflows/ci.yml | 4 +- modin/config/envvars.py | 21 ++-- .../dispatching/factories/factories.py | 6 +- .../pandas/native_query_compiler.py | 109 ++++++++---------- modin/tests/pandas/dataframe/test_binary.py | 2 +- modin/tests/pandas/dataframe/test_default.py | 13 ++- modin/tests/pandas/dataframe/test_indexing.py | 8 +- modin/tests/pandas/dataframe/test_iter.py | 5 +- .../tests/pandas/dataframe/test_join_sort.py | 6 +- .../pandas/dataframe/test_map_metadata.py | 11 +- modin/tests/test_utils.py | 17 +-- 11 files changed, 98 insertions(+), 104 deletions(-) rename modin/{experimental => }/core/storage_formats/pandas/native_query_compiler.py (94%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3125a1d094e..5077323df9d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -222,7 +222,7 @@ jobs: with: filters: | test-small-query-compiler: - - 'modin/experimental/core/storage_formats/pandas/small_query_compiler.py' + - 'modin/experimental/core/storage_formats/pandas/native_query_compiler.py' - 'modin/core/storage_formats/pandas/query_compiler.py' - 'modin/core/storage_formats/base/query_compiler.py' shared: &shared @@ -647,7 +647,7 @@ jobs: matrix: python-version: ["3.9"] env: - MODIN_NATIVE_DATAFRAME_MODE: "Native_pandas" + MODIN_NATIVE_DATAFRAME_MODE: "Pandas" name: test-small-query-compiler python ${{matrix.python-version}}) steps: - uses: actions/checkout@v4 diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 1d3f4260f8f..59f3dab1d03 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -915,20 +915,19 @@ def _check_vars() -> None: class NativeDataframeMode(EnvironmentVariable, type=str): """ - The mode of execution used for handling dataframes in Modin. - - When the env variable is set to None the PandasQueryCompiler would be used - which would lead to Modin executing dataframes in distributed fashion. - When set to Native_pandas NativeQueryCompiler is used which handles the - dataframes without distributing, falling back to native pandas functions. - - In future more execution modes can be added for single node execution so - keeping the parameter as string. + When this config is set to ``Default``, ``PandasQueryCompiler`` is used, + which leads to Modin executing dataframes in distributed fashion. + When set to a string (e.g., ``Pandas``), ``NativeQueryCompiler`` is used, + which handles the dataframes without distributing, + falling back to native library functions (e.g., ``Pandas``). + + This could be beneficial for handling relatively small dataframes + without involving additional overhead of communication between processes. """ varname = "MODIN_NATIVE_DATAFRAME_MODE" - choices = ("Native_pandas",) - default = None + choices = ("Pandas",) + default = "Default" _check_vars() diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py index ee1b68b2dee..deda5113287 100644 --- a/modin/core/execution/dispatching/factories/factories.py +++ b/modin/core/execution/dispatching/factories/factories.py @@ -28,9 +28,7 @@ from modin.config import NativeDataframeMode from modin.core.io import BaseIO -from modin.experimental.core.storage_formats.pandas.native_query_compiler import ( - NativeQueryCompiler, -) +from modin.core.storage_formats.pandas.native_query_compiler import NativeQueryCompiler from modin.utils import get_current_execution _doc_abstract_factory_class = """ @@ -172,7 +170,7 @@ def prepare(cls): method="io.from_pandas", ) def _from_pandas(cls, df): - if NativeDataframeMode.get(): + if NativeDataframeMode.get() == "Pandas": df_copy = df.copy() return NativeQueryCompiler(df_copy) return cls.io_cls.from_pandas(df) diff --git a/modin/experimental/core/storage_formats/pandas/native_query_compiler.py b/modin/core/storage_formats/pandas/native_query_compiler.py similarity index 94% rename from modin/experimental/core/storage_formats/pandas/native_query_compiler.py rename to modin/core/storage_formats/pandas/native_query_compiler.py index f010207adbc..bd89ec4775e 100644 --- a/modin/experimental/core/storage_formats/pandas/native_query_compiler.py +++ b/modin/core/storage_formats/pandas/native_query_compiler.py @@ -406,8 +406,6 @@ def _fillna(df, value, **kwargs): # noqa: GL08 df = df.squeeze(axis=1) if squeeze_value and isinstance(value, pandas.DataFrame): value = value.squeeze(axis=1) - # if len(df.columns) == 1 and df.columns[0] == "__reduced__": - # df = df["__reduced__"] return df.fillna(value, **kwargs) @@ -495,12 +493,10 @@ def _get_dummies(df, columns, **kwargs): # noqa: GL08 def _register_default_pandas( func, is_series=False, - squeeze_series=False, squeeze_args=False, squeeze_kwargs=False, - return_modin=True, + return_raw=False, in_place=False, - df_copy=False, filter_kwargs=[], ): """ @@ -512,18 +508,14 @@ def _register_default_pandas( Function to apply. is_series : bool, default: False If True, the passed frame will always be squeezed to a series. - squeeze_series : bool, default: False - If True, the passed frame will always be squeezed to a series if there is a single column named "__reduced__". squeeze_args : bool, default: False If True, all passed arguments will be squeezed. squeeze_kwargs : bool, default: False If True, all passed key word arguments will be squeezed. - return_modin : bool, default: True - If True, the result will always try to convert to DataFrame or Series. + return_raw : bool, default: False + If True, and the result not DataFrame or Series it is returned as is without wrapping in query compiler. in_place : bool, default: False If True, the specified function will be applied on the passed frame in place. - df_copy : bool, default: False - If True, the specified function will be applied to a copy of the passed frame. filter_kwargs : list, default: [] List of key word argument names to remove. @@ -535,17 +527,9 @@ def _register_default_pandas( def caller(query_compiler, *args, **kwargs): df = query_compiler._modin_frame - if df_copy: - df = df.copy() if is_series: df = df.squeeze(axis=1) - exclude_names = [ - # "broadcast", - "fold_axis", - # "squeeze_self", - # "squeeze_value", - "ignore_indices", - ] + filter_kwargs + exclude_names = ["fold_axis"] + filter_kwargs kwargs = kwargs.copy() for name in exclude_names: kwargs.pop(name, None) @@ -553,12 +537,11 @@ def caller(query_compiler, *args, **kwargs): kwargs = try_cast_to_pandas(kwargs, squeeze=squeeze_kwargs) result = func(df, *args, **kwargs) inplace_method = kwargs.get("inplace", False) - if in_place: inplace_method = in_place if inplace_method: result = df - if not (return_modin or isinstance(result, (pandas.Series, pandas.DataFrame))): + if return_raw and not isinstance(result, (pandas.Series, pandas.DataFrame)): return result if isinstance(result, pandas.Series): if result.name is None: @@ -576,8 +559,8 @@ class NativeQueryCompiler(BaseQueryCompiler): Query compiler for the pandas storage format. This class translates common query compiler API into - plain pandas to execute operations on small data - depending on the threshold. + native library functions (e.g., pandas) to execute operations + on small data depending on the threshold. Parameters ---------- @@ -585,8 +568,11 @@ class NativeQueryCompiler(BaseQueryCompiler): Pandas frame to query with the compiled queries. """ - def __init__(self, pandas_frame): - assert NativeDataframeMode.get() == "Native_Pandas" + _modin_frame: pandas.DataFrame + _shape_hint: Optional[str] + + def __init__(self, pandas_frame, shape_hint: Optional[str] = None): + assert NativeDataframeMode.get() == "Pandas" if hasattr(pandas_frame, "_to_pandas"): pandas_frame = pandas_frame._to_pandas() if is_scalar(pandas_frame): @@ -595,6 +581,7 @@ def __init__(self, pandas_frame): pandas_frame = pandas.DataFrame(pandas_frame) self._modin_frame = pandas_frame + self._shape_hint = shape_hint def execute(self): pass @@ -617,6 +604,10 @@ def set_frame_dtypes_cache(self, dtypes): Parameters ---------- dtypes : pandas.Series, ModinDtypes, callable or None + + Notes + ----- + This function is for consistency with other QCs, dtypes should be assigned directly on the frame. """ pass @@ -627,6 +618,10 @@ def set_frame_index_cache(self, index): Parameters ---------- index : sequence, callable or None + + Notes + ----- + This function is for consistency with other QCs, dtypes should be assigned directly on the frame. """ pass @@ -665,27 +660,25 @@ def setitem_bool(self, row_loc, col_loc, item): self._modin_frame.loc[row_loc._modin_frame.squeeze(axis=1), col_loc] = item return self.__constructor__(self._modin_frame) - __and__ = _register_default_pandas(pandas.DataFrame.__and__, squeeze_series=True) + __and__ = _register_default_pandas(pandas.DataFrame.__and__) __dir__ = _register_default_pandas(pandas.DataFrame.__dir__) - __eq__ = _register_default_pandas(pandas.DataFrame.__eq__, squeeze_series=True) + __eq__ = _register_default_pandas(pandas.DataFrame.__eq__) __format__ = _register_default_pandas(pandas.DataFrame.__format__) - __ge__ = _register_default_pandas(pandas.DataFrame.__ge__, squeeze_series=True) - __gt__ = _register_default_pandas(pandas.DataFrame.__gt__, squeeze_series=True) - __le__ = _register_default_pandas(pandas.DataFrame.__le__, squeeze_series=True) - __lt__ = _register_default_pandas(pandas.DataFrame.__lt__, squeeze_series=True) - __ne__ = _register_default_pandas(pandas.DataFrame.__ne__, squeeze_series=True) - __or__ = _register_default_pandas(pandas.DataFrame.__or__, squeeze_series=True) - __rand__ = _register_default_pandas(pandas.DataFrame.__rand__, squeeze_series=True) - __reduce__ = _register_default_pandas( - pandas.DataFrame.__reduce__, return_modin=False - ) + __ge__ = _register_default_pandas(pandas.DataFrame.__ge__) + __gt__ = _register_default_pandas(pandas.DataFrame.__gt__) + __le__ = _register_default_pandas(pandas.DataFrame.__le__) + __lt__ = _register_default_pandas(pandas.DataFrame.__lt__) + __ne__ = _register_default_pandas(pandas.DataFrame.__ne__) + __or__ = _register_default_pandas(pandas.DataFrame.__or__) + __rand__ = _register_default_pandas(pandas.DataFrame.__rand__) + __reduce__ = _register_default_pandas(pandas.DataFrame.__reduce__, return_raw=True) __reduce_ex__ = _register_default_pandas( - pandas.DataFrame.__reduce_ex__, return_modin=False + pandas.DataFrame.__reduce_ex__, return_raw=True ) - __ror__ = _register_default_pandas(pandas.DataFrame.__ror__, squeeze_series=True) - __rxor__ = _register_default_pandas(pandas.DataFrame.__rxor__, squeeze_series=True) + __ror__ = _register_default_pandas(pandas.DataFrame.__ror__) + __rxor__ = _register_default_pandas(pandas.DataFrame.__rxor__) __sizeof__ = _register_default_pandas(pandas.DataFrame.__sizeof__) - __xor__ = _register_default_pandas(pandas.DataFrame.__xor__, squeeze_series=True) + __xor__ = _register_default_pandas(pandas.DataFrame.__xor__) abs = _register_default_pandas(pandas.DataFrame.abs) add = _register_default_pandas(_register_binary("add")) all = _register_default_pandas(pandas.DataFrame.all) @@ -696,10 +689,8 @@ def setitem_bool(self, row_loc, col_loc, item): astype = _register_default_pandas(pandas.DataFrame.astype) case_when = _register_default_pandas(pandas.Series.case_when) cat_codes = _register_default_pandas(lambda ser: ser.cat.codes, is_series=True) - combine = _register_default_pandas(_combine, squeeze_series=True) - combine_first = _register_default_pandas( - lambda df, other: df.combine_first(other), squeeze_series=True - ) + combine = _register_default_pandas(_combine) + combine_first = _register_default_pandas(lambda df, other: df.combine_first(other)) compare = _register_default_pandas(pandas.DataFrame.compare) concat = _register_default_pandas(_concat) conj = _register_default_pandas( @@ -714,9 +705,7 @@ def setitem_bool(self, row_loc, col_loc, item): cumprod = _register_default_pandas(pandas.DataFrame.cumprod) cumsum = _register_default_pandas(pandas.DataFrame.cumsum) delitem = _register_default_pandas(_delitem) - df_update = _register_default_pandas( - pandas.DataFrame.update, in_place=True, df_copy=True - ) + df_update = _register_default_pandas(pandas.DataFrame.update, in_place=True) diff = _register_default_pandas(pandas.DataFrame.diff) dot = _register_default_pandas(_register_binary("dot")) drop = _register_default_pandas(_drop) @@ -825,7 +814,7 @@ def setitem_bool(self, row_loc, col_loc, item): fillna = _register_default_pandas(_fillna) first_valid_index = _register_default_pandas( - pandas.DataFrame.first_valid_index, return_modin=False + pandas.DataFrame.first_valid_index, return_raw=True ) floordiv = _register_default_pandas(_register_binary("floordiv")) ge = _register_default_pandas(_register_binary("ge"), filter_kwargs=["dtypes"]) @@ -859,7 +848,7 @@ def setitem_bool(self, row_loc, col_loc, item): idxmax = _register_default_pandas(pandas.DataFrame.idxmax) idxmin = _register_default_pandas(pandas.DataFrame.idxmin) infer_objects = _register_default_pandas( - pandas.DataFrame.infer_objects, return_modin=False + pandas.DataFrame.infer_objects, return_raw=True ) insert = _register_default_pandas( pandas.DataFrame.insert, in_place=True, squeeze_args=True @@ -876,9 +865,9 @@ def setitem_bool(self, row_loc, col_loc, item): ) isna = _register_default_pandas(pandas.DataFrame.isna) join = _register_default_pandas(pandas.DataFrame.join) - kurt = _register_default_pandas(pandas.DataFrame.kurt, return_modin=False) + kurt = _register_default_pandas(pandas.DataFrame.kurt, return_raw=True) last_valid_index = _register_default_pandas( - pandas.DataFrame.last_valid_index, return_modin=False + pandas.DataFrame.last_valid_index, return_raw=True ) le = _register_default_pandas(_register_binary("le"), filter_kwargs=["dtypes"]) lt = _register_default_pandas(_register_binary("lt"), filter_kwargs=["dtypes"]) @@ -886,8 +875,8 @@ def setitem_bool(self, row_loc, col_loc, item): mask = _register_default_pandas(pandas.DataFrame.mask) max = _register_default_pandas(pandas.DataFrame.max) map = _register_default_pandas(pandas.DataFrame.map) - mean = _register_default_pandas(pandas.DataFrame.mean, return_modin=False) - median = _register_default_pandas(pandas.DataFrame.median, return_modin=False) + mean = _register_default_pandas(pandas.DataFrame.mean, return_raw=True) + median = _register_default_pandas(pandas.DataFrame.median, return_raw=True) melt = _register_default_pandas(pandas.DataFrame.melt) memory_usage = _register_default_pandas(pandas.DataFrame.memory_usage) merge = _register_default_pandas(pandas.DataFrame.merge) @@ -899,9 +888,7 @@ def setitem_bool(self, row_loc, col_loc, item): negative = _register_default_pandas(pandas.DataFrame.__neg__) nlargest = _register_default_pandas(pandas.DataFrame.nlargest) notna = _register_default_pandas(pandas.DataFrame.notna) - nsmallest = _register_default_pandas( - lambda df, **kwargs: df.nsmallest(**kwargs), squeeze_series=True - ) + nsmallest = _register_default_pandas(lambda df, **kwargs: df.nsmallest(**kwargs)) nunique = _register_default_pandas(pandas.DataFrame.nunique) pivot = _register_default_pandas(pandas.DataFrame.pivot) pivot_table = _register_default_pandas(pandas.DataFrame.pivot_table) @@ -982,7 +969,7 @@ def setitem_bool(self, row_loc, col_loc, item): series_view = _register_default_pandas(pandas.Series.view, is_series=True) set_index_from_columns = _register_default_pandas(pandas.DataFrame.set_index) setitem = _register_default_pandas(_setitem) - skew = _register_default_pandas(pandas.DataFrame.skew, return_modin=False) + skew = _register_default_pandas(pandas.DataFrame.skew, return_raw=True) sort_index = _register_default_pandas(_sort_index) sort_columns_by_row_values = _register_default_pandas( lambda df, columns, **kwargs: df.sort_values(by=columns, axis=1, **kwargs) @@ -1044,13 +1031,13 @@ def setitem_bool(self, row_loc, col_loc, item): sum_min_count = _register_default_pandas(pandas.DataFrame.sum) to_datetime = _register_default_pandas(_to_datetime) to_numeric = _register_default_pandas(_to_numeric) - to_numpy = _register_default_pandas(pandas.DataFrame.to_numpy, return_modin=False) + to_numpy = _register_default_pandas(pandas.DataFrame.to_numpy, return_raw=True) to_timedelta = _register_default_pandas( lambda ser, *args, **kwargs: pandas.to_timedelta(ser, *args, **kwargs), is_series=True, ) transpose = _register_default_pandas(pandas.DataFrame.transpose) - truediv = _register_default_pandas(_register_binary("truediv"), squeeze_series=True) + truediv = _register_default_pandas(_register_binary("truediv")) unstack = _register_default_pandas(pandas.DataFrame.unstack) var = _register_default_pandas(pandas.DataFrame.var) where = _register_default_pandas(pandas.DataFrame.where) diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index 1b643cfcdba..2f614d5958d 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -211,7 +211,7 @@ def operation(df): reason="Modin on this engine doesn't create virtual partitions.", ) @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize( diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index a794aeb6446..da6c034d674 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -124,7 +124,7 @@ def test_to_numpy(data): @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -298,7 +298,7 @@ def test_corr_min_periods(self, min_periods): {"a": [1, np.nan, 3, 4, 5, 6], "b": [1, 2, 1, 4, 5, np.nan]} ) modin_df = pd.concat([modin_df.iloc[:3], modin_df.iloc[3:]]) - if not NativeDataframeMode.get(): + if NativeDataframeMode.get() == "Default": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( modin_df, pandas_df, lambda df: df.corr(min_periods=min_periods) @@ -318,7 +318,7 @@ def test_corr_non_numeric(self, numeric_only): reason="doesn't make sense for non-partitioned executions", ) @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not contain partitions.", ) def test_corr_nans_in_different_partitions(self): @@ -611,7 +611,10 @@ def test_pivot(data, index, columns, values, request): or "default-one_column-several_columns_index" in request.node.callspec.id or "default-one_column-one_column_index" in request.node.callspec.id or ( - (current_execution in ("BaseOnPython",) or NativeDataframeMode.get()) + ( + current_execution in ("BaseOnPython",) + or NativeDataframeMode.get() == "Pandas" + ) and index is lib.no_default ) ): @@ -992,7 +995,7 @@ def test_resampler_functions_with_arg(rule, axis, method_arg): marks=pytest.mark.xfail( condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python") and StorageFormat.get() != "Base" - and NativeDataframeMode.get() is None, + and NativeDataframeMode.get() == "Default", reason="https://github.com/modin-project/modin/issues/6399", ), ), diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py index 0f38eaa5ebe..935b49cd318 100644 --- a/modin/tests/pandas/dataframe/test_indexing.py +++ b/modin/tests/pandas/dataframe/test_indexing.py @@ -67,6 +67,7 @@ def eval_setitem(md_df, pd_df, value, col=None, loc=None, expected_exception=Non col = pd_df.columns[loc] value_getter = value if callable(value) else (lambda *args, **kwargs: value) + eval_general( md_df, pd_df, @@ -83,6 +84,7 @@ def eval_loc(md_df, pd_df, value, key): md_value, pd_value = value else: md_value, pd_value = value, value + eval_general( md_df, pd_df, @@ -525,6 +527,7 @@ def test_loc_4456( if reverse_value_columns: pdf_value = pdf_value.reindex(columns=pdf_value.columns[::-1]) mdf_value = mdf_value.reindex(columns=mdf_value.columns[::-1]) + eval_loc(modin_df, pandas_df, pdf_value, key) eval_loc(modin_df, pandas_df, (mdf_value, pdf_value), key) @@ -588,7 +591,7 @@ def test_loc_setting_single_categorical_column(): @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not currently support IO functions.", ) def test_loc_multi_index(): @@ -1487,7 +1490,6 @@ def test_reset_index(data, test_async_reset_index): pd_df_cp = pandas_df.copy() if test_async_reset_index: modin_df._query_compiler.set_frame_index_cache(None) - modin_df_cp.reset_index(inplace=True) pd_df_cp.reset_index(inplace=True) df_equals(modin_df_cp, pd_df_cp) @@ -2245,7 +2247,7 @@ def test___setitem__partitions_aligning(): @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not currently support IO functions.", ) def test___setitem__with_mismatched_partitions(): diff --git a/modin/tests/pandas/dataframe/test_iter.py b/modin/tests/pandas/dataframe/test_iter.py index b00ae056920..ccd6e632d10 100644 --- a/modin/tests/pandas/dataframe/test_iter.py +++ b/modin/tests/pandas/dataframe/test_iter.py @@ -142,7 +142,8 @@ def test_display_options_for___repr__(max_rows_columns, expand_frame_repr, frame def test___finalize__(): data = test_data_values[0] - with warns_that_defaulting_to_pandas(): + # Using force for warns_that_defaulting_to_pandas as the warnings are raised in Dataframe layer, before geting into QueryCompiler layer. + with warns_that_defaulting_to_pandas(force=True): pd.DataFrame(data).__finalize__(None) @@ -230,7 +231,7 @@ def test___repr__(): "2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 "2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" pandas_df = pandas.read_csv(io.StringIO(string_data)) - with warns_that_defaulting_to_pandas(): + with warns_that_defaulting_to_pandas(force=True): modin_df = pd.read_csv(io.StringIO(string_data)) assert repr(pandas_df) == repr(modin_df) diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py index cebe1194c6c..06ee419e6ec 100644 --- a/modin/tests/pandas/dataframe/test_join_sort.py +++ b/modin/tests/pandas/dataframe/test_join_sort.py @@ -732,7 +732,7 @@ def test_sort_values_descending_with_only_two_bins(): modin_df = pd.concat([part1, part2]) pandas_df = modin_df._to_pandas() - if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): + if StorageFormat.get() == "Pandas" and NativeDataframeMode.get() == "Default": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( @@ -772,7 +772,7 @@ def test_sort_values_with_one_partition(ascending): np.array([["hello", "goodbye"], ["hello", "Hello"]]) ) - if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): + if StorageFormat.get() == "Pandas" and NativeDataframeMode.get() == "Default": assert modin_df._query_compiler._modin_frame._partitions.shape == (1, 1) eval_general( @@ -893,7 +893,7 @@ def test_sort_values_with_only_one_non_na_row_in_partition(ascending, na_positio @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask") - or NativeDataframeMode.get() is not None, + or NativeDataframeMode.get() == "Pandas", reason="We only need to test this case where sort does not default to pandas.", ) def test_sort_values_with_sort_key_on_partition_boundary(): diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index 40c910ed4cc..cc3d6753ea0 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -304,7 +304,10 @@ def test_copy(data): assert new_modin_df.columns is not modin_df.columns assert new_modin_df.dtypes is not modin_df.dtypes - if get_current_execution() != "BaseOnPython" and not NativeDataframeMode.get(): + if ( + get_current_execution() != "BaseOnPython" + and NativeDataframeMode.get() == "Default" + ): assert np.array_equal( new_modin_df._query_compiler._modin_frame._partitions, modin_df._query_compiler._modin_frame._partitions, @@ -571,7 +574,7 @@ def test_astype_int64_to_astype_category_github_issue_6259(): reason="BaseOnPython doesn't have proxy categories", ) @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler doesn't have proxy categories", ) class TestCategoricalProxyDtype: @@ -797,7 +800,7 @@ def comparator(df1, df2): @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not contain partitions.", ) def test_convert_dtypes_multiple_row_partitions(): @@ -824,7 +827,7 @@ def test_convert_dtypes_5653(): modin_part1 = pd.DataFrame({"col1": ["a", "b", "c", "d"]}) modin_part2 = pd.DataFrame({"col1": [None, None, None, None]}) modin_df = pd.concat([modin_part1, modin_part2]) - if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): + if StorageFormat.get() == "Pandas" and NativeDataframeMode.get() == "Default": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) modin_df = modin_df.convert_dtypes() assert len(modin_df.dtypes) == 1 diff --git a/modin/tests/test_utils.py b/modin/tests/test_utils.py index 9e5589314cc..55075f3a743 100644 --- a/modin/tests/test_utils.py +++ b/modin/tests/test_utils.py @@ -250,7 +250,7 @@ def test_format_string(): assert answer == expected -def warns_that_defaulting_to_pandas(prefix=None, suffix=None): +def warns_that_defaulting_to_pandas(prefix=None, suffix=None, force=False): """ Assert that code warns that it's defaulting to pandas. @@ -262,18 +262,19 @@ def warns_that_defaulting_to_pandas(prefix=None, suffix=None): suffix : Optional[str] If specified, checks that the end of the warning message matches this argument after "[Dd]efaulting to pandas". + force : Optional[bool] + If true return the pytest.recwarn.WarningsChecker irrespective of ``NativeDataframeMode`` Returns ------- pytest.recwarn.WarningsChecker or contextlib.nullcontext - If Modin is not operating in MODIN_NATIVE_DATAFRAME_MODE,a WarningsChecker - is returned whic will check for a UserWarning indicating that Modin - is defaulting to Pandas. If MODIN_NATIVE_DATAFRAME_MODE is set, a - nullcontext is returned to avoid warning about the default to Pandas, - as this occurs due user selecting of MODIN_NATIVE_DATAFRAME_MODE. - + If Modin is not operating in ``NativeDataframeMode``, a ``WarningsChecker`` + is returned, which will check for a ``UserWarning`` indicating that Modin + is defaulting to Pandas. If ``NativeDataframeMode`` is set, a + ``nullcontext`` is returned to avoid the warning about defaulting to Pandas, + as this occurs due to user setting of ``NativeDataframeMode``. """ - if NativeDataframeMode.get(): + if NativeDataframeMode.get() == "Pandas" and not force: return contextlib.nullcontext() match = "[Dd]efaulting to pandas"