From cf5d638ec7a69d2d851a7d43f23c96640eaab9dd Mon Sep 17 00:00:00 2001 From: Arun Jose <40291569+arunjose696@users.noreply.github.com> Date: Mon, 2 Sep 2024 14:29:23 +0200 Subject: [PATCH] FEAT-#7308: Interoperability between query compilers (#7376) Co-authored-by: Anatoly Myachev Co-authored-by: Igoshev, Iaroslav Signed-off-by: arunjose696 --- .github/workflows/ci.yml | 8 + .../pandas/native_query_compiler.py | 5 +- .../storage_formats/pandas/query_compiler.py | 3 +- .../pandas/query_compiler_caster.py | 159 +++++ modin/pandas/dataframe.py | 5 +- modin/tests/pandas/native_df_mode/__init__.py | 12 + .../pandas/native_df_mode/test_binary.py | 198 ++++++ .../pandas/native_df_mode/test_default.py | 338 +++++++++ .../pandas/native_df_mode/test_indexing.py | 668 ++++++++++++++++++ .../tests/pandas/native_df_mode/test_iter.py | 137 ++++ .../pandas/native_df_mode/test_join_sort.py | 411 +++++++++++ .../native_df_mode/test_map_metadata.py | 258 +++++++ .../pandas/native_df_mode/test_pickle.py | 73 ++ .../pandas/native_df_mode/test_window.py | 101 +++ modin/tests/pandas/native_df_mode/utils.py | 133 ++++ 15 files changed, 2502 insertions(+), 7 deletions(-) create mode 100644 modin/core/storage_formats/pandas/query_compiler_caster.py create mode 100644 modin/tests/pandas/native_df_mode/__init__.py create mode 100644 modin/tests/pandas/native_df_mode/test_binary.py create mode 100644 modin/tests/pandas/native_df_mode/test_default.py create mode 100644 modin/tests/pandas/native_df_mode/test_indexing.py create mode 100644 modin/tests/pandas/native_df_mode/test_iter.py create mode 100644 modin/tests/pandas/native_df_mode/test_join_sort.py create mode 100644 modin/tests/pandas/native_df_mode/test_map_metadata.py create mode 100644 modin/tests/pandas/native_df_mode/test_pickle.py create mode 100644 modin/tests/pandas/native_df_mode/test_window.py create mode 100644 modin/tests/pandas/native_df_mode/utils.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9186500682a..8fb26225613 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -698,6 +698,14 @@ jobs: - run: python -m pytest modin/tests/pandas/dataframe/test_reduce.py - run: python -m pytest modin/tests/pandas/dataframe/test_udf.py - run: python -m pytest modin/tests/pandas/dataframe/test_window.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_binary.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_default.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_indexing.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_iter.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_join_sort.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_map_metadata.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_pickle.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_window.py - uses: ./.github/actions/upload-coverage merge-coverage-artifacts: diff --git a/modin/core/storage_formats/pandas/native_query_compiler.py b/modin/core/storage_formats/pandas/native_query_compiler.py index bfe331cfc6e..12f9da6ef46 100644 --- a/modin/core/storage_formats/pandas/native_query_compiler.py +++ b/modin/core/storage_formats/pandas/native_query_compiler.py @@ -24,8 +24,8 @@ import pandas from pandas.core.dtypes.common import is_list_like, is_scalar -from modin.config.envvars import NativeDataframeMode from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler +from modin.core.storage_formats.pandas.query_compiler_caster import QueryCompilerCaster from modin.utils import ( MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings, @@ -565,7 +565,7 @@ def caller(query_compiler, *args, **kwargs): @_inherit_docstrings(BaseQueryCompiler) -class NativeQueryCompiler(BaseQueryCompiler): +class NativeQueryCompiler(BaseQueryCompiler, QueryCompilerCaster): """ Query compiler for the pandas storage format. @@ -585,7 +585,6 @@ class NativeQueryCompiler(BaseQueryCompiler): _shape_hint: Optional[str] def __init__(self, pandas_frame, shape_hint: Optional[str] = None): - assert NativeDataframeMode.get() == "Pandas" if hasattr(pandas_frame, "_to_pandas"): pandas_frame = pandas_frame._to_pandas() if is_scalar(pandas_frame): diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 410bd2b50d8..c7fb0bae21b 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -66,6 +66,7 @@ extract_dtype, ) from modin.core.storage_formats import BaseQueryCompiler +from modin.core.storage_formats.pandas.query_compiler_caster import QueryCompilerCaster from modin.error_message import ErrorMessage from modin.logging import get_logger from modin.utils import ( @@ -253,7 +254,7 @@ def caller(df, *args, **kwargs): @_inherit_docstrings(BaseQueryCompiler) -class PandasQueryCompiler(BaseQueryCompiler): +class PandasQueryCompiler(BaseQueryCompiler, QueryCompilerCaster): """ Query compiler for the pandas storage format. diff --git a/modin/core/storage_formats/pandas/query_compiler_caster.py b/modin/core/storage_formats/pandas/query_compiler_caster.py new file mode 100644 index 00000000000..211860a8427 --- /dev/null +++ b/modin/core/storage_formats/pandas/query_compiler_caster.py @@ -0,0 +1,159 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +""" +Module contains ``QueryCompilerCaster`` class. + +``QueryCompilerCaster`` is used for automatically casting query compiler +arguments to the type of the current query compiler for query compiler class functions. +This ensures compatibility between different query compiler classes. +""" + +import functools +import inspect +from types import FunctionType, MethodType +from typing import Any, Dict, Tuple, TypeVar + +from pandas.core.indexes.frozen import FrozenList + +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler + +Fn = TypeVar("Fn", bound=Any) + + +class QueryCompilerCaster: + """Cast all query compiler arguments of the member function to current query compiler.""" + + @classmethod + def __init_subclass__( + cls, + **kwargs: Dict, + ) -> None: + """ + Apply type casting to all children of ``QueryCompilerCaster``. + + This method is called automatically when a class inherits from + ``QueryCompilerCaster``. It ensures that all member functions within the + subclass have their arguments automatically casted to the current query + compiler type. + + Parameters + ---------- + **kwargs : Additional keyword arguments + """ + super().__init_subclass__(**kwargs) + apply_argument_cast(cls) + + +def cast_nested_args_to_current_qc_type(arguments, current_qc): + """ + Cast all arguments in nested fashion to current query compiler. + + Parameters + ---------- + arguments : tuple or dict + current_qc : BaseQueryCompiler + + Returns + ------- + tuple or dict + Returns args and kwargs with all query compilers casted to current_qc. + """ + + def cast_arg_to_current_qc(arg): + current_qc_type = type(current_qc) + if isinstance(arg, BaseQueryCompiler) and not isinstance(arg, current_qc_type): + data_cls = current_qc._modin_frame + return current_qc_type.from_pandas(arg.to_pandas(), data_cls) + else: + return arg + + imutable_types = (FrozenList, tuple) + if isinstance(arguments, imutable_types): + args_type = type(arguments) + arguments = list(arguments) + arguments = cast_nested_args_to_current_qc_type(arguments, current_qc) + + return args_type(arguments) + if isinstance(arguments, list): + for i in range(len(arguments)): + if isinstance(arguments[i], (list, dict)): + cast_nested_args_to_current_qc_type(arguments[i], current_qc) + else: + arguments[i] = cast_arg_to_current_qc(arguments[i]) + elif isinstance(arguments, dict): + for key in arguments: + if isinstance(arguments[key], (list, dict)): + cast_nested_args_to_current_qc_type(arguments[key], current_qc) + else: + arguments[key] = cast_arg_to_current_qc(arguments[key]) + return arguments + + +def apply_argument_cast(obj: Fn) -> Fn: + """ + Cast all arguments that are query compilers to the current query compiler. + + Parameters + ---------- + obj : function + + Returns + ------- + function + Returns decorated function which does argument casting. + """ + if isinstance(obj, type): + all_attrs = dict(inspect.getmembers(obj)) + all_attrs.pop("__abstractmethods__") + + # This is required because inspect converts class methods to member functions + current_class_attrs = vars(obj) + for key in current_class_attrs: + all_attrs[key] = current_class_attrs[key] + + for attr_name, attr_value in all_attrs.items(): + if isinstance( + attr_value, (FunctionType, MethodType, classmethod, staticmethod) + ): + wrapped = apply_argument_cast(attr_value) + setattr(obj, attr_name, wrapped) + return obj # type: ignore [return-value] + elif isinstance(obj, classmethod): + return classmethod(apply_argument_cast(obj.__func__)) # type: ignore [return-value, arg-type] + elif isinstance(obj, staticmethod): + return staticmethod(apply_argument_cast(obj.__func__)) + + @functools.wraps(obj) + def cast_args(*args: Tuple, **kwargs: Dict) -> Any: + """ + Add casting for query compiler arguments. + + Parameters + ---------- + *args : tuple + The function arguments. + **kwargs : dict + The function keyword arguments. + + Returns + ------- + Any + """ + current_qc = args[0] + if isinstance(current_qc, BaseQueryCompiler): + kwargs = cast_nested_args_to_current_qc_type(kwargs, current_qc) + args = cast_nested_args_to_current_qc_type(args, current_qc) + return obj(*args, **kwargs) + + return cast_args diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 3d97efb4af4..de96ea0ab26 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -2993,9 +2993,8 @@ def _create_or_update_from_compiler( DataFrame or None None if update was done, ``DataFrame`` otherwise. """ - assert ( - isinstance(new_query_compiler, type(self._query_compiler)) - or type(new_query_compiler) in self._query_compiler.__class__.__bases__ + assert isinstance( + new_query_compiler, self._query_compiler.__class__.__bases__ ), "Invalid Query Compiler object: {}".format(type(new_query_compiler)) if not inplace: return self.__constructor__(query_compiler=new_query_compiler) diff --git a/modin/tests/pandas/native_df_mode/__init__.py b/modin/tests/pandas/native_df_mode/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/tests/pandas/native_df_mode/test_binary.py b/modin/tests/pandas/native_df_mode/test_binary.py new file mode 100644 index 00000000000..82c837b6416 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_binary.py @@ -0,0 +1,198 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from itertools import product + +import matplotlib +import pytest + +from modin.config import NativeDataframeMode, NPartitions +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + eval_general_interop, +) +from modin.tests.pandas.utils import ( + default_to_pandas_ignore_string, + df_equals, + test_data, + test_data_keys, + test_data_values, +) + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +# Our configuration in pytest.ini requires that we explicitly catch all +# instances of defaulting to pandas, but some test modules, like this one, +# have too many such instances. +pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) + + +@pytest.mark.parametrize( + "other", + [ + lambda df, axis: 4, + lambda df, axis: df.iloc[0] if axis == "columns" else list(df[df.columns[0]]), + lambda df, axis: { + label: idx + 1 + for idx, label in enumerate(df.axes[0 if axis == "rows" else 1]) + }, + lambda df, axis: { + label if idx % 2 else f"random_key{idx}": idx + 1 + for idx, label in enumerate(df.axes[0 if axis == "rows" else 1][::-1]) + }, + ], + ids=[ + "scalar", + "series_or_list", + "dictionary_keys_equal_columns", + "dictionary_keys_unequal_columns", + ], +) +@pytest.mark.parametrize("axis", ["rows", "columns"]) +@pytest.mark.parametrize( + "op", + [ + *("add", "radd", "sub", "rsub", "mod", "rmod", "pow", "rpow"), + *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +@pytest.mark.parametrize("backend", [None, "pyarrow"]) +def test_math_functions(other, axis, op, backend, df_mode_pair): + data = test_data["float_nan_data"] + if (op == "floordiv" or op == "rfloordiv") and axis == "rows": + # lambda == "series_or_list" + pytest.xfail(reason="different behavior") + + if op == "rmod" and axis == "rows": + # lambda == "series_or_list" + pytest.xfail(reason="different behavior") + + if op in ("mod", "rmod") and backend == "pyarrow": + pytest.skip(reason="These functions are not implemented in pandas itself") + + eval_general_interop( + data, + backend, + lambda df1, df2: getattr(df1, op)(other(df2, axis), axis=axis), + df_mode_pair, + ) + + +@pytest.mark.parametrize("other", [lambda df: 2, lambda df: df]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test___divmod__(other, df_mode_pair): + data = test_data["float_nan_data"] + eval_general_interop( + data, None, lambda df1, df2: divmod(df1, other(df2)), df_mode_pair + ) + + +@pytest.mark.parametrize("other", ["as_left", 4]) +@pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_comparison(data, op, other, request, df_mode_pair): + def operation(df1, df2): + return getattr(df1, op)(df2 if other == "as_left" else other) + + expected_exception = None + if "int_data" in request.node.callspec.id and other == "a": + pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019") + elif "float_nan_data" in request.node.callspec.id and other == "a": + expected_exception = TypeError( + "Invalid comparison between dtype=float64 and str" + ) + eval_general_interop( + data, + None, + operation, + df_mode_pair, + expected_exception=expected_exception, + ) + + +@pytest.mark.parametrize( + "frame1_data,frame2_data,expected_pandas_equals", + [ + pytest.param({}, {}, True, id="two_empty_dataframes"), + pytest.param([[1]], [[0]], False, id="single_unequal_values"), + pytest.param([[None]], [[None]], True, id="single_none_values"), + pytest.param( + [[1, 2], [3, 4]], + [[1, 2], [3, 4]], + True, + id="equal_two_by_two_dataframes", + ), + pytest.param( + [[1, 2], [3, 4]], + [[5, 2], [3, 4]], + False, + id="unequal_two_by_two_dataframes", + ), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_equals(frame1_data, frame2_data, expected_pandas_equals, df_mode_pair): + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + frame1_data, df_mode=df_mode_pair[0] + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + frame2_data, df_mode=df_mode_pair[1] + ) + + pandas_equals = pandas_df1.equals(pandas_df2) + assert pandas_equals == expected_pandas_equals, ( + "Test expected pandas to say the dataframes were" + + f"{'' if expected_pandas_equals else ' not'} equal, but they were" + + f"{' not' if expected_pandas_equals else ''} equal." + ) + + assert modin_df1.equals(modin_df2) == pandas_equals + assert modin_df1.equals(pandas_df2) == pandas_equals + + +@pytest.mark.parametrize("empty_operand", ["right", "left", "both"]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_empty_df(empty_operand, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode( + [0, 1, 2, 0, 1, 2], df_mode=df_mode_pair[0] + ) + modin_df_empty, pandas_df_empty = create_test_df_in_defined_mode( + df_mode=df_mode_pair[1] + ) + + if empty_operand == "right": + modin_res = modin_df + modin_df_empty + pandas_res = pandas_df + pandas_df_empty + elif empty_operand == "left": + modin_res = modin_df_empty + modin_df + pandas_res = pandas_df_empty + pandas_df + else: + modin_res = modin_df_empty + modin_df_empty + pandas_res = pandas_df_empty + pandas_df_empty + + df_equals(modin_res, pandas_res) diff --git a/modin/tests/pandas/native_df_mode/test_default.py b/modin/tests/pandas/native_df_mode/test_default.py new file mode 100644 index 00000000000..03d6d372fd4 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_default.py @@ -0,0 +1,338 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + + +from itertools import product + +import matplotlib +import numpy as np +import pandas +import pytest +from numpy.testing import assert_array_equal + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions +from modin.pandas.io import to_pandas +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + create_test_series_in_defined_mode, + eval_general_interop, +) +from modin.tests.pandas.utils import ( + default_to_pandas_ignore_string, + df_equals, + test_data, + test_data_diff_dtype, + test_data_keys, + test_data_large_categorical_dataframe, + test_data_values, +) +from modin.tests.test_utils import warns_that_defaulting_to_pandas + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +# Our configuration in pytest.ini requires that we explicitly catch all +# instances of defaulting to pandas, but some test modules, like this one, +# have too many such instances. +pytestmark = [ + pytest.mark.filterwarnings(default_to_pandas_ignore_string), + # IGNORE FUTUREWARNINGS MARKS TO CLEANUP OUTPUT + pytest.mark.filterwarnings( + "ignore:.*bool is now deprecated and will be removed:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:first is deprecated and will be removed:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:last is deprecated and will be removed:FutureWarning" + ), +] + + +@pytest.mark.parametrize( + "op, make_args", + [ + ("align", lambda df: {"other": df}), + ("corrwith", lambda df: {"other": df}), + ("ewm", lambda df: {"com": 0.5}), + ("from_dict", lambda df: {"data": None}), + ("from_records", lambda df: {"data": to_pandas(df)}), + ("hist", lambda df: {"column": "int_col"}), + ("interpolate", None), + ("mask", lambda df: {"cond": df != 0}), + ("pct_change", None), + ("to_xarray", None), + ("flags", None), + ("set_flags", lambda df: {"allows_duplicate_labels": False}), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_ops_defaulting_to_pandas(op, make_args, df_mode_pair): + modin_df1, _ = create_test_df_in_defined_mode( + test_data_diff_dtype, + post_fn=lambda df: df.drop(["str_col", "bool_col"], axis=1), + df_mode=df_mode_pair[0], + ) + modin_df2, _ = create_test_df_in_defined_mode( + test_data_diff_dtype, + post_fn=lambda df: df.drop(["str_col", "bool_col"], axis=1), + df_mode=df_mode_pair[1], + ) + with warns_that_defaulting_to_pandas(): + operation = getattr(modin_df1, op) + if make_args is not None: + operation(**make_args(modin_df2)) + else: + try: + operation() + # `except` for non callable attributes + except TypeError: + pass + + +@pytest.mark.parametrize( + "data", + test_data_values + [test_data_large_categorical_dataframe], + ids=test_data_keys + ["categorical_ints"], +) +def test_to_numpy(data): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + assert_array_equal(modin_df.values, pandas_df.values) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_asfreq(df_mode_pair): + index = pd.date_range("1/1/2000", periods=4, freq="min") + series, _ = create_test_series_in_defined_mode( + [0.0, None, 2.0, 3.0], index=index, df_mode=df_mode_pair[0] + ) + df, _ = create_test_df_in_defined_mode({"s": series}, df_mode=df_mode_pair[1]) + with warns_that_defaulting_to_pandas(): + # We are only testing that this defaults to pandas, so we will just check for + # the warning + df.asfreq(freq="30S") + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_assign(df_mode_pair): + data = test_data_values[0] + + def assign_one_column(df1, df2): + df1.assign(new_column=pd.Series(df2.iloc[:, 0])) + + eval_general_interop(data, None, assign_one_column, df_mode_pair) + + def assign_multiple_columns(df1, df2): + df1.assign( + new_column=pd.Series(df2.iloc[:, 0]), new_column2=pd.Series(df2.iloc[:, 1]) + ) + + eval_general_interop(data, None, assign_multiple_columns, df_mode_pair) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_combine_first(df_mode_pair): + data1 = {"A": [None, 0], "B": [None, 4]} + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + data1, df_mode=df_mode_pair[0] + ) + data2 = {"A": [1, 1], "B": [3, 3]} + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + data2, df_mode=df_mode_pair[1] + ) + + df_equals( + modin_df1.combine_first(modin_df2), + pandas_df1.combine_first(pandas_df2), + # https://github.com/modin-project/modin/issues/5959 + check_dtypes=False, + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_dot(data, df_mode_pair): + + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + col_len = len(modin_df.columns) + + # Test series input + modin_series, pandas_series = create_test_series_in_defined_mode( + np.arange(col_len), + index=pandas_df.columns, + df_mode=df_mode_pair[1], + ) + modin_result = modin_df.dot(modin_series) + pandas_result = pandas_df.dot(pandas_series) + df_equals(modin_result, pandas_result) + + def dot_func(df1, df2): + return df1.dot(df2.T) + + # modin_result = modin_df.dot(modin_df.T) + # pandas_result = pandas_df.dot(pandas_df.T) + # df_equals(modin_result, pandas_result) + # Test dataframe input + eval_general_interop(data, None, dot_func, df_mode_pair) + + # Test when input series index doesn't line up with columns + with pytest.raises(ValueError): + modin_series_without_index, _ = create_test_series_in_defined_mode( + np.arange(col_len), df_mode=df_mode_pair[1] + ) + modin_df.dot(modin_series_without_index) + + # Test case when left dataframe has size (n x 1) + # and right dataframe has size (1 x n) + eval_general_interop(pandas_series, None, dot_func, df_mode_pair) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_matmul(data, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + col_len = len(modin_df.columns) + + # Test list input + arr = np.arange(col_len) + modin_result = modin_df @ arr + pandas_result = pandas_df @ arr + df_equals(modin_result, pandas_result) + + # Test bad dimensions + with pytest.raises(ValueError): + modin_df @ np.arange(col_len + 10) + + # Test series input + modin_series, pandas_series = create_test_series_in_defined_mode( + np.arange(col_len), + index=pandas_df.columns, + df_mode=df_mode_pair[1], + ) + modin_result = modin_df @ modin_series + pandas_result = pandas_df @ pandas_series + df_equals(modin_result, pandas_result) + + # Test dataframe input + def matmul_func(df1, df2): + return df1 @ df2.T + + # Test dataframe input + eval_general_interop(data, None, matmul_func, df_mode_pair) + + # Test when input series index doesn't line up with columns + with pytest.raises(ValueError): + modin_series_without_index, _ = create_test_series_in_defined_mode( + np.arange(col_len), df_mode=df_mode_pair[1] + ) + modin_df @ modin_series_without_index + + +@pytest.mark.parametrize("data", [test_data["int_data"]], ids=["int_data"]) +@pytest.mark.parametrize( + "index", + [ + pytest.param(lambda _, df: df.columns[0], id="single_index_col"), + pytest.param( + lambda _, df: [*df.columns[0:2], *df.columns[-7:-4]], + id="multiple_index_cols", + ), + pytest.param(None, id="default_index"), + ], +) +@pytest.mark.parametrize( + "columns", + [ + pytest.param(lambda _, df: df.columns[len(df.columns) // 2], id="single_col"), + pytest.param( + lambda _, df: [ + *df.columns[(len(df.columns) // 2) : (len(df.columns) // 2 + 4)], + df.columns[-7], + ], + id="multiple_cols", + ), + pytest.param(None, id="default_columns"), + ], +) +@pytest.mark.parametrize( + "values", + [ + pytest.param(lambda _, df: df.columns[-1], id="single_value_col"), + pytest.param(lambda _, df: df.columns[-4:-1], id="multiple_value_cols"), + ], +) +@pytest.mark.parametrize( + "aggfunc", + [ + pytest.param(lambda df, _: np.mean(df), id="callable_tree_reduce_func"), + pytest.param("mean", id="tree_reduce_func"), + pytest.param("nunique", id="full_axis_func"), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_pivot_table_data(data, index, columns, values, aggfunc, request, df_mode_pair): + if ( + "callable_tree_reduce_func-single_value_col-multiple_cols-multiple_index_cols" + in request.node.callspec.id + or "callable_tree_reduce_func-multiple_value_cols-multiple_cols-multiple_index_cols" + in request.node.callspec.id + or "tree_reduce_func-single_value_col-multiple_cols-multiple_index_cols" + in request.node.callspec.id + or "tree_reduce_func-multiple_value_cols-multiple_cols-multiple_index_cols" + in request.node.callspec.id + or "full_axis_func-single_value_col-multiple_cols-multiple_index_cols" + in request.node.callspec.id + or "full_axis_func-multiple_value_cols-multiple_cols-multiple_index_cols" + in request.node.callspec.id + ): + pytest.xfail(reason="https://github.com/modin-project/modin/issues/7011") + + expected_exception = None + if "default_columns-default_index" in request.node.callspec.id: + expected_exception = ValueError("No group keys passed!") + elif ( + "callable_tree_reduce_func" in request.node.callspec.id + and "int_data" in request.node.callspec.id + ): + expected_exception = TypeError("'numpy.float64' object is not callable") + + eval_general_interop( + data, + None, + operation=lambda df, _, *args, **kwargs: df.pivot_table( + *args, **kwargs + ).sort_index(axis=int(index is not None)), + df_mode_pair=df_mode_pair, + index=index, + columns=columns, + values=values, + aggfunc=aggfunc, + expected_exception=expected_exception, + ) diff --git a/modin/tests/pandas/native_df_mode/test_indexing.py b/modin/tests/pandas/native_df_mode/test_indexing.py new file mode 100644 index 00000000000..b434026394a --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_indexing.py @@ -0,0 +1,668 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. +from itertools import product + +import matplotlib +import numpy as np +import pandas +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + create_test_series_in_defined_mode, + eval_general_interop, +) +from modin.tests.pandas.utils import ( + RAND_HIGH, + RAND_LOW, + default_to_pandas_ignore_string, + df_equals, + eval_general, + test_data, + test_data_keys, + test_data_values, +) + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +# Our configuration in pytest.ini requires that we explicitly catch all +# instances of defaulting to pandas, but some test modules, like this one, +# have too many such instances. +# TODO(https://github.com/modin-project/modin/issues/3655): catch all instances +# of defaulting to pandas. +pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) + + +def eval_setitem(md_df, pd_df, value, col=None, loc=None, expected_exception=None): + if loc is not None: + col = pd_df.columns[loc] + + value_getter = value if callable(value) else (lambda *args, **kwargs: value) + + eval_general( + md_df, + pd_df, + lambda df: df.__setitem__(col, value_getter(df)), + __inplace__=True, + expected_exception=expected_exception, + ) + df_mode_pair_list = list(product(NativeDataframeMode.choices, repeat=2)) + for df_mode_pair in df_mode_pair_list: + eval_general_interop( + pd_df, + None, + lambda df1, df2: df1.__setitem__(col, value_getter(df2)), + df_mode_pair, + __inplace__=True, + expected_exception=expected_exception, + ) + + +def eval_loc(md_df, pd_df, value, key): + if isinstance(value, tuple): + assert len(value) == 2 + # case when value for pandas different + md_value, pd_value = value + else: + md_value, pd_value = value, value + + eval_general( + md_df, + pd_df, + lambda df: df.loc.__setitem__( + key, pd_value if isinstance(df, pandas.DataFrame) else md_value + ), + __inplace__=True, + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "key_func", + [ + # test for the case from https://github.com/modin-project/modin/issues/4308 + lambda df: "non_existing_column", + lambda df: df.columns[0], + lambda df: df.index, + lambda df: [df.index, df.columns[0]], + lambda df: ( + pandas.Series(list(range(len(df.index)))) + if isinstance(df, pandas.DataFrame) + else pd.Series(list(range(len(df)))) + ), + ], + ids=[ + "non_existing_column", + "first_column_name", + "original_index", + "list_of_index_and_first_column_name", + "series_of_integers", + ], +) +@pytest.mark.parametrize( + "drop_kwargs", + [{"drop": True}, {"drop": False}, {}], + ids=["drop_True", "drop_False", "no_drop_param"], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_set_index(data, key_func, drop_kwargs, request, df_mode_pair): + if ( + "list_of_index_and_first_column_name" in request.node.name + and "drop_False" in request.node.name + ): + pytest.xfail( + reason="KeyError: https://github.com/modin-project/modin/issues/5636" + ) + expected_exception = None + if "non_existing_column" in request.node.callspec.id: + expected_exception = KeyError( + "None of ['non_existing_column'] are in the columns" + ) + + eval_general_interop( + data, + None, + lambda df1, df2: df1.set_index(key_func(df2), **drop_kwargs), + expected_exception=expected_exception, + df_mode_pair=df_mode_pair, + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_loc(data, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + + indices = [i % 3 == 0 for i in range(len(modin_df.index))] + columns = [i % 5 == 0 for i in range(len(modin_df.columns))] + + # Key is a Modin or pandas series of booleans + series1, _ = create_test_series_in_defined_mode(indices, df_mode=df_mode_pair[0]) + series2, _ = create_test_series_in_defined_mode( + columns, index=modin_df.columns, df_mode=df_mode_pair[0] + ) + df_equals( + modin_df.loc[series1, series2], + pandas_df.loc[ + pandas.Series(indices), pandas.Series(columns, index=modin_df.columns) + ], + ) + + +@pytest.mark.parametrize("left, right", [(2, 1), (6, 1), (lambda df: 70, 1), (90, 70)]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_loc_insert_row(left, right, df_mode_pair): + # This test case comes from + # https://github.com/modin-project/modin/issues/3764 + data = [[1, 2, 3], [4, 5, 6]] + + def _test_loc_rows(df1, df2): + df1.loc[left] = df2.loc[right] + return df1 + + expected_exception = None + if right == 70: + pytest.xfail(reason="https://github.com/modin-project/modin/issues/7024") + + eval_general_interop( + data, + None, + _test_loc_rows, + expected_exception=expected_exception, + df_mode_pair=df_mode_pair, + ) + + +@pytest.fixture(params=list(product(NativeDataframeMode.choices, repeat=2))) +def loc_iter_dfs_interop(request): + df_mode_pair = request.param + columns = ["col1", "col2", "col3"] + index = ["row1", "row2", "row3"] + md_df1, pd_df1 = create_test_df_in_defined_mode( + {col: ([idx] * len(index)) for idx, col in enumerate(columns)}, + columns=columns, + index=index, + df_mode=df_mode_pair[0], + ) + md_df2, pd_df2 = create_test_df_in_defined_mode( + {col: ([idx] * len(index)) for idx, col in enumerate(columns)}, + columns=columns, + index=index, + df_mode=df_mode_pair[1], + ) + return md_df1, pd_df1, md_df2, pd_df2 + + +@pytest.mark.parametrize("reverse_order", [False, True]) +@pytest.mark.parametrize("axis", [0, 1]) +def test_loc_iter_assignment(loc_iter_dfs_interop, reverse_order, axis): + if reverse_order and axis: + pytest.xfail( + "Due to internal sorting of lookup values assignment order is lost, see GH-#2552" + ) + + md_df1, pd_df1, md_df2, pd_df2 = loc_iter_dfs_interop + + select = [slice(None), slice(None)] + select[axis] = sorted(pd_df1.axes[axis][:-1], reverse=reverse_order) + select = tuple(select) + + pd_df1.loc[select] = pd_df1.loc[select] + pd_df2.loc[select] + md_df1.loc[select] = md_df1.loc[select] + md_df2.loc[select] + df_equals(md_df1, pd_df1) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_loc_series(df_mode_pair): + md_df1, pd_df1 = create_test_df_in_defined_mode( + {"a": [1, 2], "b": [3, 4]}, df_mode=df_mode_pair[0] + ) + md_df2, pd_df2 = create_test_df_in_defined_mode( + {"a": [1, 2], "b": [3, 4]}, df_mode=df_mode_pair[1] + ) + + pd_df1.loc[pd_df2["a"] > 1, "b"] = np.log(pd_df1["b"]) + md_df1.loc[md_df2["a"] > 1, "b"] = np.log(md_df1["b"]) + + df_equals(pd_df1, md_df1) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_reindex_like(df_mode_pair): + o_data = [ + [24.3, 75.7, "high"], + [31, 87.8, "high"], + [22, 71.6, "medium"], + [35, 95, "medium"], + ] + o_columns = ["temp_celsius", "temp_fahrenheit", "windspeed"] + o_index = pd.date_range(start="2014-02-12", end="2014-02-15", freq="D") + new_data = [[28, "low"], [30, "low"], [35.1, "medium"]] + new_columns = ["temp_celsius", "windspeed"] + new_index = pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]) + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + o_data, + columns=o_columns, + index=o_index, + df_mode=df_mode_pair[0], + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + new_data, + columns=new_columns, + index=new_index, + df_mode=df_mode_pair[1], + ) + modin_result = modin_df2.reindex_like(modin_df1) + pandas_result = pandas_df2.reindex_like(pandas_df1) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_reindex_multiindex(df_mode_pair): + data1, data2 = np.random.randint(1, 20, (5, 5)), np.random.randint(10, 25, 6) + index = np.array(["AUD", "BRL", "CAD", "EUR", "INR"]) + pandas_midx = pandas.MultiIndex.from_product( + [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"] + ) + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + data=data1, index=index, columns=index, df_mode=df_mode_pair[0] + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + data=data2, index=pandas_midx, df_mode=df_mode_pair[1] + ) + + modin_df2.columns, pandas_df2.columns = ["Notional"], ["Notional"] + md_midx = pd.MultiIndex.from_product([modin_df2.index.levels[0], modin_df1.index]) + pd_midx = pandas.MultiIndex.from_product( + [pandas_df2.index.levels[0], pandas_df1.index] + ) + # reindex without axis, index, or columns + modin_result = modin_df1.reindex(md_midx, fill_value=0) + pandas_result = pandas_df1.reindex(pd_midx, fill_value=0) + df_equals(modin_result, pandas_result) + # reindex with only axis + modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0) + pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0) + df_equals(modin_result, pandas_result) + # reindex with axis and level + modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0, level=0) + pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0, level=0) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_getitem_empty_mask(df_mode_pair): + # modin-project/modin#517 + modin_frames = [] + pandas_frames = [] + data1 = np.random.randint(0, 100, size=(100, 4)) + mdf1, pdf1 = create_test_df_in_defined_mode( + data1, columns=list("ABCD"), df_mode=df_mode_pair[0] + ) + + modin_frames.append(mdf1) + pandas_frames.append(pdf1) + + data2 = np.random.randint(0, 100, size=(100, 4)) + mdf2, pdf2 = create_test_df_in_defined_mode( + data2, columns=list("ABCD"), df_mode=df_mode_pair[1] + ) + modin_frames.append(mdf2) + pandas_frames.append(pdf2) + + data3 = np.random.randint(0, 100, size=(100, 4)) + mdf3, pdf3 = create_test_df_in_defined_mode( + data3, columns=list("ABCD"), df_mode=df_mode_pair[0] + ) + modin_frames.append(mdf3) + pandas_frames.append(pdf3) + + modin_data = pd.concat(modin_frames) + pandas_data = pandas.concat(pandas_frames) + df_equals( + modin_data[[False for _ in modin_data.index]], + pandas_data[[False for _ in modin_data.index]], + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test___setitem__mask(df_mode_pair): + # DataFrame mask: + data = test_data["int_data"] + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + data, df_mode=df_mode_pair[0] + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + data, df_mode=df_mode_pair[0] + ) + + mean = int((RAND_HIGH + RAND_LOW) / 2) + pandas_df1[pandas_df2 > mean] = -50 + modin_df1[modin_df2 > mean] = -50 + + df_equals(modin_df1, pandas_df1) + + +@pytest.mark.parametrize( + "data", + [ + {}, + {"id": [], "max_speed": [], "health": []}, + {"id": [1], "max_speed": [2], "health": [3]}, + {"id": [4, 40, 400], "max_speed": [111, 222, 333], "health": [33, 22, 11]}, + ], + ids=["empty_frame", "empty_cols", "1_length_cols", "2_length_cols"], +) +@pytest.mark.parametrize( + "value", + [[11, 22], [11, 22, 33]], + ids=["2_length_val", "3_length_val"], +) +@pytest.mark.parametrize("convert_to_series", [False, True]) +@pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_setitem_on_empty_df(data, value, convert_to_series, new_col_id, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + + def applyier(df): + if convert_to_series: + converted_value = ( + pandas.Series(value) + if isinstance(df, pandas.DataFrame) + else create_test_series_in_defined_mode(value, df_mode=df_mode_pair[1])[ + 1 + ] + ) + else: + converted_value = value + df[new_col_id] = converted_value + return df + + expected_exception = None + if not convert_to_series: + values_length = len(value) + index_length = len(pandas_df.index) + expected_exception = ValueError( + f"Length of values ({values_length}) does not match length of index ({index_length})" + ) + + eval_general( + modin_df, + pandas_df, + applyier, + # https://github.com/modin-project/modin/issues/5961 + comparator_kwargs={ + "check_dtypes": not (len(pandas_df) == 0 and len(pandas_df.columns) != 0) + }, + expected_exception=expected_exception, + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_setitem_on_empty_df_4407(df_mode_pair): + data = {} + index = pd.date_range(end="1/1/2018", periods=0, freq="D") + column = pd.date_range(end="1/1/2018", periods=1, freq="h")[0] + modin_df, pandas_df = create_test_df_in_defined_mode( + data, columns=index, df_mode=df_mode_pair[0] + ) + modin_ser, pandas_ser = create_test_series_in_defined_mode( + [1], df_mode=df_mode_pair[1] + ) + modin_df[column] = modin_ser + pandas_df[column] = pandas_ser + + df_equals(modin_df, pandas_df) + assert modin_df.columns.freq == pandas_df.columns.freq + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_setitem_2d_insertion(df_mode_pair): + def build_value_picker(modin_value, pandas_value): + """Build a function that returns either Modin or pandas DataFrame depending on the passed frame.""" + return lambda source_df, *args, **kwargs: ( + modin_value + if isinstance(source_df, (pd.DataFrame, pd.Series)) + else pandas_value + ) + + modin_df, pandas_df = create_test_df_in_defined_mode( + test_data["int_data"], df_mode=df_mode_pair[0] + ) + + # Easy case - key and value.columns are equal + modin_value, pandas_value = create_test_df_in_defined_mode( + { + "new_value1": np.arange(len(modin_df)), + "new_value2": np.arange(len(modin_df)), + }, + df_mode=df_mode_pair[1], + ) + eval_setitem( + modin_df, + pandas_df, + build_value_picker(modin_value, pandas_value), + col=["new_value1", "new_value2"], + ) + + # Key and value.columns have equal values but in different order + new_columns = ["new_value3", "new_value4"] + modin_value.columns, pandas_value.columns = new_columns, new_columns + eval_setitem( + modin_df, + pandas_df, + build_value_picker(modin_value, pandas_value), + col=["new_value4", "new_value3"], + ) + + # Key and value.columns have different values + new_columns = ["new_value5", "new_value6"] + modin_value.columns, pandas_value.columns = new_columns, new_columns + eval_setitem( + modin_df, + pandas_df, + build_value_picker(modin_value, pandas_value), + col=["__new_value5", "__new_value6"], + ) + + # Key and value.columns have different lengths, testing that both raise the same exception + eval_setitem( + modin_df, + pandas_df, + build_value_picker(modin_value.iloc[:, [0]], pandas_value.iloc[:, [0]]), + col=["new_value7", "new_value8"], + expected_exception=ValueError("Columns must be same length as key"), + ) + + +@pytest.mark.parametrize("does_value_have_different_columns", [True, False]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_setitem_2d_update(does_value_have_different_columns, df_mode_pair): + def test(dfs, iloc): + """Update columns on the given numeric indices.""" + df1, df2 = dfs + cols1 = df1.columns[iloc].tolist() + cols2 = df2.columns[iloc].tolist() + df1[cols1] = df2[cols2] + return df1 + + modin_df, pandas_df = create_test_df_in_defined_mode( + test_data["int_data"], df_mode=df_mode_pair[0] + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + test_data["int_data"], df_mode=df_mode_pair[1] + ) + modin_df2 *= 10 + pandas_df2 *= 10 + + if does_value_have_different_columns: + new_columns = [f"{col}_new" for col in modin_df.columns] + modin_df2.columns = new_columns + pandas_df2.columns = new_columns + + modin_dfs = (modin_df, modin_df2) + pandas_dfs = (pandas_df, pandas_df2) + + eval_general(modin_dfs, pandas_dfs, test, iloc=[0, 1, 2]) + eval_general(modin_dfs, pandas_dfs, test, iloc=[0, -1]) + eval_general( + modin_dfs, pandas_dfs, test, iloc=slice(1, None) + ) # (start=1, stop=None) + eval_general( + modin_dfs, pandas_dfs, test, iloc=slice(None, -2) + ) # (start=None, stop=-2) + eval_general( + modin_dfs, + pandas_dfs, + test, + iloc=[0, 1, 5, 6, 9, 10, -2, -1], + ) + eval_general( + modin_dfs, + pandas_dfs, + test, + iloc=[5, 4, 0, 10, 1, -1], + ) + eval_general( + modin_dfs, pandas_dfs, test, iloc=slice(None, None, 2) + ) # (start=None, stop=None, step=2) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test___setitem__single_item_in_series(df_mode_pair): + # Test assigning a single item in a Series for issue + # https://github.com/modin-project/modin/issues/3860 + modin_series1, pandas_series1 = create_test_series_in_defined_mode( + 99, df_mode=df_mode_pair[0] + ) + modin_series2, pandas_series2 = create_test_series_in_defined_mode( + 100, df_mode=df_mode_pair[1] + ) + modin_series1[:1] = modin_series2 + pandas_series1[:1] = pandas_series2 + df_equals(modin_series1, pandas_series1) + + +@pytest.mark.parametrize( + "value", + [ + 1, + np.int32(1), + 1.0, + "str val", + pandas.Timestamp("1/4/2018"), + np.datetime64(0, "ms"), + True, + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_loc_boolean_assignment_scalar_dtypes(value, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode( + { + "a": [1, 2, 3], + "b": [3.0, 5.0, 6.0], + "c": ["a", "b", "c"], + "d": [1.0, "c", 2.0], + "e": pandas.to_datetime(["1/1/2018", "1/2/2018", "1/3/2018"]), + "f": [True, False, True], + }, + df_mode=df_mode_pair[1], + ) + modin_idx, pandas_idx = create_test_series_in_defined_mode( + [False, True, True], df_mode=df_mode_pair[1] + ) + + modin_df.loc[modin_idx] = value + pandas_df.loc[pandas_idx] = value + df_equals(modin_df, pandas_df) + + +# This is a very subtle bug that comes from: +# https://github.com/modin-project/modin/issues/4945 +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_lazy_eval_index(df_mode_pair): + data = {"col0": [0, 1]} + + def func(df1, df2): + df_copy = df1[df2["col0"] < 6].copy() + # The problem here is that the index is not copied over so it needs + # to get recomputed at some point. Our implementation of __setitem__ + # requires us to build a mask and insert the value from the right + # handside into the new DataFrame. However, it's possible that we + # won't have any new partitions, so we will end up computing an empty + # index. + df_copy["col0"] = df_copy["col0"].apply(lambda x: x + 1) + return df_copy + + eval_general_interop(data, None, func, df_mode_pair=df_mode_pair) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_index_of_empty_frame(df_mode_pair): + # Test on an empty frame created by user + + # Test on an empty frame produced by Modin's logic + data = test_data_values[0] + md_df1, pd_df1 = create_test_df_in_defined_mode( + data, + index=pandas.RangeIndex(len(next(iter(data.values()))), name="index name"), + df_mode=df_mode_pair[0], + ) + md_df2, pd_df2 = create_test_df_in_defined_mode( + data, + index=pandas.RangeIndex(len(next(iter(data.values()))), name="index name"), + df_mode=df_mode_pair[1], + ) + + md_res = md_df1.query(f"{md_df2.columns[0]} > {RAND_HIGH}") + pd_res = pd_df1.query(f"{pd_df2.columns[0]} > {RAND_HIGH}") + + assert md_res.empty and pd_res.empty + df_equals(md_res.index, pd_res.index) diff --git a/modin/tests/pandas/native_df_mode/test_iter.py b/modin/tests/pandas/native_df_mode/test_iter.py new file mode 100644 index 00000000000..a2e176d4372 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_iter.py @@ -0,0 +1,137 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import warnings +from itertools import product + +import matplotlib +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions +from modin.pandas.utils import SET_DATAFRAME_ATTRIBUTE_WARNING +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + create_test_series_in_defined_mode, +) +from modin.tests.pandas.utils import df_equals, eval_general + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test___setattr__mutating_column(df_mode_pair): + # Use case from issue #4577 + modin_df, pandas_df = create_test_df_in_defined_mode( + [[1]], columns=["col0"], df_mode=df_mode_pair[0] + ) + # Replacing a column with a list should mutate the column in place. + pandas_df.col0 = [3] + modin_df.col0 = [3] + modin_ser, pandas_ser = create_test_series_in_defined_mode( + [3], df_mode=df_mode_pair[1] + ) + df_equals(modin_df, pandas_df) + # Check that the col0 attribute reflects the value update. + df_equals(modin_df.col0, pandas_df.col0) + + pandas_df.col0 = pandas_ser + modin_df.col0 = modin_ser + + # Check that the col0 attribute reflects this update + df_equals(modin_df, pandas_df) + + pandas_df.loc[0, "col0"] = 4 + modin_df.loc[0, "col0"] = 4 + + # Check that the col0 attribute reflects update via loc + df_equals(modin_df, pandas_df) + assert modin_df.col0.equals(modin_df["col0"]) + + # Check that attempting to add a new col via attributes raises warning + # and adds the provided list as a new attribute and not a column. + with pytest.warns( + UserWarning, + match=SET_DATAFRAME_ATTRIBUTE_WARNING, + ): + modin_df.col1 = [4] + + with warnings.catch_warnings(): + warnings.filterwarnings( + action="error", + message=SET_DATAFRAME_ATTRIBUTE_WARNING, + ) + modin_df.col1 = [5] + modin_df.new_attr = 6 + modin_df.col0 = 7 + + assert "new_attr" in dir( + modin_df + ), "Modin attribute was not correctly added to the df." + assert ( + "new_attr" not in modin_df + ), "New attribute was not correctly added to columns." + assert modin_df.new_attr == 6, "Modin attribute value was set incorrectly." + assert isinstance( + modin_df.col0, pd.Series + ), "Scalar was not broadcasted properly to an existing column." + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_isin_with_modin_objects(df_mode_pair): + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + {"a": [1, 2], "b": [3, 4]}, df_mode=df_mode_pair[0] + ) + modin_series, pandas_series = create_test_series_in_defined_mode( + [1, 4, 5, 6], df_mode=df_mode_pair[1] + ) + + eval_general( + (modin_df1, modin_series), + (pandas_df1, pandas_series), + lambda srs: srs[0].isin(srs[1]), + ) + + modin_df2 = modin_series.to_frame("a") + pandas_df2 = pandas_series.to_frame("a") + + eval_general( + (modin_df1, modin_df2), + (pandas_df1, pandas_df2), + lambda srs: srs[0].isin(srs[1]), + ) + + # Check case when indices are not matching + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + {"a": [1, 2], "b": [3, 4]}, + index=[10, 11], + df_mode=df_mode_pair[0], + ) + + eval_general( + (modin_df1, modin_series), + (pandas_df1, pandas_series), + lambda srs: srs[0].isin(srs[1]), + ) + eval_general( + (modin_df1, modin_df2), + (pandas_df1, pandas_df2), + lambda srs: srs[0].isin(srs[1]), + ) diff --git a/modin/tests/pandas/native_df_mode/test_join_sort.py b/modin/tests/pandas/native_df_mode/test_join_sort.py new file mode 100644 index 00000000000..62565dde382 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_join_sort.py @@ -0,0 +1,411 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from itertools import product + +import matplotlib +import numpy as np +import pandas +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions +from modin.pandas.io import to_pandas +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + create_test_series_in_defined_mode, + eval_general_interop, +) +from modin.tests.pandas.utils import ( + default_to_pandas_ignore_string, + df_equals, + eval_general, + random_state, + test_data_keys, + test_data_values, +) + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +# Our configuration in pytest.ini requires that we explicitly catch all +# instances of defaulting to pandas, but some test modules, like this one, +# have too many such instances. +pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) + +# Initialize env for storage format detection in @pytest.mark.* +pd.DataFrame() + + +def df_equals_and_sort(df1, df2): + """Sort dataframe's rows and run ``df_equals()`` for them.""" + df1 = df1.sort_values(by=df1.columns.tolist(), ignore_index=True) + df2 = df2.sort_values(by=df2.columns.tolist(), ignore_index=True) + df_equals(df1, df2) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_combine(data, df_mode_pair): + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + data, df_mode=df_mode_pair[0] + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + data, df_mode=df_mode_pair[1] + ) + modin_df_1.combine( + modin_df_2 + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2 + ) + pandas_df_1.combine( + pandas_df_2 + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2 + ) + + +@pytest.mark.parametrize( + "test_data, test_data2", + [ + ( + np.random.randint(0, 100, size=(64, 64)), + np.random.randint(0, 100, size=(128, 64)), + ), + ( + np.random.randint(0, 100, size=(128, 64)), + np.random.randint(0, 100, size=(64, 64)), + ), + ( + np.random.randint(0, 100, size=(64, 64)), + np.random.randint(0, 100, size=(64, 128)), + ), + ( + np.random.randint(0, 100, size=(64, 128)), + np.random.randint(0, 100, size=(64, 64)), + ), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_join(test_data, test_data2, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode( + test_data, + columns=["col{}".format(i) for i in range(test_data.shape[1])], + index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), + df_mode=df_mode_pair[0], + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + test_data2, + columns=["col{}".format(i) for i in range(test_data2.shape[1])], + index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), + df_mode=df_mode_pair[1], + ) + + hows = ["inner", "left", "right", "outer"] + ons = ["col33", "col34"] + sorts = [False, True] + assert len(ons) == len(sorts), "the loop below is designed for this condition" + for i in range(len(hows)): + for j in range(len(ons)): + modin_result = modin_df.join( + modin_df2, + how=hows[i], + on=ons[j], + sort=sorts[j], + lsuffix="_caller", + rsuffix="_other", + ) + pandas_result = pandas_df.join( + pandas_df2, + how=hows[i], + on=ons[j], + sort=sorts[j], + lsuffix="_caller", + rsuffix="_other", + ) + if sorts[j]: + # sorting in `join` is implemented through range partitioning technique + # therefore the order of the rows after it does not match the pandas, + # so additional sorting is needed in order to get the same result as for pandas + df_equals_and_sort(modin_result, pandas_result) + else: + df_equals(modin_result, pandas_result) + + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], + "col4": [2, 4, 5, 6], + } + + modin_df = pd.DataFrame(frame_data) + pandas_df = pandas.DataFrame(frame_data) + + frame_data2 = {"col5": [0], "col6": [1]} + modin_df2 = pd.DataFrame(frame_data2) + pandas_df2 = pandas.DataFrame(frame_data2) + + join_types = ["left", "right", "outer", "inner"] + for how in join_types: + modin_join = modin_df.join(modin_df2, how=how) + pandas_join = pandas_df.join(pandas_df2, how=how) + df_equals(modin_join, pandas_join) + + frame_data3 = {"col7": [1, 2, 3, 5, 6, 7, 8]} + + modin_df3 = pd.DataFrame(frame_data3) + pandas_df3 = pandas.DataFrame(frame_data3) + + join_types = ["left", "outer", "inner"] + for how in join_types: + modin_join = modin_df.join([modin_df2, modin_df3], how=how) + pandas_join = pandas_df.join([pandas_df2, pandas_df3], how=how) + df_equals(modin_join, pandas_join) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_join_cross_6786(df_mode_pair): + data = [[7, 8, 9], [10, 11, 12]] + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + data, columns=["x", "y", "z"], df_mode=df_mode_pair[0] + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + data, columns=["x", "y", "z"], df_mode=df_mode_pair[1] + ) + modin_join = modin_df_1.join( + modin_df_2[["x"]].set_axis(["p", "q"], axis=0), how="cross", lsuffix="p" + ) + pandas_join = pandas_df_1.join( + pandas_df_2[["x"]].set_axis(["p", "q"], axis=0), how="cross", lsuffix="p" + ) + df_equals(modin_join, pandas_join) + + +@pytest.mark.parametrize( + "test_data, test_data2", + [ + ( + np.random.randint(0, 100, size=(64, 64)), + np.random.randint(0, 100, size=(128, 64)), + ), + ( + np.random.randint(0, 100, size=(128, 64)), + np.random.randint(0, 100, size=(64, 64)), + ), + ( + np.random.randint(0, 100, size=(64, 64)), + np.random.randint(0, 100, size=(64, 128)), + ), + ( + np.random.randint(0, 100, size=(64, 128)), + np.random.randint(0, 100, size=(64, 64)), + ), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_merge(test_data, test_data2, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode( + test_data, + columns=["col{}".format(i) for i in range(test_data.shape[1])], + index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), + df_mode=df_mode_pair[0], + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + test_data2, + columns=["col{}".format(i) for i in range(test_data2.shape[1])], + index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), + df_mode=df_mode_pair[1], + ) + hows = ["left", "inner", "right"] + ons = ["col33", ["col33", "col34"]] + sorts = [False, True] + assert len(ons) == len(sorts), "the loop below is designed for this condition" + for i in range(len(hows)): + for j in range(len(ons)): + modin_result = modin_df.merge( + modin_df2, how=hows[i], on=ons[j], sort=sorts[j] + ) + pandas_result = pandas_df.merge( + pandas_df2, how=hows[i], on=ons[j], sort=sorts[j] + ) + # FIXME: https://github.com/modin-project/modin/issues/2246 + df_equals_and_sort(modin_result, pandas_result) + + modin_result = modin_df.merge( + modin_df2, + how=hows[i], + left_on="key", + right_on="key", + sort=sorts[j], + ) + pandas_result = pandas_df.merge( + pandas_df2, + how=hows[i], + left_on="key", + right_on="key", + sort=sorts[j], + ) + # FIXME: https://github.com/modin-project/modin/issues/2246 + df_equals_and_sort(modin_result, pandas_result) + + +@pytest.mark.parametrize("how", ["left", "inner", "right"]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_merge_empty( + how, + df_mode_pair, +): + data = np.random.randint(0, 100, size=(64, 64)) + eval_general_interop( + data, + None, + lambda df1, df2: df1.merge(df2.iloc[:0], how=how), + df_mode_pair, + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_merge_with_mi_columns(df_mode_pair): + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + { + ("col0", "a"): [1, 2, 3, 4], + ("col0", "b"): [2, 3, 4, 5], + ("col1", "a"): [3, 4, 5, 6], + }, + df_mode=df_mode_pair[0], + ) + + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + { + ("col0", "a"): [1, 2, 3, 4], + ("col0", "c"): [2, 3, 4, 5], + ("col1", "a"): [3, 4, 5, 6], + }, + df_mode=df_mode_pair[1], + ) + + eval_general( + (modin_df1, modin_df2), + (pandas_df1, pandas_df2), + lambda dfs: dfs[0].merge(dfs[1], on=[("col0", "a")]), + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_where(df_mode_pair): + columns = list("abcdefghij") + + frame_data = random_state.randn(100, 10) + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + frame_data, columns=columns, df_mode=df_mode_pair[0] + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + frame_data, columns=columns, df_mode=df_mode_pair[1] + ) + pandas_cond_df = pandas_df_2 % 5 < 2 + modin_cond_df = modin_df_2 % 5 < 2 + + pandas_result = pandas_df_1.where(pandas_cond_df, -pandas_df_2) + modin_result = modin_df_1.where(modin_cond_df, -modin_df_2) + assert all((to_pandas(modin_result) == pandas_result).all()) + + # test case when other is Series + other_data = random_state.randn(len(pandas_df_1)) + modin_other, pandas_other = create_test_series_in_defined_mode( + other_data, df_mode=df_mode_pair[0] + ) + pandas_result = pandas_df_1.where(pandas_cond_df, pandas_other, axis=0) + modin_result = modin_df_1.where(modin_cond_df, modin_other, axis=0) + df_equals(modin_result, pandas_result) + + # Test that we choose the right values to replace when `other` == `True` + # everywhere. + other_data = np.full(shape=pandas_df_1.shape, fill_value=True) + modin_other, pandas_other = create_test_df_in_defined_mode( + other_data, columns=columns, df_mode=df_mode_pair[0] + ) + pandas_result = pandas_df_1.where(pandas_cond_df, pandas_other) + modin_result = modin_df_1.where(modin_cond_df, modin_other) + df_equals(modin_result, pandas_result) + + other = pandas_df_1.loc[3] + pandas_result = pandas_df_1.where(pandas_cond_df, other, axis=1) + modin_result = modin_df_1.where(modin_cond_df, other, axis=1) + assert all((to_pandas(modin_result) == pandas_result).all()) + + other = pandas_df_1["e"] + pandas_result = pandas_df_1.where(pandas_cond_df, other, axis=0) + modin_result = modin_df_1.where(modin_cond_df, other, axis=0) + assert all((to_pandas(modin_result) == pandas_result).all()) + + pandas_result = pandas_df_1.where(pandas_df_2 < 2, True) + modin_result = modin_df_1.where(modin_df_2 < 2, True) + assert all((to_pandas(modin_result) == pandas_result).all()) + + +@pytest.mark.parametrize("align_axis", ["index", "columns"]) +@pytest.mark.parametrize("keep_shape", [False, True]) +@pytest.mark.parametrize("keep_equal", [False, True]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_compare(align_axis, keep_shape, keep_equal, df_mode_pair): + kwargs = { + "align_axis": align_axis, + "keep_shape": keep_shape, + "keep_equal": keep_equal, + } + frame_data1 = random_state.randn(100, 10) + frame_data2 = random_state.randn(100, 10) + modin_df, pandas_df = create_test_df_in_defined_mode( + frame_data1, columns=list("abcdefghij"), df_mode=df_mode_pair[0] + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + frame_data2, columns=list("abcdefghij"), df_mode=df_mode_pair[0] + ) + modin_result = modin_df.compare(modin_df2, **kwargs) + pandas_result = pandas_df.compare(pandas_df2, **kwargs) + assert to_pandas(modin_result).equals(pandas_result) + + modin_result = modin_df2.compare(modin_df, **kwargs) + pandas_result = pandas_df2.compare(pandas_df, **kwargs) + assert to_pandas(modin_result).equals(pandas_result) + + series_data1 = ["a", "b", "c", "d", "e"] + series_data2 = ["a", "a", "c", "b", "e"] + modin_series1, pandas_series1 = create_test_series_in_defined_mode( + series_data1, df_mode=df_mode_pair[0] + ) + modin_series2, pandas_series2 = create_test_series_in_defined_mode( + series_data2, df_mode=df_mode_pair[1] + ) + + modin_result = modin_series1.compare(modin_series2, **kwargs) + pandas_result = pandas_series1.compare(pandas_series2, **kwargs) + assert to_pandas(modin_result).equals(pandas_result) + + modin_result = modin_series2.compare(modin_series1, **kwargs) + pandas_result = pandas_series2.compare(pandas_series1, **kwargs) + assert to_pandas(modin_result).equals(pandas_result) diff --git a/modin/tests/pandas/native_df_mode/test_map_metadata.py b/modin/tests/pandas/native_df_mode/test_map_metadata.py new file mode 100644 index 00000000000..e9e460ffbc8 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_map_metadata.py @@ -0,0 +1,258 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + + +from itertools import product + +import matplotlib +import numpy as np +import pandas +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions, StorageFormat +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + create_test_series_in_defined_mode, +) +from modin.tests.pandas.utils import ( + RAND_HIGH, + RAND_LOW, + axis_keys, + axis_values, + default_to_pandas_ignore_string, + df_equals, + eval_general, + name_contains, + numeric_dfs, + random_state, + test_data, + test_data_keys, + test_data_values, +) + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +# Our configuration in pytest.ini requires that we explicitly catch all +# instances of defaulting to pandas, but some test modules, like this one, +# have too many such instances. +pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) + + +def eval_insert(modin_df, pandas_df, **kwargs): + if "col" in kwargs and "column" not in kwargs: + kwargs["column"] = kwargs.pop("col") + _kwargs = {"loc": 0, "column": "New column"} + _kwargs.update(kwargs) + + eval_general( + modin_df, + pandas_df, + operation=lambda df, **kwargs: df.insert(**kwargs), + __inplace__=True, + **_kwargs, + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_empty_df(df_mode_pair): + modin_df, pd_df = create_test_df_in_defined_mode(None, df_mode=df_mode_pair[0]) + md_series, pd_series = create_test_series_in_defined_mode( + [1, 2, 3, 4, 5], df_mode=df_mode_pair[1] + ) + modin_df["a"] = md_series + pd_df["a"] = pd_series + df_equals(modin_df, pd_df) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_astype(df_mode_pair): + td = pandas.DataFrame(test_data["int_data"])[["col1", "index", "col3", "col4"]] + modin_df, pandas_df = create_test_df_in_defined_mode( + td.values, + index=td.index, + columns=td.columns, + df_mode=df_mode_pair[0], + ) + + def astype_func(df): + md_ser, pd_ser = create_test_series_in_defined_mode( + [str, str], index=["col1", "col1"], df_mode=df_mode_pair[1] + ) + if isinstance(df, pd.DataFrame): + return df.astype(md_ser) + else: + return df.astype(pd_ser) + + # The dtypes series must have a unique index. + eval_general( + modin_df, + pandas_df, + astype_func, + expected_exception=ValueError( + "cannot reindex on an axis with duplicate labels" + ), + ) + + +########################################################################### + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_convert_dtypes_5653(df_mode_pair): + modin_part1, _ = create_test_df_in_defined_mode( + {"col1": ["a", "b", "c", "d"]}, df_mode=df_mode_pair[0] + ) + modin_part2, _ = create_test_df_in_defined_mode( + {"col1": [None, None, None, None]}, df_mode=df_mode_pair[1] + ) + modin_df = pd.concat([modin_part1, modin_part2]) + if StorageFormat.get() == "Pandas" and NativeDataframeMode.get() == "Default": + assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) + modin_df = modin_df.convert_dtypes() + assert len(modin_df.dtypes) == 1 + assert modin_df.dtypes.iloc[0] == "string" + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"]) +@pytest.mark.exclude_in_sanity +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_clip(request, data, axis, bound_type, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + + if name_contains(request.node.name, numeric_dfs): + ind_len = ( + len(modin_df.index) + if not pandas.DataFrame()._get_axis_number(axis) + else len(modin_df.columns) + ) + + lower = random_state.randint(RAND_LOW, RAND_HIGH, ind_len) + upper = random_state.randint(RAND_LOW, RAND_HIGH, ind_len) + + if bound_type == "series": + modin_lower, pandas_lower = create_test_series_in_defined_mode( + lower, df_mode=df_mode_pair[1] + ) + modin_upper, pandas_upper = create_test_series_in_defined_mode( + upper, df_mode=df_mode_pair[0] + ) + else: + modin_lower = pandas_lower = lower + modin_upper = pandas_upper = upper + + # test lower and upper list bound on each column + modin_result = modin_df.clip(modin_lower, modin_upper, axis=axis) + pandas_result = pandas_df.clip(pandas_lower, pandas_upper, axis=axis) + df_equals(modin_result, pandas_result) + + # test only upper list bound on each column + modin_result = modin_df.clip(np.nan, modin_upper, axis=axis) + pandas_result = pandas_df.clip(np.nan, pandas_upper, axis=axis) + df_equals(modin_result, pandas_result) + + with pytest.raises(ValueError): + modin_df.clip(lower=[1, 2, 3], axis=None) + + +@pytest.mark.parametrize( + "data, other_data", + [ + ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), + ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}), + ( + {"A": ["a", "b", "c"], "B": ["x", "y", "z"]}, + {"B": ["d", "e", "f", "g", "h", "i"]}, + ), + ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, np.nan, 6]}), + ], +) +@pytest.mark.parametrize("errors", ["raise", "ignore"]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_update(data, other_data, errors, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + other_modin_df, other_pandas_df = create_test_df_in_defined_mode( + other_data, df_mode=df_mode_pair[1] + ) + expected_exception = None + if errors == "raise": + expected_exception = ValueError("Data overlaps.") + eval_general( + modin_df, + pandas_df, + lambda df: ( + df.update(other_modin_df, errors=errors) + if isinstance(df, pd.DataFrame) + else df.update(other_pandas_df, errors=errors) + ), + __inplace__=True, + expected_exception=expected_exception, + ) + + +@pytest.mark.parametrize( + "get_index", + [ + pytest.param(lambda idx: None, id="None_idx"), + pytest.param(lambda idx: ["a", "b", "c"], id="No_intersection_idx"), + pytest.param(lambda idx: idx, id="Equal_idx"), + pytest.param(lambda idx: idx[::-1], id="Reversed_idx"), + ], +) +@pytest.mark.parametrize( + "get_columns", + [ + pytest.param(lambda idx: None, id="None_idx"), + pytest.param(lambda idx: ["a", "b", "c"], id="No_intersection_idx"), + pytest.param(lambda idx: idx, id="Equal_idx"), + pytest.param(lambda idx: idx[::-1], id="Reversed_idx"), + ], +) +@pytest.mark.parametrize("dtype", [None, "str"]) +@pytest.mark.exclude_in_sanity +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_constructor_from_modin_series(get_index, get_columns, dtype, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode( + test_data_values[0], df_mode=df_mode_pair[0] + ) + + modin_data = {f"new_col{i}": modin_df.iloc[:, i] for i in range(modin_df.shape[1])} + pandas_data = { + f"new_col{i}": pandas_df.iloc[:, i] for i in range(pandas_df.shape[1]) + } + + index = get_index(modin_df.index) + columns = get_columns(list(modin_data.keys())) + + new_modin = pd.DataFrame(modin_data, index=index, columns=columns, dtype=dtype) + new_pandas = pandas.DataFrame( + pandas_data, index=index, columns=columns, dtype=dtype + ) + df_equals(new_modin, new_pandas) diff --git a/modin/tests/pandas/native_df_mode/test_pickle.py b/modin/tests/pandas/native_df_mode/test_pickle.py new file mode 100644 index 00000000000..cf9b4dfcb9c --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_pickle.py @@ -0,0 +1,73 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from itertools import product + +import numpy as np +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, PersistentPickle +from modin.tests.pandas.native_df_mode.utils import create_test_df_in_defined_mode +from modin.tests.pandas.utils import df_equals + + +@pytest.fixture +def modin_df(): + return pd.DataFrame({"col1": np.arange(1000), "col2": np.arange(2000, 3000)}) + + +@pytest.fixture +def modin_column(modin_df): + return modin_df["col1"] + + +@pytest.fixture(params=[True, False]) +def persistent(request): + old = PersistentPickle.get() + PersistentPickle.put(request.param) + yield request.param + PersistentPickle.put(old) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test__reduce__(df_mode_pair): + # `DataFrame.__reduce__` will be called implicitly when lambda expressions are + # pre-processed for the distributed engine. + dataframe_data = ["Major League Baseball", "National Basketball Association"] + abbr_md, abbr_pd = create_test_df_in_defined_mode( + dataframe_data, index=["MLB", "NBA"], df_mode=df_mode_pair[0] + ) + + dataframe_data = { + "name": ["Mariners", "Lakers"] * 500, + "league_abbreviation": ["MLB", "NBA"] * 500, + } + teams_md, teams_pd = create_test_df_in_defined_mode( + dataframe_data, df_mode=df_mode_pair[1] + ) + + result_md = ( + teams_md.set_index("name") + .league_abbreviation.apply(lambda abbr: abbr_md[0].loc[abbr]) + .rename("league") + ) + + result_pd = ( + teams_pd.set_index("name") + .league_abbreviation.apply(lambda abbr: abbr_pd[0].loc[abbr]) + .rename("league") + ) + df_equals(result_md, result_pd) diff --git a/modin/tests/pandas/native_df_mode/test_window.py b/modin/tests/pandas/native_df_mode/test_window.py new file mode 100644 index 00000000000..7e8e5da9342 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_window.py @@ -0,0 +1,101 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from itertools import product + +import matplotlib +import numpy as np +import pandas +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions +from modin.tests.pandas.native_df_mode.utils import create_test_df_in_defined_mode +from modin.tests.pandas.utils import df_equals + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_fillna_4660(df_mode_pair): + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + {"a": ["a"], "b": ["b"], "c": [pd.NA]}, + index=["row1"], + df_mode=df_mode_pair[0], + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + {"a": ["a"], "b": ["b"], "c": [pd.NA]}, + index=["row1"], + df_mode=df_mode_pair[1], + ) + modin_result = modin_df_1["c"].fillna(modin_df_2["b"]) + pandas_result = pandas_df_1["c"].fillna(pandas_df_2["b"]) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_fillna_dict_series(df_mode_pair): + frame_data = { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + frame_data, df_mode=df_mode_pair[0] + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + frame_data, df_mode=df_mode_pair[1] + ) + + df_equals(modin_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5})) + + df_equals( + modin_df.fillna({"a": 0, "b": 5, "d": 7}), + df.fillna({"a": 0, "b": 5, "d": 7}), + ) + + # Series treated same as dict + df_equals( + modin_df_1.fillna(modin_df_2.max()), pandas_df_1.fillna(pandas_df_2.max()) + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_fillna_dataframe(df_mode_pair): + frame_data = { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + frame_data, index=list("VWXYZ"), df_mode=df_mode_pair[0] + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + {"a": [np.nan, 10, 20, 30, 40], "b": [50, 60, 70, 80, 90], "foo": ["bar"] * 5}, + index=list("VWXuZ"), + df_mode=df_mode_pair[1], + ) + + # only those columns and indices which are shared get filled + df_equals(modin_df_1.fillna(modin_df_2), pandas_df_1.fillna(pandas_df_2)) diff --git a/modin/tests/pandas/native_df_mode/utils.py b/modin/tests/pandas/native_df_mode/utils.py new file mode 100644 index 00000000000..9e9d77ac1f7 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/utils.py @@ -0,0 +1,133 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from modin.config import Engine +from modin.config.pubsub import context +from modin.tests.pandas.utils import ( + NoModinException, + create_test_dfs, + create_test_series, + df_equals, +) +from modin.utils import try_cast_to_pandas + + +def create_test_df_in_defined_mode( + *args, post_fn=None, backend=None, df_mode=None, **kwargs +): + with context(NativeDataframeMode=df_mode): + return create_test_dfs(*args, post_fn=post_fn, backend=backend, **kwargs) + + +def create_test_series_in_defined_mode( + vals, sort=False, backend=None, df_mode=None, **kwargs +): + with context(NativeDataframeMode=df_mode): + return create_test_series(vals, sort=sort, backend=backend, **kwargs) + + +def eval_general_interop( + data, + backend, + operation, + df_mode_pair, + comparator=df_equals, + __inplace__=False, + expected_exception=None, + check_kwargs_callable=True, + md_extra_kwargs=None, + comparator_kwargs=None, + **kwargs, +): + df_mode1, df_mode2 = df_mode_pair + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + data, backend=backend, df_mode=df_mode1 + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + data, backend=backend, df_mode=df_mode2 + ) + md_kwargs, pd_kwargs = {}, {} + + def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}): + try: + pd_result = fn(pandas_df1, pandas_df2, **pd_kwargs) + except Exception as pd_e: + try: + if inplace: + _ = fn(modin_df1, modin_df2, **md_kwargs) + try_cast_to_pandas(modin_df1) # force materialization + else: + try_cast_to_pandas( + fn(modin_df1, modin_df2, **md_kwargs) + ) # force materialization + except Exception as md_e: + assert isinstance( + md_e, type(pd_e) + ), "Got Modin Exception type {}, but pandas Exception type {} was expected".format( + type(md_e), type(pd_e) + ) + if expected_exception: + if Engine.get() == "Ray": + from ray.exceptions import RayTaskError + + # unwrap ray exceptions from remote worker + if isinstance(md_e, RayTaskError): + md_e = md_e.args[0] + assert ( + type(md_e) is type(expected_exception) + and md_e.args == expected_exception.args + ), f"not acceptable Modin's exception: [{repr(md_e)}]" + assert ( + pd_e.args == expected_exception.args + ), f"not acceptable Pandas' exception: [{repr(pd_e)}]" + elif expected_exception is False: + # The only way to disable exception message checking. + pass + else: + # It’s not enough that Modin and pandas have the same types of exceptions; + # we need to explicitly specify the instance of an exception + # (using `expected_exception`) in tests so that we can check exception messages. + # This allows us to eliminate situations where exceptions are thrown + # that we don't expect, which could hide different bugs. + raise pd_e + else: + raise NoModinException( + f"Modin doesn't throw an exception, while pandas does: [{repr(pd_e)}]" + ) + else: + md_result = fn(modin_df1, modin_df2, **md_kwargs) + return (md_result, pd_result) if not inplace else (modin_df1, pandas_df1) + + for key, value in kwargs.items(): + if check_kwargs_callable and callable(value): + values = execute_callable(value) + # that means, that callable raised an exception + if values is None: + return + else: + md_value, pd_value = values + else: + md_value, pd_value = value, value + + md_kwargs[key] = md_value + pd_kwargs[key] = pd_value + + if md_extra_kwargs: + assert isinstance(md_extra_kwargs, dict) + md_kwargs.update(md_extra_kwargs) + + values = execute_callable( + operation, md_kwargs=md_kwargs, pd_kwargs=pd_kwargs, inplace=__inplace__ + ) + if values is not None: + comparator(*values, **(comparator_kwargs or {}))