diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f0fc8437480..76f9eab1b52 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -221,9 +221,8 @@ jobs: id: filter with: filters: | - test-small-query-compiler: + test-native-dataframe-mode: - 'modin/core/storage_formats/pandas/native_query_compiler.py' - - 'modin/core/storage_formats/pandas/query_compiler.py' - 'modin/core/storage_formats/base/query_compiler.py' shared: &shared - 'modin/core/execution/dispatching/**' @@ -636,9 +635,9 @@ jobs: python-version: ${{matrix.python-version}} - run: python -m pytest modin/tests/experimental/spreadsheet/test_general.py - test-small-query-compiler: + test-native-dataframe-mode: needs: [ lint-flake8, execution-filter] - if: ${{ needs.execution-filter.outputs.test-small-query-compiler == 'true' }} + if: ${{ needs.execution-filter.outputs.test-native-dataframe-mode == 'true' }} runs-on: ubuntu-latest defaults: run: @@ -648,15 +647,13 @@ jobs: python-version: ["3.9"] env: MODIN_NATIVE_DATAFRAME_MODE: "Pandas" - name: test-small-query-compiler python ${{matrix.python-version}}) + name: test-native-dataframe-mode python ${{matrix.python-version}}) steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} - - run: python -m pytest modin/tests/config/test_envvars.py - - run: python -m pytest modin/tests/config/test_parameter.py - run: python -m pytest modin/tests/pandas/dataframe/test_binary.py - run: python -m pytest modin/tests/pandas/dataframe/test_default.py - run: python -m pytest modin/tests/pandas/dataframe/test_indexing.py diff --git a/modin/config/envvars.py b/modin/config/envvars.py index f39f4a79ea2..676f1a31d8a 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -915,18 +915,23 @@ def _check_vars() -> None: class NativeDataframeMode(EnvironmentVariable, type=str): """ + Configures the query compiler to process Modin data. + When this config is set to ``Default``, ``PandasQueryCompiler`` is used, which leads to Modin executing dataframes in distributed fashion. - When set to a string (e.g., ``Pandas``), ``NativeQueryCompiler`` is used, + When set to a string (e.g., ``pandas``), ``NativeQueryCompiler`` is used, which handles the dataframes without distributing, - falling back to native library functions (e.g., ``Pandas``). + falling back to native library functions (e.g., ``pandas``). This could be beneficial for handling relatively small dataframes without involving additional overhead of communication between processes. """ varname = "MODIN_NATIVE_DATAFRAME_MODE" - choices = ("Default", "Pandas",) + choices = ( + "Default", + "Pandas", + ) default = "Default" diff --git a/modin/core/storage_formats/pandas/native_query_compiler.py b/modin/core/storage_formats/pandas/native_query_compiler.py index 39733724a35..bfe331cfc6e 100644 --- a/modin/core/storage_formats/pandas/native_query_compiler.py +++ b/modin/core/storage_formats/pandas/native_query_compiler.py @@ -352,6 +352,20 @@ def binary_operator(df, other, **kwargs): def _register_expanding(func): + """ + Build function that apply specified expanding window functions. + + Parameters + ---------- + func : str + Expanding window functionname to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + def expanding_operator(df, fold_axis, rolling_args, *args, **kwargs): squeeze_self = kwargs.pop("squeeze_self", False) @@ -497,7 +511,6 @@ def _register_default_pandas( squeeze_kwargs=False, return_raw=False, in_place=False, - filter_kwargs=[], ): """ Build function that apply specified method of the passed frame. @@ -516,8 +529,6 @@ def _register_default_pandas( If True, and the result not DataFrame or Series it is returned as is without wrapping in query compiler. in_place : bool, default: False If True, the specified function will be applied on the passed frame in place. - filter_kwargs : list, default: [] - List of key word argument names to remove. Returns ------- @@ -529,7 +540,7 @@ def caller(query_compiler, *args, **kwargs): df = query_compiler._modin_frame if is_series: df = df.squeeze(axis=1) - exclude_names = ["fold_axis"] + filter_kwargs + exclude_names = ["fold_axis", "dtypes"] kwargs = kwargs.copy() for name in exclude_names: kwargs.pop(name, None) @@ -565,7 +576,9 @@ class NativeQueryCompiler(BaseQueryCompiler): Parameters ---------- pandas_frame : pandas.DataFrame - Pandas frame to query with the compiled queries. + The pandas frame to query with the compiled queries. + shape_hint : {"row", "column", None}, default: None + Shape hint for frames known to be a column or a row, otherwise None. """ _modin_frame: pandas.DataFrame @@ -767,7 +780,7 @@ def setitem_bool(self, row_loc, col_loc, item): dt_weekofyear = _register_default_pandas(_dt_prop_map("weekofyear")) dt_year = _register_default_pandas(_dt_prop_map("year")) duplicated = _register_default_pandas(pandas.DataFrame.duplicated) - eq = _register_default_pandas(_register_binary("eq"), filter_kwargs=["dtypes"]) + eq = _register_default_pandas(_register_binary("eq")) equals = _register_default_pandas(_register_binary("equals")) eval = _register_default_pandas(pandas.DataFrame.eval) explode = _register_default_pandas(pandas.DataFrame.explode) @@ -819,7 +832,7 @@ def setitem_bool(self, row_loc, col_loc, item): pandas.DataFrame.first_valid_index, return_raw=True ) floordiv = _register_default_pandas(_register_binary("floordiv")) - ge = _register_default_pandas(_register_binary("ge"), filter_kwargs=["dtypes"]) + ge = _register_default_pandas(_register_binary("ge")) get_dummies = _register_default_pandas(_get_dummies) getitem_array = _register_default_pandas(_getitem_array) getitem_row_array = _register_default_pandas(_getitem_row_array) @@ -846,7 +859,7 @@ def setitem_bool(self, row_loc, col_loc, item): groupby_std = _register_default_pandas(_groupby("std")) groupby_sum = _register_default_pandas(_groupby("sum")) groupby_var = _register_default_pandas(_groupby("var")) - gt = _register_default_pandas(_register_binary("gt"), filter_kwargs=["dtypes"]) + gt = _register_default_pandas(_register_binary("gt")) idxmax = _register_default_pandas(pandas.DataFrame.idxmax) idxmin = _register_default_pandas(pandas.DataFrame.idxmin) infer_objects = _register_default_pandas( @@ -871,8 +884,8 @@ def setitem_bool(self, row_loc, col_loc, item): last_valid_index = _register_default_pandas( pandas.DataFrame.last_valid_index, return_raw=True ) - le = _register_default_pandas(_register_binary("le"), filter_kwargs=["dtypes"]) - lt = _register_default_pandas(_register_binary("lt"), filter_kwargs=["dtypes"]) + le = _register_default_pandas(_register_binary("le")) + lt = _register_default_pandas(_register_binary("lt")) # mad = _register_default_pandas(pandas.DataFrame.mad) mask = _register_default_pandas(pandas.DataFrame.mask) max = _register_default_pandas(pandas.DataFrame.max) @@ -886,7 +899,7 @@ def setitem_bool(self, row_loc, col_loc, item): mod = _register_default_pandas(_register_binary("mod")) mode = _register_default_pandas(pandas.DataFrame.mode) mul = _register_default_pandas(_register_binary("mul")) - ne = _register_default_pandas(_register_binary("ne"), filter_kwargs=["dtypes"]) + ne = _register_default_pandas(_register_binary("ne")) negative = _register_default_pandas(pandas.DataFrame.__neg__) nlargest = _register_default_pandas(pandas.DataFrame.nlargest) notna = _register_default_pandas(pandas.DataFrame.notna) @@ -1214,7 +1227,7 @@ def from_pandas(cls, df, data_cls): @classmethod def from_arrow(cls, at, data_cls): - return + return cls(at.to_pandas()) def free(self): return @@ -1231,7 +1244,7 @@ def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): @classmethod def from_dataframe(cls, df, data_cls): - return cls(data_cls.from_dataframe(df)) + return cls(pandas.api.interchange.from_dataframe(df)) # END Dataframe exchange protocol diff --git a/modin/tests/pandas/dataframe/test_iter.py b/modin/tests/pandas/dataframe/test_iter.py index e420f154a0f..38ab70524a2 100644 --- a/modin/tests/pandas/dataframe/test_iter.py +++ b/modin/tests/pandas/dataframe/test_iter.py @@ -232,6 +232,8 @@ def test___repr__(): "2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 "2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" pandas_df = pandas.read_csv(io.StringIO(string_data)) + # Using `force` for `NativeDataframeMode` as the warnings are raised at the API layer, + # before geting into the Query Compiler layer. with warns_that_defaulting_to_pandas(force=True): modin_df = pd.read_csv(io.StringIO(string_data)) assert repr(pandas_df) == repr(modin_df)