diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1108fd6ffa7..ecedb33b929 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -647,7 +647,7 @@ jobs: matrix: python-version: ["3.9"] env: - MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER: "True" + MODIN_NATIVE_DATAFRAME_MODE: "Native_pandas" name: test-small-query-compiler python ${{matrix.python-version}}) steps: - uses: actions/checkout@v4 @@ -667,21 +667,6 @@ jobs: - run: python -m pytest modin/tests/pandas/dataframe/test_reduce.py - run: python -m pytest modin/tests/pandas/dataframe/test_udf.py - run: python -m pytest modin/tests/pandas/dataframe/test_window.py - - run: python -m pytest modin/tests/pandas/extensions/test_dataframe_extensions.py - - run: python -m pytest modin/tests/pandas/extensions/test_pd_extensions.py - - run: python -m pytest modin/tests/pandas/extensions/test_series_extensions.py - - run: python -m pytest modin/tests/pandas/integrations/test_lazy_import.py - - run: python -m pytest modin/tests/pandas/internals/test_benchmark_mode.py - - run: python -m pytest modin/tests/pandas/internals/test_repartition.py - - run: python -m pytest modin/tests/pandas/test_api.py - - run: python -m pytest modin/tests/pandas/test_concat.py - - run: python -m pytest modin/tests/pandas/test_expanding.py - - run: python -m pytest modin/tests/pandas/test_general.py - - run: python -m pytest modin/tests/pandas/test_groupby.py - - run: python -m pytest modin/tests/pandas/test_io.py - - run: python -m pytest modin/tests/pandas/test_reshape.py - - run: python -m pytest modin/tests/pandas/test_rolling.py - - run: python -m pytest modin/tests/pandas/test_series.py - uses: codecov/codecov-action@v2 merge-coverage-artifacts: diff --git a/modin/config/__init__.py b/modin/config/__init__.py index b3c7f8f54f0..bcacded395f 100644 --- a/modin/config/__init__.py +++ b/modin/config/__init__.py @@ -39,6 +39,7 @@ MinPartitionSize, MinRowPartitionSize, ModinNumpy, + NativeDataframeMode, NPartitions, PersistentPickle, ProgressBar, @@ -53,7 +54,6 @@ TestReadFromPostgres, TestReadFromSqlServer, TrackFileLeaks, - UsePlainPandasQueryCompiler, ) from modin.config.pubsub import Parameter, ValueSource, context @@ -69,7 +69,7 @@ "CpuCount", "GpuCount", "Memory", - "UsePlainPandasQueryCompiler", + "NativeDataframeMode", # Ray specific "IsRayCluster", "RayRedisAddress", diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 3010dc28bdc..c2feb1841b4 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -913,11 +913,23 @@ def _check_vars() -> None: ) -class UsePlainPandasQueryCompiler(EnvironmentVariable, type=bool): - """Set to true to use implementation of PlainPandasQueryCompiler.""" +class NativeDataframeMode(EnvironmentVariable, type=str): + """ + The mode of execution used for handling dataframes in Modin - varname = "MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER" - default = False + When the env variable is set to None the PandasQueryCompiler would be used + which would lead to modin executing dataframes in distributed fashion. + When set to Native_pandas NativeQueryCompiler is used which handles the + dataframes without distributing, falling back to native pandas functions. + + In future more execution modes can be added for single node execution so + keeping the parameter as string. + + """ + + varname = "MODIN_NATIVE_DATAFRAME_MODE" + choices = ("Native_pandas",) + default = None _check_vars() diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py index 9b71a067159..ee1b68b2dee 100644 --- a/modin/core/execution/dispatching/factories/factories.py +++ b/modin/core/execution/dispatching/factories/factories.py @@ -26,11 +26,10 @@ import pandas from pandas.util._decorators import doc -from modin.config import UsePlainPandasQueryCompiler - +from modin.config import NativeDataframeMode from modin.core.io import BaseIO -from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - PlainPandasQueryCompiler, +from modin.experimental.core.storage_formats.pandas.native_query_compiler import ( + NativeQueryCompiler, ) from modin.utils import get_current_execution @@ -173,9 +172,9 @@ def prepare(cls): method="io.from_pandas", ) def _from_pandas(cls, df): - if UsePlainPandasQueryCompiler.get(): + if NativeDataframeMode.get(): df_copy = df.copy() - return PlainPandasQueryCompiler(df_copy) + return NativeQueryCompiler(df_copy) return cls.io_cls.from_pandas(df) @classmethod diff --git a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py b/modin/experimental/core/storage_formats/pandas/native_query_compiler.py similarity index 97% rename from modin/experimental/core/storage_formats/pandas/small_query_compiler.py rename to modin/experimental/core/storage_formats/pandas/native_query_compiler.py index bfccb0fd105..9f8cf19295b 100644 --- a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/native_query_compiler.py @@ -12,9 +12,9 @@ # governing permissions and limitations under the License. """ -Module contains ``PlainPandasQueryCompiler`` class. +Module contains ``NativeQueryCompiler`` class. -``PlainPandasQueryCompiler`` is responsible for compiling efficient DataFrame algebra +``NativeQueryCompiler`` is responsible for compiling efficient DataFrame algebra queries for small data and empty ``PandasDataFrame``. """ @@ -24,7 +24,7 @@ import pandas from pandas.core.dtypes.common import is_list_like, is_scalar -from modin.config.envvars import UsePlainPandasQueryCompiler +from modin.config.envvars import NativeDataframeMode from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler from modin.utils import ( MODIN_UNNAMED_SERIES_LABEL, @@ -608,7 +608,7 @@ def caller(query_compiler, *args, **kwargs): @_inherit_docstrings(BaseQueryCompiler) -class PlainPandasQueryCompiler(BaseQueryCompiler): +class NativeQueryCompiler(BaseQueryCompiler): """ Query compiler for the pandas storage format. @@ -623,7 +623,7 @@ class PlainPandasQueryCompiler(BaseQueryCompiler): """ def __init__(self, pandas_frame): - assert UsePlainPandasQueryCompiler.get() + assert NativeDataframeMode.get() == "Native_Pandas" if hasattr(pandas_frame, "_to_pandas"): pandas_frame = pandas_frame._to_pandas() if is_scalar(pandas_frame): @@ -636,6 +636,59 @@ def __init__(self, pandas_frame): def execute(self): pass + @property + def frame_has_materialized_dtypes(self) -> bool: + """ + Check if the undelying dataframe has materialized dtypes. + + Returns + ------- + bool + """ + return True + + def set_frame_dtypes_cache(self, dtypes): + """ + Set dtypes cache for the underlying dataframe frame. + + Parameters + ---------- + dtypes : pandas.Series, ModinDtypes, callable or None + """ + pass + + def set_frame_index_cache(self, index): + """ + Set index cache for underlying dataframe. + + Parameters + ---------- + index : sequence, callable or None + """ + pass + + @property + def frame_has_index_cache(self): + """ + Check if the index cache exists for underlying dataframe. + + Returns + ------- + bool + """ + return True + + @property + def frame_has_dtypes_cache(self) -> bool: + """ + Check if the dtypes cache exists for the underlying dataframe. + + Returns + ------- + bool + """ + return True + def take_2d_positional(self, index=None, columns=None): index = slice(None) if index is None else index columns = slice(None) if columns is None else columns diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index 17351b21839..1b643cfcdba 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -17,7 +17,7 @@ import pytest import modin.pandas as pd -from modin.config import NPartitions, StorageFormat, UsePlainPandasQueryCompiler +from modin.config import NativeDataframeMode, NPartitions, StorageFormat from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) @@ -211,8 +211,8 @@ def operation(df): reason="Modin on this engine doesn't create virtual partitions.", ) @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not contain partitions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize( "left_virtual,right_virtual", [(True, False), (False, True), (True, True)] diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 50e11b4068b..016ee2e7ac9 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -23,7 +23,7 @@ from numpy.testing import assert_array_equal import modin.pandas as pd -from modin.config import Engine, NPartitions, StorageFormat, UsePlainPandasQueryCompiler +from modin.config import Engine, NativeDataframeMode, NPartitions, StorageFormat from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( axis_keys, @@ -90,7 +90,7 @@ def test_ops_defaulting_to_pandas(op, make_args): modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1) with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): operation = getattr(modin_df, op) @@ -108,7 +108,7 @@ def test_style(): data = test_data_values[0] with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): pd.DataFrame(data).style @@ -120,7 +120,7 @@ def test_to_timestamp(): with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): df.to_period().to_timestamp() @@ -137,8 +137,8 @@ def test_to_numpy(data): @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not contain partitions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_partition_to_numpy(data): @@ -153,7 +153,7 @@ def test_asfreq(): df = pd.DataFrame({"s": series}) with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): # We are only testing that this defaults to pandas, so we will just check for @@ -315,7 +315,7 @@ def test_corr_min_periods(self, min_periods): {"a": [1, np.nan, 3, 4, 5, 6], "b": [1, 2, 1, 4, 5, np.nan]} ) modin_df = pd.concat([modin_df.iloc[:3], modin_df.iloc[3:]]) - if not UsePlainPandasQueryCompiler.get(): + if not NativeDataframeMode.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( modin_df, pandas_df, lambda df: df.corr(min_periods=min_periods) @@ -335,8 +335,8 @@ def test_corr_non_numeric(self, numeric_only): reason="doesn't make sense for non-partitioned executions", ) @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not contain partitions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not contain partitions.", ) def test_corr_nans_in_different_partitions(self): # NaN in the first partition @@ -628,10 +628,7 @@ def test_pivot(data, index, columns, values, request): or "default-one_column-several_columns_index" in request.node.callspec.id or "default-one_column-one_column_index" in request.node.callspec.id or ( - ( - current_execution in ("BaseOnPython",) - or UsePlainPandasQueryCompiler.get() - ) + (current_execution in ("BaseOnPython",) or NativeDataframeMode.get()) and index is lib.no_default ) ): @@ -1012,7 +1009,7 @@ def test_resampler_functions_with_arg(rule, axis, method_arg): marks=pytest.mark.xfail( condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python") and StorageFormat.get() != "Base" - and not UsePlainPandasQueryCompiler.get(), + and NativeDataframeMode.get() is None, reason="https://github.com/modin-project/modin/issues/6399", ), ), diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py index 506ac2bb774..0f38eaa5ebe 100644 --- a/modin/tests/pandas/dataframe/test_indexing.py +++ b/modin/tests/pandas/dataframe/test_indexing.py @@ -21,7 +21,11 @@ from pandas._testing import ensure_clean import modin.pandas as pd +<<<<<<< HEAD from modin.config import MinRowPartitionSize, NPartitions, UsePlainPandasQueryCompiler +======= +from modin.config import MinPartitionSize, NativeDataframeMode, NPartitions +>>>>>>> 1984aa1f (renaming to PlainPandasQueryCompiler to NativeDataframeMode) from modin.pandas.indexing import is_range_like from modin.pandas.testing import assert_index_equal from modin.tests.pandas.utils import ( @@ -584,8 +588,8 @@ def test_loc_setting_single_categorical_column(): @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not currently support IO functions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not currently support IO functions.", ) def test_loc_multi_index(): modin_df = pd.read_csv( @@ -2241,8 +2245,8 @@ def test___setitem__partitions_aligning(): @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not currently support IO functions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not currently support IO functions.", ) def test___setitem__with_mismatched_partitions(): with ensure_clean(".csv") as fname: diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py index 8343468b1df..46983ffd45c 100644 --- a/modin/tests/pandas/dataframe/test_join_sort.py +++ b/modin/tests/pandas/dataframe/test_join_sort.py @@ -20,7 +20,7 @@ import pytest import modin.pandas as pd -from modin.config import Engine, NPartitions, StorageFormat, UsePlainPandasQueryCompiler +from modin.config import Engine, NativeDataframeMode, NPartitions, StorageFormat from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( arg_keys, @@ -613,7 +613,7 @@ def test_sort_multiindex(sort_remaining): for kwargs in [{"level": 0}, {"axis": 0}, {"axis": 1}]: with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): df_equals( @@ -737,7 +737,7 @@ def test_sort_values_descending_with_only_two_bins(): modin_df = pd.concat([part1, part2]) pandas_df = modin_df._to_pandas() - if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): + if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( @@ -777,7 +777,7 @@ def test_sort_values_with_one_partition(ascending): np.array([["hello", "goodbye"], ["hello", "Hello"]]) ) - if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): + if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (1, 1) eval_general( @@ -897,7 +897,8 @@ def test_sort_values_with_only_one_non_na_row_in_partition(ascending, na_positio @pytest.mark.skipif( - Engine.get() not in ("Ray", "Unidist", "Dask") or UsePlainPandasQueryCompiler.get(), + Engine.get() not in ("Ray", "Unidist", "Dask") + or NativeDataframeMode.get() is not None, reason="We only need to test this case where sort does not default to pandas.", ) def test_sort_values_with_sort_key_on_partition_boundary(): diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index bde65a9b845..40c910ed4cc 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -21,9 +21,9 @@ import modin.pandas as pd from modin.config import ( MinRowPartitionSize, + NativeDataframeMode, NPartitions, StorageFormat, - UsePlainPandasQueryCompiler, ) from modin.core.dataframe.pandas.metadata import LazyProxyCategoricalDtype from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas @@ -304,10 +304,7 @@ def test_copy(data): assert new_modin_df.columns is not modin_df.columns assert new_modin_df.dtypes is not modin_df.dtypes - if ( - get_current_execution() != "BaseOnPython" - and not UsePlainPandasQueryCompiler.get() - ): + if get_current_execution() != "BaseOnPython" and not NativeDataframeMode.get(): assert np.array_equal( new_modin_df._query_compiler._modin_frame._partitions, modin_df._query_compiler._modin_frame._partitions, @@ -574,8 +571,8 @@ def test_astype_int64_to_astype_category_github_issue_6259(): reason="BaseOnPython doesn't have proxy categories", ) @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler doesn't have proxy categories", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler doesn't have proxy categories", ) class TestCategoricalProxyDtype: """This class contains test and test usilities for the ``LazyProxyCategoricalDtype`` class.""" @@ -800,8 +797,8 @@ def comparator(df1, df2): @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not contain partitions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not contain partitions.", ) def test_convert_dtypes_multiple_row_partitions(): # Column 0 should have string dtype @@ -827,7 +824,7 @@ def test_convert_dtypes_5653(): modin_part1 = pd.DataFrame({"col1": ["a", "b", "c", "d"]}) modin_part2 = pd.DataFrame({"col1": [None, None, None, None]}) modin_df = pd.concat([modin_part1, modin_part2]) - if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): + if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) modin_df = modin_df.convert_dtypes() assert len(modin_df.dtypes) == 1 diff --git a/modin/tests/pandas/test_expanding.py b/modin/tests/pandas/test_expanding.py index 5a962061e47..d96a38bc21e 100644 --- a/modin/tests/pandas/test_expanding.py +++ b/modin/tests/pandas/test_expanding.py @@ -18,7 +18,7 @@ import pytest import modin.pandas as pd -from modin.config import NPartitions, UsePlainPandasQueryCompiler +from modin.config import NativeDataframeMode, NPartitions from modin.tests.test_utils import warns_that_defaulting_to_pandas from .utils import ( @@ -71,7 +71,7 @@ def test_dataframe(data, min_periods, axis, method, kwargs): def test_dataframe_corr_cov(data, min_periods, axis, method): with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): eval_general( @@ -87,7 +87,7 @@ def test_dataframe_corr_cov_with_self(method): mdf, pdf = create_test_dfs(test_data["float_nan_data"]) with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): eval_general(