renaming to PlainPandasQueryCompiler to NativeDataframeMode

modin-project · Jun 5, 2024 · 1984aa1 · 1984aa1
1 parent c486925
commit 1984aa1
Show file tree

Hide file tree

Showing 11 changed files with 118 additions and 74 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -647,7 +647,7 @@ jobs:
       matrix:
         python-version: ["3.9"]
     env:
-      MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER: "True"
+      MODIN_NATIVE_DATAFRAME_MODE: "Native_pandas"
     name: test-small-query-compiler python ${{matrix.python-version}})
     steps:
       - uses: actions/checkout@v4
@@ -667,21 +667,6 @@ jobs:
       - run: python -m pytest modin/tests/pandas/dataframe/test_reduce.py
       - run: python -m pytest modin/tests/pandas/dataframe/test_udf.py
       - run: python -m pytest modin/tests/pandas/dataframe/test_window.py
-      - run: python -m pytest modin/tests/pandas/extensions/test_dataframe_extensions.py
-      - run: python -m pytest modin/tests/pandas/extensions/test_pd_extensions.py
-      - run: python -m pytest modin/tests/pandas/extensions/test_series_extensions.py
-      - run: python -m pytest modin/tests/pandas/integrations/test_lazy_import.py
-      - run: python -m pytest modin/tests/pandas/internals/test_benchmark_mode.py
-      - run: python -m pytest modin/tests/pandas/internals/test_repartition.py
-      - run: python -m pytest modin/tests/pandas/test_api.py
-      - run: python -m pytest modin/tests/pandas/test_concat.py
-      - run: python -m pytest modin/tests/pandas/test_expanding.py
-      - run: python -m pytest modin/tests/pandas/test_general.py
-      - run: python -m pytest modin/tests/pandas/test_groupby.py
-      - run: python -m pytest modin/tests/pandas/test_io.py
-      - run: python -m pytest modin/tests/pandas/test_reshape.py
-      - run: python -m pytest modin/tests/pandas/test_rolling.py
-      - run: python -m pytest modin/tests/pandas/test_series.py
       - uses: codecov/codecov-action@v2
 
   merge-coverage-artifacts:

diff --git a/modin/config/__init__.py b/modin/config/__init__.py
@@ -37,6 +37,7 @@
     Memory,
     MinPartitionSize,
     ModinNumpy,
+    NativeDataframeMode,
     NPartitions,
     PersistentPickle,
     ProgressBar,
@@ -51,7 +52,6 @@
     TestReadFromPostgres,
     TestReadFromSqlServer,
     TrackFileLeaks,
-    UsePlainPandasQueryCompiler,
 )
 from modin.config.pubsub import Parameter, ValueSource, context
 
@@ -67,7 +67,7 @@
     "CpuCount",
     "GpuCount",
     "Memory",
-    "UsePlainPandasQueryCompiler",
+    "NativeDataframeMode",
     # Ray specific
     "IsRayCluster",
     "RayRedisAddress",

diff --git a/modin/config/envvars.py b/modin/config/envvars.py
@@ -848,11 +848,23 @@ def _check_vars() -> None:
         )
 
 
-class UsePlainPandasQueryCompiler(EnvironmentVariable, type=bool):
-    """Set to true to use implementation of PlainPandasQueryCompiler."""
+class NativeDataframeMode(EnvironmentVariable, type=str):
+    """
+    The mode of execution used for handling dataframes in Modin
 
-    varname = "MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER"
-    default = False
+    When the env variable is set to None the PandasQueryCompiler would be used
+    which would lead to modin executing dataframes in distributed fashion.
+    When set to Native_pandas NativeQueryCompiler is used which handles the
+    dataframes without distributing, falling back to native pandas functions.
+
+    In future more execution modes can be added for single node execution so
+    keeping the parameter as string.
+
+    """
+
+    varname = "MODIN_NATIVE_DATAFRAME_MODE"
+    choices = ("Native_pandas",)
+    default = None
 
 
 _check_vars()
diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py
@@ -26,11 +26,10 @@
 import pandas
 from pandas.util._decorators import doc
 
-from modin.config import  UsePlainPandasQueryCompiler
-
+from modin.config import NativeDataframeMode
 from modin.core.io import BaseIO
-from modin.experimental.core.storage_formats.pandas.small_query_compiler import (
-    PlainPandasQueryCompiler,
+from modin.experimental.core.storage_formats.pandas.native_query_compiler import (
+    NativeQueryCompiler,
 )
 from modin.utils import get_current_execution
 
@@ -173,9 +172,9 @@ def prepare(cls):
         method="io.from_pandas",
     )
     def _from_pandas(cls, df):
-        if UsePlainPandasQueryCompiler.get():
+        if NativeDataframeMode.get():
             df_copy = df.copy()
-            return PlainPandasQueryCompiler(df_copy)
+            return NativeQueryCompiler(df_copy)
         return cls.io_cls.from_pandas(df)
 
     @classmethod

diff --git a/...ge_formats/pandas/small_query_compiler.py → ...e_formats/pandas/native_query_compiler.py b/...ge_formats/pandas/small_query_compiler.py → ...e_formats/pandas/native_query_compiler.py
@@ -12,9 +12,9 @@
 # governing permissions and limitations under the License.
 
 """
-Module contains ``PlainPandasQueryCompiler`` class.
+Module contains ``NativeQueryCompiler`` class.
 
-``PlainPandasQueryCompiler`` is responsible for compiling efficient DataFrame algebra
+``NativeQueryCompiler`` is responsible for compiling efficient DataFrame algebra
 queries for small data and empty ``PandasDataFrame``.
 """
 
@@ -24,7 +24,7 @@
 import pandas
 from pandas.core.dtypes.common import is_list_like, is_scalar
 
-from modin.config.envvars import UsePlainPandasQueryCompiler
+from modin.config.envvars import NativeDataframeMode
 from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler
 from modin.utils import (
     MODIN_UNNAMED_SERIES_LABEL,
@@ -608,7 +608,7 @@ def caller(query_compiler, *args, **kwargs):
 
 
 @_inherit_docstrings(BaseQueryCompiler)
-class PlainPandasQueryCompiler(BaseQueryCompiler):
+class NativeQueryCompiler(BaseQueryCompiler):
     """
     Query compiler for the pandas storage format.
 
@@ -623,7 +623,7 @@ class PlainPandasQueryCompiler(BaseQueryCompiler):
     """
 
     def __init__(self, pandas_frame):
-        assert UsePlainPandasQueryCompiler.get()
+        assert NativeDataframeMode.get() == "Native_Pandas"
         if hasattr(pandas_frame, "_to_pandas"):
             pandas_frame = pandas_frame._to_pandas()
         if is_scalar(pandas_frame):
@@ -636,6 +636,59 @@ def __init__(self, pandas_frame):
     def execute(self):
         pass
 
+    @property
+    def frame_has_materialized_dtypes(self) -> bool:
+        """
+        Check if the undelying dataframe has materialized dtypes.
+
+        Returns
+        -------
+        bool
+        """
+        return True
+
+    def set_frame_dtypes_cache(self, dtypes):
+        """
+        Set dtypes cache for the underlying dataframe frame.
+
+        Parameters
+        ----------
+        dtypes : pandas.Series, ModinDtypes, callable or None
+        """
+        pass
+
+    def set_frame_index_cache(self, index):
+        """
+        Set index cache for underlying dataframe.
+
+        Parameters
+        ----------
+        index : sequence, callable or None
+        """
+        pass
+
+    @property
+    def frame_has_index_cache(self):
+        """
+        Check if the index cache exists for underlying dataframe.
+
+        Returns
+        -------
+        bool
+        """
+        return True
+
+    @property
+    def frame_has_dtypes_cache(self) -> bool:
+        """
+        Check if the dtypes cache exists for the underlying dataframe.
+
+        Returns
+        -------
+        bool
+        """
+        return True
+
     def take_2d_positional(self, index=None, columns=None):
         index = slice(None) if index is None else index
         columns = slice(None) if columns is None else columns

diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py
@@ -17,7 +17,7 @@
 import pytest
 
 import modin.pandas as pd
-from modin.config import NPartitions, StorageFormat, UsePlainPandasQueryCompiler
+from modin.config import NativeDataframeMode, NPartitions, StorageFormat
 from modin.core.dataframe.pandas.partitioning.axis_partition import (
     PandasDataframeAxisPartition,
 )
@@ -211,8 +211,8 @@ def operation(df):
     reason="Modin on this engine doesn't create virtual partitions.",
 )
 @pytest.mark.skipif(
-    UsePlainPandasQueryCompiler.get(),
-    reason="PlainPandasQueryCompiler does not contain partitions.",
+    NativeDataframeMode.get() is not None,
+    reason="NativeQueryCompiler does not contain partitions.",
 )
 @pytest.mark.parametrize(
     "left_virtual,right_virtual", [(True, False), (False, True), (True, True)]

diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py
@@ -23,7 +23,7 @@
 from numpy.testing import assert_array_equal
 
 import modin.pandas as pd
-from modin.config import Engine, NPartitions, StorageFormat, UsePlainPandasQueryCompiler
+from modin.config import Engine, NativeDataframeMode, NPartitions, StorageFormat
 from modin.pandas.io import to_pandas
 from modin.tests.pandas.utils import (
     axis_keys,
@@ -90,7 +90,7 @@ def test_ops_defaulting_to_pandas(op, make_args):
     modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1)
     with (
         warns_that_defaulting_to_pandas()
-        if not UsePlainPandasQueryCompiler.get()
+        if not NativeDataframeMode.get()
         else contextlib.nullcontext()
     ):
         operation = getattr(modin_df, op)
@@ -108,7 +108,7 @@ def test_style():
     data = test_data_values[0]
     with (
         warns_that_defaulting_to_pandas()
-        if not UsePlainPandasQueryCompiler.get()
+        if not NativeDataframeMode.get()
         else contextlib.nullcontext()
     ):
         pd.DataFrame(data).style
@@ -120,7 +120,7 @@ def test_to_timestamp():
 
     with (
         warns_that_defaulting_to_pandas()
-        if not UsePlainPandasQueryCompiler.get()
+        if not NativeDataframeMode.get()
         else contextlib.nullcontext()
     ):
         df.to_period().to_timestamp()
@@ -137,8 +137,8 @@ def test_to_numpy(data):
 
 
 @pytest.mark.skipif(
-    UsePlainPandasQueryCompiler.get(),
-    reason="PlainPandasQueryCompiler does not contain partitions.",
+    NativeDataframeMode.get() is not None,
+    reason="NativeQueryCompiler does not contain partitions.",
 )
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test_partition_to_numpy(data):
@@ -153,7 +153,7 @@ def test_asfreq():
     df = pd.DataFrame({"s": series})
     with (
         warns_that_defaulting_to_pandas()
-        if not UsePlainPandasQueryCompiler.get()
+        if not NativeDataframeMode.get()
         else contextlib.nullcontext()
     ):
         # We are only testing that this defaults to pandas, so we will just check for
@@ -315,7 +315,7 @@ def test_corr_min_periods(self, min_periods):
                 {"a": [1, np.nan, 3, 4, 5, 6], "b": [1, 2, 1, 4, 5, np.nan]}
             )
             modin_df = pd.concat([modin_df.iloc[:3], modin_df.iloc[3:]])
-            if not UsePlainPandasQueryCompiler.get():
+            if not NativeDataframeMode.get():
                 assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1)
             eval_general(
                 modin_df, pandas_df, lambda df: df.corr(min_periods=min_periods)
@@ -335,8 +335,8 @@ def test_corr_non_numeric(self, numeric_only):
         reason="doesn't make sense for non-partitioned executions",
     )
     @pytest.mark.skipif(
-        UsePlainPandasQueryCompiler.get(),
-        reason="PlainPandasQueryCompiler does not contain partitions.",
+        NativeDataframeMode.get() is not None,
+        reason="NativeQueryCompiler does not contain partitions.",
     )
     def test_corr_nans_in_different_partitions(self):
         # NaN in the first partition
@@ -628,10 +628,7 @@ def test_pivot(data, index, columns, values, request):
         or "default-one_column-several_columns_index" in request.node.callspec.id
         or "default-one_column-one_column_index" in request.node.callspec.id
         or (
-            (
-                current_execution in ("BaseOnPython",)
-                or UsePlainPandasQueryCompiler.get()
-            )
+            (current_execution in ("BaseOnPython",) or NativeDataframeMode.get())
             and index is lib.no_default
         )
     ):
@@ -1012,7 +1009,7 @@ def test_resampler_functions_with_arg(rule, axis, method_arg):
             marks=pytest.mark.xfail(
                 condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python")
                 and StorageFormat.get() != "Base"
-                and not UsePlainPandasQueryCompiler.get(),
+                and NativeDataframeMode.get() is None,
                 reason="https://github.com/modin-project/modin/issues/6399",
             ),
         ),

diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py
@@ -21,7 +21,7 @@
 from pandas._testing import ensure_clean
 
 import modin.pandas as pd
-from modin.config import MinPartitionSize, NPartitions, UsePlainPandasQueryCompiler
+from modin.config import MinPartitionSize, NativeDataframeMode, NPartitions
 from modin.pandas.indexing import is_range_like
 from modin.pandas.testing import assert_index_equal
 from modin.tests.pandas.utils import (
@@ -584,8 +584,8 @@ def test_loc_setting_single_categorical_column():
 
 
 @pytest.mark.skipif(
-    UsePlainPandasQueryCompiler.get(),
-    reason="PlainPandasQueryCompiler does not currently support IO functions.",
+    NativeDataframeMode.get() is not None,
+    reason="NativeQueryCompiler does not currently support IO functions.",
 )
 def test_loc_multi_index():
     modin_df = pd.read_csv(
@@ -2241,8 +2241,8 @@ def test___setitem__partitions_aligning():
 
 
 @pytest.mark.skipif(
-    UsePlainPandasQueryCompiler.get(),
-    reason="PlainPandasQueryCompiler does not currently support IO functions.",
+    NativeDataframeMode.get() is not None,
+    reason="NativeQueryCompiler does not currently support IO functions.",
 )
 def test___setitem__with_mismatched_partitions():
     with ensure_clean(".csv") as fname: