diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cce78ec2a64..1108fd6ffa7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -214,6 +214,7 @@ jobs: unidist: ${{ steps.filter.outputs.unidist }} engines: ${{ steps.engines.outputs.engines }} experimental: ${{ steps.experimental.outputs.experimental }} + test-small-query-compiler: ${{ steps.filter.outputs.test-small-query-compiler }} steps: - uses: actions/checkout@v4 - uses: dorny/paths-filter@v3 @@ -636,8 +637,8 @@ jobs: - run: python -m pytest modin/tests/experimental/spreadsheet/test_general.py test-small-query-compiler: - needs: [changes, lint-flake8, lint-black, test-api, test-headers] - if: ${{ needs.changes.outputs.test-small-query-compiler == 'true' }} + needs: [ lint-flake8, execution-filter] + if: ${{ needs.execution-filter.outputs.test-small-query-compiler == 'true' }} runs-on: ubuntu-latest defaults: run: diff --git a/docs/conf.py b/docs/conf.py index 346fbcd8611..99355183995 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -54,9 +54,7 @@ def noop_decorator(*args, **kwargs): if not hasattr(sys.modules["unidist"].core.base, "object_ref"): sys.modules["unidist"].core.base.object_ref = type("object_ref", (object,), {}) if not hasattr(sys.modules["unidist"].core.base.object_ref, "ObjectRef"): - sys.modules["unidist"].core.base.object_ref.ObjectRef = type( - "ObjectRef", (object,), {} - ) + sys.modules["unidist"].core.base.object_ref.ObjectRef = type("ObjectRef", (object,), {}) sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) import modin diff --git a/modin/config/__init__.py b/modin/config/__init__.py index 3e39b9e811a..467495d6b0a 100644 --- a/modin/config/__init__.py +++ b/modin/config/__init__.py @@ -52,7 +52,6 @@ TestReadFromSqlServer, TrackFileLeaks, UsePlainPandasQueryCompiler, - ) from modin.config.pubsub import Parameter, ValueSource, context diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index f18887e0ab1..11911996b65 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -70,13 +70,10 @@ def maybe_compute_dtypes_common_cast( The dtypes of the operands are supposed to be known. """ if not trigger_computations: - if not first._modin_frame.has_materialized_dtypes: + if not first.has_materialized_dtypes(): return None - if ( - isinstance(second, type(first)) - and not second._modin_frame.has_materialized_dtypes - ): + if isinstance(second, type(first)) and not second.has_materialized_dtypes(): return None dtypes_first = first._modin_frame.dtypes.to_dict() diff --git a/modin/core/dataframe/algebra/tree_reduce.py b/modin/core/dataframe/algebra/tree_reduce.py index 9dd42fa46b2..b6aca9bcc6a 100644 --- a/modin/core/dataframe/algebra/tree_reduce.py +++ b/modin/core/dataframe/algebra/tree_reduce.py @@ -67,7 +67,7 @@ def caller( _axis = kwargs.get("axis") if axis is None else axis new_dtypes = None - if compute_dtypes and query_compiler._modin_frame.has_materialized_dtypes: + if compute_dtypes and query_compiler.has_materialized_dtypes(): new_dtypes = str(compute_dtypes(query_compiler.dtypes, *args, **kwargs)) return query_compiler.__constructor__( diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 8e50e69bc2b..337882b702a 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -4521,6 +4521,27 @@ def has_multiindex(self, axis=0): assert axis == 1 return isinstance(self.columns, pandas.MultiIndex) + def has_materialized_dtypes(self): + """ + Check if the undelying modin frame has materialized dtypes + + Returns + ------- + bool + True if if the undelying modin frame and False otherwise. + """ + return True + + def set_frame_dtypes_cache(self, dtypes): + """ + Set dtypes cache for the underlying modin frame. + + Parameters + ---------- + dtypes : pandas.Series, ModinDtypes, callable or None + """ + pass + def get_index_name(self, axis=0): """ Get index name of specified axis. diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py index 3959b86f3ab..e72babb7ea8 100644 --- a/modin/core/storage_formats/pandas/aggregations.py +++ b/modin/core/storage_formats/pandas/aggregations.py @@ -71,7 +71,7 @@ def corr_method( np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)), index=new_columns, ) - elif numeric_only and qc._modin_frame.has_materialized_dtypes: + elif numeric_only and qc.has_materialized_dtypes(): old_dtypes = qc._modin_frame.dtypes new_columns = old_dtypes[old_dtypes.map(is_numeric_dtype)].index diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 5281e92d8c5..df93d5c774a 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -365,6 +365,27 @@ def copy(self): # END Copy + def has_materialized_dtypes(self): + """ + Check if the undelying modin frame has materialized dtypes + + Returns + ------- + bool + True if if the undelying modin frame and False otherwise. + """ + return self.has_materialized_dtypes() + + def set_frame_dtypes_cache(self, dtypes): + """ + Set dtypes cache for the underlying modin frame. + + Parameters + ---------- + dtypes : pandas.Series, ModinDtypes, callable or None + """ + self.set_frame_dtypes_cache(dtypes) + # Append/Concat/Join (Not Merge) # The append/concat/join operations should ideally never trigger remote # compute. These operations should only ever be manipulations of the @@ -580,10 +601,7 @@ def reindex(self, axis, labels, **kwargs): new_index, indexer = (self.index, None) if axis else self.index.reindex(labels) new_columns, _ = self.columns.reindex(labels) if axis else (self.columns, None) new_dtypes = None - if ( - self._modin_frame.has_materialized_dtypes - and kwargs.get("method", None) is None - ): + if self.has_materialized_dtypes() and kwargs.get("method", None) is None: # For columns, defining types is easier because we don't have to calculate the common # type, since the entire column is filled. A simple `reindex` covers our needs. # For rows, we can avoid calculating common types if we know that no new strings of @@ -2650,7 +2668,7 @@ def fillna(df): } return df.fillna(value=func_dict, **kwargs) - if self._modin_frame.has_materialized_dtypes: + if self.has_materialized_dtypes(): dtypes = self._modin_frame.dtypes value_dtypes = pandas.DataFrame( {k: [v] for (k, v) in value.items()} @@ -2663,7 +2681,7 @@ def fillna(df): new_dtypes = dtypes else: - if self._modin_frame.has_materialized_dtypes: + if self.has_materialized_dtypes(): dtype = pandas.Series(value).dtype if all( find_common_type([t, dtype]) == t for t in self._modin_frame.dtypes @@ -2898,7 +2916,7 @@ def _set_item(df, row_loc): # pragma: no cover df.loc[row_loc.squeeze(axis=1), col_loc] = item return df - if self._modin_frame.has_materialized_dtypes and is_scalar(item): + if self.has_materialized_dtypes() and is_scalar(item): new_dtypes = self.dtypes.copy() old_dtypes = new_dtypes[col_loc] item_type = extract_dtype(item) @@ -4607,7 +4625,7 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item): # compute dtypes only if assigning entire columns isinstance(row_numeric_index, slice) and row_numeric_index == slice(None) - and self._modin_frame.has_materialized_dtypes + and self.has_materialized_dtypes() ): new_dtypes = self.dtypes.copy() new_dtypes.iloc[col_numeric_index] = broadcasted_dtypes.values diff --git a/modin/pandas/base.py b/modin/pandas/base.py index bf7670c2621..6c72324f27f 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1075,7 +1075,7 @@ def astype( if not copy: # If the new types match the old ones, then copying can be avoided - if self._query_compiler._modin_frame.has_materialized_dtypes: + if self._query_compiler.has_materialized_dtypes(): frame_dtypes = self._query_compiler._modin_frame.dtypes if isinstance(dtype, dict): for col in dtype: diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 0acc0b1ccdc..516537c8108 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -155,8 +155,7 @@ def __init__( # use this list to update inplace when there is a shallow copy. self._siblings = [] if isinstance(data, (DataFrame, Series)): - query_compiler = data._query_compiler.copy() - self._query_compiler = query_compiler + self._query_compiler = data._query_compiler.copy() if index is not None and any(i not in data.index for i in index): raise NotImplementedError( "Passing non-existant columns or index values to constructor not" diff --git a/modin/pandas/io.py b/modin/pandas/io.py index 615bab25762..5bb599dd749 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -64,7 +64,6 @@ from pandas.io.parsers import TextFileReader from pandas.io.parsers.readers import _c_parser_defaults - from modin.config import ModinNumpy, UsePlainPandasQueryCompiler from modin.error_message import ErrorMessage from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 09b19637e88..ef7f199b57c 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -22,13 +22,7 @@ from pandas._typing import AggFuncType, AggFuncTypeBase, AggFuncTypeDict, IndexLabel from pandas.util._decorators import doc - - -from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, -) - -from modin.utils import hashable +from modin.utils import hashable _doc_binary_operation = """ Return {operation} of {left} and `{right}` (binary operator `{bin_op}`). diff --git a/modin/tests/core/storage_formats/pandas/test_internals.py b/modin/tests/core/storage_formats/pandas/test_internals.py index 9a16022a371..680e3825c3f 100644 --- a/modin/tests/core/storage_formats/pandas/test_internals.py +++ b/modin/tests/core/storage_formats/pandas/test_internals.py @@ -1138,14 +1138,14 @@ def test_binary_op_preserve_dtypes(): def setup_cache(df, has_cache=True): if has_cache: _ = df.dtypes - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() else: - df._query_compiler._modin_frame.set_dtypes_cache(None) - assert not df._query_compiler._modin_frame.has_materialized_dtypes + df._query_compiler.set_frame_dtypes_cache(None) + assert not df._query_compiler.has_materialized_dtypes() return df def assert_cache(df, has_cache=True): - assert not (has_cache ^ df._query_compiler._modin_frame.has_materialized_dtypes) + assert not (has_cache ^ df._query_compiler.has_materialized_dtypes()) # Check when `other` is a non-distributed object assert_cache(setup_cache(df) + 2.0) @@ -1195,30 +1195,30 @@ def test_setitem_bool_preserve_dtypes(): df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}) indexer = pd.Series([True, False, True, False]) - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() # slice(None) as a col_loc df.loc[indexer] = 2.0 - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() # list as a col_loc df.loc[indexer, ["a", "b"]] = 2.0 - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() # scalar as a col_loc df.loc[indexer, "a"] = 2.0 - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() def test_setitem_unhashable_preserve_dtypes(): df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() df2 = pd.DataFrame([[9, 9], [5, 5]]) - assert df2._query_compiler._modin_frame.has_materialized_dtypes + assert df2._query_compiler.has_materialized_dtypes() df[[1, 2]] = df2 - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() @pytest.mark.parametrize("modify_config", [{RangePartitioning: True}], indirect=True) @@ -1246,7 +1246,7 @@ def test_reindex_preserve_dtypes(kwargs): df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}) reindexed_df = df.reindex(**kwargs) - assert reindexed_df._query_compiler._modin_frame.has_materialized_dtypes + assert reindexed_df._query_compiler.has_materialized_dtypes() class TestModinIndexIds: @@ -2039,7 +2039,7 @@ def test_concat_axis_1( ) # setting columns cache to 'None', in order to prevent completing 'dtypes' with the materialized columns md_df._query_compiler._modin_frame.set_columns_cache(None) - md_df._query_compiler._modin_frame.set_dtypes_cache( + md_df._query_compiler.set_frame_dtypes_cache( ModinDtypes( DtypesDescriptor( known_dtypes, @@ -2100,7 +2100,7 @@ def test_update_parent(self): # 'df2' will have a 'DtypesDescriptor' with unknown dtypes for a column 'c' df2 = pd.DataFrame({"c": [2, 3, 4]}) - df2._query_compiler._modin_frame.set_dtypes_cache(None) + df2._query_compiler.set_frame_dtypes_cache(None) dtypes_cache = df2._query_compiler._modin_frame._dtypes assert isinstance( dtypes_cache._value, DtypesDescriptor @@ -2226,7 +2226,7 @@ def test_set_index_with_dupl_labels(self): """Verify that setting duplicated columns doesn't propagate any errors to a user.""" df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [3.5, 4.4, 5.5, 6.6]}) # making sure that dtypes are represented by an unmaterialized dtypes-descriptor - df._query_compiler._modin_frame.set_dtypes_cache(None) + df._query_compiler.set_frame_dtypes_cache(None) df.columns = ["a", "a"] assert df.dtypes.equals( @@ -2252,8 +2252,8 @@ def test_concat_mi(self): ) # Drop actual dtypes in order to use partially-known dtypes - md_df1._query_compiler._modin_frame.set_dtypes_cache(None) - md_df2._query_compiler._modin_frame.set_dtypes_cache(None) + md_df1._query_compiler.set_frame_dtypes_cache(None) + md_df2._query_compiler.set_frame_dtypes_cache(None) md_res = pd.concat([md_df1, md_df2], axis=1) pd_res = pandas.concat([pd_df1, pd_df2], axis=1) @@ -2282,9 +2282,9 @@ def test_preserve_dtypes_setitem(self, self_dtype, value, value_dtype): with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch: df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [3, 4]}) if self_dtype == "materialized": - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() elif self_dtype == "partial": - df._query_compiler._modin_frame.set_dtypes_cache( + df._query_compiler.set_frame_dtypes_cache( ModinDtypes( DtypesDescriptor( {"a": np.dtype("int64")}, @@ -2293,7 +2293,7 @@ def test_preserve_dtypes_setitem(self, self_dtype, value, value_dtype): ) ) elif self_dtype == "unknown": - df._query_compiler._modin_frame.set_dtypes_cache(None) + df._query_compiler.set_frame_dtypes_cache(None) else: raise NotImplementedError(self_dtype) @@ -2304,7 +2304,7 @@ def test_preserve_dtypes_setitem(self, self_dtype, value, value_dtype): [np.dtype("int64"), value_dtype, np.dtype("int64")], index=["a", "b", "c"], ) - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() assert df.dtypes.equals(result_dtype) elif self_dtype == "partial": result_dtype = DtypesDescriptor( @@ -2339,9 +2339,9 @@ def test_preserve_dtypes_insert(self, self_dtype, value, value_dtype): with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch: df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) if self_dtype == "materialized": - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() elif self_dtype == "partial": - df._query_compiler._modin_frame.set_dtypes_cache( + df._query_compiler.set_frame_dtypes_cache( ModinDtypes( DtypesDescriptor( {"a": np.dtype("int64")}, cols_with_unknown_dtypes=["b"] @@ -2349,7 +2349,7 @@ def test_preserve_dtypes_insert(self, self_dtype, value, value_dtype): ) ) elif self_dtype == "unknown": - df._query_compiler._modin_frame.set_dtypes_cache(None) + df._query_compiler.set_frame_dtypes_cache(None) else: raise NotImplementedError(self_dtype) @@ -2360,7 +2360,7 @@ def test_preserve_dtypes_insert(self, self_dtype, value, value_dtype): [value_dtype, np.dtype("int64"), np.dtype("int64")], index=["c", "a", "b"], ) - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() assert df.dtypes.equals(result_dtype) elif self_dtype == "partial": result_dtype = DtypesDescriptor( @@ -2390,7 +2390,7 @@ def test_get_dummies_case(self): cols = [col for col in res.columns if col != "items"] res[cols] = res[cols] / res[cols].mean() - assert res._query_compiler._modin_frame.has_materialized_dtypes + assert res._query_compiler.has_materialized_dtypes() patch.assert_not_called() @@ -2405,19 +2405,19 @@ def test_preserve_dtypes_reset_index(self, drop, has_materialized_index): else: df._query_compiler._modin_frame.set_index_cache(None) assert not df._query_compiler._modin_frame.has_materialized_index - assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df._query_compiler.has_materialized_dtypes() res = df.reset_index(drop=drop) if drop: # we droped the index, so columns and dtypes shouldn't change - assert res._query_compiler._modin_frame.has_materialized_dtypes + assert res._query_compiler.has_materialized_dtypes() assert res.dtypes.equals(df.dtypes) else: if has_materialized_index: # we should have inserted index dtype into the descriptor, # and since both of them are materialized, the result should be # materialized too - assert res._query_compiler._modin_frame.has_materialized_dtypes + assert res._query_compiler.has_materialized_dtypes() assert res.dtypes.equals( pandas.Series( [np.dtype("int64"), np.dtype("int64")], index=["index", "a"] @@ -2436,7 +2436,7 @@ def test_preserve_dtypes_reset_index(self, drop, has_materialized_index): # case 2: 'df' has partial dtype by default df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) - df._query_compiler._modin_frame.set_dtypes_cache( + df._query_compiler.set_frame_dtypes_cache( ModinDtypes( DtypesDescriptor( {"a": np.dtype("int64")}, cols_with_unknown_dtypes=["b"] diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index 9c6ed0cc2af..6c2d88390db 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -19,7 +19,12 @@ import pytest import modin.pandas as pd -from modin.config import MinPartitionSize, NPartitions, StorageFormat +from modin.config import ( + MinPartitionSize, + NPartitions, + StorageFormat, + UsePlainPandasQueryCompiler, +) from modin.core.dataframe.pandas.metadata import LazyProxyCategoricalDtype from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas from modin.pandas.testing import assert_index_equal, assert_series_equal @@ -299,7 +304,10 @@ def test_copy(data): assert new_modin_df.columns is not modin_df.columns assert new_modin_df.dtypes is not modin_df.dtypes - if get_current_execution() != "BaseOnPython": + if ( + get_current_execution() != "BaseOnPython" + and not UsePlainPandasQueryCompiler.get() + ): assert np.array_equal( new_modin_df._query_compiler._modin_frame._partitions, modin_df._query_compiler._modin_frame._partitions, @@ -473,13 +481,13 @@ def test_astype_copy(has_dtypes): data = [1] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) if not has_dtypes: - modin_df._query_compiler._modin_frame.set_dtypes_cache(None) + modin_df._query_compiler.set_frame_dtypes_cache(None) eval_general(modin_df, pandas_df, lambda df: df.astype(str, copy=False)) # trivial case where copying can be avoided, behavior should match pandas s1 = pd.Series([1, 2]) if not has_dtypes: - modin_df._query_compiler._modin_frame.set_dtypes_cache(None) + modin_df._query_compiler.set_frame_dtypes_cache(None) s2 = s1.astype("int64", copy=False) s2[0] = 10 df_equals(s1, s2) @@ -565,6 +573,10 @@ def test_astype_int64_to_astype_category_github_issue_6259(): get_current_execution() == "BaseOnPython", reason="BaseOnPython doesn't have proxy categories", ) +@pytest.mark.skipif( + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler doesn't have proxy categories", +) class TestCategoricalProxyDtype: """This class contains test and test usilities for the ``LazyProxyCategoricalDtype`` class.""" @@ -787,6 +799,10 @@ def comparator(df1, df2): ) +@pytest.mark.skipif( + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not contain partitions.", +) def test_convert_dtypes_multiple_row_partitions(): # Column 0 should have string dtype modin_part1 = pd.DataFrame(["a"]).convert_dtypes() @@ -811,7 +827,7 @@ def test_convert_dtypes_5653(): modin_part1 = pd.DataFrame({"col1": ["a", "b", "c", "d"]}) modin_part2 = pd.DataFrame({"col1": [None, None, None, None]}) modin_df = pd.concat([modin_part1, modin_part2]) - if StorageFormat.get() == "Pandas": + if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) modin_df = modin_df.convert_dtypes() assert len(modin_df.dtypes) == 1 diff --git a/setup.cfg b/setup.cfg index ab9bf80c882..884ec718a6a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -73,6 +73,3 @@ exclude_lines = raise ImportError assert pass - -[pytest] -addopts = --env=MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER=True \ No newline at end of file