FEAT-#7340: Add more granular lazy flags to query compiler (#7348)

Signed-off-by: Jonathan Shi <[email protected]>
modin-project · Jul 29, 2024 · a40cef7 · a40cef7
1 parent 7c1dde0
commit a40cef7
Show file tree

Hide file tree

Showing 7 changed files with 111 additions and 27 deletions.
diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
@@ -120,10 +120,21 @@ class BaseQueryCompiler(
 
     Attributes
     ----------
-    lazy_execution : bool
-        Whether underlying execution engine is designed to be executed in a lazy mode only.
-        If True, such QueryCompiler will be handled differently at the front-end in order
-        to reduce execution triggering as much as possible.
+    lazy_row_labels : bool, default False
+        True if the backend defers computations of the row labels (`df.index` for a frame).
+        Used by the frontend to avoid unnecessary execution or defer error validation.
+    lazy_row_count : bool, default False
+        True if the backend defers computations of the number of rows (`len(df.index)`).
+        Used by the frontend to avoid unnecessary execution or defer error validation.
+    lazy_column_types : bool, default False
+        True if the backend defers computations of the column types (`df.dtypes`).
+        Used by the frontend to avoid unnecessary execution or defer error validation.
+    lazy_column_labels : bool, default False
+        True if the backend defers computations of the column labels (`df.columns`).
+        Used by the frontend to avoid unnecessary execution or defer error validation.
+    lazy_column_count : bool, default False
+        True if the backend defers computations of the number of columns (`len(df.columns)`).
+        Used by the frontend to avoid unnecessary execution or defer error validation.
     _shape_hint : {"row", "column", None}, default: None
         Shape hint for frames known to be a column or a row, otherwise None.
 
@@ -197,7 +208,25 @@ def default_to_pandas(self, pandas_op, *args, **kwargs) -> Self:
     # some of these abstract methods, but for the sake of generality they are
     # treated differently.
 
-    lazy_execution = False
+    lazy_row_labels = False
+    lazy_row_count = False
+    lazy_column_types = False
+    lazy_column_labels = False
+    lazy_column_count = False
+
+    @property
+    def lazy_shape(self):
+        """
+        Whether either of the underlying dataframe's dimensions (row count/column count) are computed lazily.
+
+        If True, the frontend should avoid length/shape checks as much as possible.
+
+        Returns
+        -------
+        bool
+        """
+        return self.lazy_row_count or self.lazy_column_count
+
     _shape_hint = None
 
     # Metadata modification abstract methods
@@ -4524,7 +4553,7 @@ def has_multiindex(self, axis=0):
     @property
     def frame_has_materialized_dtypes(self) -> bool:
         """
-        Check if the undelying dataframe has materialized dtypes.
+        Check if the underlying dataframe has materialized dtypes.
 
         Returns
         -------
@@ -4535,7 +4564,7 @@ def frame_has_materialized_dtypes(self) -> bool:
     @property
     def frame_has_materialized_columns(self) -> bool:
         """
-        Check if the undelying dataframe has materialized columns.
+        Check if the underlying dataframe has materialized columns.
 
         Returns
         -------
@@ -4546,7 +4575,7 @@ def frame_has_materialized_columns(self) -> bool:
     @property
     def frame_has_materialized_index(self) -> bool:
         """
-        Check if the undelying dataframe has materialized index.
+        Check if the underlying dataframe has materialized index.
 
         Returns
         -------

diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
@@ -276,19 +276,69 @@ def __init__(self, modin_frame: PandasDataframe, shape_hint: Optional[str] = Non
         self._shape_hint = shape_hint
 
     @property
-    def lazy_execution(self):
+    def lazy_row_labels(self):
         """
-        Whether underlying Modin frame should be executed in a lazy mode.
+        Whether the row labels are computed lazily.
 
-        If True, such QueryCompiler will be handled differently at the front-end in order
-        to reduce triggering the computation as much as possible.
+        Equivalent to `not self.frame_has_materialized_index`.
 
         Returns
         -------
         bool
         """
-        frame = self._modin_frame
-        return not frame.has_materialized_index or not frame.has_materialized_columns
+        return not self.frame_has_materialized_index
+
+    @property
+    def lazy_row_count(self):
+        """
+        Whether the row count is computed lazily.
+
+        Equivalent to `not self.frame_has_materialized_index`.
+
+        Returns
+        -------
+        bool
+        """
+        return not self.frame_has_materialized_index
+
+    @property
+    def lazy_column_types(self):
+        """
+        Whether the dtypes are computed lazily.
+
+        Equivalent to `not self.frame_has_materialized_dtypes`.
+
+        Returns
+        -------
+        bool
+        """
+        return not self.frame_has_materialized_dtypes
+
+    @property
+    def lazy_column_labels(self):
+        """
+        Whether the column labels are computed lazily.
+
+        Equivalent to `not self.frame_has_materialized_columns`.
+
+        Returns
+        -------
+        bool
+        """
+        return not self.frame_has_materialized_columns
+
+    @property
+    def lazy_column_count(self):
+        """
+        Whether the column count is are computed lazily.
+
+        Equivalent to `not self.frame_has_materialized_columns`.
+
+        Returns
+        -------
+        bool
+        """
+        return not self.frame_has_materialized_columns
 
     def finalize(self):
         self._modin_frame.finalize()
@@ -607,7 +657,7 @@ def reindex(self, axis, labels, **kwargs):
         return self.__constructor__(new_modin_frame)
 
     def reset_index(self, **kwargs) -> PandasQueryCompiler:
-        if self.lazy_execution:
+        if self.lazy_row_labels:
 
             def _reset(df, *axis_lengths, partition_idx):  # pragma: no cover
                 df = df.reset_index(**kwargs)

diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -1068,7 +1068,7 @@ def astype(
             # will handle errors where dtype dict includes keys that are not
             # in columns.
             if (
-                not self._query_compiler.lazy_execution
+                not self._query_compiler.lazy_column_labels
                 and not set(dtype.keys()).issubset(set(self._query_compiler.columns))
                 and errors == "raise"
             ):
@@ -1462,7 +1462,9 @@ def drop(
                     axes[axis] = [axes[axis]]
                 # In case of lazy execution we should bypass these error checking components
                 # because they can force the materialization of the row or column labels.
-                if self._query_compiler.lazy_execution:
+                if (axis == "index" and self._query_compiler.lazy_row_labels) or (
+                    axis == "columns" and self._query_compiler.lazy_column_labels
+                ):
                     continue
                 if errors == "raise":
                     non_existent = pandas.Index(axes[axis]).difference(
@@ -2657,7 +2659,10 @@ def reset_index(
         # exist.
         if (
             not drop
-            and not self._query_compiler.lazy_execution
+            and not (
+                self._query_compiler.lazy_column_labels
+                or self._query_compiler.lazy_row_labels
+            )
             and not self._query_compiler.has_multiindex()
             and all(n in self.columns for n in ["level_0", "index"])
         ):
@@ -3944,7 +3949,7 @@ def __getitem__(self, key) -> Self:
         BasePandasDataset
             Located dataset.
         """
-        if not self._query_compiler.lazy_execution and len(self) == 0:
+        if not self._query_compiler.lazy_row_count and len(self) == 0:
             return self._default_to_pandas("__getitem__", key)
         # see if we can slice the rows
         # This lets us reuse code in pandas to error check
@@ -4075,7 +4080,7 @@ def _getitem_slice(self, key: slice) -> Self:
         if is_full_grab_slice(
             key,
             # Avoid triggering shape computation for lazy executions
-            sequence_len=(None if self._query_compiler.lazy_execution else len(self)),
+            sequence_len=(None if self._query_compiler.lazy_row_count else len(self)),
         ):
             return self.copy()
         return self.iloc[key]
@@ -4301,7 +4306,7 @@ def __getattribute__(self, item) -> Any:
         Any
         """
         attr = super().__getattribute__(item)
-        if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_execution:
+        if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_shape:
             # We default to pandas on empty DataFrames. This avoids a large amount of
             # pain in underlying implementation and returns a result immediately rather
             # than dealing with the edge cases that empty DataFrames have.

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -1084,7 +1084,7 @@ def insert(
                     + f"{len(value.columns)} columns instead."
                 )
             value = value.squeeze(axis=1)
-        if not self._query_compiler.lazy_execution and len(self.index) == 0:
+        if not self._query_compiler.lazy_row_count and len(self.index) == 0:
             if not hasattr(value, "index"):
                 try:
                     value = pandas.Series(value)
@@ -2783,7 +2783,7 @@ def setitem_unhashable_key(df, value):
             if not isinstance(value, (Series, Categorical, np.ndarray, list, range)):
                 value = list(value)
 
-        if not self._query_compiler.lazy_execution and len(self.index) == 0:
+        if not self._query_compiler.lazy_row_count and len(self.index) == 0:
             new_self = self.__constructor__({key: value}, columns=self.columns)
             self._update_inplace(new_self._query_compiler)
         else:

diff --git a/modin/pandas/general.py b/modin/pandas/general.py
@@ -491,7 +491,7 @@ def concat(
             for obj in list_of_objs
             if (
                 isinstance(obj, (Series, pandas.Series))
-                or (isinstance(obj, DataFrame) and obj._query_compiler.lazy_execution)
+                or (isinstance(obj, DataFrame) and obj._query_compiler.lazy_shape)
                 or sum(obj.shape) > 0
             )
         ]

diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py
@@ -196,7 +196,7 @@ def __getattr__(self, key):
 
     def __getattribute__(self, item):
         attr = super().__getattribute__(item)
-        if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_execution:
+        if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_shape:
             # We default to pandas on empty DataFrames. This avoids a large amount of
             # pain in underlying implementation and returns a result immediately rather
             # than dealing with the edge cases that empty DataFrames have.

diff --git a/modin/tests/pandas/test_groupby.py b/modin/tests/pandas/test_groupby.py
@@ -2752,15 +2752,15 @@ def lazy_frame(self):
             donor_obj = pd.DataFrame()._query_compiler
 
             self._mock_obj = mock.patch(
-                f"{donor_obj.__module__}.{donor_obj.__class__.__name__}.lazy_execution",
+                f"{donor_obj.__module__}.{donor_obj.__class__.__name__}.lazy_shape",
                 new_callable=mock.PropertyMock,
             )
             patch_obj = self._mock_obj.__enter__()
             patch_obj.return_value = True
 
             df = pd.DataFrame(**self._df_kwargs)
             # The frame is lazy until `self.__exit__()` is called
-            assert df._query_compiler.lazy_execution
+            assert df._query_compiler.lazy_shape
             return df
 
         def __enter__(self):