Skip to content

Commit

Permalink
FEAT-#7340: Add more granular lazy flags to query compiler (#7348)
Browse files Browse the repository at this point in the history
Signed-off-by: Jonathan Shi <[email protected]>
  • Loading branch information
noloerino committed Jul 29, 2024
1 parent 7c1dde0 commit a40cef7
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 27 deletions.
45 changes: 37 additions & 8 deletions modin/core/storage_formats/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,21 @@ class BaseQueryCompiler(
Attributes
----------
lazy_execution : bool
Whether underlying execution engine is designed to be executed in a lazy mode only.
If True, such QueryCompiler will be handled differently at the front-end in order
to reduce execution triggering as much as possible.
lazy_row_labels : bool, default False
True if the backend defers computations of the row labels (`df.index` for a frame).
Used by the frontend to avoid unnecessary execution or defer error validation.
lazy_row_count : bool, default False
True if the backend defers computations of the number of rows (`len(df.index)`).
Used by the frontend to avoid unnecessary execution or defer error validation.
lazy_column_types : bool, default False
True if the backend defers computations of the column types (`df.dtypes`).
Used by the frontend to avoid unnecessary execution or defer error validation.
lazy_column_labels : bool, default False
True if the backend defers computations of the column labels (`df.columns`).
Used by the frontend to avoid unnecessary execution or defer error validation.
lazy_column_count : bool, default False
True if the backend defers computations of the number of columns (`len(df.columns)`).
Used by the frontend to avoid unnecessary execution or defer error validation.
_shape_hint : {"row", "column", None}, default: None
Shape hint for frames known to be a column or a row, otherwise None.
Expand Down Expand Up @@ -197,7 +208,25 @@ def default_to_pandas(self, pandas_op, *args, **kwargs) -> Self:
# some of these abstract methods, but for the sake of generality they are
# treated differently.

lazy_execution = False
lazy_row_labels = False
lazy_row_count = False
lazy_column_types = False
lazy_column_labels = False
lazy_column_count = False

@property
def lazy_shape(self):
"""
Whether either of the underlying dataframe's dimensions (row count/column count) are computed lazily.
If True, the frontend should avoid length/shape checks as much as possible.
Returns
-------
bool
"""
return self.lazy_row_count or self.lazy_column_count

_shape_hint = None

# Metadata modification abstract methods
Expand Down Expand Up @@ -4524,7 +4553,7 @@ def has_multiindex(self, axis=0):
@property
def frame_has_materialized_dtypes(self) -> bool:
"""
Check if the undelying dataframe has materialized dtypes.
Check if the underlying dataframe has materialized dtypes.
Returns
-------
Expand All @@ -4535,7 +4564,7 @@ def frame_has_materialized_dtypes(self) -> bool:
@property
def frame_has_materialized_columns(self) -> bool:
"""
Check if the undelying dataframe has materialized columns.
Check if the underlying dataframe has materialized columns.
Returns
-------
Expand All @@ -4546,7 +4575,7 @@ def frame_has_materialized_columns(self) -> bool:
@property
def frame_has_materialized_index(self) -> bool:
"""
Check if the undelying dataframe has materialized index.
Check if the underlying dataframe has materialized index.
Returns
-------
Expand Down
64 changes: 57 additions & 7 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,19 +276,69 @@ def __init__(self, modin_frame: PandasDataframe, shape_hint: Optional[str] = Non
self._shape_hint = shape_hint

@property
def lazy_execution(self):
def lazy_row_labels(self):
"""
Whether underlying Modin frame should be executed in a lazy mode.
Whether the row labels are computed lazily.
If True, such QueryCompiler will be handled differently at the front-end in order
to reduce triggering the computation as much as possible.
Equivalent to `not self.frame_has_materialized_index`.
Returns
-------
bool
"""
frame = self._modin_frame
return not frame.has_materialized_index or not frame.has_materialized_columns
return not self.frame_has_materialized_index

@property
def lazy_row_count(self):
"""
Whether the row count is computed lazily.
Equivalent to `not self.frame_has_materialized_index`.
Returns
-------
bool
"""
return not self.frame_has_materialized_index

@property
def lazy_column_types(self):
"""
Whether the dtypes are computed lazily.
Equivalent to `not self.frame_has_materialized_dtypes`.
Returns
-------
bool
"""
return not self.frame_has_materialized_dtypes

@property
def lazy_column_labels(self):
"""
Whether the column labels are computed lazily.
Equivalent to `not self.frame_has_materialized_columns`.
Returns
-------
bool
"""
return not self.frame_has_materialized_columns

@property
def lazy_column_count(self):
"""
Whether the column count is are computed lazily.
Equivalent to `not self.frame_has_materialized_columns`.
Returns
-------
bool
"""
return not self.frame_has_materialized_columns

def finalize(self):
self._modin_frame.finalize()
Expand Down Expand Up @@ -607,7 +657,7 @@ def reindex(self, axis, labels, **kwargs):
return self.__constructor__(new_modin_frame)

def reset_index(self, **kwargs) -> PandasQueryCompiler:
if self.lazy_execution:
if self.lazy_row_labels:

def _reset(df, *axis_lengths, partition_idx): # pragma: no cover
df = df.reset_index(**kwargs)
Expand Down
17 changes: 11 additions & 6 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1068,7 +1068,7 @@ def astype(
# will handle errors where dtype dict includes keys that are not
# in columns.
if (
not self._query_compiler.lazy_execution
not self._query_compiler.lazy_column_labels
and not set(dtype.keys()).issubset(set(self._query_compiler.columns))
and errors == "raise"
):
Expand Down Expand Up @@ -1462,7 +1462,9 @@ def drop(
axes[axis] = [axes[axis]]
# In case of lazy execution we should bypass these error checking components
# because they can force the materialization of the row or column labels.
if self._query_compiler.lazy_execution:
if (axis == "index" and self._query_compiler.lazy_row_labels) or (
axis == "columns" and self._query_compiler.lazy_column_labels
):
continue
if errors == "raise":
non_existent = pandas.Index(axes[axis]).difference(
Expand Down Expand Up @@ -2657,7 +2659,10 @@ def reset_index(
# exist.
if (
not drop
and not self._query_compiler.lazy_execution
and not (
self._query_compiler.lazy_column_labels
or self._query_compiler.lazy_row_labels
)
and not self._query_compiler.has_multiindex()
and all(n in self.columns for n in ["level_0", "index"])
):
Expand Down Expand Up @@ -3944,7 +3949,7 @@ def __getitem__(self, key) -> Self:
BasePandasDataset
Located dataset.
"""
if not self._query_compiler.lazy_execution and len(self) == 0:
if not self._query_compiler.lazy_row_count and len(self) == 0:
return self._default_to_pandas("__getitem__", key)
# see if we can slice the rows
# This lets us reuse code in pandas to error check
Expand Down Expand Up @@ -4075,7 +4080,7 @@ def _getitem_slice(self, key: slice) -> Self:
if is_full_grab_slice(
key,
# Avoid triggering shape computation for lazy executions
sequence_len=(None if self._query_compiler.lazy_execution else len(self)),
sequence_len=(None if self._query_compiler.lazy_row_count else len(self)),
):
return self.copy()
return self.iloc[key]
Expand Down Expand Up @@ -4301,7 +4306,7 @@ def __getattribute__(self, item) -> Any:
Any
"""
attr = super().__getattribute__(item)
if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_execution:
if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_shape:
# We default to pandas on empty DataFrames. This avoids a large amount of
# pain in underlying implementation and returns a result immediately rather
# than dealing with the edge cases that empty DataFrames have.
Expand Down
4 changes: 2 additions & 2 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1084,7 +1084,7 @@ def insert(
+ f"{len(value.columns)} columns instead."
)
value = value.squeeze(axis=1)
if not self._query_compiler.lazy_execution and len(self.index) == 0:
if not self._query_compiler.lazy_row_count and len(self.index) == 0:
if not hasattr(value, "index"):
try:
value = pandas.Series(value)
Expand Down Expand Up @@ -2783,7 +2783,7 @@ def setitem_unhashable_key(df, value):
if not isinstance(value, (Series, Categorical, np.ndarray, list, range)):
value = list(value)

if not self._query_compiler.lazy_execution and len(self.index) == 0:
if not self._query_compiler.lazy_row_count and len(self.index) == 0:
new_self = self.__constructor__({key: value}, columns=self.columns)
self._update_inplace(new_self._query_compiler)
else:
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def concat(
for obj in list_of_objs
if (
isinstance(obj, (Series, pandas.Series))
or (isinstance(obj, DataFrame) and obj._query_compiler.lazy_execution)
or (isinstance(obj, DataFrame) and obj._query_compiler.lazy_shape)
or sum(obj.shape) > 0
)
]
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def __getattr__(self, key):

def __getattribute__(self, item):
attr = super().__getattribute__(item)
if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_execution:
if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_shape:
# We default to pandas on empty DataFrames. This avoids a large amount of
# pain in underlying implementation and returns a result immediately rather
# than dealing with the edge cases that empty DataFrames have.
Expand Down
4 changes: 2 additions & 2 deletions modin/tests/pandas/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2752,15 +2752,15 @@ def lazy_frame(self):
donor_obj = pd.DataFrame()._query_compiler

self._mock_obj = mock.patch(
f"{donor_obj.__module__}.{donor_obj.__class__.__name__}.lazy_execution",
f"{donor_obj.__module__}.{donor_obj.__class__.__name__}.lazy_shape",
new_callable=mock.PropertyMock,
)
patch_obj = self._mock_obj.__enter__()
patch_obj.return_value = True

df = pd.DataFrame(**self._df_kwargs)
# The frame is lazy until `self.__exit__()` is called
assert df._query_compiler.lazy_execution
assert df._query_compiler.lazy_shape
return df

def __enter__(self):
Expand Down

0 comments on commit a40cef7

Please sign in to comment.