-
Notifications
You must be signed in to change notification settings - Fork 653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FIX-#4478: Support level param in query and eval. #4529
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -795,25 +795,40 @@ def equals(self, other): # noqa: PR01, RT01, D200 | |||||
and self.eq(other).all().all() | ||||||
) | ||||||
|
||||||
def _update_var_dicts_in_kwargs(self, expr, kwargs): | ||||||
def _update_var_dicts_in_kwargs(self, expr, level: int, kwargs): | ||||||
""" | ||||||
Copy variables with "@" prefix in `local_dict` and `global_dict` keys of kwargs. | ||||||
|
||||||
This function exists because we neeed to infer local and variables | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this say |
||||||
ourselves here in the main node instead of having the remote functions | ||||||
infer them on their own. The remote functions take place in a different | ||||||
thread with their own stacks that normally do not match the stack that | ||||||
leads to the Modin eval or query call. | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
expr : str | ||||||
The expression string to search variables with "@" prefix. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
level : int | ||||||
The number of prior stack frames back to look for local variables. | ||||||
level=0 means look just one frame back. | ||||||
kwargs : dict | ||||||
See the documentation for eval() for complete details on the keyword arguments accepted by query(). | ||||||
""" | ||||||
if "@" not in expr: | ||||||
return | ||||||
frame = sys._getframe() | ||||||
try: | ||||||
f_locals = frame.f_back.f_back.f_back.f_back.f_locals | ||||||
f_globals = frame.f_back.f_back.f_back.f_back.f_globals | ||||||
finally: | ||||||
del frame | ||||||
|
||||||
# Bump level up one because this function is wrapped in the Modin | ||||||
# logging function wrapper. The alternative is for the wrapper to | ||||||
# give the Modin functions that deal with `level` the special | ||||||
# treatment of bumping `level` up by one, but that's not so nice | ||||||
# either. | ||||||
level += 1 | ||||||
Comment on lines
+821
to
+826
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if we should just discard any |
||||||
|
||||||
frame = sys._getframe(level + 1) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we bumping level by one again here? |
||||||
f_locals = frame.f_locals | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason you removed the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, please retain the |
||||||
f_globals = frame.f_globals | ||||||
del frame | ||||||
local_names = set(re.findall(r"@([\w]+)", expr)) | ||||||
local_dict = {} | ||||||
global_dict = {} | ||||||
|
@@ -836,14 +851,27 @@ def eval(self, expr, inplace=False, **kwargs): # noqa: PR01, RT01, D200 | |||||
""" | ||||||
Evaluate a string describing operations on ``DataFrame`` columns. | ||||||
""" | ||||||
# Remove `level` from the kwargs if it's already there. "level" doesn't | ||||||
# make sense within the remote execution context, where the remote | ||||||
# functions have a stack which is different from the stack on the | ||||||
# driver node that ends in the modin eval call. | ||||||
level = kwargs.pop("level", 0) | ||||||
|
||||||
# Bump level up one because this function is wrapped in the Modin | ||||||
# logging function wrapper. The alternative is for the wrapper to | ||||||
# give the Modin functions that deal with `level` the special | ||||||
# treatment of bumping `level` up by one, but that's not so nice | ||||||
# either. | ||||||
level += 1 | ||||||
self._validate_eval_query(expr, **kwargs) | ||||||
inplace = validate_bool_kwarg(inplace, "inplace") | ||||||
self._update_var_dicts_in_kwargs(expr, kwargs) | ||||||
self._update_var_dicts_in_kwargs(expr, level + 1, kwargs) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you had already increased |
||||||
# Make sure to not pass level to query compiler | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this comment is needed |
||||||
new_query_compiler = self._query_compiler.eval(expr, **kwargs) | ||||||
return_type = type( | ||||||
pandas.DataFrame(columns=self.columns) | ||||||
.astype(self.dtypes) | ||||||
.eval(expr, **kwargs) | ||||||
.eval(expr, level=level + 1, **kwargs) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again, why |
||||||
).__name__ | ||||||
if return_type == type(self).__name__: | ||||||
return self._create_or_update_from_compiler(new_query_compiler, inplace) | ||||||
|
@@ -1692,9 +1720,22 @@ def query(self, expr, inplace=False, **kwargs): # noqa: PR01, RT01, D200 | |||||
""" | ||||||
Query the columns of a ``DataFrame`` with a boolean expression. | ||||||
""" | ||||||
self._update_var_dicts_in_kwargs(expr, kwargs) | ||||||
# Remove `level` from the kwargs if it's already there. "level" doesn't | ||||||
# make sense within the remote execution context, where the remote | ||||||
# functions have a stack which is different from the stack on the | ||||||
# driver node that ends in the modin eval call. | ||||||
level = kwargs.pop("level", 0) | ||||||
|
||||||
# Bump level up one because this function is wrapped in the Modin | ||||||
# logging function wrapper. The alternative is for the wrapper to | ||||||
# give the Modin functions that deal with `level` the special | ||||||
# treatment of bumping `level` up by one, but that's not so nice | ||||||
# eitiher. | ||||||
level += 1 | ||||||
Comment on lines
+1723
to
+1734
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. given I see exact same wall of comments three times already, maybe we should turn this into a helper function like def _extract_eval_level(kwargs):
"""
Extract `level` from eval or query `kwargs`.
Remove `level` from the kwargs if it's already there. "level" doesn't
make sense within the remote execution context, where the remote
functions have a stack which is different from the stack on the
driver node that ends in the modin eval call.
Bump level up one because eval/query are wrapped in the Modin
logging function wrapper.
"""
level = kwargs.pop("level", 0) + 1
return level, kwargs and then use it here like def query(self, expr, inplace=False, **kwargs):
# docstring
level, kwargs = _extract_eval_level(kwargs)
# do the rest of the stuff |
||||||
self._update_var_dicts_in_kwargs(expr, level + 1, kwargs) | ||||||
self._validate_eval_query(expr, **kwargs) | ||||||
inplace = validate_bool_kwarg(inplace, "inplace") | ||||||
# Make sure to not pass level to query compiler. | ||||||
new_query_compiler = self._query_compiler.query(expr, **kwargs) | ||||||
return self._create_or_update_from_compiler(new_query_compiler, inplace) | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -285,12 +285,29 @@ def test_eval_df_arithmetic_subexpression(): | |
|
||
@pytest.mark.parametrize("method", ["query", "eval"]) | ||
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) | ||
@pytest.mark.parametrize("local_var", [2]) | ||
def test_eval_and_query_with_local_and_global_var(method, data, local_var): | ||
@pytest.mark.parametrize("level", [0, 1]) | ||
def test_eval_and_query_with_local_and_global_var(method, data, level): | ||
# NOTE: if this test is failing because pandas or modin can't find | ||
# the local variable and you are investigating why, it's worth | ||
# noting that Modin's `query` and `eval` functions bump scope `level` | ||
# up by 1 to reflect the Modin logging function wrapper. | ||
local_var = 1 # noqa: F841 eval and query can reach back in the call | ||
# stack to access this variable | ||
|
||
def df_method_with_local_var(df, method: str, *args, **kwargs): | ||
# Give this level of the stack a different value of local_var, so that | ||
# passing `level=0` uses value 0, and `level=1` uses value 1. | ||
local_var = 0 # noqa: F841 eval and query can reach back in the call | ||
# stack to access this variable | ||
return getattr(df, method)(*args, **kwargs) | ||
|
||
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) | ||
op = "+" if method == "eval" else "<" | ||
for expr in (f"col1 {op} @local_var", f"col1 {op} @TEST_VAR"): | ||
df_equals(getattr(modin_df, method)(expr), getattr(pandas_df, method)(expr)) | ||
df_equals( | ||
df_method_with_local_var(modin_df, method, expr, level=level), | ||
df_method_with_local_var(pandas_df, method, expr, level=level), | ||
Comment on lines
+308
to
+309
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should also test the case of omitting the |
||
) | ||
|
||
|
||
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: might as well add type annotations for
expr
andkwargs
here.