From 42d36970f71044264832c404e303fcada9323807 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 9 Apr 2024 10:32:15 +0200 Subject: [PATCH] fix: Turn off cse if cache node found (#15554) --- .../src/logical_plan/optimizer/mod.rs | 33 ++++++++++--------- py-polars/polars/lazyframe/frame.py | 6 +++- py-polars/tests/unit/test_cse.py | 9 +++++ 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/crates/polars-plan/src/logical_plan/optimizer/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/mod.rs index afd3fc79c76b..d89deb6a195b 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/mod.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/mod.rs @@ -113,21 +113,24 @@ pub fn optimize( } #[cfg(feature = "cse")] - let _cse_plan_changed = - if comm_subplan_elim && members.has_joins_or_unions && members.has_duplicate_scans() { - if verbose { - eprintln!("found multiple sources; run comm_subplan_elim") - } - let (lp, changed, cid2c) = cse::elim_cmn_subplans(lp_top, lp_arena, expr_arena); - - prune_unused_caches(lp_arena, cid2c); - - lp_top = lp; - members.has_cache |= changed; - changed - } else { - false - }; + let _cse_plan_changed = if comm_subplan_elim + && members.has_joins_or_unions + && members.has_duplicate_scans() + && !members.has_cache + { + if verbose { + eprintln!("found multiple sources; run comm_subplan_elim") + } + let (lp, changed, cid2c) = cse::elim_cmn_subplans(lp_top, lp_arena, expr_arena); + + prune_unused_caches(lp_arena, cid2c); + + lp_top = lp; + members.has_cache |= changed; + changed + } else { + false + }; #[cfg(not(feature = "cse"))] let _cse_plan_changed = false; diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 8fc7ce2712a6..b420aaed80a2 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -2371,7 +2371,11 @@ def lazy(self) -> Self: return self def cache(self) -> Self: - """Cache the result once the execution of the physical plan hits this node.""" + """ + Cache the result once the execution of the physical plan hits this node. + + It is not recommended using this as the optimizer likely can do a better job. + """ return self._from_pyldf(self._ldf.cache()) def cast( diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py index 4d01f5ec674e..cec9c394009b 100644 --- a/py-polars/tests/unit/test_cse.py +++ b/py-polars/tests/unit/test_cse.py @@ -658,3 +658,12 @@ def test_cse_15536() -> None: data.filter(pl.lit(True) & (pl.col("a") == 7) | (pl.col("a") == 8)), ] ).collect()["a"].to_list() == [6, 9, 7, 8] + + +def test_cse_15548() -> None: + ldf = pl.LazyFrame({"a": [1, 2, 3]}) + ldf2 = ldf.filter(pl.col("a") == 1).cache() + ldf3 = pl.concat([ldf, ldf2]) + + assert len(ldf3.collect(comm_subplan_elim=False)) == 4 + assert len(ldf3.collect(comm_subplan_elim=True)) == 4