Skip to content

Commit

Permalink
fix(python): fix panic when doing a scan_parquet with hive partioning (
Browse files Browse the repository at this point in the history
…#15381)

Co-authored-by: ritchie <[email protected]>
  • Loading branch information
kszlim and ritchie46 authored Mar 29, 2024
1 parent f61594a commit 31bb26f
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ pub(super) fn set_cache_states(
lp_arena: &mut Arena<ALogicalPlan>,
expr_arena: &mut Arena<AExpr>,
scratch: &mut Vec<Node>,
hive_partition_eval: HiveEval<'_>,
verbose: bool,
) -> PolarsResult<()> {
let mut stack = Vec::with_capacity(4);
Expand Down Expand Up @@ -262,7 +263,7 @@ pub(super) fn set_cache_states(
// back to the cache node again
if !cache_schema_and_children.is_empty() {
let mut proj_pd = ProjectionPushDown::new();
let pred_pd = PredicatePushDown::new(Default::default()).block_at_cache(false);
let pred_pd = PredicatePushDown::new(hive_partition_eval).block_at_cache(false);
for (_cache_id, v) in cache_schema_and_children {
// # CHECK IF WE NEED TO REMOVE CACHES
// If we encounter multiple predicates we remove the cache nodes completely as we don't
Expand Down
9 changes: 8 additions & 1 deletion crates/polars-plan/src/logical_plan/optimizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,14 @@ pub fn optimize(
lp_top = opt.optimize_loop(&mut rules, expr_arena, lp_arena, lp_top)?;

if members.has_joins_or_unions && members.has_cache {
cache_states::set_cache_states(lp_top, lp_arena, expr_arena, scratch, verbose)?;
cache_states::set_cache_states(
lp_top,
lp_arena,
expr_arena,
scratch,
hive_partition_eval,
verbose,
)?;
}

// This one should run (nearly) last as this modifies the projections
Expand Down
6 changes: 6 additions & 0 deletions py-polars/tests/unit/io/test_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ def test_hive_partitioned_predicate_pushdown_skips_correct_number_of_files(
assert q.filter(pl.col("a").is_in([1, 4])).collect().shape == (2, 2)
assert "hive partitioning: skipped 3 files" in capfd.readouterr().err

# Ensure the CSE can work with hive partitions.
q = q.filter(pl.col("a").gt(2))
assert q.join(q, on="a", how="left").collect(comm_subplan_elim=True).to_dict(
as_series=False
) == {"d": [3, 4], "a": [3, 4], "d_right": [3, 4]}


@pytest.mark.skip(
reason="Broken by pyarrow 15 release: https://github.com/pola-rs/polars/issues/13892"
Expand Down

0 comments on commit 31bb26f

Please sign in to comment.