From 314f5a514351e359cfb36b8bdf703c1082d8d936 Mon Sep 17 00:00:00 2001 From: Qi Chen Date: Thu, 17 Aug 2023 23:26:11 +0800 Subject: [PATCH] [Fix](orc-reader) Fix filling partition or missing column used incorrect row count. (#23096) [Fix](orc-reader) Fix filling partition or missing column used incorrect row count. `_row_reader->nextBatch` returns number of read rows. When orc lazy materialization is turned on, the number of read rows includes filtered rows, so caller must look at `numElements` in the row batch to determine how many rows were not filtered which will to fill to the block. In this case, filling partition or missing column used incorrect row count which will cause be crash by `filter.size() != offsets.size()` in filter column step. When orc lazy materialization is turned off, add `_convert_dict_cols_to_string_cols(block, nullptr)` if `(block->rows() == 0)`. --- be/src/vec/exec/format/orc/vorc_reader.cpp | 14 ++++-- .../test_external_catalog_hive_partition.out | 50 +++++++++++++++++++ ...est_external_catalog_hive_partition.groovy | 2 + 3 files changed, 61 insertions(+), 5 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 130d06bea825e9..095f3f85b65003 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -1413,8 +1413,10 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { } *read_rows = rr; - RETURN_IF_ERROR(_fill_partition_columns(block, rr, _lazy_read_ctx.partition_columns)); - RETURN_IF_ERROR(_fill_missing_columns(block, rr, _lazy_read_ctx.missing_columns)); + RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements, + _lazy_read_ctx.partition_columns)); + RETURN_IF_ERROR( + _fill_missing_columns(block, _batch->numElements, _lazy_read_ctx.missing_columns)); if (block->rows() == 0) { *eof = true; @@ -1487,16 +1489,18 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { } *read_rows = rr; + RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements, + _lazy_read_ctx.partition_columns)); RETURN_IF_ERROR( - _fill_partition_columns(block, *read_rows, _lazy_read_ctx.partition_columns)); - RETURN_IF_ERROR(_fill_missing_columns(block, *read_rows, _lazy_read_ctx.missing_columns)); + _fill_missing_columns(block, _batch->numElements, _lazy_read_ctx.missing_columns)); if (block->rows() == 0) { + _convert_dict_cols_to_string_cols(block, nullptr); *eof = true; return Status::OK(); } - _build_delete_row_filter(block, rr); + _build_delete_row_filter(block, _batch->numElements); std::vector columns_to_filter; int column_to_keep = block->columns(); diff --git a/regression-test/data/external_table_p2/hive/test_external_catalog_hive_partition.out b/regression-test/data/external_table_p2/hive/test_external_catalog_hive_partition.out index 5608999eb55cb6..c823189e6885f8 100644 --- a/regression-test/data/external_table_p2/hive/test_external_catalog_hive_partition.out +++ b/regression-test/data/external_table_p2/hive/test_external_catalog_hive_partition.out @@ -23,6 +23,31 @@ -- !q06 -- 2023-01-03T00:00 100 0.3 test3 +-- !q07 -- +1994 50063846 1820677 +1995 58220229 1820677 +1995 66859335 1820677 +1997 77350500 1820677 +1995 98899109 1820677 +1996 122310373 1820677 +1996 138664326 1820677 +1995 145803300 1820677 +1998 187514084 1820677 +1994 197627203 1820677 +1993 216217095 1820677 +1997 260737890 1820677 +1998 279581856 1820677 +1992 296560224 1820677 +1993 306190854 1820677 +1997 329189126 1820677 +1992 389043491 1820677 +1997 435247522 1820677 +1998 449388167 1820677 +1994 526241665 1820677 +1998 533034534 1820677 +1996 576018657 1820677 +1997 582732039 1820677 + -- !q01 -- 0.1 test1 2023-01-01T00:00 \N 0.2 test2 2023-01-02T00:00 \N @@ -47,6 +72,31 @@ -- !q06 -- 2023-01-03T00:00 100 0.3 test3 +-- !q07 -- +1994 50063846 1820677 +1995 58220229 1820677 +1995 66859335 1820677 +1997 77350500 1820677 +1995 98899109 1820677 +1996 122310373 1820677 +1996 138664326 1820677 +1995 145803300 1820677 +1998 187514084 1820677 +1994 197627203 1820677 +1993 216217095 1820677 +1997 260737890 1820677 +1998 279581856 1820677 +1992 296560224 1820677 +1993 306190854 1820677 +1997 329189126 1820677 +1992 389043491 1820677 +1997 435247522 1820677 +1998 449388167 1820677 +1994 526241665 1820677 +1998 533034534 1820677 +1996 576018657 1820677 +1997 582732039 1820677 + -- !q01 -- 0.1 test1 2023-01-01T00:00 \N 0.2 test2 2023-01-02T00:00 \N diff --git a/regression-test/suites/external_table_p2/hive/test_external_catalog_hive_partition.groovy b/regression-test/suites/external_table_p2/hive/test_external_catalog_hive_partition.groovy index 44f4353ffc63ae..196625b3b49972 100644 --- a/regression-test/suites/external_table_p2/hive/test_external_catalog_hive_partition.groovy +++ b/regression-test/suites/external_table_p2/hive/test_external_catalog_hive_partition.groovy @@ -39,6 +39,7 @@ suite("test_external_catalog_hive_partition", "p2,external,hive,external_remote, qt_q04 """ select * from multi_catalog.parquet_partitioned_columns order by t_float """ qt_q05 """ select * from multi_catalog.parquet_partitioned_columns where t_int is null order by t_float """ qt_q06 """ select * from multi_catalog.parquet_partitioned_columns where t_int is not null order by t_float """ + qt_q07 """ select o_orderyear, o_orderkey, o_custkey from multi_catalog.orders_par_parquet where o_custkey=1820677 order by o_orderkey """ } // test orc format def q01_orc = { @@ -48,6 +49,7 @@ suite("test_external_catalog_hive_partition", "p2,external,hive,external_remote, qt_q04 """ select * from multi_catalog.orc_partitioned_columns order by t_float """ qt_q05 """ select * from multi_catalog.orc_partitioned_columns where t_int is null order by t_float """ qt_q06 """ select * from multi_catalog.orc_partitioned_columns where t_int is not null order by t_float """ + qt_q07 """ select o_orderyear, o_orderkey, o_custkey from multi_catalog.orders_par_orc where o_custkey=1820677 order by o_orderkey """ } // test text format def q01_text = {