Skip to content

Commit

Permalink
[Fix](orc-reader) Fix filling partition or missing column used incorr…
Browse files Browse the repository at this point in the history
…ect row count. (#23096)

[Fix](orc-reader) Fix filling partition or missing column used incorrect row count.

`_row_reader->nextBatch` returns number of read rows. When orc lazy materialization is turned on, the number of read rows includes filtered rows, so caller must look at `numElements` in the row batch to determine how
many rows were not filtered which will to fill to the block.

In this case, filling partition or missing column used incorrect row count which will cause be crash by `filter.size() != offsets.size()` in filter column step.

When orc lazy materialization is turned off, add `_convert_dict_cols_to_string_cols(block, nullptr)` if `(block->rows() == 0)`.
  • Loading branch information
kaka11chen authored Aug 17, 2023
1 parent 1f19d0d commit 314f5a5
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 5 deletions.
14 changes: 9 additions & 5 deletions be/src/vec/exec/format/orc/vorc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1413,8 +1413,10 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) {
}
*read_rows = rr;

RETURN_IF_ERROR(_fill_partition_columns(block, rr, _lazy_read_ctx.partition_columns));
RETURN_IF_ERROR(_fill_missing_columns(block, rr, _lazy_read_ctx.missing_columns));
RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
_lazy_read_ctx.partition_columns));
RETURN_IF_ERROR(
_fill_missing_columns(block, _batch->numElements, _lazy_read_ctx.missing_columns));

if (block->rows() == 0) {
*eof = true;
Expand Down Expand Up @@ -1487,16 +1489,18 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) {
}
*read_rows = rr;

RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
_lazy_read_ctx.partition_columns));
RETURN_IF_ERROR(
_fill_partition_columns(block, *read_rows, _lazy_read_ctx.partition_columns));
RETURN_IF_ERROR(_fill_missing_columns(block, *read_rows, _lazy_read_ctx.missing_columns));
_fill_missing_columns(block, _batch->numElements, _lazy_read_ctx.missing_columns));

if (block->rows() == 0) {
_convert_dict_cols_to_string_cols(block, nullptr);
*eof = true;
return Status::OK();
}

_build_delete_row_filter(block, rr);
_build_delete_row_filter(block, _batch->numElements);

std::vector<uint32_t> columns_to_filter;
int column_to_keep = block->columns();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,31 @@
-- !q06 --
2023-01-03T00:00 100 0.3 test3

-- !q07 --
1994 50063846 1820677
1995 58220229 1820677
1995 66859335 1820677
1997 77350500 1820677
1995 98899109 1820677
1996 122310373 1820677
1996 138664326 1820677
1995 145803300 1820677
1998 187514084 1820677
1994 197627203 1820677
1993 216217095 1820677
1997 260737890 1820677
1998 279581856 1820677
1992 296560224 1820677
1993 306190854 1820677
1997 329189126 1820677
1992 389043491 1820677
1997 435247522 1820677
1998 449388167 1820677
1994 526241665 1820677
1998 533034534 1820677
1996 576018657 1820677
1997 582732039 1820677

-- !q01 --
0.1 test1 2023-01-01T00:00 \N
0.2 test2 2023-01-02T00:00 \N
Expand All @@ -47,6 +72,31 @@
-- !q06 --
2023-01-03T00:00 100 0.3 test3

-- !q07 --
1994 50063846 1820677
1995 58220229 1820677
1995 66859335 1820677
1997 77350500 1820677
1995 98899109 1820677
1996 122310373 1820677
1996 138664326 1820677
1995 145803300 1820677
1998 187514084 1820677
1994 197627203 1820677
1993 216217095 1820677
1997 260737890 1820677
1998 279581856 1820677
1992 296560224 1820677
1993 306190854 1820677
1997 329189126 1820677
1992 389043491 1820677
1997 435247522 1820677
1998 449388167 1820677
1994 526241665 1820677
1998 533034534 1820677
1996 576018657 1820677
1997 582732039 1820677

-- !q01 --
0.1 test1 2023-01-01T00:00 \N
0.2 test2 2023-01-02T00:00 \N
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ suite("test_external_catalog_hive_partition", "p2,external,hive,external_remote,
qt_q04 """ select * from multi_catalog.parquet_partitioned_columns order by t_float """
qt_q05 """ select * from multi_catalog.parquet_partitioned_columns where t_int is null order by t_float """
qt_q06 """ select * from multi_catalog.parquet_partitioned_columns where t_int is not null order by t_float """
qt_q07 """ select o_orderyear, o_orderkey, o_custkey from multi_catalog.orders_par_parquet where o_custkey=1820677 order by o_orderkey """
}
// test orc format
def q01_orc = {
Expand All @@ -48,6 +49,7 @@ suite("test_external_catalog_hive_partition", "p2,external,hive,external_remote,
qt_q04 """ select * from multi_catalog.orc_partitioned_columns order by t_float """
qt_q05 """ select * from multi_catalog.orc_partitioned_columns where t_int is null order by t_float """
qt_q06 """ select * from multi_catalog.orc_partitioned_columns where t_int is not null order by t_float """
qt_q07 """ select o_orderyear, o_orderkey, o_custkey from multi_catalog.orders_par_orc where o_custkey=1820677 order by o_orderkey """
}
// test text format
def q01_text = {
Expand Down

0 comments on commit 314f5a5

Please sign in to comment.