Skip to content

Commit

Permalink
[Configurations](multi-catalog) Add `enable_parquet_filter_by_min_max…
Browse files Browse the repository at this point in the history
…` and `enable_orc_filter_by_min_max` Session variables. (apache#35290)

backport apache#35012 apache#35320
  • Loading branch information
kaka11chen authored and weixingyu12 committed Jun 11, 2024
1 parent a4705e6 commit 34fa53f
Show file tree
Hide file tree
Showing 8 changed files with 329 additions and 6 deletions.
5 changes: 4 additions & 1 deletion be/src/vec/exec/format/orc/vorc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state,
_is_hive(params.__isset.slot_name_to_schema_pos),
_io_ctx(io_ctx),
_enable_lazy_mat(enable_lazy_mat),
_enable_filter_by_min_max(
state == nullptr ? true : state->query_options().enable_orc_filter_by_min_max),
_dict_cols_has_converted(false),
_unsupported_pushdown_types(unsupported_pushdown_types) {
TimezoneUtils::find_cctz_time_zone(ctz, _time_zone);
Expand All @@ -171,6 +173,7 @@ OrcReader::OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& r
_file_system(nullptr),
_io_ctx(io_ctx),
_enable_lazy_mat(enable_lazy_mat),
_enable_filter_by_min_max(true),
_dict_cols_has_converted(false) {
_init_system_properties();
_init_file_description();
Expand Down Expand Up @@ -648,7 +651,7 @@ bool static build_search_argument(std::vector<OrcPredicate>& predicates, int ind

bool OrcReader::_init_search_argument(
std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range) {
if (colname_to_value_range->empty()) {
if ((!_enable_filter_by_min_max) || colname_to_value_range->empty()) {
return false;
}
std::vector<OrcPredicate> predicates;
Expand Down
1 change: 1 addition & 0 deletions be/src/vec/exec/format/orc/vorc_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,7 @@ class OrcReader : public GenericReader {

io::IOContext* _io_ctx;
bool _enable_lazy_mat = true;
bool _enable_filter_by_min_max = true;

std::vector<DecimalScaleParams> _decimal_scale_params;
size_t _decimal_scale_params_index;
Expand Down
18 changes: 13 additions & 5 deletions be/src/vec/exec/format/parquet/vparquet_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ ParquetReader::ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams
_io_ctx(io_ctx),
_state(state),
_meta_cache(meta_cache),
_enable_lazy_mat(enable_lazy_mat) {
_enable_lazy_mat(enable_lazy_mat),
_enable_filter_by_min_max(
state == nullptr ? true
: state->query_options().enable_parquet_filter_by_min_max) {
_init_profile();
_init_system_properties();
_init_file_description();
Expand All @@ -94,7 +97,10 @@ ParquetReader::ParquetReader(const TFileScanRangeParams& params, const TFileRang
_scan_range(range),
_io_ctx(io_ctx),
_state(state),
_enable_lazy_mat(enable_lazy_mat) {
_enable_lazy_mat(enable_lazy_mat),
_enable_filter_by_min_max(
state == nullptr ? true
: state->query_options().enable_parquet_filter_by_min_max) {
_init_system_properties();
_init_file_description();
}
Expand Down Expand Up @@ -757,8 +763,9 @@ Status ParquetReader::_process_page_index(const tparquet::RowGroup& row_group,
_statistics.read_rows += row_group.num_rows;
};

if (_lazy_read_ctx.has_complex_type || _lazy_read_ctx.conjuncts.empty() ||
_colname_to_value_range == nullptr || _colname_to_value_range->empty()) {
if ((!_enable_filter_by_min_max) || _lazy_read_ctx.has_complex_type ||
_lazy_read_ctx.conjuncts.empty() || _colname_to_value_range == nullptr ||
_colname_to_value_range->empty()) {
read_whole_row_group();
return Status::OK();
}
Expand Down Expand Up @@ -867,7 +874,8 @@ Status ParquetReader::_process_row_group_filter(const tparquet::RowGroup& row_gr

Status ParquetReader::_process_column_stat_filter(const std::vector<tparquet::ColumnChunk>& columns,
bool* filter_group) {
if (_colname_to_value_range == nullptr || _colname_to_value_range->empty()) {
if ((!_enable_filter_by_min_max) || _colname_to_value_range == nullptr ||
_colname_to_value_range->empty()) {
return Status::OK();
}
auto& schema_desc = _file_metadata->schema();
Expand Down
1 change: 1 addition & 0 deletions be/src/vec/exec/format/parquet/vparquet_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ class ParquetReader : public GenericReader {
// Maybe null if not used
FileMetaCache* _meta_cache = nullptr;
bool _enable_lazy_mat = true;
bool _enable_filter_by_min_max = true;
const TupleDescriptor* _tuple_descriptor;
const RowDescriptor* _row_descriptor;
const std::unordered_map<std::string, int>* _colname_to_slot_id;
Expand Down
39 changes: 39 additions & 0 deletions fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,10 @@ public class SessionVariable implements Serializable, Writable {

public static final String ENABLE_ORC_LAZY_MAT = "enable_orc_lazy_materialization";

public static final String ENABLE_PARQUET_FILTER_BY_MIN_MAX = "enable_parquet_filter_by_min_max";

public static final String ENABLE_ORC_FILTER_BY_MIN_MAX = "enable_orc_filter_by_min_max";

public static final String INLINE_CTE_REFERENCED_THRESHOLD = "inline_cte_referenced_threshold";

public static final String ENABLE_CTE_MATERIALIZE = "enable_cte_materialize";
Expand Down Expand Up @@ -1209,6 +1213,24 @@ public void setEnableLeftZigZag(boolean enableLeftZigZag) {
needForward = true)
public boolean enableOrcLazyMat = true;


@VariableMgr.VarAttr(
name = ENABLE_PARQUET_FILTER_BY_MIN_MAX,
description = {"控制 parquet reader 是否启用 min-max 值过滤。默认为 true。",
"Controls whether to filter by min-max values in parquet reader. "
+ "The default value is true."},
needForward = true)
public boolean enableParquetFilterByMinMax = true;


@VariableMgr.VarAttr(
name = ENABLE_ORC_FILTER_BY_MIN_MAX,
description = {"控制 orc reader 是否启用 min-max 值过滤。默认为 true。",
"Controls whether to filter by min-max values in orc reader. "
+ "The default value is true."},
needForward = true)
public boolean enableOrcFilterByMinMax = true;

@VariableMgr.VarAttr(
name = EXTERNAL_TABLE_ANALYZE_PART_NUM,
description = {"收集外表统计信息行数时选取的采样分区数,默认-1表示全部分区",
Expand Down Expand Up @@ -2149,6 +2171,21 @@ public void setEnableOrcLazyMat(boolean enableOrcLazyMat) {
this.enableOrcLazyMat = enableOrcLazyMat;
}

public boolean isEnableParquetFilterByMinMax() {
return enableParquetFilterByMinMax;
}

public void setEnableParquetFilterByMinMax(boolean enableParquetFilterByMinMax) {
this.enableParquetFilterByMinMax = enableParquetFilterByMinMax;
}

public boolean isEnableOrcFilterByMinMax() {
return enableOrcFilterByMinMax;
}

public void setEnableOrcFilterByMinMax(boolean enableOrcFilterByMinMax) {
this.enableOrcFilterByMinMax = enableOrcFilterByMinMax;
}

/**
* getInsertVisibleTimeoutMs.
Expand Down Expand Up @@ -2588,6 +2625,8 @@ public TQueryOptions toThrift() {

tResult.setEnableParquetLazyMat(enableParquetLazyMat);
tResult.setEnableOrcLazyMat(enableOrcLazyMat);
tResult.setEnableParquetFilterByMinMax(enableParquetFilterByMinMax);
tResult.setEnableOrcFilterByMinMax(enableOrcFilterByMinMax);

tResult.setTruncateCharOrVarcharColumns(truncateCharOrVarcharColumns);

Expand Down
3 changes: 3 additions & 0 deletions gensrc/thrift/PaloInternalService.thrift
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,9 @@ struct TQueryOptions {
90: optional bool enable_inverted_index_compound_inlist = false;
// For emergency use, skip missing version when reading rowsets
91: optional bool skip_missing_version = false;

92: optional bool enable_parquet_filter_by_min_max = true
93: optional bool enable_orc_filter_by_min_max = true
}


Expand Down
262 changes: 262 additions & 0 deletions regression-test/data/external_table_p0/hive/test_hive_basic_type.out

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,17 @@
suite("test_hive_basic_type", "p0,external,hive,external_docker,external_docker_hive") {
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled != null && enabled.equalsIgnoreCase("true")) {
for (boolean enable_filter_by_min_max : [true, false]) {
String catalog_name = "test_hive_basic_type"
String ex_db_name = "`default`"
String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
String hms_port = context.config.otherConfigs.get("hms_port")
String hdfs_port = context.config.otherConfigs.get("hdfs_port")

sql """set enable_parquet_filter_by_min_max = ${enable_filter_by_min_max};"""

sql """set enable_orc_filter_by_min_max = ${enable_filter_by_min_max};"""

sql """drop catalog if exists ${catalog_name} """

sql """CREATE CATALOG ${catalog_name} PROPERTIES (
Expand Down Expand Up @@ -144,6 +149,7 @@ suite("test_hive_basic_type", "p0,external,hive,external_docker,external_docker_
}
}
//sql """drop catalog if exists ${catalog_name} """
}
}
}

0 comments on commit 34fa53f

Please sign in to comment.