From e92dc19f7f6823b9d6d0cb622aa8de6309c0d515 Mon Sep 17 00:00:00 2001 From: wuwenchi Date: Thu, 21 Dec 2023 19:03:13 +0800 Subject: [PATCH 1/6] [fix](hive) add support for quoteChar and seperatorChar for hive (branch-2.0) (#28703)#28703 bp #28613 --- .../scripts/create_preinstalled_table.hql | 8 +++++ .../doris/planner/external/HiveScanNode.java | 17 ++++++++- .../hive/test_hive_serde_prop.out | 4 +++ .../hive/test_hive_serde_prop.groovy | 36 +++++++++++++++++++ 4 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 regression-test/data/external_table_p0/hive/test_hive_serde_prop.out create mode 100644 regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql index 4e80d7466d2f91..e798ecd7f2bb9c 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql @@ -1798,3 +1798,11 @@ create table stats_test2 (id INT, value STRING) STORED AS PARQUET; insert into stats_test1 values (1, 'name1'), (2, 'name2'), (3, 'name3'); INSERT INTO stats_test2 VALUES (1, ';'), (2, '\*'); + +create table employee_gz(name string,salary string) +row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +with serdeproperties +('quoteChar'='\"' +,'seperatorChar'=','); + +insert into employee_gz values ('a', '1.1'), ('b', '2.2'); diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java index 943d30017e7c2a..58b93112477b40 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java @@ -76,6 +76,9 @@ public class HiveScanNode extends FileQueryScanNode { public static final String DEFAULT_FIELD_DELIMITER = "\1"; // "\x01" public static final String PROP_LINE_DELIMITER = "line.delim"; public static final String DEFAULT_LINE_DELIMITER = "\n"; + public static final String PROP_SEPERATOR_CHAR = "seperatorChar"; + public static final String PROP_QUOTA_CHAR = "quoteChar"; + public static final String PROP_COLLECTION_DELIMITER_HIVE2 = "colelction.delim"; public static final String PROP_COLLECTION_DELIMITER_HIVE3 = "collection.delim"; @@ -362,7 +365,16 @@ protected Map getLocationProperties() throws UserException { protected TFileAttributes getFileAttributes() throws UserException { TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); java.util.Map delimiter = hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters(); - textParams.setColumnSeparator(delimiter.getOrDefault(PROP_FIELD_DELIMITER, DEFAULT_FIELD_DELIMITER)); + if (delimiter.containsKey(PROP_FIELD_DELIMITER)) { + textParams.setColumnSeparator(delimiter.get(PROP_FIELD_DELIMITER)); + } else if (delimiter.containsKey(PROP_SEPERATOR_CHAR)) { + textParams.setColumnSeparator(delimiter.get(PROP_SEPERATOR_CHAR)); + } else { + textParams.setColumnSeparator(DEFAULT_FIELD_DELIMITER); + } + if (delimiter.containsKey(PROP_QUOTA_CHAR)) { + textParams.setEnclose(delimiter.get(PROP_QUOTA_CHAR).getBytes()[0]); + } textParams.setLineDelimiter(delimiter.getOrDefault(PROP_LINE_DELIMITER, DEFAULT_LINE_DELIMITER)); textParams.setMapkvDelimiter(delimiter.getOrDefault(PROP_MAP_KV_DELIMITER, DEFAULT_MAP_KV_DELIMITER)); @@ -377,6 +389,9 @@ protected TFileAttributes getFileAttributes() throws UserException { TFileAttributes fileAttributes = new TFileAttributes(); fileAttributes.setTextParams(textParams); fileAttributes.setHeaderType(""); + if (textParams.isSet(TFileTextScanRangeParams._Fields.ENCLOSE)) { + fileAttributes.setTrimDoubleQuotes(true); + } return fileAttributes; } diff --git a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out new file mode 100644 index 00000000000000..1cde2baec27924 --- /dev/null +++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out @@ -0,0 +1,4 @@ +test_hive_serde_prop.out -- This file is automatically generated. You should know what you did if you want to edit this +-- !1 -- +a 1.1 +b 2.2 diff --git a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy new file mode 100644 index 00000000000000..41d7056d20832b --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,external") { + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String catalog_name = "test_hive_serde_prop" + String ex_db_name = "`stats_test`" + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String hms_port = context.config.otherConfigs.get("hms_port") + + sql """drop catalog if exists ${catalog_name} """ + + sql """CREATE CATALOG ${catalog_name} PROPERTIES ( + 'type'='hms', + 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}', + 'hadoop.username' = 'hive' + );""" + + qt_1 """select * from ${catalog_name}.${ex_db_name}.employee_gz order by name;""" + } +} From 30934ff0c9eb2965a5891463801a288e87b5b7af Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 21 Dec 2023 20:15:24 +0800 Subject: [PATCH 2/6] [branch-2.0-fix](partial update) Don't use delete bitmap to mark delete for rows with delete sign when sequence column doesn't exist (#28800) --- .../olap/rowset/segment_v2/segment_writer.cpp | 30 ---- be/src/olap/tablet.cpp | 8 - be/src/olap/tablet_meta.h | 1 - .../test_partial_update_delete_sign.out | 2 - .../test_delete_sign_delete_bitmap.out | 54 ------- .../test_delete_sign_delete_bitmap.groovy | 140 +++++++++--------- 6 files changed, 70 insertions(+), 165 deletions(-) delete mode 100644 regression-test/data/unique_with_mow_p0/test_delete_sign_delete_bitmap.out diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 02ea943818d812..29ec7e02bc5652 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -444,13 +444,6 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* // mark key with delete sign as deleted. bool have_delete_sign = (delete_sign_column_data != nullptr && delete_sign_column_data[block_pos] != 0); - if (have_delete_sign && !_tablet_schema->has_sequence_col() && !have_input_seq_column) { - // we can directly use delete bitmap to mark the rows with delete sign as deleted - // if sequence column doesn't exist to eliminate reading delete sign columns in later reads - _mow_context->delete_bitmap->add({_opts.rowset_ctx->rowset_id, _segment_id, - DeleteBitmap::TEMP_VERSION_FOR_DELETE_SIGN}, - segment_pos); - } RowLocation loc; // save rowset shared ptr so this rowset wouldn't delete @@ -699,29 +692,6 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po _serialize_block_to_row_column(*const_cast(block)); } - if (_opts.write_type == DataWriteType::TYPE_DIRECT && _opts.enable_unique_key_merge_on_write && - !_tablet_schema->has_sequence_col() && _tablet_schema->delete_sign_idx() != -1) { - const vectorized::ColumnWithTypeAndName& delete_sign_column = - block->get_by_position(_tablet_schema->delete_sign_idx()); - auto& delete_sign_col = - reinterpret_cast(*(delete_sign_column.column)); - if (delete_sign_col.size() >= row_pos + num_rows) { - const vectorized::Int8* delete_sign_column_data = delete_sign_col.get_data().data(); - uint32_t segment_start_pos = - _column_writers[_tablet_schema->delete_sign_idx()]->get_next_rowid(); - for (size_t block_pos = row_pos, seg_pos = segment_start_pos; - seg_pos < segment_start_pos + num_rows; block_pos++, seg_pos++) { - // we can directly use delete bitmap to mark the rows with delete sign as deleted - // if sequence column doesn't exist to eliminate reading delete sign columns in later reads - if (delete_sign_column_data[block_pos]) { - _mow_context->delete_bitmap->add({_opts.rowset_ctx->rowset_id, _segment_id, - DeleteBitmap::TEMP_VERSION_FOR_DELETE_SIGN}, - seg_pos); - } - } - } - } - _olap_data_convertor->set_source_content(block, row_pos, num_rows); // find all row pos for short key indexes diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 4c1ebd721de826..611e32828bba25 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -3075,14 +3075,6 @@ Status Tablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, continue; } if (is_partial_update && rowset_writer != nullptr) { - if (delete_bitmap->contains( - {rowset_id, seg->id(), DeleteBitmap::TEMP_VERSION_FOR_DELETE_SIGN}, - row_id)) { - LOG(INFO) - << "DEBUG: skip a delete sign column while calc_segment_delete_bitmap " - << "processing confict for partial update"; - continue; - } // In publish version, record rows to be deleted for concurrent update // For example, if version 5 and 6 update a row, but version 6 only see // version 4 when write, and when publish version, version 5's value will diff --git a/be/src/olap/tablet_meta.h b/be/src/olap/tablet_meta.h index 11dc3532514c8b..0ef058760eae43 100644 --- a/be/src/olap/tablet_meta.h +++ b/be/src/olap/tablet_meta.h @@ -343,7 +343,6 @@ class DeleteBitmap { // tablet's delete bitmap we can use arbitary version number in BitmapKey. Here we define some version numbers // for specific usage during this periods to avoid conflicts constexpr static inline uint64_t TEMP_VERSION_COMMON = 0; - constexpr static inline uint64_t TEMP_VERSION_FOR_DELETE_SIGN = 1; /** * diff --git a/regression-test/data/unique_with_mow_p0/partial_update/test_partial_update_delete_sign.out b/regression-test/data/unique_with_mow_p0/partial_update/test_partial_update_delete_sign.out index f14434b2f9f88c..8d3e69bbe26ac9 100644 --- a/regression-test/data/unique_with_mow_p0/partial_update/test_partial_update_delete_sign.out +++ b/regression-test/data/unique_with_mow_p0/partial_update/test_partial_update_delete_sign.out @@ -27,7 +27,6 @@ -- !2 -- -- !3 -- -1 2 \N -- !1 -- 1 1 1 1 @@ -65,7 +64,6 @@ -- !2 -- -- !3 -- -1 2 \N -- !1 -- 1 1 1 1 diff --git a/regression-test/data/unique_with_mow_p0/test_delete_sign_delete_bitmap.out b/regression-test/data/unique_with_mow_p0/test_delete_sign_delete_bitmap.out deleted file mode 100644 index 687aeab54a1009..00000000000000 --- a/regression-test/data/unique_with_mow_p0/test_delete_sign_delete_bitmap.out +++ /dev/null @@ -1,54 +0,0 @@ --- This file is automatically generated. You should know what you did if you want to edit this --- !sql -- -1 1 1 1 1 -2 2 2 2 2 -3 3 3 3 3 -4 4 4 4 4 -5 5 5 5 5 - --- !after_delete -- -2 2 2 2 2 -4 4 4 4 4 - --- !1 -- -1 1 1 1 1 0 -1 1 1 1 1 1 -2 2 2 2 2 0 -3 3 3 3 3 0 -3 3 3 3 3 1 -4 4 4 4 4 0 -5 5 5 5 5 0 -5 5 5 5 5 1 - --- !2 -- -2 2 2 2 2 0 -4 4 4 4 4 0 - --- !sql -- -1 1 1 1 1 -2 2 2 2 2 -3 3 3 3 3 -4 4 4 4 4 -5 5 5 5 5 - --- !after_delete -- -2 2 2 2 2 -4 4 4 4 4 - --- !1 -- -1 1 1 1 1 0 -1 1 1 1 1 1 -2 2 2 2 2 0 -3 3 3 3 3 0 -3 3 3 3 3 1 -4 4 4 4 4 0 -5 5 5 5 5 0 -5 5 5 5 5 1 - --- !2 -- -1 1 1 1 1 1 -2 2 2 2 2 0 -3 3 3 3 3 1 -4 4 4 4 4 0 -5 5 5 5 5 1 - diff --git a/regression-test/suites/unique_with_mow_p0/test_delete_sign_delete_bitmap.groovy b/regression-test/suites/unique_with_mow_p0/test_delete_sign_delete_bitmap.groovy index f9b89c1eea7a27..be6324d2ec76a6 100644 --- a/regression-test/suites/unique_with_mow_p0/test_delete_sign_delete_bitmap.groovy +++ b/regression-test/suites/unique_with_mow_p0/test_delete_sign_delete_bitmap.groovy @@ -17,80 +17,80 @@ suite('test_delete_sign_delete_bitmap') { - def tableName1 = "test_delete_sign_delete_bitmap1" - sql "DROP TABLE IF EXISTS ${tableName1};" - sql """ CREATE TABLE IF NOT EXISTS ${tableName1} ( - `k1` int NOT NULL, - `c1` int, - `c2` int, - `c3` int, - `c4` int - )UNIQUE KEY(k1) - DISTRIBUTED BY HASH(k1) BUCKETS 1 - PROPERTIES ( - "enable_unique_key_merge_on_write" = "true", - "disable_auto_compaction" = "true", - "replication_num" = "1" - );""" + // def tableName1 = "test_delete_sign_delete_bitmap1" + // sql "DROP TABLE IF EXISTS ${tableName1};" + // sql """ CREATE TABLE IF NOT EXISTS ${tableName1} ( + // `k1` int NOT NULL, + // `c1` int, + // `c2` int, + // `c3` int, + // `c4` int + // )UNIQUE KEY(k1) + // DISTRIBUTED BY HASH(k1) BUCKETS 1 + // PROPERTIES ( + // "enable_unique_key_merge_on_write" = "true", + // "disable_auto_compaction" = "true", + // "replication_num" = "1" + // );""" - sql "insert into ${tableName1} values(1,1,1,1,1),(2,2,2,2,2),(3,3,3,3,3),(4,4,4,4,4),(5,5,5,5,5);" - qt_sql "select * from ${tableName1} order by k1,c1,c2,c3,c4;" - // sql "insert into ${tableName1}(k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__) select k1,c1,c2,c3,c4,1 from ${tableName1} where k1 in (1,3,5);" - sql """insert into ${tableName1}(k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__) values(1,1,1,1,1,1),(3,3,3,3,3,1),(5,5,5,5,5,1);""" - sql "sync" - qt_after_delete "select * from ${tableName1} order by k1,c1,c2,c3,c4;" - sql "set skip_delete_sign=true;" - sql "set skip_storage_engine_merge=true;" - sql "set skip_delete_bitmap=true;" - sql "sync" - // skip_delete_bitmap=true, skip_delete_sign=true - qt_1 "select k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__ from ${tableName1} order by k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__;" + // sql "insert into ${tableName1} values(1,1,1,1,1),(2,2,2,2,2),(3,3,3,3,3),(4,4,4,4,4),(5,5,5,5,5);" + // qt_sql "select * from ${tableName1} order by k1,c1,c2,c3,c4;" + // // sql "insert into ${tableName1}(k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__) select k1,c1,c2,c3,c4,1 from ${tableName1} where k1 in (1,3,5);" + // sql """insert into ${tableName1}(k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__) values(1,1,1,1,1,1),(3,3,3,3,3,1),(5,5,5,5,5,1);""" + // sql "sync" + // qt_after_delete "select * from ${tableName1} order by k1,c1,c2,c3,c4;" + // sql "set skip_delete_sign=true;" + // sql "set skip_storage_engine_merge=true;" + // sql "set skip_delete_bitmap=true;" + // sql "sync" + // // skip_delete_bitmap=true, skip_delete_sign=true + // qt_1 "select k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__ from ${tableName1} order by k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__;" - sql "set skip_delete_sign=true;" - sql "set skip_delete_bitmap=false;" - sql "sync" - // skip_delete_bitmap=false, skip_delete_sign=true - qt_2 "select k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__ from ${tableName1} order by k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__;" - sql "drop table if exists ${tableName1};" + // sql "set skip_delete_sign=true;" + // sql "set skip_delete_bitmap=false;" + // sql "sync" + // // skip_delete_bitmap=false, skip_delete_sign=true + // qt_2 "select k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__ from ${tableName1} order by k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__;" + // sql "drop table if exists ${tableName1};" - sql "set skip_delete_sign=false;" - sql "set skip_storage_engine_merge=false;" - sql "set skip_delete_bitmap=false;" - sql "sync" - def tableName2 = "test_delete_sign_delete_bitmap2" - sql "DROP TABLE IF EXISTS ${tableName2};" - sql """ CREATE TABLE IF NOT EXISTS ${tableName2} ( - `k1` int NOT NULL, - `c1` int, - `c2` int, - `c3` int, - `c4` int - )UNIQUE KEY(k1) - DISTRIBUTED BY HASH(k1) BUCKETS 1 - PROPERTIES ( - "enable_unique_key_merge_on_write" = "true", - "disable_auto_compaction" = "true", - "replication_num" = "1", - "function_column.sequence_col" = 'c4' - );""" + // sql "set skip_delete_sign=false;" + // sql "set skip_storage_engine_merge=false;" + // sql "set skip_delete_bitmap=false;" + // sql "sync" + // def tableName2 = "test_delete_sign_delete_bitmap2" + // sql "DROP TABLE IF EXISTS ${tableName2};" + // sql """ CREATE TABLE IF NOT EXISTS ${tableName2} ( + // `k1` int NOT NULL, + // `c1` int, + // `c2` int, + // `c3` int, + // `c4` int + // )UNIQUE KEY(k1) + // DISTRIBUTED BY HASH(k1) BUCKETS 1 + // PROPERTIES ( + // "enable_unique_key_merge_on_write" = "true", + // "disable_auto_compaction" = "true", + // "replication_num" = "1", + // "function_column.sequence_col" = 'c4' + // );""" - sql "insert into ${tableName2} values(1,1,1,1,1),(2,2,2,2,2),(3,3,3,3,3),(4,4,4,4,4),(5,5,5,5,5);" - qt_sql "select * from ${tableName2} order by k1,c1,c2,c3,c4;" - sql """insert into ${tableName2}(k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__) values(1,1,1,1,1,1),(3,3,3,3,3,1),(5,5,5,5,5,1);""" - sql "sync" - qt_after_delete "select * from ${tableName2} order by k1,c1,c2,c3,c4;" - sql "set skip_delete_sign=true;" - sql "set skip_storage_engine_merge=true;" - sql "set skip_delete_bitmap=true;" - sql "sync" - // skip_delete_bitmap=true, skip_delete_sign=true - qt_1 "select k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__ from ${tableName2} order by k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__;" + // sql "insert into ${tableName2} values(1,1,1,1,1),(2,2,2,2,2),(3,3,3,3,3),(4,4,4,4,4),(5,5,5,5,5);" + // qt_sql "select * from ${tableName2} order by k1,c1,c2,c3,c4;" + // sql """insert into ${tableName2}(k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__) values(1,1,1,1,1,1),(3,3,3,3,3,1),(5,5,5,5,5,1);""" + // sql "sync" + // qt_after_delete "select * from ${tableName2} order by k1,c1,c2,c3,c4;" + // sql "set skip_delete_sign=true;" + // sql "set skip_storage_engine_merge=true;" + // sql "set skip_delete_bitmap=true;" + // sql "sync" + // // skip_delete_bitmap=true, skip_delete_sign=true + // qt_1 "select k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__ from ${tableName2} order by k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__;" - sql "set skip_delete_sign=true;" - sql "set skip_delete_bitmap=false;" - sql "sync" - // skip_delete_bitmap=false, skip_delete_sign=true - qt_2 "select k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__ from ${tableName2} order by k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__;" - sql "drop table if exists ${tableName2};" + // sql "set skip_delete_sign=true;" + // sql "set skip_delete_bitmap=false;" + // sql "sync" + // // skip_delete_bitmap=false, skip_delete_sign=true + // qt_2 "select k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__ from ${tableName2} order by k1,c1,c2,c3,c4,__DORIS_DELETE_SIGN__;" + // sql "drop table if exists ${tableName2};" } From 6708cb6c3bb848b95c4ca2946d234347630474da Mon Sep 17 00:00:00 2001 From: zzzxl <33418555+zzzxl1993@users.noreply.github.com> Date: Thu, 21 Dec 2023 20:32:34 +0800 Subject: [PATCH 3/6] [feature](invert index) add match_phrase_prefix and match_regexp #27404 #28257 (#28715) --- be/src/clucene | 2 +- be/src/exec/olap_common.h | 4 + be/src/exec/olap_utils.h | 17 ++- be/src/olap/match_predicate.cpp | 8 +- .../query/conjunction_query.cpp | 6 +- .../query/disjunction_query.cpp | 17 ++- .../inverted_index/query/disjunction_query.h | 1 - .../query/phrase_prefix_query.cpp | 63 +++++++++ .../query/phrase_prefix_query.h | 54 ++++++++ .../inverted_index/query/prefix_query.cpp | 80 +++++++++++ .../inverted_index/query/prefix_query.h | 40 ++++++ .../inverted_index/query/regexp_query.cpp | 98 +++++++++++++ .../inverted_index/query/regexp_query.h | 46 ++++++ .../segment_v2/inverted_index_query_type.h | 8 ++ .../segment_v2/inverted_index_reader.cpp | 131 +++++++++++++++--- .../rowset/segment_v2/inverted_index_reader.h | 21 ++- be/src/vec/functions/function_tokenize.cpp | 8 +- be/src/vec/functions/match.cpp | 39 +++--- be/src/vec/functions/match.h | 34 +++++ .../org/apache/doris/nereids/DorisLexer.g4 | 2 + .../org/apache/doris/nereids/DorisParser.g4 | 2 +- fe/fe-core/src/main/cup/sql_parser.cup | 8 +- .../apache/doris/analysis/MatchPredicate.java | 22 +++ .../nereids/parser/LogicalPlanBuilder.java | 14 ++ .../nereids/trees/expressions/Match.java | 4 + .../trees/expressions/MatchPhrasePrefix.java | 49 +++++++ .../trees/expressions/MatchRegexp.java | 49 +++++++ .../visitor/ExpressionVisitor.java | 10 ++ .../org/apache/doris/qe/SessionVariable.java | 8 ++ fe/fe-core/src/main/jflex/sql_scanner.flex | 2 + gensrc/thrift/Opcodes.thrift | 2 + gensrc/thrift/PaloInternalService.thrift | 2 + .../test_index_match_phrase_prefix.out | 31 +++++ .../test_index_match_regexp.out | 16 +++ .../test_index_match_phrase_prefix.groovy | 98 +++++++++++++ .../test_index_match_regexp.groovy | 89 ++++++++++++ 36 files changed, 1018 insertions(+), 67 deletions(-) create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhrasePrefix.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java create mode 100644 regression-test/data/inverted_index_p0/test_index_match_phrase_prefix.out create mode 100644 regression-test/data/inverted_index_p0/test_index_match_regexp.out create mode 100644 regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix.groovy create mode 100644 regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy diff --git a/be/src/clucene b/be/src/clucene index 70c1a692bbb127..6f8a21ffe15bd7 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 70c1a692bbb1277f107ff2ddedda41b3a223c632 +Subproject commit 6f8a21ffe15bd78a1cd3e685067ee5c9ed071827 diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index 5bd06d8d54030e..7133f40e7adc8f 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -304,6 +304,10 @@ class ColumnValueRange { condition.__set_condition_op("match_all"); } else if (value.first == MatchType::MATCH_PHRASE) { condition.__set_condition_op("match_phrase"); + } else if (value.first == MatchType::MATCH_PHRASE_PREFIX) { + condition.__set_condition_op("match_phrase_prefix"); + } else if (value.first == MatchType::MATCH_REGEXP) { + condition.__set_condition_op("match_regexp"); } else if (value.first == MatchType::MATCH_ELEMENT_EQ) { condition.__set_condition_op("match_element_eq"); } else if (value.first == MatchType::MATCH_ELEMENT_LT) { diff --git a/be/src/exec/olap_utils.h b/be/src/exec/olap_utils.h index 1d6bdf959302b0..1b8525dc1b9948 100644 --- a/be/src/exec/olap_utils.h +++ b/be/src/exec/olap_utils.h @@ -170,6 +170,8 @@ enum class MatchType { MATCH_ELEMENT_GT = 5, MATCH_ELEMENT_LE = 6, MATCH_ELEMENT_GE = 7, + MATCH_PHRASE_PREFIX = 8, + MATCH_REGEXP = 9, }; inline MatchType to_match_type(TExprOpcode::type type) { @@ -183,6 +185,12 @@ inline MatchType to_match_type(TExprOpcode::type type) { case TExprOpcode::type::MATCH_PHRASE: return MatchType::MATCH_PHRASE; break; + case TExprOpcode::type::MATCH_PHRASE_PREFIX: + return MatchType::MATCH_PHRASE_PREFIX; + break; + case TExprOpcode::type::MATCH_REGEXP: + return MatchType::MATCH_REGEXP; + break; case TExprOpcode::type::MATCH_ELEMENT_EQ: return MatchType::MATCH_ELEMENT_EQ; break; @@ -212,6 +220,10 @@ inline MatchType to_match_type(const std::string& condition_op) { return MatchType::MATCH_ALL; } else if (condition_op.compare("match_phrase") == 0) { return MatchType::MATCH_PHRASE; + } else if (condition_op.compare("match_phrase_prefix") == 0) { + return MatchType::MATCH_PHRASE_PREFIX; + } else if (condition_op.compare("match_regexp") == 0) { + return MatchType::MATCH_REGEXP; } else if (condition_op.compare("match_element_eq") == 0) { return MatchType::MATCH_ELEMENT_EQ; } else if (condition_op.compare("match_element_lt") == 0) { @@ -229,6 +241,8 @@ inline MatchType to_match_type(const std::string& condition_op) { inline bool is_match_condition(const std::string& op) { if (0 == strcasecmp(op.c_str(), "match_any") || 0 == strcasecmp(op.c_str(), "match_all") || 0 == strcasecmp(op.c_str(), "match_phrase") || + 0 == strcasecmp(op.c_str(), "match_phrase_prefix") || + 0 == strcasecmp(op.c_str(), "match_regexp") || 0 == strcasecmp(op.c_str(), "match_element_eq") || 0 == strcasecmp(op.c_str(), "match_element_lt") || 0 == strcasecmp(op.c_str(), "match_element_gt") || @@ -241,7 +255,8 @@ inline bool is_match_condition(const std::string& op) { inline bool is_match_operator(const TExprOpcode::type& op_type) { return TExprOpcode::MATCH_ANY == op_type || TExprOpcode::MATCH_ALL == op_type || - TExprOpcode::MATCH_PHRASE == op_type || TExprOpcode::MATCH_ELEMENT_EQ == op_type || + TExprOpcode::MATCH_PHRASE == op_type || TExprOpcode::MATCH_PHRASE_PREFIX == op_type || + TExprOpcode::MATCH_REGEXP == op_type || TExprOpcode::MATCH_ELEMENT_EQ == op_type || TExprOpcode::MATCH_ELEMENT_LT == op_type || TExprOpcode::MATCH_ELEMENT_GT == op_type || TExprOpcode::MATCH_ELEMENT_LE == op_type || TExprOpcode::MATCH_ELEMENT_GE == op_type; } diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp index 61d257231553bb..8ffd6d9993609a 100644 --- a/be/src/olap/match_predicate.cpp +++ b/be/src/olap/match_predicate.cpp @@ -95,6 +95,12 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m case MatchType::MATCH_PHRASE: ret = InvertedIndexQueryType::MATCH_PHRASE_QUERY; break; + case MatchType::MATCH_PHRASE_PREFIX: + ret = InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY; + break; + case MatchType::MATCH_REGEXP: + ret = InvertedIndexQueryType::MATCH_REGEXP_QUERY; + break; case MatchType::MATCH_ELEMENT_EQ: ret = InvertedIndexQueryType::EQUAL_QUERY; break; @@ -117,7 +123,7 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m } bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const { - if (_match_type == MatchType::MATCH_PHRASE && + if ((_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX) && iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp index b77edc79ade905..b2448a8fa8e233 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp @@ -38,12 +38,12 @@ ConjunctionQuery::~ConjunctionQuery() { } void ConjunctionQuery::add(const std::wstring& field_name, const std::vector& terms) { - if (terms.size() < 1) { - _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() < 1"); + if (terms.empty()) { + _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms empty"); } std::vector iterators; - for (auto& term : terms) { + for (const auto& term : terms) { std::wstring ws_term = StringUtil::string_to_wstring(term); Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str()); _terms.push_back(t); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp index 07a159b3222408..7b797d7b54a91e 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp @@ -22,26 +22,25 @@ namespace doris { DisjunctionQuery::DisjunctionQuery(IndexReader* reader) : _reader(reader) {} DisjunctionQuery::~DisjunctionQuery() { - for (auto& term : _terms) { - if (term) { - _CLDELETE(term); - } - } for (auto& term_doc : _term_docs) { if (term_doc) { _CLDELETE(term_doc); } } + for (auto& term : _terms) { + if (term) { + _CLDELETE(term); + } + } } void DisjunctionQuery::add(const std::wstring& field_name, const std::vector& terms) { - if (terms.size() < 1) { - _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() < 1"); + if (terms.empty()) { + _CLTHROWA(CL_ERR_IllegalArgument, "DisjunctionQuery::add: terms empty"); } - for (auto& term : terms) { + for (const auto& term : terms) { std::wstring ws_term = StringUtil::string_to_wstring(term); - _wsterms.emplace_back(&ws_term); Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str()); _terms.push_back(t); TermDocs* term_doc = _reader->termDocs(t); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h index f42fd69dabc2ef..bb0a837f42a313 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h @@ -39,7 +39,6 @@ class DisjunctionQuery { private: IndexReader* _reader = nullptr; - std::vector _wsterms; std::vector _terms; std::vector _term_docs; std::vector _term_iterators; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp new file mode 100644 index 00000000000000..4b0340cda4a011 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "phrase_prefix_query.h" + +#include "olap/rowset//segment_v2/inverted_index/query/prefix_query.h" + +namespace doris { + +namespace segment_v2 { + +PhrasePrefixQuery::PhrasePrefixQuery(const std::shared_ptr& searcher) + : _searcher(searcher) {} + +void PhrasePrefixQuery::add(const std::wstring& field_name, const std::vector& terms) { + if (terms.empty()) { + return; + } + + for (size_t i = 0; i < terms.size(); i++) { + if (i < terms.size() - 1) { + std::wstring ws = StringUtil::string_to_wstring(terms[i]); + Term* t = _CLNEW Term(field_name.c_str(), ws.c_str()); + _query.add(t); + _CLDECDELETE(t); + } else { + std::vector prefix_terms; + PrefixQuery::get_prefix_terms(_searcher->getReader(), field_name, terms[i], + prefix_terms, _max_expansions); + if (prefix_terms.empty()) { + continue; + } + _query.add(prefix_terms); + for (auto& t : prefix_terms) { + _CLDECDELETE(t); + } + } + } +} + +void PhrasePrefixQuery::search(roaring::Roaring& roaring) { + _searcher->_search(&_query, [&roaring](const int32_t docid, const float_t /*score*/) { + roaring.add(docid); + }); +} + +} // namespace segment_v2 + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h new file mode 100644 index 00000000000000..28007620ce581e --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include + +#include "CLucene/search/MultiPhraseQuery.h" +#include "roaring/roaring.hh" + +CL_NS_USE(index) +CL_NS_USE(search) + +namespace doris { + +namespace segment_v2 { + +class PhrasePrefixQuery { +public: + PhrasePrefixQuery(const std::shared_ptr& searcher); + ~PhrasePrefixQuery() = default; + + void set_max_expansions(int32_t max_expansions) { _max_expansions = max_expansions; } + + void add(const std::wstring& field_name, const std::vector& terms); + void search(roaring::Roaring& roaring); + +private: + std::shared_ptr _searcher; + MultiPhraseQuery _query; + + int32_t _max_expansions = 50; +}; + +} // namespace segment_v2 + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp new file mode 100644 index 00000000000000..7d23d6eb60f348 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "prefix_query.h" + +namespace doris { + +void PrefixQuery::get_prefix_terms(IndexReader* reader, const std::wstring& field_name, + const std::string& prefix, + std::vector& prefix_terms, + int32_t max_expansions) { + std::wstring ws_prefix = StringUtil::string_to_wstring(prefix); + + Term* prefix_term = _CLNEW Term(field_name.c_str(), ws_prefix.c_str()); + TermEnum* enumerator = reader->terms(prefix_term); + + int32_t count = 0; + Term* lastTerm = nullptr; + try { + const TCHAR* prefixText = prefix_term->text(); + const TCHAR* prefixField = prefix_term->field(); + const TCHAR* tmp = nullptr; + size_t i = 0; + size_t prefixLen = prefix_term->textLength(); + do { + lastTerm = enumerator->term(); + if (lastTerm != nullptr && lastTerm->field() == prefixField) { + size_t termLen = lastTerm->textLength(); + if (prefixLen > termLen) { + break; + } + + tmp = lastTerm->text(); + + for (i = prefixLen - 1; i != -1; --i) { + if (tmp[i] != prefixText[i]) { + tmp = nullptr; + break; + } + } + if (tmp == nullptr) { + break; + } + + if (max_expansions > 0 && count >= max_expansions) { + break; + } + + Term* t = _CLNEW Term(field_name.c_str(), tmp); + prefix_terms.push_back(t); + count++; + } else { + break; + } + _CLDECDELETE(lastTerm); + } while (enumerator->next()); + } + _CLFINALLY({ + enumerator->close(); + _CLDELETE(enumerator); + _CLDECDELETE(lastTerm); + _CLDECDELETE(prefix_term); + }); +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h new file mode 100644 index 00000000000000..5deb0c1a3628ad --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include + +CL_NS_USE(index) + +namespace doris { + +class PrefixQuery { +public: + PrefixQuery() = default; + ~PrefixQuery() = default; + + static void get_prefix_terms(IndexReader* reader, const std::wstring& field_name, + const std::string& prefix, + std::vector& prefix_terms, + int32_t max_expansions = 50); +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp new file mode 100644 index 00000000000000..83c5401bac0e5b --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "regexp_query.h" + +#include +#include + +#include "common/logging.h" + +namespace doris::segment_v2 { + +RegexpQuery::RegexpQuery(const std::shared_ptr& searcher) + : _searcher(searcher), query(searcher->getReader()) {} + +void RegexpQuery::add(const std::wstring& field_name, const std::string& pattern) { + hs_database_t* database = nullptr; + hs_compile_error_t* compile_err = nullptr; + hs_scratch_t* scratch = nullptr; + + if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8, + HS_MODE_BLOCK, nullptr, &database, &compile_err) != HS_SUCCESS) { + LOG(ERROR) << "hyperscan compilation failed: " << compile_err->message; + hs_free_compile_error(compile_err); + return; + } + + if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) { + LOG(ERROR) << "hyperscan could not allocate scratch space."; + hs_free_database(database); + return; + } + + auto on_match = [](unsigned int id, unsigned long long from, unsigned long long to, + unsigned int flags, void* context) -> int { + *((bool*)context) = true; + return 0; + }; + + Term* term = nullptr; + TermEnum* enumerator = nullptr; + std::vector terms; + int32_t count = 0; + + try { + enumerator = _searcher->getReader()->terms(); + while (enumerator->next()) { + term = enumerator->term(); + std::string input = lucene_wcstoutf8string(term->text(), term->textLength()); + + bool is_match = false; + if (hs_scan(database, input.data(), input.size(), 0, scratch, on_match, + (void*)&is_match) != HS_SUCCESS) { + LOG(ERROR) << "hyperscan match failed: " << input; + break; + } + + if (is_match) { + terms.emplace_back(std::move(input)); + if (++count >= _max_expansions) { + break; + } + } + + _CLDECDELETE(term); + } + } + _CLFINALLY({ + _CLDECDELETE(term); + enumerator->close(); + _CLDELETE(enumerator); + + hs_free_scratch(scratch); + hs_free_database(database); + }) + + query.add(field_name, terms); +} + +void RegexpQuery::search(roaring::Roaring& roaring) { + query.search(roaring); +} + +} // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h new file mode 100644 index 00000000000000..3791ad50d8f78f --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "olap/rowset/segment_v2/inverted_index/query/disjunction_query.h" + +CL_NS_USE(index) +CL_NS_USE(search) + +namespace doris::segment_v2 { + +class RegexpQuery { +public: + RegexpQuery(const std::shared_ptr& searcher); + ~RegexpQuery() = default; + + void set_max_expansions(int32_t max_expansions) { _max_expansions = max_expansions; } + + void add(const std::wstring& field_name, const std::string& pattern); + void search(roaring::Roaring& roaring); + +private: + std::shared_ptr _searcher; + + int32_t _max_expansions = 50; + DisjunctionQuery query; +}; + +} // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h index 1ebfe6359181e9..3037f979f6edf7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h @@ -32,6 +32,8 @@ enum class InvertedIndexQueryType { MATCH_ANY_QUERY = 5, MATCH_ALL_QUERY = 6, MATCH_PHRASE_QUERY = 7, + MATCH_PHRASE_PREFIX_QUERY = 8, + MATCH_REGEXP_QUERY = 9, }; inline std::string InvertedIndexQueryType_toString(InvertedIndexQueryType query_type) { @@ -63,6 +65,12 @@ inline std::string InvertedIndexQueryType_toString(InvertedIndexQueryType query_ case InvertedIndexQueryType::MATCH_PHRASE_QUERY: { return "MPHRASE"; } + case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: { + return "MPHRASEPREFIX"; + } + case InvertedIndexQueryType::MATCH_REGEXP_QUERY: { + return "MREGEXP"; + } default: return ""; } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 7d710d72c382ab..292884e631b939 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -50,12 +50,15 @@ #include "CLucene/analysis/standard95/StandardAnalyzer.h" #include "common/config.h" #include "common/logging.h" +#include "inverted_index_query_type.h" #include "io/fs/file_system.h" #include "olap/inverted_index_parser.h" #include "olap/key_coder.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h" +#include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h" +#include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h" #include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_compound_directory.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" @@ -90,7 +93,9 @@ bool InvertedIndexReader::_is_range_query(InvertedIndexQueryType query_type) { bool InvertedIndexReader::_is_match_query(InvertedIndexQueryType query_type) { return (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || - query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY); + query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || + query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || + query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY); } bool InvertedIndexReader::indexExists(io::Path& index_file_path) { @@ -141,10 +146,13 @@ std::unique_ptr InvertedIndexReader::create_reader( return reader; } -std::vector InvertedIndexReader::get_analyse_result( - lucene::util::Reader* reader, lucene::analysis::Analyzer* analyzer, - const std::string& field_name, InvertedIndexQueryType query_type, bool drop_duplicates) { - std::vector analyse_result; +void InvertedIndexReader::get_analyse_result(std::vector& analyse_result, + lucene::util::Reader* reader, + lucene::analysis::Analyzer* analyzer, + const std::string& field_name, + InvertedIndexQueryType query_type, + bool drop_duplicates) { + analyse_result.clear(); std::wstring field_ws = std::wstring(field_name.begin(), field_name.end()); std::unique_ptr token_stream( @@ -168,8 +176,6 @@ std::vector InvertedIndexReader::get_analyse_result( std::set unrepeated_result(analyse_result.begin(), analyse_result.end()); analyse_result.assign(unrepeated_result.begin(), unrepeated_result.end()); } - - return analyse_result; } Status InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle, @@ -246,19 +252,25 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run auto index_file_name = InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_meta.index_id()); auto index_file_path = index_dir / index_file_name; - InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared(); - inverted_index_ctx->parser_type = get_inverted_index_parser_type_from_string( - get_parser_string_from_properties(_index_meta.properties())); - inverted_index_ctx->parser_mode = - get_parser_mode_string_from_properties(_index_meta.properties()); - inverted_index_ctx->char_filter_map = - get_parser_char_filter_map_from_properties(_index_meta.properties()); + try { - auto analyzer = create_analyzer(inverted_index_ctx.get()); - auto reader = create_reader(inverted_index_ctx.get(), search_str); - inverted_index_ctx->analyzer = analyzer.get(); - std::vector analyse_result = - get_analyse_result(reader.get(), analyzer.get(), column_name, query_type); + std::vector analyse_result; + if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) { + analyse_result.emplace_back(search_str); + } else { + InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared(); + inverted_index_ctx->parser_type = get_inverted_index_parser_type_from_string( + get_parser_string_from_properties(_index_meta.properties())); + inverted_index_ctx->parser_mode = + get_parser_mode_string_from_properties(_index_meta.properties()); + inverted_index_ctx->char_filter_map = + get_parser_char_filter_map_from_properties(_index_meta.properties()); + auto analyzer = create_analyzer(inverted_index_ctx.get()); + auto reader = create_reader(inverted_index_ctx.get(), search_str); + inverted_index_ctx->analyzer = analyzer.get(); + get_analyse_result(analyse_result, reader.get(), analyzer.get(), column_name, + query_type); + } if (analyse_result.empty()) { auto msg = fmt::format( @@ -267,7 +279,9 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run search_str, get_parser_string_from_properties(_index_meta.properties())); if (query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || - query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { + query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || + query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || + query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) { LOG(WARNING) << msg; return Status::OK(); } else { @@ -294,6 +308,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run roaring::Roaring query_match_bitmap; bool null_bitmap_already_read = false; if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || + query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || query_type == InvertedIndexQueryType::EQUAL_QUERY) { std::string str_tokens; @@ -302,7 +317,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run str_tokens += " "; } - auto cache = InvertedIndexQueryCache::instance(); + auto* cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCache::CacheKey cache_key; cache_key.index_path = index_file_path; cache_key.column_name = column_name; @@ -333,6 +348,10 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run query.reset(phrase_query); res = normal_index_search(stats, query_type, index_searcher, null_bitmap_already_read, query, term_match_bitmap); + } else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) { + res = match_phrase_prefix_index_search(stats, runtime_state, field_ws, + analyse_result, index_searcher, + term_match_bitmap); } else { res = match_all_index_search(stats, runtime_state, field_ws, analyse_result, index_searcher, term_match_bitmap); @@ -346,13 +365,45 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run cache->insert(cache_key, term_match_bitmap, &cache_handle); } query_match_bitmap = *term_match_bitmap; + } else if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) { + const std::string& pattern = analyse_result[0]; + + std::shared_ptr term_match_bitmap = nullptr; + auto* cache = InvertedIndexQueryCache::instance(); + + InvertedIndexQueryCache::CacheKey cache_key; + cache_key.index_path = index_file_path; + cache_key.column_name = column_name; + cache_key.query_type = query_type; + cache_key.value = pattern; + InvertedIndexQueryCacheHandle cache_handle; + if (cache->lookup(cache_key, &cache_handle)) { + stats->inverted_index_query_cache_hit++; + term_match_bitmap = cache_handle.get_bitmap(); + } else { + stats->inverted_index_query_cache_miss++; + + auto index_searcher = get_index_search(); + + term_match_bitmap = std::make_shared(); + + Status res = match_regexp_index_search(stats, runtime_state, field_ws, pattern, + index_searcher, term_match_bitmap); + if (!res.ok()) { + return res; + } + + term_match_bitmap->runOptimize(); + cache->insert(cache_key, term_match_bitmap, &cache_handle); + } + query_match_bitmap = *term_match_bitmap; } else { bool first = true; for (auto token : analyse_result) { std::shared_ptr term_match_bitmap = nullptr; // try to get term bitmap match result from cache to avoid query index on cache hit - auto cache = InvertedIndexQueryCache::instance(); + auto* cache = InvertedIndexQueryCache::instance(); // use EQUAL_QUERY type here since cache is for each term/token //auto token = lucene_wcstoutf8string(token_ws.c_str(), token_ws.length()); std::wstring token_ws = StringUtil::string_to_wstring(token); @@ -471,6 +522,42 @@ Status FullTextIndexReader::match_all_index_search( return Status::OK(); } +Status FullTextIndexReader::match_phrase_prefix_index_search( + OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws, + const std::vector& analyse_result, const IndexSearcherPtr& index_searcher, + const std::shared_ptr& term_match_bitmap) { + TQueryOptions queryOptions = runtime_state->query_options(); + try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + PhrasePrefixQuery query(index_searcher); + query.set_max_expansions(queryOptions.inverted_index_max_expansions); + query.add(field_ws, analyse_result); + query.search(*term_match_bitmap); + } catch (const CLuceneError& e) { + return Status::Error("CLuceneError occured: {}", + e.what()); + } + return Status::OK(); +} + +Status FullTextIndexReader::match_regexp_index_search( + OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws, + const std::string& pattern, const IndexSearcherPtr& index_searcher, + const std::shared_ptr& term_match_bitmap) { + TQueryOptions queryOptions = runtime_state->query_options(); + try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + RegexpQuery query(index_searcher); + query.set_max_expansions(queryOptions.inverted_index_max_expansions); + query.add(field_ws, pattern); + query.search(*term_match_bitmap); + } catch (const CLuceneError& e) { + return Status::Error("CLuceneError occured: {}", + e.what()); + } + return Status::OK(); +} + void FullTextIndexReader::check_null_bitmap(const IndexSearcherPtr& index_searcher, bool& null_bitmap_already_read) { // try to reuse index_searcher's directory to read null_bitmap to cache diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 20c5c731f9eca8..8b5c786f36f909 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -97,11 +97,12 @@ class InvertedIndexReader : public std::enable_shared_from_this get_analyse_result(lucene::util::Reader* reader, - lucene::analysis::Analyzer* analyzer, - const std::string& field_name, - InvertedIndexQueryType query_type, - bool drop_duplicates = true); + static void get_analyse_result(std::vector& analyse_result, + lucene::util::Reader* reader, + lucene::analysis::Analyzer* analyzer, + const std::string& field_name, InvertedIndexQueryType query_type, + bool drop_duplicates = true); + static std::unique_ptr create_reader(InvertedIndexCtx* inverted_index_ctx, const std::string& value); static std::unique_ptr create_analyzer( @@ -153,6 +154,16 @@ class FullTextIndexReader : public InvertedIndexReader { const std::shared_ptr& term_match_bitmap); void check_null_bitmap(const IndexSearcherPtr& index_searcher, bool& null_bitmap_already_read); + + Status match_phrase_prefix_index_search( + OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws, + const std::vector& analyse_result, const IndexSearcherPtr& index_searcher, + const std::shared_ptr& term_match_bitmap); + + Status match_regexp_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, + const std::wstring& field_ws, const std::string& pattern, + const IndexSearcherPtr& index_searcher, + const std::shared_ptr& term_match_bitmap); }; class StringTypeInvertedIndexReader : public InvertedIndexReader { diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp index 11760a30f5025f..54d9bee4ae9edc 100644 --- a/be/src/vec/functions/function_tokenize.cpp +++ b/be/src/vec/functions/function_tokenize.cpp @@ -79,10 +79,10 @@ void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string, auto reader = doris::segment_v2::InvertedIndexReader::create_reader( &inverted_index_ctx, tokenize_str.to_string()); - std::vector query_tokens = - doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx.analyzer, "tokenize", - doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); + std::vector query_tokens; + doris::segment_v2::InvertedIndexReader::get_analyse_result( + query_tokens, reader.get(), inverted_index_ctx.analyzer, "tokenize", + doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); for (auto token : query_tokens) { const size_t old_size = column_string_chars.size(); const size_t split_part_size = token.length(); diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index 3497b2ef7a96a5..c81c1617ca61d4 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -129,10 +129,10 @@ inline std::vector FunctionMatchBase::analyse_data_token( auto reader = doris::segment_v2::InvertedIndexReader::create_reader( inverted_index_ctx, str_ref.to_string()); - std::vector element_tokens = - doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer, column_name, query_type, - false); + std::vector element_tokens; + doris::segment_v2::InvertedIndexReader::get_analyse_result( + element_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, + query_type, false); data_tokens.insert(data_tokens.end(), element_tokens.begin(), element_tokens.end()); } } else { @@ -140,8 +140,9 @@ inline std::vector FunctionMatchBase::analyse_data_token( auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, str_ref.to_string()); - data_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer, column_name, query_type, false); + doris::segment_v2::InvertedIndexReader::get_analyse_result(data_tokens, reader.get(), + inverted_index_ctx->analyzer, + column_name, query_type, false); } return data_tokens; } @@ -160,10 +161,10 @@ Status FunctionMatchAny::execute_match(const std::string& column_name, << inverted_index_parser_type_to_string(parser_type); auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, match_query_str); - std::vector query_tokens = - doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer, column_name, - doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY); + std::vector query_tokens; + doris::segment_v2::InvertedIndexReader::get_analyse_result( + query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, + doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY); if (query_tokens.empty()) { LOG(WARNING) << fmt::format( "token parser result is empty for query, " @@ -205,10 +206,10 @@ Status FunctionMatchAll::execute_match(const std::string& column_name, << inverted_index_parser_type_to_string(parser_type); auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, match_query_str); - std::vector query_tokens = - doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer, column_name, - doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY); + std::vector query_tokens; + doris::segment_v2::InvertedIndexReader::get_analyse_result( + query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, + doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY); if (query_tokens.empty()) { LOG(WARNING) << fmt::format( "token parser result is empty for query, " @@ -256,10 +257,10 @@ Status FunctionMatchPhrase::execute_match(const std::string& column_name, << inverted_index_parser_type_to_string(parser_type); auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, match_query_str); - std::vector query_tokens = - doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer, column_name, - doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); + std::vector query_tokens; + doris::segment_v2::InvertedIndexReader::get_analyse_result( + query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, + doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); if (query_tokens.empty()) { LOG(WARNING) << fmt::format( "token parser result is empty for query, " @@ -313,6 +314,8 @@ void register_function_match(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); factory.register_function(); + factory.register_function(); + factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h index b8e7f91cb019b8..13701bd2d6000a 100644 --- a/be/src/vec/functions/match.h +++ b/be/src/vec/functions/match.h @@ -128,6 +128,40 @@ class FunctionMatchPhrase : public FunctionMatchBase { ColumnUInt8::Container& result) override; }; +class FunctionMatchPhrasePrefix : public FunctionMatchBase { +public: + static constexpr auto name = "match_phrase_prefix"; + static FunctionPtr create() { return std::make_shared(); } + + String get_name() const override { return name; } + + Status execute_match(const std::string& column_name, const std::string& match_query_str, + size_t input_rows_count, const ColumnString* string_col, + InvertedIndexCtx* inverted_index_ctx, + const ColumnArray::Offsets64* array_offsets, + ColumnUInt8::Container& result) override { + return Status::Error( + "FunctionMatchPhrasePrefix not support execute_match"); + } +}; + +class FunctionMatchRegexp : public FunctionMatchBase { +public: + static constexpr auto name = "match_regexp"; + static FunctionPtr create() { return std::make_shared(); } + + String get_name() const override { return name; } + + Status execute_match(const std::string& column_name, const std::string& match_query_str, + size_t input_rows_count, const ColumnString* string_col, + InvertedIndexCtx* inverted_index_ctx, + const ColumnArray::Offsets64* array_offsets, + ColumnUInt8::Container& result) override { + return Status::Error( + "FunctionMatchRegexp not support execute_match"); + } +}; + class FunctionMatchElementEQ : public FunctionMatchBase { public: static constexpr auto name = "match_element_eq"; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index abf6e4bb27f00e..378a34f3da0bfd 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -338,6 +338,8 @@ MATCH_ELEMENT_GT: 'ELEMENT_GT'; MATCH_ELEMENT_LE: 'ELEMENT_LE'; MATCH_ELEMENT_LT: 'ELEMENT_LT'; MATCH_PHRASE: 'MATCH_PHRASE'; +MATCH_PHRASE_PREFIX: 'MATCH_PHRASE_PREFIX'; +MATCH_REGEXP: 'MATCH_REGEXP'; MATERIALIZED: 'MATERIALIZED'; MAX: 'MAX'; MAXVALUE: 'MAXVALUE'; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index ec4b2672c6d861..ba44219e7e047f 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -312,7 +312,7 @@ booleanExpression predicate : NOT? kind=BETWEEN lower=valueExpression AND upper=valueExpression | NOT? kind=(LIKE | REGEXP | RLIKE) pattern=valueExpression - | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE) pattern=valueExpression + | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE | MATCH_PHRASE_PREFIX | MATCH_REGEXP) pattern=valueExpression | NOT? kind=IN LEFT_PAREN query RIGHT_PAREN | NOT? kind=IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN | IS NOT? kind=NULL diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index 07345f6b6b9c5e..08ffc389a92814 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -468,6 +468,8 @@ terminal String KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, + KW_MATCH_PHRASE_PREFIX, + KW_MATCH_REGEXP, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, @@ -959,7 +961,7 @@ precedence left KW_AND; precedence left KW_NOT, NOT; precedence left KW_BETWEEN, KW_IN, KW_IS, KW_EXISTS; precedence left KW_LIKE, KW_REGEXP; -precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE; +precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH_PHRASE_PREFIX, KW_MATCH_REGEXP, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE; precedence left EQUAL, LESSTHAN, GREATERTHAN; precedence left ADD, SUBTRACT; precedence left AT, STAR, DIVIDE, MOD, KW_DIV; @@ -6985,6 +6987,10 @@ match_predicate ::= {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ALL, e1, e2); :} | expr:e1 KW_MATCH_PHRASE expr:e2 {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE, e1, e2); :} + | expr:e1 KW_MATCH_PHRASE_PREFIX expr:e2 + {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE_PREFIX, e1, e2); :} + | expr:e1 KW_MATCH_REGEXP expr:e2 + {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_REGEXP, e1, e2); :} | expr:e1 KW_MATCH_ELEMENT_EQ expr:e2 {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_EQ, e1, e2); :} | expr:e1 KW_MATCH_ELEMENT_LT expr:e2 diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java index 10579614524e1d..f106aec956c72c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java @@ -50,6 +50,8 @@ public enum Operator { MATCH_ANY("MATCH_ANY", "match_any", TExprOpcode.MATCH_ANY), MATCH_ALL("MATCH_ALL", "match_all", TExprOpcode.MATCH_ALL), MATCH_PHRASE("MATCH_PHRASE", "match_phrase", TExprOpcode.MATCH_PHRASE), + MATCH_PHRASE_PREFIX("MATCH_PHRASE_PREFIX", "match_phrase_prefix", TExprOpcode.MATCH_PHRASE_PREFIX), + MATCH_REGEXP("MATCH_REGEXP", "match_regexp", TExprOpcode.MATCH_REGEXP), MATCH_ELEMENT_EQ("MATCH_ELEMENT_EQ", "match_element_eq", TExprOpcode.MATCH_ELEMENT_EQ), MATCH_ELEMENT_LT("MATCH_ELEMENT_LT", "match_element_lt", TExprOpcode.MATCH_ELEMENT_LT), MATCH_ELEMENT_GT("MATCH_ELEMENT_GT", "match_element_gt", TExprOpcode.MATCH_ELEMENT_GT), @@ -147,6 +149,26 @@ public static void initBuiltins(FunctionSet functionSet) { symbolNotUsed, Lists.newArrayList(new ArrayType(t), t), Type.BOOLEAN)); + functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator( + Operator.MATCH_PHRASE_PREFIX.getName(), + symbolNotUsed, + Lists.newArrayList(t, t), + Type.BOOLEAN)); + functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator( + Operator.MATCH_PHRASE_PREFIX.getName(), + symbolNotUsed, + Lists.newArrayList(new ArrayType(t), t), + Type.BOOLEAN)); + functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator( + Operator.MATCH_REGEXP.getName(), + symbolNotUsed, + Lists.newArrayList(t, t), + Type.BOOLEAN)); + functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator( + Operator.MATCH_REGEXP.getName(), + symbolNotUsed, + Lists.newArrayList(new ArrayType(t), t), + Type.BOOLEAN)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index 23cb6c572e2a26..447f0c28442354 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -158,6 +158,8 @@ import org.apache.doris.nereids.trees.expressions.MatchAll; import org.apache.doris.nereids.trees.expressions.MatchAny; import org.apache.doris.nereids.trees.expressions.MatchPhrase; +import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix; +import org.apache.doris.nereids.trees.expressions.MatchRegexp; import org.apache.doris.nereids.trees.expressions.Mod; import org.apache.doris.nereids.trees.expressions.Multiply; import org.apache.doris.nereids.trees.expressions.NamedExpression; @@ -1927,6 +1929,18 @@ private Expression withPredicate(Expression valueExpression, PredicateContext ct getExpression(ctx.pattern) ); break; + case DorisParser.MATCH_PHRASE_PREFIX: + outExpression = new MatchPhrasePrefix( + valueExpression, + getExpression(ctx.pattern) + ); + break; + case DorisParser.MATCH_REGEXP: + outExpression = new MatchRegexp( + valueExpression, + getExpression(ctx.pattern) + ); + break; default: throw new ParseException("Unsupported predicate type: " + ctx.kind.getText(), ctx); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java index bc9837eafec5bc..5b3027365a8d91 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java @@ -50,6 +50,10 @@ public Operator op() throws AnalysisException { return Operator.MATCH_ALL; case "MATCH_PHRASE": return Operator.MATCH_PHRASE; + case "MATCH_PHRASE_PREFIX": + return Operator.MATCH_PHRASE_PREFIX; + case "MATCH_REGEXP": + return Operator.MATCH_REGEXP; default: throw new AnalysisException("UnSupported type: " + symbol); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhrasePrefix.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhrasePrefix.java new file mode 100644 index 00000000000000..748da21ce30c68 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhrasePrefix.java @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions; + +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * like expression: a MATCH_PHRASE_PREFIX 'hello w'. + */ +public class MatchPhrasePrefix extends Match { + public MatchPhrasePrefix(Expression left, Expression right) { + super(ImmutableList.of(left, right), "MATCH_PHRASE_PREFIX"); + } + + private MatchPhrasePrefix(List children) { + super(children, "MATCH_PHRASE_PREFIX"); + } + + @Override + public MatchPhrasePrefix withChildren(List children) { + Preconditions.checkArgument(children.size() == 2); + return new MatchPhrasePrefix(children); + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitMatchPhrasePrefix(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java new file mode 100644 index 00000000000000..6bb55aeb8978a5 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions; + +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * like expression: a MATCH_REGEXP '^h\\w*'. + */ +public class MatchRegexp extends Match { + public MatchRegexp(Expression left, Expression right) { + super(ImmutableList.of(left, right), "MATCH_REGEXP"); + } + + private MatchRegexp(List children) { + super(children, "MATCH_REGEXP"); + } + + @Override + public MatchRegexp withChildren(List children) { + Preconditions.checkArgument(children.size() == 2); + return new MatchRegexp(children); + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitMatchRegexp(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java index 4d89227ee9fcf8..179570b824e21c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java @@ -56,6 +56,8 @@ import org.apache.doris.nereids.trees.expressions.MatchAll; import org.apache.doris.nereids.trees.expressions.MatchAny; import org.apache.doris.nereids.trees.expressions.MatchPhrase; +import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix; +import org.apache.doris.nereids.trees.expressions.MatchRegexp; import org.apache.doris.nereids.trees.expressions.Mod; import org.apache.doris.nereids.trees.expressions.Multiply; import org.apache.doris.nereids.trees.expressions.NamedExpression; @@ -454,6 +456,14 @@ public R visitMatchPhrase(MatchPhrase matchPhrase, C context) { return visitMatch(matchPhrase, context); } + public R visitMatchPhrasePrefix(MatchPhrasePrefix matchPhrasePrefix, C context) { + return visitMatch(matchPhrasePrefix, context); + } + + public R visitMatchRegexp(MatchRegexp matchRegexp, C context) { + return visitMatch(matchRegexp, context); + } + /* ******************************************************************************************** * Unbound expressions * ********************************************************************************************/ diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 2fd5dd7adea7b7..da4212f04185e4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -402,6 +402,7 @@ public class SessionVariable implements Serializable, Writable { public static final String ENABLE_UNIQUE_KEY_PARTIAL_UPDATE = "enable_unique_key_partial_update"; public static final String INVERTED_INDEX_CONJUNCTION_OPT_THRESHOLD = "inverted_index_conjunction_opt_threshold"; + public static final String INVERTED_INDEX_MAX_EXPANSIONS = "inverted_index_max_expansions"; public static final String AUTO_ANALYZE_START_TIME = "auto_analyze_start_time"; @@ -1192,6 +1193,12 @@ public void setMaxJoinNumberOfReorder(int maxJoinNumberOfReorder) { flag = VariableMgr.GLOBAL) public String autoAnalyzeEndTime = "23:59:59"; + @VariableMgr.VarAttr(name = INVERTED_INDEX_MAX_EXPANSIONS, + description = {"这个参数用来限制查询时扩展的词项(terms)的数量,以此来控制查询的性能", + "This parameter is used to limit the number of term expansions during a query," + + " thereby controlling query performance"}) + public int invertedIndexMaxExpansions = 50; + @VariableMgr.VarAttr(name = ENABLE_UNIQUE_KEY_PARTIAL_UPDATE, needForward = true) public boolean enableUniqueKeyPartialUpdate = false; @@ -2435,6 +2442,7 @@ public TQueryOptions toThrift() { tResult.setTruncateCharOrVarcharColumns(truncateCharOrVarcharColumns); tResult.setInvertedIndexConjunctionOptThreshold(invertedIndexConjunctionOptThreshold); + tResult.setInvertedIndexMaxExpansions(invertedIndexMaxExpansions); tResult.setFasterFloatConvert(fasterFloatConvert); diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex b/fe/fe-core/src/main/jflex/sql_scanner.flex index 4a19494ce80c99..86a3dc7482de60 100644 --- a/fe/fe-core/src/main/jflex/sql_scanner.flex +++ b/fe/fe-core/src/main/jflex/sql_scanner.flex @@ -309,6 +309,8 @@ import org.apache.doris.qe.SqlModeHelper; keywordMap.put("match_any", new Integer(SqlParserSymbols.KW_MATCH_ANY)); keywordMap.put("match_all", new Integer(SqlParserSymbols.KW_MATCH_ALL)); keywordMap.put("match_phrase", new Integer(SqlParserSymbols.KW_MATCH_PHRASE)); + keywordMap.put("match_phrase_prefix", new Integer(SqlParserSymbols.KW_MATCH_PHRASE_PREFIX)); + keywordMap.put("match_regexp", new Integer(SqlParserSymbols.KW_MATCH_REGEXP)); keywordMap.put("element_eq", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_EQ)); keywordMap.put("element_lt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_LT)); keywordMap.put("element_gt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_GT)); diff --git a/gensrc/thrift/Opcodes.thrift b/gensrc/thrift/Opcodes.thrift index f6444ebe218fd3..72a1d80e036222 100644 --- a/gensrc/thrift/Opcodes.thrift +++ b/gensrc/thrift/Opcodes.thrift @@ -93,4 +93,6 @@ enum TExprOpcode { MATCH_ELEMENT_GT, MATCH_ELEMENT_LE, MATCH_ELEMENT_GE, + MATCH_PHRASE_PREFIX, + MATCH_REGEXP, } diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 62eb5a0827b090..9ff6a589d69dd2 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -249,6 +249,8 @@ struct TQueryOptions { 86: optional i32 analyze_timeout = 43200; 87: optional bool faster_float_convert = false; + + 88: optional i32 inverted_index_max_expansions = 50; } diff --git a/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix.out b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix.out new file mode 100644 index 00000000000000..140fd5ee937992 --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix.out @@ -0,0 +1,31 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +863 + +-- !sql -- +863 + +-- !sql -- +235 + +-- !sql -- +235 + +-- !sql -- +166 + +-- !sql -- +166 + +-- !sql -- +56 + +-- !sql -- +56 + +-- !sql -- +7 + +-- !sql -- +7 + diff --git a/regression-test/data/inverted_index_p0/test_index_match_regexp.out b/regression-test/data/inverted_index_p0/test_index_match_regexp.out new file mode 100644 index 00000000000000..eab27de65ee45f --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_match_regexp.out @@ -0,0 +1,16 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1000 + +-- !sql -- +54 + +-- !sql -- +910 + +-- !sql -- +60 + +-- !sql -- +38 + diff --git a/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix.groovy b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix.groovy new file mode 100644 index 00000000000000..b23bc1b5a8b82a --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix.groovy @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_index_match_phrase_prefix", "p0"){ + def indexTbName1 = "test_index_match_phrase_prefix" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false, + expected_succ_rows = -1, load_to_single_tablet = 'true' -> + + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'label', label + "_" + UUID.randomUUID().toString() + set 'read_json_by_line', read_flag + set 'format', format_flag + file file_name // import json file + time 10000 // limit inflight 10s + if (expected_succ_rows >= 0) { + set 'max_filter_ratio', '1' + } + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (ignore_failure && expected_succ_rows < 0) { return } + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + if (expected_succ_rows >= 0) { + assertEquals(json.NumberLoadedRows, expected_succ_rows) + } else { + assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows) + assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0) + } + } + } + } + + try { + load_httplogs_data.call(indexTbName1, 'test_index_match_phrase_prefix', 'true', 'json', 'documents-1000.json') + + qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix 'ima'; """ + qt_sql """ select count() from test_index_match_phrase_prefix where request like '%ima%'; """ + + qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix 'images/h'; """ + qt_sql """ select count() from test_index_match_phrase_prefix where request like '%images/h%'; """ + + qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix 'images/hm'; """ + qt_sql """ select count() from test_index_match_phrase_prefix where request like '%images/hm%'; """ + + qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix '/french/images/n'; """ + qt_sql """ select count() from test_index_match_phrase_prefix where request like '%/french/images/n%'; """ + + qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix '/french/tickets/images/ti'; """ + qt_sql """ select count() from test_index_match_phrase_prefix where request like '%/french/tickets/images/ti%'; """ + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +} \ No newline at end of file diff --git a/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy new file mode 100644 index 00000000000000..4c1ee1a5b0b484 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_index_match_regexp", "p0"){ + def indexTbName1 = "test_index_match_regexp" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false, + expected_succ_rows = -1, load_to_single_tablet = 'true' -> + + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'label', label + "_" + UUID.randomUUID().toString() + set 'read_json_by_line', read_flag + set 'format', format_flag + file file_name // import json file + time 10000 // limit inflight 10s + if (expected_succ_rows >= 0) { + set 'max_filter_ratio', '1' + } + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (ignore_failure && expected_succ_rows < 0) { return } + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + if (expected_succ_rows >= 0) { + assertEquals(json.NumberLoadedRows, expected_succ_rows) + } else { + assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows) + assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0) + } + } + } + } + + try { + load_httplogs_data.call(indexTbName1, 'test_index_match_regexp', 'true', 'json', 'documents-1000.json') + + qt_sql """ select count() from test_index_match_regexp where request match_regexp '^h'; """ + qt_sql """ select count() from test_index_match_regexp where request match_regexp '^team'; """ + qt_sql """ select count() from test_index_match_regexp where request match_regexp 's\$'; """ + qt_sql """ select count() from test_index_match_regexp where request match_regexp 'er\$'; """ + qt_sql """ select count() from test_index_match_regexp where request match_regexp '.*tickets.*'; """ + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +} \ No newline at end of file From 57980712b9457314a432ffaf97dbe42a31e48025 Mon Sep 17 00:00:00 2001 From: morrySnow <101034200+morrySnow@users.noreply.github.com> Date: Fri, 22 Dec 2023 11:06:07 +0800 Subject: [PATCH 4/6] [fix](Nereids) explain should fallback too if Nereids is not enable (#28475) (#28574) pick from master PR #28475 commit b50bc0d2c902856f6727c5301d256ca6a561f8a3 --- .../trees/plans/commands/DeleteCommand.java | 8 +++++ .../commands/InsertIntoTableCommand.java | 8 +++++ .../trees/plans/commands/UpdateCommand.java | 8 +++++ .../doris/nereids/util/ReadLockTest.java | 32 +++++++++++-------- 4 files changed, 43 insertions(+), 13 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DeleteCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DeleteCommand.java index fa6fd629008389..ccaadd7cd9ccd3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DeleteCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DeleteCommand.java @@ -130,6 +130,14 @@ public LogicalPlan getLogicalQuery() { @Override public Plan getExplainPlan(ConnectContext ctx) { + if (!ctx.getSessionVariable().isEnableNereidsDML()) { + try { + ctx.getSessionVariable().enableFallbackToOriginalPlannerOnce(); + } catch (Exception e) { + throw new AnalysisException("failed to set fallback to original planner to true", e); + } + throw new AnalysisException("Nereids DML is disabled, will try to fall back to the original planner"); + } return completeQueryPlan(ctx, logicalQuery); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/InsertIntoTableCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/InsertIntoTableCommand.java index 33bbfe8d546aa4..0ff6099163c69a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/InsertIntoTableCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/InsertIntoTableCommand.java @@ -160,6 +160,14 @@ public void run(ConnectContext ctx, StmtExecutor executor) throws Exception { @Override public Plan getExplainPlan(ConnectContext ctx) { + if (!ctx.getSessionVariable().isEnableNereidsDML()) { + try { + ctx.getSessionVariable().enableFallbackToOriginalPlannerOnce(); + } catch (Exception e) { + throw new AnalysisException("failed to set fallback to original planner to true", e); + } + throw new AnalysisException("Nereids DML is disabled, will try to fall back to the original planner"); + } return this.logicalQuery; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/UpdateCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/UpdateCommand.java index 6236ba019b5f9b..92f3fb21ee5414 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/UpdateCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/UpdateCommand.java @@ -148,6 +148,14 @@ private void checkTable(ConnectContext ctx) throws AnalysisException { @Override public Plan getExplainPlan(ConnectContext ctx) throws AnalysisException { + if (!ctx.getSessionVariable().isEnableNereidsDML()) { + try { + ctx.getSessionVariable().enableFallbackToOriginalPlannerOnce(); + } catch (Exception e) { + throw new AnalysisException("failed to set fallback to original planner to true", e); + } + throw new AnalysisException("Nereids DML is disabled, will try to fall back to the original planner"); + } return completeQueryPlan(ctx, logicalQuery); } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/util/ReadLockTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/ReadLockTest.java index 58212c2d3ba5d2..c24a846a1bb245 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/util/ReadLockTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/ReadLockTest.java @@ -108,20 +108,26 @@ public void testScalarSubQuery() { } @Test - public void testInserInto() { + public void testInsertInto() { String sql = "INSERT INTO supplier(s_suppkey) SELECT lo_orderkey FROM lineorder"; StatementContext statementContext = MemoTestUtils.createStatementContext(connectContext, sql); - InsertIntoTableCommand insertIntoTableCommand = (InsertIntoTableCommand) parser.parseSingle(sql); - NereidsPlanner planner = new NereidsPlanner(statementContext); - planner.plan( - (LogicalPlan) insertIntoTableCommand.getExplainPlan(connectContext), - PhysicalProperties.ANY - ); - CascadesContext cascadesContext = planner.getCascadesContext(); - List f = cascadesContext.getTables(); - Assertions.assertEquals(2, f.size()); - Set tableNames = f.stream().map(TableIf::getName).collect(Collectors.toSet()); - Assertions.assertTrue(tableNames.contains("supplier")); - Assertions.assertTrue(tableNames.contains("lineorder")); + boolean originalDML = connectContext.getSessionVariable().enableNereidsDML; + connectContext.getSessionVariable().enableNereidsDML = true; + try { + InsertIntoTableCommand insertIntoTableCommand = (InsertIntoTableCommand) parser.parseSingle(sql); + NereidsPlanner planner = new NereidsPlanner(statementContext); + planner.plan( + (LogicalPlan) insertIntoTableCommand.getExplainPlan(connectContext), + PhysicalProperties.ANY + ); + CascadesContext cascadesContext = planner.getCascadesContext(); + List f = cascadesContext.getTables(); + Assertions.assertEquals(2, f.size()); + Set tableNames = f.stream().map(TableIf::getName).collect(Collectors.toSet()); + Assertions.assertTrue(tableNames.contains("supplier")); + Assertions.assertTrue(tableNames.contains("lineorder")); + } finally { + connectContext.getSessionVariable().enableNereidsDML = originalDML; + } } } From 0c71825172cb2d2b661efb1d1fefe5f50785662e Mon Sep 17 00:00:00 2001 From: Tiewei Fang <43782773+BePPPower@users.noreply.github.com> Date: Fri, 22 Dec 2023 11:10:47 +0800 Subject: [PATCH 5/6] [Fix](Export) Fix BE core when the `columns` attribute of `export` parquet is specified as an asterisk (#28627) --- .../org/apache/doris/analysis/ExportStmt.java | 31 +++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java index 38f02c49cc41e1..6eaa45cdfddba7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java @@ -39,6 +39,7 @@ import com.google.common.base.Joiner; import com.google.common.base.Preconditions; +import com.google.common.base.Splitter; import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; @@ -49,6 +50,7 @@ import java.util.Map; import java.util.Optional; import java.util.UUID; +import java.util.stream.Collectors; // EXPORT statement, export data to dirs by broker. // @@ -64,7 +66,6 @@ public class ExportStmt extends StatementBase { private static final String DEFAULT_COLUMN_SEPARATOR = "\t"; private static final String DEFAULT_LINE_DELIMITER = "\n"; - private static final String DEFAULT_COLUMNS = ""; private static final String DEFAULT_PARALLELISM = "1"; private static final Integer DEFAULT_TIMEOUT = 7200; @@ -121,7 +122,6 @@ public ExportStmt(TableRef tableRef, Expr whereExpr, String path, this.columnSeparator = DEFAULT_COLUMN_SEPARATOR; this.lineDelimiter = DEFAULT_LINE_DELIMITER; this.timeout = DEFAULT_TIMEOUT; - this.columns = DEFAULT_COLUMNS; // The ExportStmt may be created in replay thread, there is no ConnectionContext // in replay thread, so we need to clone session variable from default session variable. @@ -352,7 +352,14 @@ private void checkProperties(Map properties) throws UserExceptio properties, ExportStmt.DEFAULT_COLUMN_SEPARATOR)); this.lineDelimiter = Separator.convertSeparator(PropertyAnalyzer.analyzeLineDelimiter( properties, ExportStmt.DEFAULT_LINE_DELIMITER)); - this.columns = properties.getOrDefault(LoadStmt.KEY_IN_PARAM_COLUMNS, DEFAULT_COLUMNS); + + // null means not specified + // "" means user specified zero columns + this.columns = properties.getOrDefault(LoadStmt.KEY_IN_PARAM_COLUMNS, null); + // check columns are exits + if (this.columns != null) { + checkColumns(); + } // format this.format = properties.getOrDefault(LoadStmt.KEY_IN_PARAM_FORMAT_TYPE, "csv").toLowerCase(); @@ -384,6 +391,24 @@ private void checkProperties(Map properties) throws UserExceptio label = properties.get(LABEL); } + private void checkColumns() throws DdlException { + if (this.columns.isEmpty()) { + throw new DdlException("columns can not be empty"); + } + Database db = Env.getCurrentInternalCatalog().getDbOrDdlException(this.tblName.getDb()); + Table table = db.getTableOrDdlException(this.tblName.getTbl()); + List tableColumns = table.getBaseSchema().stream().map(column -> column.getName()) + .collect(Collectors.toList()); + Splitter split = Splitter.on(',').trimResults().omitEmptyStrings(); + + List columnsSpecified = split.splitToList(this.columns.toLowerCase()); + for (String columnName : columnsSpecified) { + if (!tableColumns.contains(columnName)) { + throw new DdlException("unknown column [" + columnName + "] in table [" + this.tblName.getTbl() + "]"); + } + } + } + @Override public String toSql() { StringBuilder sb = new StringBuilder(); From 69357f3592101795ba6214555e08bd77bdd750a0 Mon Sep 17 00:00:00 2001 From: zhiqiang Date: Fri, 22 Dec 2023 11:20:15 +0800 Subject: [PATCH 6/6] [test](regression-test) order by decs should only make effect on its nearest column #28728 (#28730) --- .../data/correctness_p0/test_order_by.out | 22 +++++++ .../correctness_p0/test_order_by.groovy | 63 +++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 regression-test/data/correctness_p0/test_order_by.out create mode 100644 regression-test/suites/correctness_p0/test_order_by.groovy diff --git a/regression-test/data/correctness_p0/test_order_by.out b/regression-test/data/correctness_p0/test_order_by.out new file mode 100644 index 00000000000000..23d447cd520e27 --- /dev/null +++ b/regression-test/data/correctness_p0/test_order_by.out @@ -0,0 +1,22 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +2023-12-19T00:01:12 2023-12-19 00:06:08.618 2023-12-18T23:56:12 +2023-12-19T00:01:12 2023-12-19 00:05:58.513 2023-12-18T23:56:12 +2023-12-18T23:56:12 2023-12-19 00:01:08.799 2023-12-18T23:56:12 +2023-12-18T23:56:12 2023-12-19 00:01:08.797 2023-12-18T23:56:12 +2023-12-18T23:56:12 2023-12-19 00:01:08.796 2023-12-18T23:56:12 + +-- !select -- +2023-12-18T23:56:12 2023-12-19 00:01:08.799 2023-12-18T23:56:12 +2023-12-18T23:56:12 2023-12-19 00:01:08.797 2023-12-18T23:56:12 +2023-12-18T23:56:12 2023-12-19 00:01:08.796 2023-12-18T23:56:12 +2023-12-19T00:01:12 2023-12-19 00:06:08.618 2023-12-18T23:56:12 +2023-12-19T00:01:12 2023-12-19 00:05:58.513 2023-12-18T23:56:12 + +-- !select -- +2023-12-19T00:01:12 2023-12-19 00:06:08.618 2023-12-18T23:56:12 +2023-12-19T00:01:12 2023-12-19 00:05:58.513 2023-12-18T23:56:12 +2023-12-18T23:56:12 2023-12-19 00:01:08.799 2023-12-18T23:56:12 +2023-12-18T23:56:12 2023-12-19 00:01:08.797 2023-12-18T23:56:12 +2023-12-18T23:56:12 2023-12-19 00:01:08.796 2023-12-18T23:56:12 + diff --git a/regression-test/suites/correctness_p0/test_order_by.groovy b/regression-test/suites/correctness_p0/test_order_by.groovy new file mode 100644 index 00000000000000..3bcb253f6d89b8 --- /dev/null +++ b/regression-test/suites/correctness_p0/test_order_by.groovy @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_order_by") { + sql """ + drop table if exists test_order_by; + """ + + sql """ + create table if not exists test_order_by( + create_time datetime null default current_timestamp, + run_time varchar(200) null comment '时间戳', + create_time2 datetime null + ) + duplicate key(create_time,run_time) + distributed by hash(create_time) buckets 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + insert into test_order_by values ('2023-12-18 23:56:12','2023-12-19 00:01:08.799','2023-12-18 23:56:12'); + """ + sql """ + insert into test_order_by values ('2023-12-18 23:56:12','2023-12-19 00:01:08.797','2023-12-18 23:56:12'); + """ + sql """ + insert into test_order_by values ('2023-12-18 23:56:12','2023-12-19 00:01:08.796','2023-12-18 23:56:12'); + """ + sql """ + insert into test_order_by values ('2023-12-19 00:01:12','2023-12-19 00:06:08.618','2023-12-18 23:56:12'); + """ + sql """ + insert into test_order_by values ('2023-12-19 00:01:12','2023-12-19 00:05:58.513','2023-12-18 23:56:12'); + """ + + qt_select """ + select * from test_order_by order by create_time desc; + """ + + qt_select """ + select * from test_order_by order by create_time, run_time desc; + """ + + qt_select """ + select * from test_order_by order by create_time desc, run_time desc; + """ +} \ No newline at end of file