From 4a31812237351d8fac439371f43c5a9c5d855b2b Mon Sep 17 00:00:00 2001 From: airborne12 Date: Fri, 22 Dec 2023 18:22:44 +0800 Subject: [PATCH] [Feature](inverted index) add lowercase option for inverted index analyzer (#28704) --- be/src/clucene | 2 +- be/src/olap/inverted_index_parser.cpp | 11 +- be/src/olap/inverted_index_parser.h | 5 +- .../segment_v2/inverted_index_reader.cpp | 10 +- .../segment_v2/inverted_index_writer.cpp | 6 + .../docs/data-table/index/inverted-index.md | 3 + .../docs/data-table/index/inverted-index.md | 3 + .../doris/analysis/InvertedIndexUtil.java | 9 + .../data/inverted_index_p0/test_lowercase.out | 112 +++++++++ .../inverted_index_p0/test_lowercase.groovy | 218 ++++++++++++++++++ 10 files changed, 372 insertions(+), 7 deletions(-) create mode 100644 regression-test/data/inverted_index_p0/test_lowercase.out create mode 100644 regression-test/suites/inverted_index_p0/test_lowercase.groovy diff --git a/be/src/clucene b/be/src/clucene index 4bd7d4501739c79..d95d6be91ecd4e4 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 4bd7d4501739c798c98b30d6350b243942d5f9bc +Subproject commit d95d6be91ecd4e471306caa57b580ba548605962 diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 3d498ff53825185..17cddc042f05424 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -37,8 +37,6 @@ std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_ default: return INVERTED_INDEX_PARSER_UNKNOWN; } - - return INVERTED_INDEX_PARSER_UNKNOWN; } InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str) { @@ -128,4 +126,13 @@ std::string get_parser_ignore_above_value_from_properties( } } +std::string get_parser_lowercase_from_properties( + const std::map& properties) { + if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) { + return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY); + } else { + return ""; + } +} + } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index ca1efe773af558d..4a84823d14c2d30 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -72,6 +72,8 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_r const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above"; const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256"; +const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case"; + std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type); InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str); @@ -88,5 +90,6 @@ CharFilterMap get_parser_char_filter_map_from_properties( // get parser ignore_above value from properties std::string get_parser_ignore_above_value_from_properties( const std::map& properties); - +std::string get_parser_lowercase_from_properties( + const std::map& properties); } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 8a226ac123f6438..6928a976fa8e7bf 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -152,8 +152,7 @@ void InvertedIndexReader::get_analyse_result(std::vector& analyse_r while (token_stream->next(&token)) { if (token.termLength() != 0) { - analyse_result.emplace_back( - std::string(token.termBuffer(), token.termLength())); + analyse_result.emplace_back(token.termBuffer(), token.termLength()); } } @@ -256,12 +255,17 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run inverted_index_ctx->char_filter_map = get_parser_char_filter_map_from_properties(_index_meta.properties()); auto analyzer = create_analyzer(inverted_index_ctx.get()); + auto lowercase = get_parser_lowercase_from_properties(_index_meta.properties()); + if (lowercase == "true") { + analyzer->set_lowercase(true); + } else if (lowercase == "false") { + analyzer->set_lowercase(false); + } auto reader = create_reader(inverted_index_ctx.get(), search_str); inverted_index_ctx->analyzer = analyzer.get(); get_analyse_result(analyse_result, reader.get(), analyzer.get(), column_name, query_type); } - if (analyse_result.empty()) { auto msg = fmt::format( "token parser result is empty for query, " diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 851826027210fde..aab2a5a73f0736a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -192,6 +192,12 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer _analyzer = std::make_unique>(); } + auto lowercase = get_parser_lowercase_from_properties(_index_meta->properties()); + if (lowercase == "true") { + _analyzer->set_lowercase(true); + } else if (lowercase == "false") { + _analyzer->set_lowercase(false); + } _index_writer = std::make_unique(_dir.get(), _analyzer.get(), create, true); _index_writer->setMaxBufferedDocs(MAX_BUFFER_DOCS); diff --git a/docs/en/docs/data-table/index/inverted-index.md b/docs/en/docs/data-table/index/inverted-index.md index f10b543807c13a0..75a8f6a3b1bc69d 100644 --- a/docs/en/docs/data-table/index/inverted-index.md +++ b/docs/en/docs/data-table/index/inverted-index.md @@ -92,6 +92,9 @@ The features for inverted index is as follows: - ignore_above: Controls whether strings are indexed. - Strings longer than the ignore_above setting will not be indexed. For arrays of strings, ignore_above will be applied for each array element separately and string elements longer than ignore_above will not be indexed. - default value is 256 bytes. + - lower_case: Whether to convert tokens to lowercase, thereby achieving case-insensitive matching. + - true: Convert to lowercase + - false: Do not convert to lowercase - COMMENT is optional ```sql diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md b/docs/zh-CN/docs/data-table/index/inverted-index.md index e3cba26ed8f4ab7..2f4c3f85bfee195 100644 --- a/docs/zh-CN/docs/data-table/index/inverted-index.md +++ b/docs/zh-CN/docs/data-table/index/inverted-index.md @@ -90,6 +90,9 @@ Doris倒排索引的功能简要介绍如下: - ignore_above:控制字符串是否建索引。 - 长度超过 ignore_above 设置的字符串不会被索引。对于字符串数组,ignore_above 将分别应用于每个数组元素,长度超过 ignore_above 的字符串元素将不被索引。 - 默认为 256 字节 + - lower_case: 是否将分词进行小写转换,从而在匹配的时候实现忽略大小写 + - true: 转换小写 + - false:不转换小写 - COMMENT 是可选的,用于指定注释 ```sql diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index daeecede096aaa0..2cd62692337ed1b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -45,6 +45,8 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_PARSER_IGNORE_ABOVE = "ignore_above"; + public static String INVERTED_INDEX_PARSER_LOWERCASE = "lower_case"; + public static String getInvertedIndexParser(Map properties) { String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY); // default is "none" if not set @@ -111,6 +113,13 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c throw new AnalysisException("invalid index properties, ignore_above must be integer"); } } + String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE); + if (lowerCase != null) { + if (!"true".equals(lowerCase) && !"false".equals(lowerCase)) { + throw new AnalysisException("invalid index properties, lowercase must be true or false"); + } + } + } // default is "none" if not set diff --git a/regression-test/data/inverted_index_p0/test_lowercase.out b/regression-test/data/inverted_index_p0/test_lowercase.out new file mode 100644 index 000000000000000..03c2f57468ffb0b --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_lowercase.out @@ -0,0 +1,112 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 + +-- !sql -- +2 HELLO 我爱你中国 + +-- !sql -- +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello world + +-- !sql -- +2 HELLO WORLD + +-- !sql -- +3 Hello World + +-- !sql -- +1 hello 我来到北京清华大学 + +-- !sql -- +2 HELLO 我爱你中国 + +-- !sql -- +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 + +-- !sql -- +2 HELLO 我爱你中国 + +-- !sql -- +3 Hello 人民可以得到更多实惠 + diff --git a/regression-test/suites/inverted_index_p0/test_lowercase.groovy b/regression-test/suites/inverted_index_p0/test_lowercase.groovy new file mode 100644 index 000000000000000..0670e05594af492 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_lowercase.groovy @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_lowercase"){ + // prepare test table + def indexTblName = "lowercase_test1" + + sql "DROP TABLE IF EXISTS ${indexTblName}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="english") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName VALUES (1, 'hello world'), (2, 'HELLO WORLD'), (3, 'Hello World');" + qt_sql "SELECT * FROM $indexTblName WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName2 = "lowercase_test2" + + sql "DROP TABLE IF EXISTS ${indexTblName2}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName2}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName2 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName3 = "lowercase_test3" + + sql "DROP TABLE IF EXISTS ${indexTblName3}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName3}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName3 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName4 = "lowercase_test11" + + sql "DROP TABLE IF EXISTS ${indexTblName4}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName4}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="english","lower_case"="true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName4 VALUES (1, 'hello world'), (2, 'HELLO WORLD'), (3, 'Hello World');" + qt_sql "SELECT * FROM $indexTblName4 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName4 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName4 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName5 = "lowercase_test12" + + sql "DROP TABLE IF EXISTS ${indexTblName5}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName5}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode","lower_case"="true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName5 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName5 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName5 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName5 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName6 = "lowercase_test13" + + sql "DROP TABLE IF EXISTS ${indexTblName6}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName6}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese","lower_case"="true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName6 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName6 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName6 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName6 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName7 = "lowercase_test21" + + sql "DROP TABLE IF EXISTS ${indexTblName7}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName7}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="english","lower_case"="false") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName7 VALUES (1, 'hello world'), (2, 'HELLO WORLD'), (3, 'Hello World');" + qt_sql "SELECT * FROM $indexTblName7 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName7 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName7 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName8 = "lowercase_test22" + + sql "DROP TABLE IF EXISTS ${indexTblName8}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName8}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode","lower_case"="false") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName8 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName8 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName8 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName8 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName9 = "lowercase_test23" + + sql "DROP TABLE IF EXISTS ${indexTblName9}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName9}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese","lower_case"="false") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName9 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName9 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName9 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName9 WHERE c MATCH 'Hello' ORDER BY id"; +}