diff --git a/be/src/clucene b/be/src/clucene index 6f8a21ffe15bd7..ed92e1813103a5 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 6f8a21ffe15bd78a1cd3e685067ee5c9ed071827 +Subproject commit ed92e1813103a513aa0ee16730b94cc840daec73 diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 5678a217b537f6..85e2f523dde86f 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -37,8 +37,6 @@ std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_ default: return INVERTED_INDEX_PARSER_UNKNOWN; } - - return INVERTED_INDEX_PARSER_UNKNOWN; } InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str) { @@ -119,4 +117,13 @@ CharFilterMap get_parser_char_filter_map_from_properties( return char_filter_map; } +std::string get_parser_lowercase_from_properties( + const std::map& properties) { + if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) { + return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY); + } else { + return ""; + } +} + } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index 54455bddef8d22..a265c6289a77ee 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -69,6 +69,8 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement"; +const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case"; + std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type); InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str); @@ -82,4 +84,6 @@ std::string get_parser_phrase_support_string_from_properties( CharFilterMap get_parser_char_filter_map_from_properties( const std::map& properties); +std::string get_parser_lowercase_from_properties( + const std::map& properties); } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 292884e631b939..6e1ebb53a9d216 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -162,8 +162,7 @@ void InvertedIndexReader::get_analyse_result(std::vector& analyse_r while (token_stream->next(&token)) { if (token.termLength() != 0) { - analyse_result.emplace_back( - std::string(token.termBuffer(), token.termLength())); + analyse_result.emplace_back(token.termBuffer(), token.termLength()); } } @@ -266,12 +265,17 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run inverted_index_ctx->char_filter_map = get_parser_char_filter_map_from_properties(_index_meta.properties()); auto analyzer = create_analyzer(inverted_index_ctx.get()); + auto lowercase = get_parser_lowercase_from_properties(_index_meta.properties()); + if (lowercase == "true") { + analyzer->set_lowercase(true); + } else if (lowercase == "false") { + analyzer->set_lowercase(false); + } auto reader = create_reader(inverted_index_ctx.get(), search_str); inverted_index_ctx->analyzer = analyzer.get(); get_analyse_result(analyse_result, reader.get(), analyzer.get(), column_name, query_type); } - if (analyse_result.empty()) { auto msg = fmt::format( "token parser result is empty for query, " diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 040f9fc4d84f85..44d03723b1535f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -185,6 +185,12 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer _analyzer = std::make_unique>(); } + auto lowercase = get_parser_lowercase_from_properties(_index_meta->properties()); + if (lowercase == "true") { + _analyzer->set_lowercase(true); + } else if (lowercase == "false") { + _analyzer->set_lowercase(false); + } _index_writer = std::make_unique(_dir.get(), _analyzer.get(), create, true); _index_writer->setMaxBufferedDocs(MAX_BUFFER_DOCS); diff --git a/docs/en/docs/data-table/index/inverted-index.md b/docs/en/docs/data-table/index/inverted-index.md index f86d47c8bbe167..789316bfaf3aba 100644 --- a/docs/en/docs/data-table/index/inverted-index.md +++ b/docs/en/docs/data-table/index/inverted-index.md @@ -89,6 +89,9 @@ The features for inverted index is as follows: - char_replace: replace each char in the pattern with a char in the replacement - char_filter_pattern: character array to be replaced - char_filter_replacement: replaced character array, can be left unset, defaults to a space character + - lower_case: Whether to convert tokens to lowercase, thereby achieving case-insensitive matching. + - true: Convert to lowercase + - false: Do not convert to lowercase - COMMENT is optional ```sql diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md b/docs/zh-CN/docs/data-table/index/inverted-index.md index ad4c9a011d989e..3dcfd7895c9985 100644 --- a/docs/zh-CN/docs/data-table/index/inverted-index.md +++ b/docs/zh-CN/docs/data-table/index/inverted-index.md @@ -87,6 +87,9 @@ Doris倒排索引的功能简要介绍如下: - char_replace 将pattern中每个char替换为一个replacement中的char - char_filter_pattern:需要被替换掉的字符数组 - char_filter_replacement:替换后的字符数组,可以不用配置,默认为一个空格字符 + - lower_case: 是否将分词进行小写转换,从而在匹配的时候实现忽略大小写 + - true: 转换小写 + - false:不转换小写 - COMMENT 是可选的,用于指定注释 ```sql diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index 4196f774e2b2d5..9e0ea2060012b7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -43,6 +43,8 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace"; + public static String INVERTED_INDEX_PARSER_LOWERCASE = "lower_case"; + public static String getInvertedIndexParser(Map properties) { String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY); // default is "none" if not set @@ -98,6 +100,13 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c if (parser == null && !properties.isEmpty()) { throw new AnalysisException("invalid index properties, please check the properties"); } + String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE); + if (lowerCase != null) { + if (!"true".equals(lowerCase) && !"false".equals(lowerCase)) { + throw new AnalysisException("invalid index properties, lowercase must be true or false"); + } + } + } // default is "none" if not set diff --git a/regression-test/data/inverted_index_p0/test_lowercase.out b/regression-test/data/inverted_index_p0/test_lowercase.out new file mode 100644 index 00000000000000..03c2f57468ffb0 --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_lowercase.out @@ -0,0 +1,112 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 + +-- !sql -- +2 HELLO 我爱你中国 + +-- !sql -- +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello world +2 HELLO WORLD +3 Hello World + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello world + +-- !sql -- +2 HELLO WORLD + +-- !sql -- +3 Hello World + +-- !sql -- +1 hello 我来到北京清华大学 + +-- !sql -- +2 HELLO 我爱你中国 + +-- !sql -- +3 Hello 人民可以得到更多实惠 + +-- !sql -- +1 hello 我来到北京清华大学 + +-- !sql -- +2 HELLO 我爱你中国 + +-- !sql -- +3 Hello 人民可以得到更多实惠 + diff --git a/regression-test/suites/inverted_index_p0/test_lowercase.groovy b/regression-test/suites/inverted_index_p0/test_lowercase.groovy new file mode 100644 index 00000000000000..0670e05594af49 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_lowercase.groovy @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_lowercase"){ + // prepare test table + def indexTblName = "lowercase_test1" + + sql "DROP TABLE IF EXISTS ${indexTblName}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="english") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName VALUES (1, 'hello world'), (2, 'HELLO WORLD'), (3, 'Hello World');" + qt_sql "SELECT * FROM $indexTblName WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName2 = "lowercase_test2" + + sql "DROP TABLE IF EXISTS ${indexTblName2}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName2}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName2 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName3 = "lowercase_test3" + + sql "DROP TABLE IF EXISTS ${indexTblName3}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName3}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName3 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName4 = "lowercase_test11" + + sql "DROP TABLE IF EXISTS ${indexTblName4}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName4}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="english","lower_case"="true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName4 VALUES (1, 'hello world'), (2, 'HELLO WORLD'), (3, 'Hello World');" + qt_sql "SELECT * FROM $indexTblName4 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName4 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName4 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName5 = "lowercase_test12" + + sql "DROP TABLE IF EXISTS ${indexTblName5}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName5}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode","lower_case"="true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName5 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName5 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName5 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName5 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName6 = "lowercase_test13" + + sql "DROP TABLE IF EXISTS ${indexTblName6}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName6}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese","lower_case"="true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName6 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName6 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName6 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName6 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName7 = "lowercase_test21" + + sql "DROP TABLE IF EXISTS ${indexTblName7}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName7}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="english","lower_case"="false") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName7 VALUES (1, 'hello world'), (2, 'HELLO WORLD'), (3, 'Hello World');" + qt_sql "SELECT * FROM $indexTblName7 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName7 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName7 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName8 = "lowercase_test22" + + sql "DROP TABLE IF EXISTS ${indexTblName8}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName8}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode","lower_case"="false") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName8 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName8 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName8 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName8 WHERE c MATCH 'Hello' ORDER BY id"; + + def indexTblName9 = "lowercase_test23" + + sql "DROP TABLE IF EXISTS ${indexTblName9}" + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName9}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese","lower_case"="false") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName9 VALUES (1, 'hello 我来到北京清华大学'), (2, 'HELLO 我爱你中国'), (3, 'Hello 人民可以得到更多实惠');" + qt_sql "SELECT * FROM $indexTblName9 WHERE c MATCH 'hello' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName9 WHERE c MATCH 'HELLO' ORDER BY id"; + qt_sql "SELECT * FROM $indexTblName9 WHERE c MATCH 'Hello' ORDER BY id"; +}