Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature](inverted index) add lowercase option for inverted index analyzer #28704

Merged
merged 4 commits into from
Dec 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions be/src/olap/inverted_index_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_
default:
return INVERTED_INDEX_PARSER_UNKNOWN;
}

return INVERTED_INDEX_PARSER_UNKNOWN;
}

InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str) {
Expand Down Expand Up @@ -128,4 +126,13 @@ std::string get_parser_ignore_above_value_from_properties(
}
}

std::string get_parser_lowercase_from_properties(
const std::map<std::string, std::string>& properties) {
if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) {
return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
} else {
return "";
}
}

} // namespace doris
5 changes: 4 additions & 1 deletion be/src/olap/inverted_index_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_r
const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above";
const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256";

const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";

std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type);

InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str);
Expand All @@ -88,5 +90,6 @@ CharFilterMap get_parser_char_filter_map_from_properties(
// get parser ignore_above value from properties
std::string get_parser_ignore_above_value_from_properties(
const std::map<std::string, std::string>& properties);

std::string get_parser_lowercase_from_properties(
const std::map<std::string, std::string>& properties);
} // namespace doris
10 changes: 7 additions & 3 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,7 @@ void InvertedIndexReader::get_analyse_result(std::vector<std::string>& analyse_r

while (token_stream->next(&token)) {
if (token.termLength<char>() != 0) {
analyse_result.emplace_back(
std::string(token.termBuffer<char>(), token.termLength<char>()));
analyse_result.emplace_back(token.termBuffer<char>(), token.termLength<char>());
}
}

Expand Down Expand Up @@ -256,12 +255,17 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run
inverted_index_ctx->char_filter_map =
get_parser_char_filter_map_from_properties(_index_meta.properties());
auto analyzer = create_analyzer(inverted_index_ctx.get());
auto lowercase = get_parser_lowercase_from_properties(_index_meta.properties());
if (lowercase == "true") {
analyzer->set_lowercase(true);
} else if (lowercase == "false") {
analyzer->set_lowercase(false);
}
auto reader = create_reader(inverted_index_ctx.get(), search_str);
inverted_index_ctx->analyzer = analyzer.get();
get_analyse_result(analyse_result, reader.get(), analyzer.get(), column_name,
query_type);
}

if (analyse_result.empty()) {
auto msg = fmt::format(
"token parser result is empty for query, "
Expand Down
6 changes: 6 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,12 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
// ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
_analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
}
auto lowercase = get_parser_lowercase_from_properties(_index_meta->properties());
if (lowercase == "true") {
_analyzer->set_lowercase(true);
} else if (lowercase == "false") {
_analyzer->set_lowercase(false);
}
airborne12 marked this conversation as resolved.
Show resolved Hide resolved
_index_writer = std::make_unique<lucene::index::IndexWriter>(_dir.get(), _analyzer.get(),
create, true);
_index_writer->setMaxBufferedDocs(MAX_BUFFER_DOCS);
Expand Down
3 changes: 3 additions & 0 deletions docs/en/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ The features for inverted index is as follows:
- ignore_above: Controls whether strings are indexed.
- Strings longer than the ignore_above setting will not be indexed. For arrays of strings, ignore_above will be applied for each array element separately and string elements longer than ignore_above will not be indexed.
- default value is 256 bytes.
- lower_case: Whether to convert tokens to lowercase, thereby achieving case-insensitive matching.
- true: Convert to lowercase
- false: Do not convert to lowercase
- COMMENT is optional

```sql
Expand Down
3 changes: 3 additions & 0 deletions docs/zh-CN/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ Doris倒排索引的功能简要介绍如下:
- ignore_above:控制字符串是否建索引。
- 长度超过 ignore_above 设置的字符串不会被索引。对于字符串数组,ignore_above 将分别应用于每个数组元素,长度超过 ignore_above 的字符串元素将不被索引。
- 默认为 256 字节
- lower_case: 是否将分词进行小写转换,从而在匹配的时候实现忽略大小写
- true: 转换小写
- false:不转换小写
- COMMENT 是可选的,用于指定注释

```sql
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ public class InvertedIndexUtil {

public static String INVERTED_INDEX_PARSER_IGNORE_ABOVE = "ignore_above";

public static String INVERTED_INDEX_PARSER_LOWERCASE = "lower_case";

public static String getInvertedIndexParser(Map<String, String> properties) {
String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY);
// default is "none" if not set
Expand Down Expand Up @@ -111,6 +113,13 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c
throw new AnalysisException("invalid index properties, ignore_above must be integer");
}
}
String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE);
if (lowerCase != null) {
if (!"true".equals(lowerCase) && !"false".equals(lowerCase)) {
throw new AnalysisException("invalid index properties, lowercase must be true or false");
}
}

}

// default is "none" if not set
Expand Down
112 changes: 112 additions & 0 deletions regression-test/data/inverted_index_p0/test_lowercase.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学

-- !sql --
2 HELLO 我爱你中国

-- !sql --
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello world

-- !sql --
2 HELLO WORLD

-- !sql --
3 Hello World

-- !sql --
1 hello 我来到北京清华大学

-- !sql --
2 HELLO 我爱你中国

-- !sql --
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学

-- !sql --
2 HELLO 我爱你中国

-- !sql --
3 Hello 人民可以得到更多实惠

Loading
Loading