Skip to content

Commit

Permalink
[Feature](inverted index) add lowercase option for inverted index ana…
Browse files Browse the repository at this point in the history
…lyzer (apache#28704)
  • Loading branch information
airborne12 authored and stephen committed Dec 28, 2023
1 parent e8955b4 commit 4a31812
Show file tree
Hide file tree
Showing 10 changed files with 372 additions and 7 deletions.
11 changes: 9 additions & 2 deletions be/src/olap/inverted_index_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_
default:
return INVERTED_INDEX_PARSER_UNKNOWN;
}

return INVERTED_INDEX_PARSER_UNKNOWN;
}

InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str) {
Expand Down Expand Up @@ -128,4 +126,13 @@ std::string get_parser_ignore_above_value_from_properties(
}
}

std::string get_parser_lowercase_from_properties(
const std::map<std::string, std::string>& properties) {
if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) {
return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
} else {
return "";
}
}

} // namespace doris
5 changes: 4 additions & 1 deletion be/src/olap/inverted_index_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_r
const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above";
const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256";

const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";

std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type);

InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str);
Expand All @@ -88,5 +90,6 @@ CharFilterMap get_parser_char_filter_map_from_properties(
// get parser ignore_above value from properties
std::string get_parser_ignore_above_value_from_properties(
const std::map<std::string, std::string>& properties);

std::string get_parser_lowercase_from_properties(
const std::map<std::string, std::string>& properties);
} // namespace doris
10 changes: 7 additions & 3 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,7 @@ void InvertedIndexReader::get_analyse_result(std::vector<std::string>& analyse_r

while (token_stream->next(&token)) {
if (token.termLength<char>() != 0) {
analyse_result.emplace_back(
std::string(token.termBuffer<char>(), token.termLength<char>()));
analyse_result.emplace_back(token.termBuffer<char>(), token.termLength<char>());
}
}

Expand Down Expand Up @@ -256,12 +255,17 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run
inverted_index_ctx->char_filter_map =
get_parser_char_filter_map_from_properties(_index_meta.properties());
auto analyzer = create_analyzer(inverted_index_ctx.get());
auto lowercase = get_parser_lowercase_from_properties(_index_meta.properties());
if (lowercase == "true") {
analyzer->set_lowercase(true);
} else if (lowercase == "false") {
analyzer->set_lowercase(false);
}
auto reader = create_reader(inverted_index_ctx.get(), search_str);
inverted_index_ctx->analyzer = analyzer.get();
get_analyse_result(analyse_result, reader.get(), analyzer.get(), column_name,
query_type);
}

if (analyse_result.empty()) {
auto msg = fmt::format(
"token parser result is empty for query, "
Expand Down
6 changes: 6 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,12 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
// ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
_analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
}
auto lowercase = get_parser_lowercase_from_properties(_index_meta->properties());
if (lowercase == "true") {
_analyzer->set_lowercase(true);
} else if (lowercase == "false") {
_analyzer->set_lowercase(false);
}
_index_writer = std::make_unique<lucene::index::IndexWriter>(_dir.get(), _analyzer.get(),
create, true);
_index_writer->setMaxBufferedDocs(MAX_BUFFER_DOCS);
Expand Down
3 changes: 3 additions & 0 deletions docs/en/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ The features for inverted index is as follows:
- ignore_above: Controls whether strings are indexed.
- Strings longer than the ignore_above setting will not be indexed. For arrays of strings, ignore_above will be applied for each array element separately and string elements longer than ignore_above will not be indexed.
- default value is 256 bytes.
- lower_case: Whether to convert tokens to lowercase, thereby achieving case-insensitive matching.
- true: Convert to lowercase
- false: Do not convert to lowercase
- COMMENT is optional

```sql
Expand Down
3 changes: 3 additions & 0 deletions docs/zh-CN/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ Doris倒排索引的功能简要介绍如下:
- ignore_above:控制字符串是否建索引。
- 长度超过 ignore_above 设置的字符串不会被索引。对于字符串数组,ignore_above 将分别应用于每个数组元素,长度超过 ignore_above 的字符串元素将不被索引。
- 默认为 256 字节
- lower_case: 是否将分词进行小写转换,从而在匹配的时候实现忽略大小写
- true: 转换小写
- false:不转换小写
- COMMENT 是可选的,用于指定注释

```sql
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ public class InvertedIndexUtil {

public static String INVERTED_INDEX_PARSER_IGNORE_ABOVE = "ignore_above";

public static String INVERTED_INDEX_PARSER_LOWERCASE = "lower_case";

public static String getInvertedIndexParser(Map<String, String> properties) {
String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY);
// default is "none" if not set
Expand Down Expand Up @@ -111,6 +113,13 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c
throw new AnalysisException("invalid index properties, ignore_above must be integer");
}
}
String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE);
if (lowerCase != null) {
if (!"true".equals(lowerCase) && !"false".equals(lowerCase)) {
throw new AnalysisException("invalid index properties, lowercase must be true or false");
}
}

}

// default is "none" if not set
Expand Down
112 changes: 112 additions & 0 deletions regression-test/data/inverted_index_p0/test_lowercase.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学

-- !sql --
2 HELLO 我爱你中国

-- !sql --
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello world

-- !sql --
2 HELLO WORLD

-- !sql --
3 Hello World

-- !sql --
1 hello 我来到北京清华大学

-- !sql --
2 HELLO 我爱你中国

-- !sql --
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学

-- !sql --
2 HELLO 我爱你中国

-- !sql --
3 Hello 人民可以得到更多实惠

Loading

0 comments on commit 4a31812

Please sign in to comment.