Skip to content

Commit

Permalink
[Feature](inverted index) add lowercase option for inverted index ana…
Browse files Browse the repository at this point in the history
…lyzer (apache#28704)
  • Loading branch information
airborne12 committed Dec 25, 2023
1 parent 4534b8f commit 323a0ee
Show file tree
Hide file tree
Showing 10 changed files with 372 additions and 6 deletions.
2 changes: 1 addition & 1 deletion be/src/clucene
Submodule clucene updated 32 files
+2 −2 CMakeLists.txt
+3 −2 src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
+13 −0 src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
+1 −0 src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
+7 −0 src/core/CLucene/analysis/AnalysisHeader.h
+11 −1 src/core/CLucene/analysis/Analyzers.cpp
+8 −5 src/core/CLucene/analysis/Analyzers.h
+3 −2 src/core/CLucene/analysis/standard95/StandardAnalyzer.h
+10 −3 src/core/CLucene/analysis/standard95/StandardTokenizer.h
+6 −0 src/core/CLucene/index/DirectoryIndexReader.cpp
+2 −0 src/core/CLucene/index/DirectoryIndexReader.h
+6 −0 src/core/CLucene/index/IndexReader.h
+34 −47 src/core/CLucene/index/IndexWriter.cpp
+4 −7 src/core/CLucene/index/IndexWriter.h
+7 −0 src/core/CLucene/index/MultiReader.cpp
+1 −0 src/core/CLucene/index/MultiReader.h
+5 −0 src/core/CLucene/index/MultiSegmentReader.cpp
+19 −18 src/core/CLucene/index/SDocumentWriter.cpp
+4 −0 src/core/CLucene/index/SDocumentWriter.h
+59 −26 src/core/CLucene/index/SegmentTermEnum.cpp
+16 −11 src/core/CLucene/index/TermInfosReader.cpp
+24 −7 src/core/CLucene/index/TermInfosWriter.cpp
+1 −0 src/core/CLucene/index/_DocumentsWriter.h
+1 −0 src/core/CLucene/index/_MultiSegmentReader.h
+10 −5 src/core/CLucene/index/_SegmentTermEnum.h
+6 −4 src/core/CLucene/index/_TermInfosWriter.h
+2 −2 src/core/CLucene/search/IndexSearcher.cpp
+1 −1 src/core/CLucene/search/IndexSearcher.h
+3 −0 src/core/CLucene/store/IndexInput.h
+1 −1 src/core/CLucene/util/PFORUtil.cpp
+63 −0 src/core/CLucene/util/SSEUtil.h
+39 −0 src/core/CLucene/util/stringUtil.h
11 changes: 9 additions & 2 deletions be/src/olap/inverted_index_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_
default:
return INVERTED_INDEX_PARSER_UNKNOWN;
}

return INVERTED_INDEX_PARSER_UNKNOWN;
}

InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str) {
Expand Down Expand Up @@ -119,4 +117,13 @@ CharFilterMap get_parser_char_filter_map_from_properties(
return char_filter_map;
}

std::string get_parser_lowercase_from_properties(
const std::map<std::string, std::string>& properties) {
if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) {
return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
} else {
return "";
}
}

} // namespace doris
4 changes: 4 additions & 0 deletions be/src/olap/inverted_index_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type";
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern";
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement";

const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";

std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type);

InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str);
Expand All @@ -82,4 +84,6 @@ std::string get_parser_phrase_support_string_from_properties(
CharFilterMap get_parser_char_filter_map_from_properties(
const std::map<std::string, std::string>& properties);

std::string get_parser_lowercase_from_properties(
const std::map<std::string, std::string>& properties);
} // namespace doris
10 changes: 7 additions & 3 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,7 @@ void InvertedIndexReader::get_analyse_result(std::vector<std::string>& analyse_r

while (token_stream->next(&token)) {
if (token.termLength<char>() != 0) {
analyse_result.emplace_back(
std::string(token.termBuffer<char>(), token.termLength<char>()));
analyse_result.emplace_back(token.termBuffer<char>(), token.termLength<char>());
}
}

Expand Down Expand Up @@ -266,12 +265,17 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run
inverted_index_ctx->char_filter_map =
get_parser_char_filter_map_from_properties(_index_meta.properties());
auto analyzer = create_analyzer(inverted_index_ctx.get());
auto lowercase = get_parser_lowercase_from_properties(_index_meta.properties());
if (lowercase == "true") {
analyzer->set_lowercase(true);
} else if (lowercase == "false") {
analyzer->set_lowercase(false);
}
auto reader = create_reader(inverted_index_ctx.get(), search_str);
inverted_index_ctx->analyzer = analyzer.get();
get_analyse_result(analyse_result, reader.get(), analyzer.get(), column_name,
query_type);
}

if (analyse_result.empty()) {
auto msg = fmt::format(
"token parser result is empty for query, "
Expand Down
6 changes: 6 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
// ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
_analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
}
auto lowercase = get_parser_lowercase_from_properties(_index_meta->properties());
if (lowercase == "true") {
_analyzer->set_lowercase(true);
} else if (lowercase == "false") {
_analyzer->set_lowercase(false);
}
_index_writer = std::make_unique<lucene::index::IndexWriter>(_dir.get(), _analyzer.get(),
create, true);
_index_writer->setMaxBufferedDocs(MAX_BUFFER_DOCS);
Expand Down
3 changes: 3 additions & 0 deletions docs/en/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ The features for inverted index is as follows:
- char_replace: replace each char in the pattern with a char in the replacement
- char_filter_pattern: character array to be replaced
- char_filter_replacement: replaced character array, can be left unset, defaults to a space character
- lower_case: Whether to convert tokens to lowercase, thereby achieving case-insensitive matching.
- true: Convert to lowercase
- false: Do not convert to lowercase
- COMMENT is optional

```sql
Expand Down
3 changes: 3 additions & 0 deletions docs/zh-CN/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ Doris倒排索引的功能简要介绍如下:
- char_replace 将pattern中每个char替换为一个replacement中的char
- char_filter_pattern:需要被替换掉的字符数组
- char_filter_replacement:替换后的字符数组,可以不用配置,默认为一个空格字符
- lower_case: 是否将分词进行小写转换,从而在匹配的时候实现忽略大小写
- true: 转换小写
- false:不转换小写
- COMMENT 是可选的,用于指定注释

```sql
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ public class InvertedIndexUtil {

public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace";

public static String INVERTED_INDEX_PARSER_LOWERCASE = "lower_case";

public static String getInvertedIndexParser(Map<String, String> properties) {
String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY);
// default is "none" if not set
Expand Down Expand Up @@ -98,6 +100,13 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c
if (parser == null && !properties.isEmpty()) {
throw new AnalysisException("invalid index properties, please check the properties");
}
String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE);
if (lowerCase != null) {
if (!"true".equals(lowerCase) && !"false".equals(lowerCase)) {
throw new AnalysisException("invalid index properties, lowercase must be true or false");
}
}

}

// default is "none" if not set
Expand Down
112 changes: 112 additions & 0 deletions regression-test/data/inverted_index_p0/test_lowercase.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学

-- !sql --
2 HELLO 我爱你中国

-- !sql --
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello world
2 HELLO WORLD
3 Hello World

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学
2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello world

-- !sql --
2 HELLO WORLD

-- !sql --
3 Hello World

-- !sql --
1 hello 我来到北京清华大学

-- !sql --
2 HELLO 我爱你中国

-- !sql --
3 Hello 人民可以得到更多实惠

-- !sql --
1 hello 我来到北京清华大学

-- !sql --
2 HELLO 我爱你中国

-- !sql --
3 Hello 人民可以得到更多实惠

Loading

0 comments on commit 323a0ee

Please sign in to comment.