Skip to content

Commit

Permalink
[opt](inverted index) the "unicode" tokenizer can be configured to di…
Browse files Browse the repository at this point in the history
…sable stop words apache#33982 (apache#34376)
  • Loading branch information
zzzxl1993 authored and weixingyu12 committed May 20, 2024
1 parent 5e698b9 commit 4b06ac2
Show file tree
Hide file tree
Showing 14 changed files with 199 additions and 35 deletions.
9 changes: 9 additions & 0 deletions be/src/olap/inverted_index_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,13 @@ std::string get_parser_ignore_above_value_from_properties(
}
}

std::string get_parser_stopwords_from_properties(
const std::map<std::string, std::string>& properties) {
if (properties.find(INVERTED_INDEX_PARSER_STOPWORDS_KEY) != properties.end()) {
return properties.at(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
} else {
return "";
}
}

} // namespace doris
5 changes: 5 additions & 0 deletions be/src/olap/inverted_index_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256";

const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";

const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";

std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type);

InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str);
Expand Down Expand Up @@ -111,4 +113,7 @@ std::string get_parser_lowercase_from_properties(
}
}

std::string get_parser_stopwords_from_properties(
const std::map<std::string, std::string>& properties);

} // namespace doris
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@ class CharReplaceCharFilter : public lucene::analysis::CharFilter {
public:
CharReplaceCharFilter(lucene::util::Reader* in, const std::string& pattern,
const std::string& replacement);
virtual ~CharReplaceCharFilter() = default;
~CharReplaceCharFilter() override = default;

void init(const void* _value, int32_t _length, bool copyData) override;
int32_t read(const void** start, int32_t min, int32_t max) override;
int32_t readCopy(void* start, int32_t off, int32_t len) override;

size_t size() override { return _buf.size(); }

private:
void fill();
void process_pattern(std::string& buf);
Expand Down
30 changes: 24 additions & 6 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,12 +290,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run
inverted_index_ctx->char_filter_map =
get_parser_char_filter_map_from_properties(_index_meta.properties());
auto analyzer = create_analyzer(inverted_index_ctx.get());
auto lowercase = get_parser_lowercase_from_properties(_index_meta.properties());
if (lowercase == "true") {
analyzer->set_lowercase(true);
} else if (lowercase == "false") {
analyzer->set_lowercase(false);
}
setup_analyzer_lowercase(analyzer, _index_meta.properties());
setup_analyzer_use_stopwords(analyzer, _index_meta.properties());
auto reader = create_reader(inverted_index_ctx.get(), search_str);
inverted_index_ctx->analyzer = analyzer.get();
get_analyse_result(analyse_result, reader.get(), analyzer.get(), column_name,
Expand Down Expand Up @@ -597,6 +593,28 @@ InvertedIndexReaderType FullTextIndexReader::type() {
return InvertedIndexReaderType::FULLTEXT;
}

void FullTextIndexReader::setup_analyzer_lowercase(
std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties) {
auto lowercase = get_parser_lowercase_from_properties(properties);
if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
analyzer->set_lowercase(true);
} else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
analyzer->set_lowercase(false);
}
}

void FullTextIndexReader::setup_analyzer_use_stopwords(
std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties) {
auto stop_words = get_parser_stopwords_from_properties(properties);
if (stop_words == "none") {
analyzer->set_stopwords(nullptr);
} else {
analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
}
}

Status StringTypeInvertedIndexReader::new_iterator(
OlapReaderStatistics* stats, RuntimeState* runtime_state,
std::unique_ptr<InvertedIndexIterator>* iterator) {
Expand Down
11 changes: 11 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ class FullTextIndexReader : public InvertedIndexReader {

InvertedIndexReaderType type() override;

static void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties);

static void setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties);

private:
Status normal_index_search(OlapReaderStatistics* stats, InvertedIndexQueryType query_type,
const IndexSearcherPtr& index_searcher,
Expand Down Expand Up @@ -274,6 +280,11 @@ class BkdIndexReader : public InvertedIndexReader {
InvertedIndexReaderType type() override;
Status get_bkd_reader(std::shared_ptr<lucene::util::bkd::bkd_reader>* reader);

static void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties);
static void setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties);

private:
const TypeInfo* _type_info {};
const KeyCoder* _value_key_coder {};
Expand Down
26 changes: 20 additions & 6 deletions be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,12 +210,8 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
// ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
_analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
}
auto lowercase = get_parser_lowercase_from_properties<true>(_index_meta->properties());
if (lowercase == "true") {
_analyzer->set_lowercase(true);
} else if (lowercase == "false") {
_analyzer->set_lowercase(false);
}
setup_analyzer_lowercase(_analyzer);
setup_analyzer_use_stopwords(_analyzer);
} catch (CLuceneError& e) {
return Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what());
Expand Down Expand Up @@ -248,6 +244,24 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
return Status::OK();
}

void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& analyzer) {
auto lowercase = get_parser_lowercase_from_properties<true>(_index_meta->properties());
if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
analyzer->set_lowercase(true);
} else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
analyzer->set_lowercase(false);
}
}

void setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>& analyzer) {
auto stop_words = get_parser_stopwords_from_properties(_index_meta->properties());
if (stop_words == "none") {
analyzer->set_stopwords(nullptr);
} else {
analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
}
}

Status add_document() {
try {
_index_writer->addDocument(_doc.get());
Expand Down
4 changes: 4 additions & 0 deletions be/src/vec/functions/function_tokenize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "CLucene/StdHeader.h"
#include "CLucene/config/repl_wchar.h"
#include "olap/inverted_index_parser.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "vec/columns/column.h"
#include "vec/common/string_ref.h"
#include "vec/core/block.h"
Expand Down Expand Up @@ -149,6 +150,9 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block
return Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what());
}
doris::segment_v2::FullTextIndexReader::setup_analyzer_lowercase(analyzer, properties);
doris::segment_v2::FullTextIndexReader::setup_analyzer_use_stopwords(analyzer,
properties);

inverted_index_ctx.analyzer = analyzer.get();
_do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, dest_offsets,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public class InvertedIndexUtil {

public static String INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";

public static String INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";

public static String getInvertedIndexParser(Map<String, String> properties) {
String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY);
// default is "none" if not set
Expand Down Expand Up @@ -136,7 +138,8 @@ public static void checkInvertedIndexProperties(Map<String, String> properties)
INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN,
INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT,
INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY,
INVERTED_INDEX_PARSER_LOWERCASE_KEY
INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_STOPWORDS_KEY
));

for (String key : properties.keySet()) {
Expand All @@ -152,6 +155,7 @@ public static void checkInvertedIndexProperties(Map<String, String> properties)
String charFilterPattern = properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);

if (parser != null && !parser.matches("none|english|unicode|chinese|standard")) {
throw new AnalysisException("Invalid inverted index 'parser' value: " + parser
Expand Down Expand Up @@ -194,5 +198,10 @@ public static void checkInvertedIndexProperties(Map<String, String> properties)
throw new AnalysisException(
"Invalid inverted index 'lower_case' value: " + lowerCase + ", lower_case must be true or false");
}

if (stopWords != null && !stopWords.matches("none")) {
throw new AnalysisException("Invalid inverted index 'stopWords' value: " + stopWords
+ ", stopWords must be none");
}
}
}
23 changes: 23 additions & 0 deletions regression-test/data/inverted_index_p0/test_stopwords.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --

-- !sql --

-- !sql --

-- !sql --
1 华夏智胜新税股票A 华夏智胜新税股票A
2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get.

-- !sql --
2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get.

-- !sql --
2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get.

-- !sql --
2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get.

-- !sql --
2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get.

6 changes: 6 additions & 0 deletions regression-test/data/inverted_index_p0/test_tokenize.out
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,9 @@
-- !tokenize_sql --
["get", "images", "hm", "bg", "jpg", "http", "1", "0", "test", "abc", "bcd"]

-- !tokenize_sql --
["华", "夏", "智", "胜", "新", "税", "股", "票"]

-- !tokenize_sql --
["华", "夏", "智", "胜", "新", "税", "股", "票", "a"]

Loading

0 comments on commit 4b06ac2

Please sign in to comment.