Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[opt](inverted index) the "unicode" tokenizer can be configured to select stop words #33982 #34376

Merged
merged 1 commit into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions be/src/olap/inverted_index_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,13 @@ std::string get_parser_ignore_above_value_from_properties(
}
}

std::string get_parser_stopwords_from_properties(
const std::map<std::string, std::string>& properties) {
if (properties.find(INVERTED_INDEX_PARSER_STOPWORDS_KEY) != properties.end()) {
return properties.at(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
} else {
return "";
}
}

} // namespace doris
5 changes: 5 additions & 0 deletions be/src/olap/inverted_index_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256";

const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";

const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";

std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type);

InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str);
Expand Down Expand Up @@ -111,4 +113,7 @@ std::string get_parser_lowercase_from_properties(
}
}

std::string get_parser_stopwords_from_properties(
const std::map<std::string, std::string>& properties);

} // namespace doris
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@ class CharReplaceCharFilter : public lucene::analysis::CharFilter {
public:
CharReplaceCharFilter(lucene::util::Reader* in, const std::string& pattern,
const std::string& replacement);
virtual ~CharReplaceCharFilter() = default;
~CharReplaceCharFilter() override = default;

void init(const void* _value, int32_t _length, bool copyData) override;
int32_t read(const void** start, int32_t min, int32_t max) override;
int32_t readCopy(void* start, int32_t off, int32_t len) override;

size_t size() override { return _buf.size(); }

private:
void fill();
void process_pattern(std::string& buf);
Expand Down
30 changes: 24 additions & 6 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,12 +290,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run
inverted_index_ctx->char_filter_map =
get_parser_char_filter_map_from_properties(_index_meta.properties());
auto analyzer = create_analyzer(inverted_index_ctx.get());
auto lowercase = get_parser_lowercase_from_properties(_index_meta.properties());
if (lowercase == "true") {
analyzer->set_lowercase(true);
} else if (lowercase == "false") {
analyzer->set_lowercase(false);
}
setup_analyzer_lowercase(analyzer, _index_meta.properties());
setup_analyzer_use_stopwords(analyzer, _index_meta.properties());
auto reader = create_reader(inverted_index_ctx.get(), search_str);
inverted_index_ctx->analyzer = analyzer.get();
get_analyse_result(analyse_result, reader.get(), analyzer.get(), column_name,
Expand Down Expand Up @@ -597,6 +593,28 @@ InvertedIndexReaderType FullTextIndexReader::type() {
return InvertedIndexReaderType::FULLTEXT;
}

void FullTextIndexReader::setup_analyzer_lowercase(
std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties) {
auto lowercase = get_parser_lowercase_from_properties(properties);
if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
analyzer->set_lowercase(true);
} else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
analyzer->set_lowercase(false);
}
}

void FullTextIndexReader::setup_analyzer_use_stopwords(
std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties) {
auto stop_words = get_parser_stopwords_from_properties(properties);
if (stop_words == "none") {
analyzer->set_stopwords(nullptr);
} else {
analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
}
}

Status StringTypeInvertedIndexReader::new_iterator(
OlapReaderStatistics* stats, RuntimeState* runtime_state,
std::unique_ptr<InvertedIndexIterator>* iterator) {
Expand Down
11 changes: 11 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ class FullTextIndexReader : public InvertedIndexReader {

InvertedIndexReaderType type() override;

static void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties);

static void setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties);

private:
Status normal_index_search(OlapReaderStatistics* stats, InvertedIndexQueryType query_type,
const IndexSearcherPtr& index_searcher,
Expand Down Expand Up @@ -274,6 +280,11 @@ class BkdIndexReader : public InvertedIndexReader {
InvertedIndexReaderType type() override;
Status get_bkd_reader(std::shared_ptr<lucene::util::bkd::bkd_reader>* reader);

static void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties);
static void setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties);

private:
const TypeInfo* _type_info {};
const KeyCoder* _value_key_coder {};
Expand Down
26 changes: 20 additions & 6 deletions be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,12 +210,8 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
// ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
_analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
}
auto lowercase = get_parser_lowercase_from_properties<true>(_index_meta->properties());
if (lowercase == "true") {
_analyzer->set_lowercase(true);
} else if (lowercase == "false") {
_analyzer->set_lowercase(false);
}
setup_analyzer_lowercase(_analyzer);
setup_analyzer_use_stopwords(_analyzer);
} catch (CLuceneError& e) {
return Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what());
Expand Down Expand Up @@ -248,6 +244,24 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
return Status::OK();
}

void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& analyzer) {
auto lowercase = get_parser_lowercase_from_properties<true>(_index_meta->properties());
if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
analyzer->set_lowercase(true);
} else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
analyzer->set_lowercase(false);
}
}

void setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>& analyzer) {
auto stop_words = get_parser_stopwords_from_properties(_index_meta->properties());
if (stop_words == "none") {
analyzer->set_stopwords(nullptr);
} else {
analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
}
}

Status add_document() {
try {
_index_writer->addDocument(_doc.get());
Expand Down
4 changes: 4 additions & 0 deletions be/src/vec/functions/function_tokenize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "CLucene/StdHeader.h"
#include "CLucene/config/repl_wchar.h"
#include "olap/inverted_index_parser.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "vec/columns/column.h"
#include "vec/common/string_ref.h"
#include "vec/core/block.h"
Expand Down Expand Up @@ -149,6 +150,9 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block
return Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what());
}
doris::segment_v2::FullTextIndexReader::setup_analyzer_lowercase(analyzer, properties);
doris::segment_v2::FullTextIndexReader::setup_analyzer_use_stopwords(analyzer,
properties);

inverted_index_ctx.analyzer = analyzer.get();
_do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, dest_offsets,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public class InvertedIndexUtil {

public static String INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";

public static String INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";

public static String getInvertedIndexParser(Map<String, String> properties) {
String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY);
// default is "none" if not set
Expand Down Expand Up @@ -136,7 +138,8 @@ public static void checkInvertedIndexProperties(Map<String, String> properties)
INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN,
INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT,
INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY,
INVERTED_INDEX_PARSER_LOWERCASE_KEY
INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_STOPWORDS_KEY
));

for (String key : properties.keySet()) {
Expand All @@ -152,6 +155,7 @@ public static void checkInvertedIndexProperties(Map<String, String> properties)
String charFilterPattern = properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);

if (parser != null && !parser.matches("none|english|unicode|chinese|standard")) {
throw new AnalysisException("Invalid inverted index 'parser' value: " + parser
Expand Down Expand Up @@ -194,5 +198,10 @@ public static void checkInvertedIndexProperties(Map<String, String> properties)
throw new AnalysisException(
"Invalid inverted index 'lower_case' value: " + lowerCase + ", lower_case must be true or false");
}

if (stopWords != null && !stopWords.matches("none")) {
throw new AnalysisException("Invalid inverted index 'stopWords' value: " + stopWords
+ ", stopWords must be none");
}
}
}
23 changes: 23 additions & 0 deletions regression-test/data/inverted_index_p0/test_stopwords.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --

-- !sql --

-- !sql --

-- !sql --
1 华夏智胜新税股票A 华夏智胜新税股票A
2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get.

-- !sql --
2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get.

-- !sql --
2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get.

-- !sql --
2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get.

-- !sql --
2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get.

6 changes: 6 additions & 0 deletions regression-test/data/inverted_index_p0/test_tokenize.out
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,9 @@
-- !tokenize_sql --
["get", "images", "hm", "bg", "jpg", "http", "1", "0", "test", "abc", "bcd"]

-- !tokenize_sql --
["华", "夏", "智", "胜", "新", "税", "股", "票"]

-- !tokenize_sql --
["华", "夏", "智", "胜", "新", "税", "股", "票", "a"]

Loading
Loading