Skip to content

Commit

Permalink
fix comment
Browse files Browse the repository at this point in the history
  • Loading branch information
airborne12 committed Sep 12, 2024
1 parent a0b94ba commit c314c82
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,10 @@ std::unique_ptr<lucene::analysis::Analyzer> InvertedIndexAnalyzer::create_analyz
return analyzer;
}

void InvertedIndexAnalyzer::get_analyse_result(std::vector<std::string>& analyse_result,
lucene::util::Reader* reader,
lucene::analysis::Analyzer* analyzer,
const std::string& field_name,
InvertedIndexQueryType query_type,
bool drop_duplicates) {
analyse_result.clear();
std::vector<std::string> InvertedIndexAnalyzer::get_analyse_result(
lucene::util::Reader* reader, lucene::analysis::Analyzer* analyzer,
const std::string& field_name, InvertedIndexQueryType query_type, bool drop_duplicates) {
std::vector<std::string> analyse_result;

std::wstring field_ws = StringUtil::string_to_wstring(field_name);
std::unique_ptr<lucene::analysis::TokenStream> token_stream(
Expand All @@ -115,6 +112,7 @@ void InvertedIndexAnalyzer::get_analyse_result(std::vector<std::string>& analyse
std::set<std::string> unrepeated_result(analyse_result.begin(), analyse_result.end());
analyse_result.assign(unrepeated_result.begin(), unrepeated_result.end());
}
return analyse_result;
}

} // namespace doris::segment_v2::inverted_index
10 changes: 5 additions & 5 deletions be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ class InvertedIndexAnalyzer {
static std::unique_ptr<lucene::analysis::Analyzer> create_analyzer(
const InvertedIndexCtx* inverted_index_ctx);

static void get_analyse_result(std::vector<std::string>& analyse_result,
lucene::util::Reader* reader,
lucene::analysis::Analyzer* analyzer,
const std::string& field_name, InvertedIndexQueryType query_type,
bool drop_duplicates = true);
static std::vector<std::string> get_analyse_result(lucene::util::Reader* reader,
lucene::analysis::Analyzer* analyzer,
const std::string& field_name,
InvertedIndexQueryType query_type,
bool drop_duplicates = true);
};
} // namespace doris::segment_v2::inverted_index
4 changes: 2 additions & 2 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,8 +282,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run
auto reader = inverted_index::InvertedIndexAnalyzer::create_reader(
inverted_index_ctx->char_filter_map);
reader->init(search_str.data(), search_str.size(), true);
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
query_info.terms, reader.get(), analyzer.get(), column_name, query_type);
query_info.terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result(
reader.get(), analyzer.get(), column_name, query_type);
}
if (query_info.terms.empty()) {
auto msg = fmt::format(
Expand Down
43 changes: 20 additions & 23 deletions be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,13 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
return open_index_directory();
}

Status create_char_string_reader(std::unique_ptr<lucene::util::Reader>& string_reader,
CharFilterMap& char_filter_map) {
Result<std::unique_ptr<lucene::util::Reader>> create_char_string_reader(
CharFilterMap& char_filter_map) {
try {
string_reader = inverted_index::InvertedIndexAnalyzer::create_reader(char_filter_map);
return Status::OK();
return inverted_index::InvertedIndexAnalyzer::create_reader(char_filter_map);
} catch (CLuceneError& e) {
return Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
"inverted index create string reader failed: {}", e.what());
return ResultError(Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
"inverted index create string reader failed: {}", e.what()));
}
}

Expand All @@ -162,18 +161,18 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
return Status::OK();
}

Status create_index_writer(std::unique_ptr<lucene::index::IndexWriter>& index_writer) {
std::unique_ptr<lucene::index::IndexWriter> create_index_writer() {
bool create_index = true;
bool close_dir_on_shutdown = true;
index_writer = std::make_unique<lucene::index::IndexWriter>(
auto index_writer = std::make_unique<lucene::index::IndexWriter>(
_dir, _analyzer.get(), create_index, close_dir_on_shutdown);
index_writer->setRAMBufferSizeMB(config::inverted_index_ram_buffer_size);
index_writer->setMaxBufferedDocs(config::inverted_index_max_buffered_docs);
index_writer->setMaxFieldLength(MAX_FIELD_LEN);
index_writer->setMergeFactor(MERGE_FACTOR);
index_writer->setUseCompoundFile(false);

return Status::OK();
return index_writer;
}

Status create_field(lucene::document::Field** field) {
Expand All @@ -189,15 +188,13 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
return Status::OK();
}

Status create_analyzer(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
std::shared_ptr<InvertedIndexCtx>& inverted_index_ctx) {
Result<std::unique_ptr<lucene::analysis::Analyzer>> create_analyzer(
std::shared_ptr<InvertedIndexCtx>& inverted_index_ctx) {
try {
analyzer = inverted_index::InvertedIndexAnalyzer::create_analyzer(
inverted_index_ctx.get());
return Status::OK();
return inverted_index::InvertedIndexAnalyzer::create_analyzer(inverted_index_ctx.get());
} catch (CLuceneError& e) {
return Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what());
return ResultError(Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what()));
}
}

Expand All @@ -210,10 +207,10 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
get_parser_lowercase_from_properties<true>(_index_meta->properties()),
get_parser_stopwords_from_properties(_index_meta->properties()));
RETURN_IF_ERROR(open_index_directory());
RETURN_IF_ERROR(create_char_string_reader(_char_string_reader,
_inverted_index_ctx->char_filter_map));
RETURN_IF_ERROR(create_analyzer(_analyzer, _inverted_index_ctx));
RETURN_IF_ERROR(create_index_writer(_index_writer));
_char_string_reader =
DORIS_TRY(create_char_string_reader(_inverted_index_ctx->char_filter_map));
_analyzer = DORIS_TRY(create_analyzer(_inverted_index_ctx));
_index_writer = create_index_writer();
_doc = std::make_unique<lucene::document::Document>();
if (_single_field) {
RETURN_IF_ERROR(create_field(&_field));
Expand Down Expand Up @@ -372,9 +369,9 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
// stream can not reuse for different field
bool own_token_stream = true;
bool own_reader = true;
std::unique_ptr<lucene::util::Reader> char_string_reader = nullptr;
RETURN_IF_ERROR(create_char_string_reader(
char_string_reader, _inverted_index_ctx->char_filter_map));
std::unique_ptr<lucene::util::Reader> char_string_reader =
DORIS_TRY(create_char_string_reader(
_inverted_index_ctx->char_filter_map));
char_string_reader->init(v->get_data(), v->get_size(), false);
_analyzer->set_ownReader(own_reader);
ts = _analyzer->tokenStream(new_field->name(),
Expand Down
8 changes: 4 additions & 4 deletions be/src/vec/functions/function_tokenize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,10 @@ void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string,
inverted_index_ctx.char_filter_map);
reader->init(tokenize_str.data, tokenize_str.size, true);

std::vector<std::string> query_tokens;
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx.analyzer, "tokenize",
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
std::vector<std::string> query_tokens =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
reader.get(), inverted_index_ctx.analyzer, "tokenize",
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
for (auto token : query_tokens) {
const size_t old_size = column_string_chars.size();
const size_t split_part_size = token.length();
Expand Down
50 changes: 25 additions & 25 deletions be/src/vec/functions/match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,22 +175,22 @@ inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_ty
return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
}

void FunctionMatchBase::analyse_query_str_token(std::vector<std::string>* query_tokens,
InvertedIndexCtx* inverted_index_ctx,
const std::string& match_query_str,
const std::string& column_name) const {
std::vector<std::string> FunctionMatchBase::analyse_query_str_token(
InvertedIndexCtx* inverted_index_ctx, const std::string& match_query_str,
const std::string& column_name) const {
VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
<< inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
std::vector<std::string> query_tokens;
if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
query_tokens->emplace_back(match_query_str);
return;
query_tokens.emplace_back(match_query_str);
return query_tokens;
}
auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
inverted_index_ctx->char_filter_map);
reader->init(match_query_str.data(), match_query_str.size(), true);
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
*query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
get_query_type_from_fn_name());
query_tokens = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
reader.get(), inverted_index_ctx->analyzer, column_name, get_query_type_from_fn_name());
return query_tokens;
}

inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
Expand All @@ -211,11 +211,10 @@ inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
inverted_index_ctx->char_filter_map);
reader->init(str_ref.data, str_ref.size, true);

std::vector<std::string> element_tokens;

doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
element_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
query_type, false);
std::vector<std::string> element_tokens =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
reader.get(), inverted_index_ctx->analyzer, column_name, query_type,
false);
data_tokens.insert(data_tokens.end(), element_tokens.begin(), element_tokens.end());
}
} else {
Expand All @@ -226,9 +225,10 @@ inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
inverted_index_ctx->char_filter_map);
reader->init(str_ref.data, str_ref.size, true);
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
data_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
query_type, false);
data_tokens =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
reader.get(), inverted_index_ctx->analyzer, column_name, query_type,
false);
}
}
return data_tokens;
Expand Down Expand Up @@ -256,8 +256,8 @@ Status FunctionMatchAny::execute_match(FunctionContext* context, const std::stri
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

std::vector<std::string> query_tokens;
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
std::vector<std::string> query_tokens =
analyse_query_str_token(inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
Expand Down Expand Up @@ -294,8 +294,8 @@ Status FunctionMatchAll::execute_match(FunctionContext* context, const std::stri
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

std::vector<std::string> query_tokens;
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
std::vector<std::string> query_tokens =
analyse_query_str_token(inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
Expand Down Expand Up @@ -338,8 +338,8 @@ Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::s
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

std::vector<std::string> query_tokens;
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
std::vector<std::string> query_tokens =
analyse_query_str_token(inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
Expand Down Expand Up @@ -397,8 +397,8 @@ Status FunctionMatchPhrasePrefix::execute_match(
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

std::vector<std::string> query_tokens;
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
std::vector<std::string> query_tokens =
analyse_query_str_token(inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
Expand Down
7 changes: 3 additions & 4 deletions be/src/vec/functions/match.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,9 @@ class FunctionMatchBase : public IFunction {

doris::segment_v2::InvertedIndexQueryType get_query_type_from_fn_name() const;

void analyse_query_str_token(std::vector<std::string>* query_tokens,
InvertedIndexCtx* inverted_index_ctx,
const std::string& match_query_str,
const std::string& field_name) const;
std::vector<std::string> analyse_query_str_token(InvertedIndexCtx* inverted_index_ctx,
const std::string& match_query_str,
const std::string& field_name) const;

std::vector<std::string> analyse_data_token(const std::string& column_name,
InvertedIndexCtx* inverted_index_ctx,
Expand Down

0 comments on commit c314c82

Please sign in to comment.