From c314c82fdf311a4d7d31eb6616d5f0aa14d081e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E5=87=AF?= Date: Thu, 12 Sep 2024 20:49:25 +0800 Subject: [PATCH] fix comment --- .../inverted_index/analyzer/analyzer.cpp | 12 ++--- .../inverted_index/analyzer/analyzer.h | 10 ++-- .../segment_v2/inverted_index_reader.cpp | 4 +- .../segment_v2/inverted_index_writer.cpp | 43 ++++++++-------- be/src/vec/functions/function_tokenize.cpp | 8 +-- be/src/vec/functions/match.cpp | 50 +++++++++---------- be/src/vec/functions/match.h | 7 ++- 7 files changed, 64 insertions(+), 70 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp index fab0d0659d6d9d..8ad1abb322f01f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp @@ -86,13 +86,10 @@ std::unique_ptr InvertedIndexAnalyzer::create_analyz return analyzer; } -void InvertedIndexAnalyzer::get_analyse_result(std::vector& analyse_result, - lucene::util::Reader* reader, - lucene::analysis::Analyzer* analyzer, - const std::string& field_name, - InvertedIndexQueryType query_type, - bool drop_duplicates) { - analyse_result.clear(); +std::vector InvertedIndexAnalyzer::get_analyse_result( + lucene::util::Reader* reader, lucene::analysis::Analyzer* analyzer, + const std::string& field_name, InvertedIndexQueryType query_type, bool drop_duplicates) { + std::vector analyse_result; std::wstring field_ws = StringUtil::string_to_wstring(field_name); std::unique_ptr token_stream( @@ -115,6 +112,7 @@ void InvertedIndexAnalyzer::get_analyse_result(std::vector& analyse std::set unrepeated_result(analyse_result.begin(), analyse_result.end()); analyse_result.assign(unrepeated_result.begin(), unrepeated_result.end()); } + return analyse_result; } } // namespace doris::segment_v2::inverted_index diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h index dea077ed337a1b..ad5d71a536420d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h @@ -39,10 +39,10 @@ class InvertedIndexAnalyzer { static std::unique_ptr create_analyzer( const InvertedIndexCtx* inverted_index_ctx); - static void get_analyse_result(std::vector& analyse_result, - lucene::util::Reader* reader, - lucene::analysis::Analyzer* analyzer, - const std::string& field_name, InvertedIndexQueryType query_type, - bool drop_duplicates = true); + static std::vector get_analyse_result(lucene::util::Reader* reader, + lucene::analysis::Analyzer* analyzer, + const std::string& field_name, + InvertedIndexQueryType query_type, + bool drop_duplicates = true); }; } // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 972fdcee1048d1..7b8504322d2687 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -282,8 +282,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run auto reader = inverted_index::InvertedIndexAnalyzer::create_reader( inverted_index_ctx->char_filter_map); reader->init(search_str.data(), search_str.size(), true); - inverted_index::InvertedIndexAnalyzer::get_analyse_result( - query_info.terms, reader.get(), analyzer.get(), column_name, query_type); + query_info.terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + reader.get(), analyzer.get(), column_name, query_type); } if (query_info.terms.empty()) { auto msg = fmt::format( diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 57aa5f10927a4d..8729bd0c590276 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -146,14 +146,13 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { return open_index_directory(); } - Status create_char_string_reader(std::unique_ptr& string_reader, - CharFilterMap& char_filter_map) { + Result> create_char_string_reader( + CharFilterMap& char_filter_map) { try { - string_reader = inverted_index::InvertedIndexAnalyzer::create_reader(char_filter_map); - return Status::OK(); + return inverted_index::InvertedIndexAnalyzer::create_reader(char_filter_map); } catch (CLuceneError& e) { - return Status::Error( - "inverted index create string reader failed: {}", e.what()); + return ResultError(Status::Error( + "inverted index create string reader failed: {}", e.what())); } } @@ -162,10 +161,10 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { return Status::OK(); } - Status create_index_writer(std::unique_ptr& index_writer) { + std::unique_ptr create_index_writer() { bool create_index = true; bool close_dir_on_shutdown = true; - index_writer = std::make_unique( + auto index_writer = std::make_unique( _dir, _analyzer.get(), create_index, close_dir_on_shutdown); index_writer->setRAMBufferSizeMB(config::inverted_index_ram_buffer_size); index_writer->setMaxBufferedDocs(config::inverted_index_max_buffered_docs); @@ -173,7 +172,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { index_writer->setMergeFactor(MERGE_FACTOR); index_writer->setUseCompoundFile(false); - return Status::OK(); + return index_writer; } Status create_field(lucene::document::Field** field) { @@ -189,15 +188,13 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { return Status::OK(); } - Status create_analyzer(std::unique_ptr& analyzer, - std::shared_ptr& inverted_index_ctx) { + Result> create_analyzer( + std::shared_ptr& inverted_index_ctx) { try { - analyzer = inverted_index::InvertedIndexAnalyzer::create_analyzer( - inverted_index_ctx.get()); - return Status::OK(); + return inverted_index::InvertedIndexAnalyzer::create_analyzer(inverted_index_ctx.get()); } catch (CLuceneError& e) { - return Status::Error( - "inverted index create analyzer failed: {}", e.what()); + return ResultError(Status::Error( + "inverted index create analyzer failed: {}", e.what())); } } @@ -210,10 +207,10 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { get_parser_lowercase_from_properties(_index_meta->properties()), get_parser_stopwords_from_properties(_index_meta->properties())); RETURN_IF_ERROR(open_index_directory()); - RETURN_IF_ERROR(create_char_string_reader(_char_string_reader, - _inverted_index_ctx->char_filter_map)); - RETURN_IF_ERROR(create_analyzer(_analyzer, _inverted_index_ctx)); - RETURN_IF_ERROR(create_index_writer(_index_writer)); + _char_string_reader = + DORIS_TRY(create_char_string_reader(_inverted_index_ctx->char_filter_map)); + _analyzer = DORIS_TRY(create_analyzer(_inverted_index_ctx)); + _index_writer = create_index_writer(); _doc = std::make_unique(); if (_single_field) { RETURN_IF_ERROR(create_field(&_field)); @@ -372,9 +369,9 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { // stream can not reuse for different field bool own_token_stream = true; bool own_reader = true; - std::unique_ptr char_string_reader = nullptr; - RETURN_IF_ERROR(create_char_string_reader( - char_string_reader, _inverted_index_ctx->char_filter_map)); + std::unique_ptr char_string_reader = + DORIS_TRY(create_char_string_reader( + _inverted_index_ctx->char_filter_map)); char_string_reader->init(v->get_data(), v->get_size(), false); _analyzer->set_ownReader(own_reader); ts = _analyzer->tokenStream(new_field->name(), diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp index d5301856a0a106..be0eb5dddc960d 100644 --- a/be/src/vec/functions/function_tokenize.cpp +++ b/be/src/vec/functions/function_tokenize.cpp @@ -84,10 +84,10 @@ void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string, inverted_index_ctx.char_filter_map); reader->init(tokenize_str.data, tokenize_str.size, true); - std::vector query_tokens; - doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result( - query_tokens, reader.get(), inverted_index_ctx.analyzer, "tokenize", - doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); + std::vector query_tokens = + doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result( + reader.get(), inverted_index_ctx.analyzer, "tokenize", + doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); for (auto token : query_tokens) { const size_t old_size = column_string_chars.size(); const size_t split_part_size = token.length(); diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index d641878be771e5..e3909d766f2587 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -175,22 +175,22 @@ inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_ty return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY; } -void FunctionMatchBase::analyse_query_str_token(std::vector* query_tokens, - InvertedIndexCtx* inverted_index_ctx, - const std::string& match_query_str, - const std::string& column_name) const { +std::vector FunctionMatchBase::analyse_query_str_token( + InvertedIndexCtx* inverted_index_ctx, const std::string& match_query_str, + const std::string& column_name) const { VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: " << inverted_index_parser_type_to_string(inverted_index_ctx->parser_type); + std::vector query_tokens; if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) { - query_tokens->emplace_back(match_query_str); - return; + query_tokens.emplace_back(match_query_str); + return query_tokens; } auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader( inverted_index_ctx->char_filter_map); reader->init(match_query_str.data(), match_query_str.size(), true); - doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result( - *query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, - get_query_type_from_fn_name()); + query_tokens = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result( + reader.get(), inverted_index_ctx->analyzer, column_name, get_query_type_from_fn_name()); + return query_tokens; } inline std::vector FunctionMatchBase::analyse_data_token( @@ -211,11 +211,10 @@ inline std::vector FunctionMatchBase::analyse_data_token( inverted_index_ctx->char_filter_map); reader->init(str_ref.data, str_ref.size, true); - std::vector element_tokens; - - doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result( - element_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, - query_type, false); + std::vector element_tokens = + doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result( + reader.get(), inverted_index_ctx->analyzer, column_name, query_type, + false); data_tokens.insert(data_tokens.end(), element_tokens.begin(), element_tokens.end()); } } else { @@ -226,9 +225,10 @@ inline std::vector FunctionMatchBase::analyse_data_token( auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader( inverted_index_ctx->char_filter_map); reader->init(str_ref.data, str_ref.size, true); - doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result( - data_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, - query_type, false); + data_tokens = + doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result( + reader.get(), inverted_index_ctx->analyzer, column_name, query_type, + false); } } return data_tokens; @@ -256,8 +256,8 @@ Status FunctionMatchAny::execute_match(FunctionContext* context, const std::stri ColumnUInt8::Container& result) const { RETURN_IF_ERROR(check(context, name)); - std::vector query_tokens; - analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name); + std::vector query_tokens = + analyse_query_str_token(inverted_index_ctx, match_query_str, column_name); if (query_tokens.empty()) { VLOG_DEBUG << fmt::format( "token parser result is empty for query, " @@ -294,8 +294,8 @@ Status FunctionMatchAll::execute_match(FunctionContext* context, const std::stri ColumnUInt8::Container& result) const { RETURN_IF_ERROR(check(context, name)); - std::vector query_tokens; - analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name); + std::vector query_tokens = + analyse_query_str_token(inverted_index_ctx, match_query_str, column_name); if (query_tokens.empty()) { VLOG_DEBUG << fmt::format( "token parser result is empty for query, " @@ -338,8 +338,8 @@ Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::s ColumnUInt8::Container& result) const { RETURN_IF_ERROR(check(context, name)); - std::vector query_tokens; - analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name); + std::vector query_tokens = + analyse_query_str_token(inverted_index_ctx, match_query_str, column_name); if (query_tokens.empty()) { VLOG_DEBUG << fmt::format( "token parser result is empty for query, " @@ -397,8 +397,8 @@ Status FunctionMatchPhrasePrefix::execute_match( ColumnUInt8::Container& result) const { RETURN_IF_ERROR(check(context, name)); - std::vector query_tokens; - analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name); + std::vector query_tokens = + analyse_query_str_token(inverted_index_ctx, match_query_str, column_name); if (query_tokens.empty()) { VLOG_DEBUG << fmt::format( "token parser result is empty for query, " diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h index 3026e4a06cf7fd..85298d096b0e68 100644 --- a/be/src/vec/functions/match.h +++ b/be/src/vec/functions/match.h @@ -82,10 +82,9 @@ class FunctionMatchBase : public IFunction { doris::segment_v2::InvertedIndexQueryType get_query_type_from_fn_name() const; - void analyse_query_str_token(std::vector* query_tokens, - InvertedIndexCtx* inverted_index_ctx, - const std::string& match_query_str, - const std::string& field_name) const; + std::vector analyse_query_str_token(InvertedIndexCtx* inverted_index_ctx, + const std::string& match_query_str, + const std::string& field_name) const; std::vector analyse_data_token(const std::string& column_name, InvertedIndexCtx* inverted_index_ctx,