From 0a1908fda9e0c97ce413329e19dcb929059a2274 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 21 Nov 2023 18:56:59 +0800 Subject: [PATCH] [Improvement](inverted index) enable bkd index reader cache and refactor inverted index searcher cache --- .../segment_v2/inverted_index_cache.cpp | 150 +++++++- .../rowset/segment_v2/inverted_index_cache.h | 44 ++- .../segment_v2/inverted_index_query_type.h | 46 ++- .../segment_v2/inverted_index_reader.cpp | 327 ++++++++---------- .../rowset/segment_v2/inverted_index_reader.h | 39 +-- .../segment_v2/inverted_index_writer.cpp | 15 +- 6 files changed, 382 insertions(+), 239 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index_cache.cpp b/be/src/olap/rowset/segment_v2/inverted_index_cache.cpp index f399f752c36155..732ab344647ba6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_cache.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_cache.cpp @@ -19,6 +19,7 @@ #include #include +#include // IWYU pragma: no_include #include // IWYU pragma: keep #include @@ -26,11 +27,13 @@ // IWYU pragma: no_include #include // IWYU pragma: keep #include +#include #include "common/logging.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/inverted_index_compound_directory.h" #include "olap/rowset/segment_v2/inverted_index_compound_reader.h" +#include "olap/rowset/segment_v2/inverted_index_desc.h" #include "runtime/exec_env.h" #include "runtime/thread_context.h" #include "util/defer_op.h" @@ -39,19 +42,78 @@ namespace doris { namespace segment_v2 { -IndexSearcherPtr InvertedIndexSearcherCache::build_index_searcher(const io::FileSystemSPtr& fs, - const std::string& index_dir, - const std::string& file_name) { +Status FulltextIndexSearcherBuilder::build(const io::FileSystemSPtr& fs, + const std::string& index_dir, + const std::string& file_name, + OptionalIndexSearcherPtr& output_searcher) { DorisCompoundReader* directory = new DorisCompoundReader(DorisCompoundDirectory::getDirectory(fs, index_dir.c_str()), file_name.c_str(), config::inverted_index_read_buffer_size); auto closeDirectory = true; auto index_searcher = std::make_shared(directory, closeDirectory); + if (!index_searcher) { + _CLDECDELETE(directory) + output_searcher = std::nullopt; + return Status::Error( + "FulltextIndexSearcherBuilder build index_searcher error."); + } // NOTE: need to cl_refcount-- here, so that directory will be deleted when // index_searcher is destroyed _CLDECDELETE(directory) - return index_searcher; + output_searcher = index_searcher; + return Status::OK(); +} + +Status BKDIndexSearcherBuilder::build(const io::FileSystemSPtr& fs, const std::string& index_dir, + const std::string& file_name, + OptionalIndexSearcherPtr& output_searcher) { + try { + auto compound_reader = std::make_unique( + DorisCompoundDirectory::getDirectory(fs, index_dir.c_str()), file_name.c_str(), + config::inverted_index_read_buffer_size); + + if (!compound_reader) { + LOG(ERROR) << "compound reader is null when get directory for:" << index_dir << "/" + << file_name; + output_searcher = std::nullopt; + return Status::Error( + "compound reader is null"); + } + CLuceneError err; + std::unique_ptr data_in; + std::unique_ptr meta_in; + std::unique_ptr index_in; + + if (!compound_reader->openInput( + InvertedIndexDescriptor::get_temporary_bkd_index_data_file_name().c_str(), + data_in, err) || + !compound_reader->openInput( + InvertedIndexDescriptor::get_temporary_bkd_index_meta_file_name().c_str(), + meta_in, err) || + !compound_reader->openInput( + InvertedIndexDescriptor::get_temporary_bkd_index_file_name().c_str(), index_in, + err)) { + // Consider logging the error or handling it more comprehensively + LOG(ERROR) << "open bkd index input error: {}" << err.what(); + output_searcher = std::nullopt; + return Status::Error( + "open bkd index input error"); + } + auto bkd_reader = std::make_shared(data_in.release()); + if (0 == bkd_reader->read_meta(meta_in.get())) { + VLOG_NOTICE << "bkd index file is empty:" << compound_reader->toString(); + output_searcher = std::nullopt; + return Status::EndOfFile("bkd index file is empty"); + } + + bkd_reader->read_index(index_in.get()); + output_searcher = IndexSearcherPtr {bkd_reader}; + return Status::OK(); + } catch (const CLuceneError& e) { + return Status::Error( + "BKDIndexSearcherBuilder build error: {}", e.what()); + } } InvertedIndexSearcherCache* InvertedIndexSearcherCache::create_global_instance( @@ -98,13 +160,18 @@ InvertedIndexSearcherCache::InvertedIndexSearcherCache(size_t capacity, uint32_t } } -Status InvertedIndexSearcherCache::get_index_searcher(const io::FileSystemSPtr& fs, - const std::string& index_dir, - const std::string& file_name, - InvertedIndexCacheHandle* cache_handle, - OlapReaderStatistics* stats, bool use_cache) { +Status InvertedIndexSearcherCache::get_index_searcher( + const io::FileSystemSPtr& fs, const std::string& index_dir, const std::string& file_name, + InvertedIndexCacheHandle* cache_handle, OlapReaderStatistics* stats, + InvertedIndexReaderType reader_type, bool use_cache) { auto file_path = index_dir + "/" + file_name; - + bool exists = false; + RETURN_IF_ERROR(fs->exists(file_path, &exists)); + if (!exists) { + LOG(WARNING) << "inverted index: " << file_path << " not exist."; + return Status::Error( + "inverted index input file {} not found", file_path); + } using namespace std::chrono; auto start_time = steady_clock::now(); Defer cost {[&]() { @@ -119,14 +186,40 @@ Status InvertedIndexSearcherCache::get_index_searcher(const io::FileSystemSPtr& } cache_handle->owned = !use_cache; - IndexSearcherPtr index_searcher = nullptr; + IndexSearcherPtr index_searcher; + std::unique_ptr index_builder = nullptr; auto mem_tracker = std::unique_ptr(new MemTracker("InvertedIndexSearcherCacheWithRead")); #ifndef BE_TEST { SCOPED_RAW_TIMER(&stats->inverted_index_searcher_open_timer); SCOPED_CONSUME_MEM_TRACKER(mem_tracker.get()); - index_searcher = build_index_searcher(fs, index_dir, file_name); + switch (reader_type) { + case InvertedIndexReaderType::STRING_TYPE: + case InvertedIndexReaderType::FULLTEXT: { + index_builder = std::make_unique(); + break; + } + case InvertedIndexReaderType::BKD: { + index_builder = std::make_unique(); + break; + } + + default: + LOG(ERROR) << "InvertedIndexReaderType:" << reader_type_to_string(reader_type) + << " is not support for InvertedIndexSearcherCache"; + return Status::Error( + "InvertedIndexSearcherCache do not support reader type."); + } + OptionalIndexSearcherPtr result; + RETURN_IF_ERROR(index_builder->build(fs, index_dir, file_name, result)); + if (!result.has_value()) { + LOG(ERROR) << "InvertedIndexReaderType:" << reader_type_to_string(reader_type) + << " build for InvertedIndexSearcherCache error"; + return Status::Error( + "InvertedIndexSearcherCache build error."); + } + index_searcher = *result; } #endif @@ -144,7 +237,8 @@ Status InvertedIndexSearcherCache::get_index_searcher(const io::FileSystemSPtr& Status InvertedIndexSearcherCache::insert(const io::FileSystemSPtr& fs, const std::string& index_dir, - const std::string& file_name) { + const std::string& file_name, + InvertedIndexReaderType reader_type) { auto file_path = index_dir + "/" + file_name; using namespace std::chrono; @@ -156,13 +250,39 @@ Status InvertedIndexSearcherCache::insert(const io::FileSystemSPtr& fs, InvertedIndexSearcherCache::CacheKey cache_key(file_path); IndexCacheValuePtr cache_value = std::make_unique(); - IndexSearcherPtr index_searcher = nullptr; + IndexSearcherPtr index_searcher; + std::unique_ptr builder = nullptr; auto mem_tracker = std::unique_ptr(new MemTracker("InvertedIndexSearcherCacheWithInsert")); #ifndef BE_TEST { SCOPED_CONSUME_MEM_TRACKER(mem_tracker.get()); - index_searcher = build_index_searcher(fs, index_dir, file_name); + switch (reader_type) { + case InvertedIndexReaderType::STRING_TYPE: + case InvertedIndexReaderType::FULLTEXT: { + builder = std::make_unique(); + break; + } + case InvertedIndexReaderType::BKD: { + builder = std::make_unique(); + break; + } + + default: + LOG(ERROR) << "InvertedIndexReaderType:" << reader_type_to_string(reader_type) + << " is not support for InvertedIndexSearcherCache"; + return Status::Error( + "InvertedIndexSearcherCache do not support reader type."); + } + OptionalIndexSearcherPtr result; + RETURN_IF_ERROR(builder->build(fs, index_dir, file_name, result)); + if (!result.has_value()) { + LOG(ERROR) << "InvertedIndexReaderType:" << reader_type_to_string(reader_type) + << " build for InvertedIndexSearcherCache error"; + return Status::Error( + "InvertedIndexSearcherCache build error."); + } + index_searcher = *result; } #endif diff --git a/be/src/olap/rowset/segment_v2/inverted_index_cache.h b/be/src/olap/rowset/segment_v2/inverted_index_cache.h index 79439ac4621794..a3abf7d11dbc15 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_cache.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_cache.h @@ -37,6 +37,7 @@ #include #include #include +#include #include "common/config.h" #include "common/status.h" @@ -54,16 +55,44 @@ namespace lucene { namespace search { class IndexSearcher; } // namespace search +namespace util { +namespace bkd { +class bkd_reader; +} +} // namespace util } // namespace lucene namespace doris { struct OlapReaderStatistics; namespace segment_v2 { -using IndexSearcherPtr = std::shared_ptr; +using FulltextIndexSearcherPtr = std::shared_ptr; +using BKDIndexSearcherPtr = std::shared_ptr; +using IndexSearcherPtr = std::variant; +using OptionalIndexSearcherPtr = std::optional; class InvertedIndexCacheHandle; +class IndexSearcherBuilder { +public: + virtual Status build(const io::FileSystemSPtr& fs, const std::string& index_dir, + const std::string& file_name, + OptionalIndexSearcherPtr& output_searcher) = 0; + virtual ~IndexSearcherBuilder() = default; +}; + +class FulltextIndexSearcherBuilder : public IndexSearcherBuilder { +public: + Status build(const io::FileSystemSPtr& fs, const std::string& index_dir, + const std::string& file_name, OptionalIndexSearcherPtr& output_searcher) override; +}; + +class BKDIndexSearcherBuilder : public IndexSearcherBuilder { +public: + Status build(const io::FileSystemSPtr& fs, const std::string& index_dir, + const std::string& file_name, OptionalIndexSearcherPtr& output_searcher) override; +}; + class InvertedIndexSearcherCache : public LRUCachePolicy { public: // The cache key of index_searcher lru cache @@ -73,7 +102,7 @@ class InvertedIndexSearcherCache : public LRUCachePolicy { }; // The cache value of index_searcher lru cache. - // Holding a opened index_searcher. + // Holding an opened index_searcher. struct CacheValue : public LRUCacheValueBase { IndexSearcherPtr index_searcher; }; @@ -95,19 +124,16 @@ class InvertedIndexSearcherCache : public LRUCachePolicy { return ExecEnv::GetInstance()->get_inverted_index_searcher_cache(); } - static IndexSearcherPtr build_index_searcher(const io::FileSystemSPtr& fs, - const std::string& index_dir, - const std::string& file_name); - InvertedIndexSearcherCache(size_t capacity, uint32_t num_shards); Status get_index_searcher(const io::FileSystemSPtr& fs, const std::string& index_dir, const std::string& file_name, InvertedIndexCacheHandle* cache_handle, - OlapReaderStatistics* stats, bool use_cache = true); + OlapReaderStatistics* stats, InvertedIndexReaderType reader_type, + bool use_cache = true); // function `insert` called after inverted index writer close Status insert(const io::FileSystemSPtr& fs, const std::string& index_dir, - const std::string& file_name); + const std::string& file_name, InvertedIndexReaderType reader_type); // function `erase` called after compaction remove segment Status erase(const std::string& index_file_path); @@ -211,7 +237,7 @@ class InvertedIndexQueryCache : public LRUCachePolicy { key_buf.append("/"); key_buf.append(column_name); key_buf.append("/"); - auto query_type_str = InvertedIndexQueryType_toString(query_type); + auto query_type_str = query_type_to_string(query_type); if (query_type_str.empty()) { return ""; } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h index 1ebfe6359181e9..64171c7739d2d6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h @@ -22,6 +22,50 @@ namespace doris { namespace segment_v2 { +enum class InvertedIndexReaderType { + UNKNOWN = -1, + FULLTEXT = 0, + STRING_TYPE = 1, + BKD = 2, +}; + +template +constexpr const char* InvertedIndexReaderTypeToString(); + +template <> +constexpr const char* InvertedIndexReaderTypeToString() { + return "UNKNOWN"; +} + +template <> +constexpr const char* InvertedIndexReaderTypeToString() { + return "FULLTEXT"; +} + +template <> +constexpr const char* InvertedIndexReaderTypeToString() { + return "STRING_TYPE"; +} + +template <> +constexpr const char* InvertedIndexReaderTypeToString() { + return "BKD"; +} + +inline std::string reader_type_to_string(InvertedIndexReaderType query_type) { + switch (query_type) { + case InvertedIndexReaderType::UNKNOWN: + return InvertedIndexReaderTypeToString(); + case InvertedIndexReaderType::FULLTEXT: + return InvertedIndexReaderTypeToString(); + case InvertedIndexReaderType::STRING_TYPE: + return InvertedIndexReaderTypeToString(); + case InvertedIndexReaderType::BKD: + return InvertedIndexReaderTypeToString(); + } + return ""; // Explicitly handle all cases +} + enum class InvertedIndexQueryType { UNKNOWN_QUERY = -1, EQUAL_QUERY = 0, @@ -34,7 +78,7 @@ enum class InvertedIndexQueryType { MATCH_PHRASE_QUERY = 7, }; -inline std::string InvertedIndexQueryType_toString(InvertedIndexQueryType query_type) { +inline std::string query_type_to_string(InvertedIndexQueryType query_type) { switch (query_type) { case InvertedIndexQueryType::UNKNOWN_QUERY: { return "UNKNOWN"; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index e870680ef50dbb..38d4b3a7eece1a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -42,7 +42,6 @@ #include #include -#include #include #include #include @@ -289,13 +288,6 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run "inverted index path: {} not exist.", index_file_path.string()); } - auto get_index_search = [this, &index_dir, &index_file_name, &stats]() { - InvertedIndexCacheHandle inverted_index_cache_handle; - static_cast(InvertedIndexSearcherCache::instance()->get_index_searcher( - _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle, stats)); - return inverted_index_cache_handle.get_index_searcher(); - }; - std::unique_ptr query; std::wstring field_ws = std::wstring(column_name.begin(), column_name.end()); @@ -324,34 +316,41 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run term_match_bitmap = cache_handle.get_bitmap(); } else { stats->inverted_index_query_cache_miss++; + InvertedIndexCacheHandle inverted_index_cache_handle; + RETURN_IF_ERROR(InvertedIndexSearcherCache::instance()->get_index_searcher( + _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle, + stats, type())); + auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); + if (FulltextIndexSearcherPtr* searcher_ptr = + std::get_if(&searcher_variant)) { + term_match_bitmap = std::make_shared(); - auto index_searcher = get_index_search(); - - term_match_bitmap = std::make_shared(); - - Status res = Status::OK(); - if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { - auto* phrase_query = new lucene::search::PhraseQuery(); - for (auto& token : analyse_result) { - std::wstring wtoken = StringUtil::string_to_wstring(token); - auto* term = _CLNEW lucene::index::Term(field_ws.c_str(), wtoken.c_str()); - phrase_query->add(term); - _CLDECDELETE(term); + Status res = Status::OK(); + if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { + auto* phrase_query = new lucene::search::PhraseQuery(); + for (auto& token : analyse_result) { + std::wstring wtoken = StringUtil::string_to_wstring(token); + auto* term = + _CLNEW lucene::index::Term(field_ws.c_str(), wtoken.c_str()); + phrase_query->add(term); + _CLDECDELETE(term); + } + query.reset(phrase_query); + res = normal_index_search(stats, query_type, *searcher_ptr, + null_bitmap_already_read, query, + term_match_bitmap); + } else { + res = match_all_index_search(stats, runtime_state, field_ws, analyse_result, + *searcher_ptr, term_match_bitmap); + } + if (!res.ok()) { + return res; } - query.reset(phrase_query); - res = normal_index_search(stats, query_type, index_searcher, - null_bitmap_already_read, query, term_match_bitmap); - } else { - res = match_all_index_search(stats, runtime_state, field_ws, analyse_result, - index_searcher, term_match_bitmap); - } - if (!res.ok()) { - return res; - } - // add to cache - term_match_bitmap->runOptimize(); - cache->insert(cache_key, term_match_bitmap, &cache_handle); + // add to cache + term_match_bitmap->runOptimize(); + cache->insert(cache_key, term_match_bitmap, &cache_handle); + } } query_match_bitmap = *term_match_bitmap; } else { @@ -374,26 +373,31 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run term_match_bitmap = cache_handle.get_bitmap(); } else { stats->inverted_index_query_cache_miss++; - - auto index_searcher = get_index_search(); - - term_match_bitmap = std::make_shared(); - // unique_ptr with custom deleter - std::unique_ptr term { - _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()), - [](lucene::index::Term* term) { _CLDECDELETE(term); }}; - query.reset(new lucene::search::TermQuery(term.get())); - - Status res = - normal_index_search(stats, query_type, index_searcher, - null_bitmap_already_read, query, term_match_bitmap); - if (!res.ok()) { - return res; + InvertedIndexCacheHandle inverted_index_cache_handle; + RETURN_IF_ERROR(InvertedIndexSearcherCache::instance()->get_index_searcher( + _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle, + stats, type())); + auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); + if (FulltextIndexSearcherPtr* searcher_ptr = + std::get_if(&searcher_variant)) { + term_match_bitmap = std::make_shared(); + // unique_ptr with custom deleter + std::unique_ptr term { + _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()), + [](lucene::index::Term* term) { _CLDECDELETE(term); }}; + query.reset(new lucene::search::TermQuery(term.get())); + + Status res = normal_index_search(stats, query_type, *searcher_ptr, + null_bitmap_already_read, query, + term_match_bitmap); + if (!res.ok()) { + return res; + } + + // add to cache + term_match_bitmap->runOptimize(); + cache->insert(cache_key, term_match_bitmap, &cache_handle); } - - // add to cache - term_match_bitmap->runOptimize(); - cache->insert(cache_key, term_match_bitmap, &cache_handle); } // add to query_match_bitmap @@ -428,7 +432,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run Status FullTextIndexReader::normal_index_search( OlapReaderStatistics* stats, InvertedIndexQueryType query_type, - const IndexSearcherPtr& index_searcher, bool& null_bitmap_already_read, + const FulltextIndexSearcherPtr& index_searcher, bool& null_bitmap_already_read, const std::unique_ptr& query, const std::shared_ptr& term_match_bitmap) { check_null_bitmap(index_searcher, null_bitmap_already_read); @@ -463,7 +467,8 @@ Status FullTextIndexReader::normal_index_search( Status FullTextIndexReader::match_all_index_search( OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws, - const std::vector& analyse_result, const IndexSearcherPtr& index_searcher, + const std::vector& analyse_result, + const FulltextIndexSearcherPtr& index_searcher, const std::shared_ptr& term_match_bitmap) { TQueryOptions queryOptions = runtime_state->query_options(); try { @@ -479,7 +484,7 @@ Status FullTextIndexReader::match_all_index_search( return Status::OK(); } -void FullTextIndexReader::check_null_bitmap(const IndexSearcherPtr& index_searcher, +void FullTextIndexReader::check_null_bitmap(const FulltextIndexSearcherPtr& index_searcher, bool& null_bitmap_already_read) { // try to reuse index_searcher's directory to read null_bitmap to cache // to avoid open directory additionally for null_bitmap @@ -580,56 +585,60 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, roaring::Roaring result; InvertedIndexCacheHandle inverted_index_cache_handle; - static_cast(InvertedIndexSearcherCache::instance()->get_index_searcher( - _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle, stats)); - auto index_searcher = inverted_index_cache_handle.get_index_searcher(); - - // try to reuse index_searcher's directory to read null_bitmap to cache - // to avoid open directory additionally for null_bitmap - InvertedIndexQueryCacheHandle null_bitmap_cache_handle; - static_cast( - read_null_bitmap(&null_bitmap_cache_handle, index_searcher->getReader()->directory())); + RETURN_IF_ERROR(InvertedIndexSearcherCache::instance()->get_index_searcher( + _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle, stats, type())); + auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); + if (FulltextIndexSearcherPtr* index_searcher = + std::get_if(&searcher_variant)) { + // try to reuse index_searcher's directory to read null_bitmap to cache + // to avoid open directory additionally for null_bitmap + InvertedIndexQueryCacheHandle null_bitmap_cache_handle; + static_cast(read_null_bitmap(&null_bitmap_cache_handle, + index_searcher->getReader()->directory())); - try { - if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || - query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || - query_type == InvertedIndexQueryType::EQUAL_QUERY) { - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - index_searcher->_search(query.get(), [&result](DocRange* doc_range) { - if (doc_range->type_ == DocRangeType::kMany) { - result.addMany(doc_range->doc_many_size_, doc_range->doc_many->data()); - } else { - result.addRange(doc_range->doc_range.first, doc_range->doc_range.second); - } - }); - } else { - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - index_searcher->_search(query.get(), - [&result](const int32_t docid, const float_t /*score*/) { - // docid equal to rowid in segment - result.add(docid); - }); - } - } catch (const CLuceneError& e) { - if (_is_range_query(query_type) && e.number() == CL_ERR_TooManyClauses) { - return Status::Error( - "range query term exceeds limits, try to downgrade from inverted index, column " - "name:{}, search_str:{}", - column_name, search_str); - } else { - return Status::Error( - "CLuceneError occured, error msg: {}, column name: {}, search_str: {}", - e.what(), column_name, search_str); + try { + if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || + query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || + query_type == InvertedIndexQueryType::EQUAL_QUERY) { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + (*index_searcher)->_search(query.get(), [&result](DocRange* doc_range) { + if (doc_range->type_ == DocRangeType::kMany) { + result.addMany(doc_range->doc_many_size_, doc_range->doc_many->data()); + } else { + result.addRange(doc_range->doc_range.first, doc_range->doc_range.second); + } + }); + } else { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + (*index_searcher) + ->_search(query.get(), + [&result](const int32_t docid, const float_t /*score*/) { + // docid equal to rowid in segment + result.add(docid); + }); + } + } catch (const CLuceneError& e) { + if (_is_range_query(query_type) && e.number() == CL_ERR_TooManyClauses) { + return Status::Error( + "range query term exceeds limits, try to downgrade from inverted index, " + "column " + "name:{}, search_str:{}", + column_name, search_str); + } else { + return Status::Error( + "CLuceneError occured, error msg: {}, column name: {}, search_str: {}", + e.what(), column_name, search_str); + } } - } - // add to cache - std::shared_ptr term_match_bitmap = - std::make_shared(result); - term_match_bitmap->runOptimize(); - cache->insert(cache_key, term_match_bitmap, &cache_handle); + // add to cache + std::shared_ptr term_match_bitmap = + std::make_shared(result); + term_match_bitmap->runOptimize(); + cache->insert(cache_key, term_match_bitmap, &cache_handle); - bit_map->swap(result); + bit_map->swap(result); + } return Status::OK(); } @@ -639,22 +648,11 @@ InvertedIndexReaderType StringTypeInvertedIndexReader::type() { BkdIndexReader::BkdIndexReader(io::FileSystemSPtr fs, const std::string& path, const TabletIndex* index_meta) - : InvertedIndexReader(fs, path, index_meta), _compoundReader(nullptr) { + : InvertedIndexReader(fs, path, index_meta) { io::Path io_path(_path); - auto index_dir = io_path.parent_path(); - auto index_file_name = InvertedIndexDescriptor::get_index_file_name(io_path.filename(), - index_meta->index_id()); - - // check index file existence - auto index_file = index_dir / index_file_name; - if (!indexExists(index_file)) { - LOG(WARNING) << "bkd index: " << index_file.string() << " not exist."; - return; - } - _file_full_path = index_file; - _compoundReader = std::make_unique( - DorisCompoundDirectory::getDirectory(fs, index_dir.c_str()), index_file_name.c_str(), - config::inverted_index_read_buffer_size); + _index_dir = io_path.parent_path(); + _index_file_name = InvertedIndexDescriptor::get_index_file_name(io_path.filename(), + index_meta->index_id()); } Status BkdIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, @@ -701,12 +699,12 @@ Status BkdIndexReader::try_query(OlapReaderStatistics* stats, const std::string& uint32_t* count) { auto visitor = std::make_unique(nullptr, query_type, true); std::shared_ptr r; - RETURN_IF_ERROR(get_bkd_reader(&r)); + RETURN_IF_ERROR(get_bkd_reader(r, stats)); std::string query_str; _value_key_coder->full_encode_ascending(query_value, &query_str); - InvertedIndexQueryCache::CacheKey cache_key {_file_full_path, column_name, query_type, - query_str}; + InvertedIndexQueryCache::CacheKey cache_key {_index_dir / _index_file_name, column_name, + query_type, query_str}; auto cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCacheHandle cache_handler; roaring::Roaring bit_map; @@ -716,14 +714,7 @@ Status BkdIndexReader::try_query(OlapReaderStatistics* stats, const std::string& return Status::OK(); } try { - auto st = bkd_query(stats, column_name, query_value, query_type, r, visitor.get()); - if (!st.ok()) { - if (st.code() == ErrorCode::END_OF_FILE) { - return Status::OK(); - } - LOG(WARNING) << "bkd_query for column " << column_name << " failed: " << st; - return st; - } + RETURN_IF_ERROR(bkd_query(stats, column_name, query_value, query_type, r, visitor.get())); *count = r->estimate_point_count(visitor.get()); } catch (const CLuceneError& e) { return Status::Error( @@ -753,16 +744,20 @@ Status BkdIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_ const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, roaring::Roaring* bit_map) { SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); + /*if (_bkd_reader == nullptr) { + LOG(WARNING) << "bkd index input file {} not found" << _file_full_path; + return Status::EndOfFile("bkd index file is empty"); + }*/ auto visitor = std::make_unique(bit_map, query_type); std::shared_ptr r; - RETURN_IF_ERROR(get_bkd_reader(&r)); + RETURN_IF_ERROR(get_bkd_reader(r, stats)); std::string query_str; _value_key_coder->full_encode_ascending(query_value, &query_str); - InvertedIndexQueryCache::CacheKey cache_key {_file_full_path, column_name, query_type, - query_str}; + InvertedIndexQueryCache::CacheKey cache_key {_index_dir / _index_file_name, column_name, + query_type, query_str}; auto cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCacheHandle cache_handler; auto cache_status = handle_cache(cache, cache_key, &cache_handler, stats, bit_map); @@ -771,14 +766,7 @@ Status BkdIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_ } try { - auto st = bkd_query(stats, column_name, query_value, query_type, r, visitor.get()); - if (!st.ok()) { - if (st.code() == ErrorCode::END_OF_FILE) { - return Status::OK(); - } - LOG(WARNING) << "bkd_query for column " << column_name << " failed: " << st; - return st; - } + RETURN_IF_ERROR(bkd_query(stats, column_name, query_value, query_type, r, visitor.get())); r->intersect(visitor.get()); } catch (const CLuceneError& e) { return Status::Error( @@ -795,45 +783,34 @@ Status BkdIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_ return Status::OK(); } -Status BkdIndexReader::get_bkd_reader(std::shared_ptr* bkdReader) { - // bkd file reader - if (_compoundReader == nullptr) { - return Status::Error( - "bkd index input file not found"); - } - CLuceneError err; - std::unique_ptr data_in; - std::unique_ptr meta_in; - std::unique_ptr index_in; - - if (!_compoundReader->openInput( - InvertedIndexDescriptor::get_temporary_bkd_index_data_file_name().c_str(), data_in, - err) || - !_compoundReader->openInput( - InvertedIndexDescriptor::get_temporary_bkd_index_meta_file_name().c_str(), meta_in, - err) || - !_compoundReader->openInput( - InvertedIndexDescriptor::get_temporary_bkd_index_file_name().c_str(), index_in, - err)) { - return Status::Error("bkd index input error: {}", - err.what()); - } - - *bkdReader = std::make_shared(data_in.release()); - if (0 == (*bkdReader)->read_meta(meta_in.get())) { - VLOG_NOTICE << "bkd index file is empty:" << _compoundReader->toString(); - return Status::EndOfFile("bkd index file is empty"); +Status BkdIndexReader::get_bkd_reader(BKDIndexSearcherPtr& bkd_reader, + OlapReaderStatistics* stats) { + InvertedIndexCacheHandle inverted_index_cache_handle; + auto st = InvertedIndexSearcherCache::instance()->get_index_searcher( + _fs, _index_dir.c_str(), _index_file_name, &inverted_index_cache_handle, stats, type()); + if (!st.ok()) { + // empty bkd index file, just return + if (st.code() == ErrorCode::END_OF_FILE) { + return Status::OK(); + } + LOG(WARNING) << "get_index_searcher for " << _index_dir / _index_file_name + << " failed: " << st; + return st; } - - (*bkdReader)->read_index(index_in.get()); - - _type_info = get_scalar_type_info((FieldType)(*bkdReader)->type); - if (_type_info == nullptr) { - return Status::Error( - "unsupported typeinfo, type={}", (*bkdReader)->type); + auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); + auto bkd_searcher = std::get_if(&searcher_variant); + if (bkd_searcher) { + _type_info = get_scalar_type_info((FieldType)(*bkd_searcher)->type); + if (_type_info == nullptr) { + return Status::Error( + "unsupported typeinfo, type={}", (*bkd_searcher)->type); + } + _value_key_coder = get_key_coder(_type_info->type()); + bkd_reader = *bkd_searcher; + return Status::OK(); } - _value_key_coder = get_key_coder(_type_info->type()); - return Status::OK(); + return Status::Error( + "get bkd reader from searcher cache builder error"); } InvertedIndexReaderType BkdIndexReader::type() { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 20c5c731f9eca8..b1801142c3fa15 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -59,15 +59,6 @@ namespace segment_v2 { class InvertedIndexIterator; class InvertedIndexQueryCacheHandle; -enum class InvertedIndexReaderType { - UNKNOWN = -1, - FULLTEXT = 0, - STRING_TYPE = 1, - BKD = 2, -}; - -using IndexSearcherPtr = std::shared_ptr; - class InvertedIndexReader : public std::enable_shared_from_this { public: explicit InvertedIndexReader(io::FileSystemSPtr fs, const std::string& path, @@ -141,7 +132,7 @@ class FullTextIndexReader : public InvertedIndexReader { private: Status normal_index_search(OlapReaderStatistics* stats, InvertedIndexQueryType query_type, - const IndexSearcherPtr& index_searcher, + const FulltextIndexSearcherPtr& index_searcher, bool& null_bitmap_already_read, const std::unique_ptr& query, const std::shared_ptr& term_match_bitmap); @@ -149,10 +140,11 @@ class FullTextIndexReader : public InvertedIndexReader { Status match_all_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws, const std::vector& analyse_result, - const IndexSearcherPtr& index_searcher, + const FulltextIndexSearcherPtr& index_searcher, const std::shared_ptr& term_match_bitmap); - void check_null_bitmap(const IndexSearcherPtr& index_searcher, bool& null_bitmap_already_read); + void check_null_bitmap(const FulltextIndexSearcherPtr& index_searcher, + bool& null_bitmap_already_read); }; class StringTypeInvertedIndexReader : public InvertedIndexReader { @@ -216,25 +208,13 @@ class BkdIndexReader : public InvertedIndexReader { ENABLE_FACTORY_CREATOR(BkdIndexReader); private: - std::string _file_full_path; + std::string _index_file_name; + io::Path _index_dir; public: explicit BkdIndexReader(io::FileSystemSPtr fs, const std::string& path, const TabletIndex* index_meta); - ~BkdIndexReader() override { - if (_compoundReader != nullptr) { - try { - _compoundReader->close(); - } catch (const CLuceneError& e) { - // Handle exception, e.g., log it, but don't rethrow. - LOG(ERROR) << "Exception caught in BkdIndexReader destructor: " << e.what() - << std::endl; - } catch (...) { - // Handle all other exceptions, but don't rethrow. - LOG(ERROR) << "Unknown exception caught in BkdIndexReader destructor." << std::endl; - } - } - } + ~BkdIndexReader() override = default; Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr* iterator) override; @@ -256,12 +236,11 @@ class BkdIndexReader : public InvertedIndexReader { roaring::Roaring* bit_map); InvertedIndexReaderType type() override; - Status get_bkd_reader(std::shared_ptr* reader); + Status get_bkd_reader(BKDIndexSearcherPtr& reader, OlapReaderStatistics* stats); private: const TypeInfo* _type_info {}; const KeyCoder* _value_key_coder {}; - std::unique_ptr _compoundReader; }; class InvertedIndexIterator { @@ -274,7 +253,7 @@ class InvertedIndexIterator { Status read_from_inverted_index(const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, uint32_t segment_num_rows, - roaring::Roaring* bit_map, bool skip_try = false); + roaring::Roaring* bit_map, bool skip_try = true); Status try_read_from_inverted_index(const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, uint32_t* count); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 5c53a91c3d8cfc..a532708371b2e3 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -118,8 +118,11 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { // open index searcher into cache auto index_file_name = InvertedIndexDescriptor::get_index_file_name( _segment_file_name, _index_meta->index_id()); - static_cast(InvertedIndexSearcherCache::instance()->insert(_fs, _directory, - index_file_name)); + auto st = InvertedIndexSearcherCache::instance()->insert( + _fs, _directory, index_file_name, InvertedIndexReaderType::FULLTEXT); + if (!st.ok()) { + LOG(ERROR) << "insert inverted index searcher cache error:" << st; + } } } } @@ -141,7 +144,6 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { auto index_path = InvertedIndexDescriptor::get_temporary_index_path( _directory + "/" + _segment_file_name, _index_meta->index_id()); - // LOG(INFO) << "inverted index path: " << index_path; bool exists = false; auto st = _fs->exists(index_path.c_str(), &exists); if (!st.ok()) { @@ -151,12 +153,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { } if (exists) { LOG(ERROR) << "try to init a directory:" << index_path << " already exists"; - return Status::InternalError("init_fulltext_index a directory already exists"); - //st = _fs->delete_directory(index_path.c_str()); - //if (!st.ok()) { - // LOG(ERROR) << "delete directory:" << index_path << " error:" << st; - // return st; - //} + return Status::InternalError("init_fulltext_index directory already exists"); } _char_string_reader = std::make_unique>();