From e298997d0e4444450c98365346222421f774a495 Mon Sep 17 00:00:00 2001 From: eldenmoon <15605149486@163.com> Date: Fri, 2 Aug 2024 19:23:10 +0800 Subject: [PATCH] nested --- be/src/olap/rowset/segment_v2/column_reader.h | 2 +- .../segment_v2/hierarchical_data_reader.h | 96 ++++++++++++++----- .../rowset/segment_v2/segment_iterator.cpp | 78 ++++++++++++++- .../olap/rowset/segment_v2/segment_iterator.h | 3 + be/src/vec/columns/column_object.cpp | 36 +------ be/src/vec/columns/subcolumn_tree.h | 65 ++++++++++--- .../vec/data_types/serde/data_type_serde.cpp | 22 +++++ be/src/vec/data_types/serde/data_type_serde.h | 3 + be/src/vec/json/path_in_data.cpp | 20 ++++ be/src/vec/json/path_in_data.h | 12 +++ 10 files changed, 266 insertions(+), 71 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index ca22f590342acfc..cbfc1668b31a036 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -742,7 +742,7 @@ class EmptyColumnIterator : public ColumnIterator { Status read_by_rowids(const rowid_t* rowids, const size_t count, vectorized::MutableColumnPtr& dst) override { - return Status::NotSupported("Not supported read_by_rowids"); + return Status::OK(); } ordinal_t get_current_ordinal() const override { diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h index 0424c4e72f78b48..7f36401a371e75a 100644 --- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h +++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h @@ -32,6 +32,9 @@ #include "vec/columns/column_object.h" #include "vec/columns/subcolumn_tree.h" #include "vec/common/assert_cast.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/core/columns_with_type_and_name.h" +#include "vec/core/types.h" #include "vec/data_types/data_type.h" #include "vec/data_types/data_type_array.h" #include "vec/data_types/data_type_nullable.h" @@ -119,59 +122,104 @@ class HierarchicalDataReader : public ColumnIterator { auto type = root_var.get_root_type(); container_variant.add_sub_column({}, std::move(column), type); } - bool nested = false; + // parent -> subcolumns + std::map nested_subcolumns; + PathsWithColumnAndType non_nested_subcolumns; RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) { MutableColumnPtr column = node.data.column->get_ptr(); PathInData real_path = node.path.copy_pop_nfront(_path.get_parts().size()); - bool add = - container_variant.add_sub_column(real_path, std::move(column), node.data.type); - if (!add) { - return Status::InternalError("Duplicated {}, type {}", node.path.get_path(), - node.data.type->get_name()); - } + if (node.parent->is_nested()) { - nested = true; + CHECK_EQ(getTypeName(remove_nullable(node.data.type)->get_type_id()), + getTypeName(TypeIndex::Array)); + nested_subcolumns[real_path.copy_pop_back()].emplace_back( + real_path, column->get_ptr(), node.data.type); + } else { + non_nested_subcolumns.emplace_back(real_path, column->get_ptr(), node.data.type); } return Status::OK(); })); - if (nested) { - container_variant.finalize_if_not(); + for (auto& entry : non_nested_subcolumns) { + bool add = container_variant.add_sub_column(entry.path, entry.column->assume_mutable(), + entry.type); + if (!add) { + return Status::InternalError("Duplicated {}, type {}", entry.path.get_path(), + entry.type->get_name()); + } + } + for (auto& entry : nested_subcolumns) { MutableColumnPtr nested_object = ColumnObject::create(true, false); MutableColumnPtr offset = - check_and_get_column( - *remove_nullable(container_variant.get_subcolumns() - .get_leaves()[0] - ->data.get_finalized_column_ptr())) + check_and_get_column(*remove_nullable(entry.second[0].column)) ->get_offsets_ptr() ->assume_mutable(); auto* nested_object_ptr = assert_cast(nested_object.get()); // flatten nested arrays - for (const auto& entry : container_variant.get_subcolumns()) { - auto& column = entry->data.get_finalized_column_ptr(); - const auto& type = entry->data.get_least_common_type(); + for (const auto& subcolumn : entry.second) { + const auto& column = subcolumn.column; + const auto& type = subcolumn.type; if (!remove_nullable(column)->is_column_array()) { return Status::InvalidArgument( "Meet none array column when flatten nested array, path {}, type {}", - entry->path.get_path(), entry->data.get_finalized_column().get_name()); + subcolumn.path.get_path(), subcolumn.type->get_name()); } MutableColumnPtr flattend_column = - check_and_get_column( - remove_nullable(entry->data.get_finalized_column_ptr()).get()) + check_and_get_column(remove_nullable(subcolumn.column).get()) ->get_data_ptr() ->assume_mutable(); DataTypePtr flattend_type = check_and_get_data_type(remove_nullable(type).get()) ->get_nested_type(); - nested_object_ptr->add_sub_column(entry->path, std::move(flattend_column), - std::move(flattend_type)); + // add path without parent prefix + nested_object_ptr->add_sub_column( + subcolumn.path.copy_pop_nfront(entry.first.get_parts().size()), + std::move(flattend_column), std::move(flattend_type)); } nested_object = make_nullable(nested_object->get_ptr())->assume_mutable(); auto array = make_nullable(ColumnArray::create(std::move(nested_object), std::move(offset))); - container_variant.clear(); - container_variant.create_root(ColumnObject::NESTED_TYPE, array->assume_mutable()); + container_variant.add_sub_column(entry.first, array->assume_mutable(), + ColumnObject::NESTED_TYPE); } + // if (has_nested) { + // // rewrite nested nodes + // container_variant.finalize_if_not(); + // MutableColumnPtr nested_object = ColumnObject::create(true, false); + // MutableColumnPtr offset = + // check_and_get_column( + // *remove_nullable(container_variant.get_subcolumns() + // .get_leaves()[0] + // ->data.get_finalized_column_ptr())) + // ->get_offsets_ptr() + // ->assume_mutable(); + // auto* nested_object_ptr = assert_cast(nested_object.get()); + // // flatten nested arrays + // for (const auto& entry : container_variant.get_subcolumns()) { + // auto& column = entry->data.get_finalized_column_ptr(); + // const auto& type = entry->data.get_least_common_type(); + // if (!remove_nullable(column)->is_column_array()) { + // return Status::InvalidArgument( + // "Meet none array column when flatten nested array, path {}, type {}", + // entry->path.get_path(), entry->data.get_finalized_column().get_name()); + // } + // MutableColumnPtr flattend_column = + // check_and_get_column( + // remove_nullable(entry->data.get_finalized_column_ptr()).get()) + // ->get_data_ptr() + // ->assume_mutable(); + // DataTypePtr flattend_type = + // check_and_get_data_type(remove_nullable(type).get()) + // ->get_nested_type(); + // nested_object_ptr->add_sub_column(entry->path, std::move(flattend_column), + // std::move(flattend_type)); + // } + // nested_object = make_nullable(nested_object->get_ptr())->assume_mutable(); + // auto array = + // make_nullable(ColumnArray::create(std::move(nested_object), std::move(offset))); + // container_variant.clear(); + // container_variant.create_root(ColumnObject::NESTED_TYPE, array->assume_mutable()); + // } // TODO select v:b -> v.b / v.b.c but v.d maybe in v // copy container variant to dst variant, todo avoid copy diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index b0a947078cd1a50..cbe5b08355d090e 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -71,6 +71,7 @@ #include "util/key_util.h" #include "util/simd/bits.h" #include "vec/columns/column.h" +#include "vec/columns/column_array.h" #include "vec/columns/column_const.h" #include "vec/columns/column_nullable.h" #include "vec/columns/column_object.h" @@ -81,10 +82,12 @@ #include "vec/common/schema_util.h" #include "vec/common/string_ref.h" #include "vec/common/typeid_cast.h" +#include "vec/core/block.h" #include "vec/core/column_with_type_and_name.h" #include "vec/core/field.h" #include "vec/core/types.h" #include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_array.h" #include "vec/data_types/data_type_factory.hpp" #include "vec/data_types/data_type_number.h" #include "vec/exprs/vexpr.h" @@ -92,6 +95,7 @@ #include "vec/exprs/vliteral.h" #include "vec/exprs/vslot_ref.h" #include "vec/functions/array/function_array_index.h" +#include "vec/functions/function_helpers.h" #include "vec/json/path_in_data.h" namespace doris { @@ -2316,7 +2320,6 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { auto status = [&]() { RETURN_IF_CATCH_EXCEPTION({ RETURN_IF_ERROR(_next_batch_internal(block)); - RETURN_IF_ERROR(_fill_missing_columns(block)); // reverse block row order if read_orderby_key_reverse is true for key topn // it should be processed for all success _next_batch_internal @@ -2394,7 +2397,78 @@ Status SegmentIterator::copy_column_data_by_selector(vectorized::IColumn* input_ return input_col_ptr->filter_by_selector(sel_rowid_idx, select_size, output_col); } +vectorized::SubcolumnsTree +SegmentIterator::_get_nested_array_offsets_columns(const vectorized::Block& block) { + vectorized::SubcolumnsTree offsets; + for (size_t i = 0; i < block.columns(); ++i) { + auto cid = _schema->column_id(i); + const auto* column_desc = _schema->column(cid); + if (column_desc->path() != nullptr && column_desc->path()->has_nested_part() && + column_desc->type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { + offsets.add(*column_desc->path(), + &vectorized::check_and_get_column( + remove_nullable(block.get_by_position(i).column).get()) + ->get_offsets()); + LOG(INFO) << "fuck"; + } + } + return offsets; +} + Status SegmentIterator::_fill_missing_columns(vectorized::Block* block) { + vectorized::SubcolumnsTree offsets = + _get_nested_array_offsets_columns(*block); + for (size_t i = 0; i < block->columns(); ++i) { + auto cid = _schema->column_id(i); + const auto* column_desc = _schema->column(cid); + int64_t current_size = block->get_by_position(i).column->size(); + if (column_desc->path() != nullptr && column_desc->path()->has_nested_part() && + current_size < block->rows()) { + const auto* leaf = offsets.get_leaf_of_the_same_nested( + *column_desc->path(), [](const auto& node) { return node.data->size(); }, + current_size); + if (!leaf) { + VLOG_DEBUG << "Not found any subcolumns column_desc: " + << column_desc->path()->get_path() << ", current_size: " << current_size + << ", block_rows: " << block->rows(); + block->get_by_position(i).column->assume_mutable()->insert_many_defaults( + block->rows() - current_size); + continue; + } + LOG(INFO) << "fuck"; + const vectorized::ColumnArray::Offsets64* offset = leaf->data; + int64_t nested_padding_size = offset->back() - (*offset)[current_size - 1]; + auto nested_column = + vectorized::check_and_get_data_type( + remove_nullable(block->get_by_position(i).type).get()) + ->get_nested_type() + ->create_column_const_with_default_value(nested_padding_size); + auto nested_new_offset = vectorized::ColumnArray::ColumnOffsets::create(); + nested_new_offset->reserve(block->rows() - current_size); + for (size_t i = current_size; i < block->rows(); ++i) { + nested_new_offset->get_data().push_back_without_reserve( + (*offset)[i] - (*offset)[current_size - 1]); + } + vectorized::ColumnPtr nested_padding_column = + vectorized::ColumnArray::create(nested_column, std::move(nested_new_offset)); + if (block->get_by_position(i).column->is_nullable()) { + nested_padding_column = vectorized::make_nullable(nested_padding_column); + } + block->get_by_position(i).column->assume_mutable()->insert_range_from( + *nested_padding_column, 0, nested_padding_column->size()); + } + } + +#ifndef NDEBUG + // check offsets aligned + for (size_t i = 0; i < block->columns(); ++i) { + auto cid = _schema->column_id(i); + const auto* column_desc = _schema->column(cid); + if (column_desc->path() != nullptr && column_desc->path()->has_nested_part() && + column_desc->type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { + } + } +#endif return Status::OK(); } @@ -2662,6 +2736,8 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { // shrink char_type suffix zero data block->shrink_char_type_column_suffix_zero(_char_type_idx); + RETURN_IF_ERROR(_fill_missing_columns(block)); + #ifndef NDEBUG size_t rows = block->rows(); for (const auto& entry : *block) { diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 14abe766514fef0..1dbbc575ca6fe57 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -46,6 +46,7 @@ #include "util/runtime_profile.h" #include "util/slice.h" #include "vec/columns/column.h" +#include "vec/columns/column_array.h" #include "vec/common/schema_util.h" #include "vec/core/block.h" #include "vec/core/column_with_type_and_name.h" @@ -226,6 +227,8 @@ class SegmentIterator : public RowwiseIterator { void _vec_init_char_column_id(vectorized::Block* block); bool _has_char_type(const Field& column_desc); + vectorized::SubcolumnsTree + _get_nested_array_offsets_columns(const vectorized::Block& block); Status _fill_missing_columns(vectorized::Block* block); uint32_t segment_id() const { return _segment->id(); } diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp index d2b61b8c6831a4b..01a36c3b1a2c08b 100644 --- a/be/src/vec/columns/column_object.cpp +++ b/be/src/vec/columns/column_object.cpp @@ -1866,42 +1866,12 @@ ColumnObject::Subcolumn ColumnObject::Subcolumn::cut(size_t start, size_t length const ColumnObject::Subcolumns::Node* ColumnObject::get_leaf_of_the_same_nested( const Subcolumns::NodePtr& entry) const { - if (!entry->path.has_nested_part()) { - return nullptr; - } - - size_t old_size = entry->data.size(); - const auto* current_node = subcolumns.find_leaf(entry->path); - const Subcolumns::Node* leaf = nullptr; - - while (current_node) { - /// Try to find the first Nested up to the current node. - const auto* node_nested = Subcolumns::find_parent( - current_node, [](const auto& candidate) -> bool { return candidate.is_nested(); }); - - if (!node_nested) { - break; - } - - /// Find the leaf with subcolumn that contains values - /// for the last rows. - /// If there are no leaves, skip current node and find - /// the next node up to the current. - leaf = Subcolumns::find_leaf(node_nested, [&](const auto& candidate) { - return candidate.data.size() > old_size; - }); - - if (leaf) { - break; - } - - current_node = node_nested->parent; - } - + const auto* leaf = subcolumns.get_leaf_of_the_same_nested( + entry->path, [](const Subcolumns::Node& node) { return node.data.size(); }, + entry->data.size()); if (leaf && is_nothing(leaf->data.get_least_common_typeBase())) { return nullptr; } - return leaf; } diff --git a/be/src/vec/columns/subcolumn_tree.h b/be/src/vec/columns/subcolumn_tree.h index 9ac2e12043349f1..f71fb4870035bde 100644 --- a/be/src/vec/columns/subcolumn_tree.h +++ b/be/src/vec/columns/subcolumn_tree.h @@ -199,18 +199,6 @@ class SubcolumnsTree { /// Find node that matches the path the best. const Node* find_best_match(const PathInData& path) const { return find_impl(path, false); } - /// Find node that matches the path exactly. - const Node* find_exact(const PathInData& path) const { return find_impl(path, true); } - - /// Find leaf by path. - const Node* find_leaf(const PathInData& path) const { - const auto* candidate = find_exact(path); - if (!candidate || !candidate->is_scalar()) { - return nullptr; - } - return candidate; - } - using NodePredicate = std::function; /// Finds leaf that satisfies the predicate. @@ -218,6 +206,9 @@ class SubcolumnsTree { return find_leaf(root.get(), predicate); } + /// Find node that matches the path exactly. + const Node* find_exact(const PathInData& path) const { return find_impl(path, true); } + static const Node* find_leaf(const Node* node, const NodePredicate& predicate) { if (!node) { return nullptr; @@ -236,6 +227,53 @@ class SubcolumnsTree { return nullptr; } + /// Find leaf by path. + const Node* find_leaf(const PathInData& path) const { + const auto* candidate = find_exact(path); + if (!candidate || !candidate->is_scalar()) { + return nullptr; + } + return candidate; + } + + const Node* get_leaf_of_the_same_nested(const PathInData& path, + std::function subcolumn_size_fn, + size_t old_size) const { + if (!path.has_nested_part()) { + return nullptr; + } + + const auto* current_node = find_leaf(path); + const Node* leaf = nullptr; + + while (current_node) { + /// Try to find the first Nested up to the current node. + const auto* node_nested = find_parent(current_node, [](const auto& candidate) -> bool { + return candidate.is_nested(); + }); + + if (!node_nested) { + break; + } + + /// Find the leaf with subcolumn that contains values + /// for the last rows. + /// If there are no leaves, skip current node and find + /// the next node up to the current. + leaf = SubcolumnsTree::find_leaf(node_nested, [&](const auto& candidate) { + return subcolumn_size_fn(candidate) > old_size; + }); + + if (leaf) { + break; + } + + current_node = node_nested->parent; + } + + return leaf; + } + /// Find first parent node that satisfies the predicate. static const Node* find_parent(const Node* node, const NodePredicate& predicate) { while (node && !predicate(*node)) { @@ -244,6 +282,9 @@ class SubcolumnsTree { return node; } + // get the mutable parent node + static Node* get_mutable_parent(const Node* node) { return const_cast(node->parent); } + bool empty() const { return root == nullptr; } size_t size() const { return leaves.size(); } diff --git a/be/src/vec/data_types/serde/data_type_serde.cpp b/be/src/vec/data_types/serde/data_type_serde.cpp index 2bdd63aa989a260..0cb7d3d08cff5d4 100644 --- a/be/src/vec/data_types/serde/data_type_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_serde.cpp @@ -18,6 +18,7 @@ #include "runtime/descriptors.h" #include "vec/columns/column.h" +#include "vec/core/field.h" #include "vec/data_types/data_type.h" namespace doris { @@ -42,6 +43,22 @@ DataTypeSerDeSPtrs create_data_type_serdes(const std::vector& s return serdes; } +void DataTypeSerDe::convert_variant_map_to_rapidjson( + const vectorized::VariantMap& map, rapidjson::Value& target, + rapidjson::Document::AllocatorType& allocator) { + target.SetObject(); + for (const auto& item : map) { + if (item.second.is_null()) { + continue; + } + rapidjson::Value key; + key.SetString(item.first.data(), item.first.size()); + rapidjson::Value val; + convert_field_to_rapidjson(item.second, val, allocator); + target.AddMember(key, val, allocator); + } +} + void DataTypeSerDe::convert_array_to_rapidjson(const vectorized::Array& array, rapidjson::Value& target, rapidjson::Document::AllocatorType& allocator) { @@ -76,6 +93,11 @@ void DataTypeSerDe::convert_field_to_rapidjson(const vectorized::Field& field, convert_array_to_rapidjson(array, target, allocator); break; } + case vectorized::Field::Types::VariantMap: { + const vectorized::VariantMap& map = field.get(); + convert_variant_map_to_rapidjson(map, target, allocator); + break; + } default: CHECK(false) << "unkown field type: " << field.get_type_name(); break; diff --git a/be/src/vec/data_types/serde/data_type_serde.h b/be/src/vec/data_types/serde/data_type_serde.h index 95c62b69ad7e35d..4e4154b0c26cf0c 100644 --- a/be/src/vec/data_types/serde/data_type_serde.h +++ b/be/src/vec/data_types/serde/data_type_serde.h @@ -335,6 +335,9 @@ class DataTypeSerDe { rapidjson::Document::AllocatorType& allocator); static void convert_array_to_rapidjson(const vectorized::Array& array, rapidjson::Value& target, rapidjson::Document::AllocatorType& allocator); + static void convert_variant_map_to_rapidjson(const vectorized::VariantMap& array, + rapidjson::Value& target, + rapidjson::Document::AllocatorType& allocator); }; /// Invert values since Arrow interprets 1 as a non-null value, while doris as a null diff --git a/be/src/vec/json/path_in_data.cpp b/be/src/vec/json/path_in_data.cpp index fb32b54a698b4bc..703a532c793387b 100644 --- a/be/src/vec/json/path_in_data.cpp +++ b/be/src/vec/json/path_in_data.cpp @@ -109,6 +109,10 @@ void PathInData::build_parts(const Parts& other_parts) { const char* begin = path.data(); for (const auto& part : other_parts) { has_nested |= part.is_nested; + if (has_nested) { + LOG(INFO) << "fuck"; + // CHECK(false); + } parts.emplace_back(std::string_view {begin, part.key.length()}, part.is_nested, part.anonymous_array_level); begin += part.key.length() + 1; @@ -124,6 +128,10 @@ void PathInData::from_protobuf(const segment_v2::ColumnPathInfo& pb) { for (const segment_v2::ColumnPathPartInfo& part_info : pb.path_part_infos()) { Part part; part.is_nested = part_info.is_nested(); + has_nested |= part.is_nested; + if (has_nested) { + LOG(INFO) << "fuck"; + } part.anonymous_array_level = part_info.anonymous_array_level(); // use string_view to ref data in path part.key = std::string_view {begin, part_info.key().length()}; @@ -170,6 +178,18 @@ PathInData PathInData::copy_pop_front() const { return copy_pop_nfront(1); } +PathInData PathInData::copy_pop_back() const { + if (parts.size() <= 1) { + return {}; + } + PathInData new_path; + Parts new_parts = parts; + new_parts.pop_back(); + new_path.build_path(new_parts); + new_path.build_parts(new_parts); + return new_path; +} + std::string PathInData::get_nested_prefix() const { if (!has_nested) { return ""; diff --git a/be/src/vec/json/path_in_data.h b/be/src/vec/json/path_in_data.h index a225f0253c6dcd3..1eb1005b8048262 100644 --- a/be/src/vec/json/path_in_data.h +++ b/be/src/vec/json/path_in_data.h @@ -29,15 +29,18 @@ #include #include "gen_cpp/segment_v2.pb.h" +#include "vec/columns/column.h" #include "vec/common/uint128.h" #include "vec/core/field.h" #include "vec/core/types.h" +#include "vec/data_types/data_type.h" namespace doris::vectorized { /// Class that represents path in document, e.g. JSON. class PathInData; using PathInDataPtr = std::shared_ptr; + class PathInData { public: struct Part { @@ -83,6 +86,7 @@ class PathInData { PathInData copy_pop_front() const; PathInData copy_pop_nfront(size_t n) const; + PathInData copy_pop_back() const; void to_protobuf(segment_v2::ColumnPathInfo* pb, int32_t parent_col_unique_id) const; void from_protobuf(const segment_v2::ColumnPathInfo& pb); @@ -144,4 +148,12 @@ struct PathInDataRef { bool operator==(const PathInDataRef& other) const { return *this->ref == *other.ref; } }; +struct PathWithColumnAndType { + PathInData path; + ColumnPtr column; + DataTypePtr type; +}; + +using PathsWithColumnAndType = std::vector; + } // namespace doris::vectorized