diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp index dcc082c22ae00c..f057214dde84cd 100644 --- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp +++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp @@ -17,6 +17,8 @@ #include "olap/rowset/segment_v2/hierarchical_data_reader.h" +#include + #include "common/status.h" #include "io/io_common.h" #include "olap/rowset/segment_v2/column_reader.h" @@ -41,7 +43,7 @@ Status HierarchicalDataReader::create(std::unique_ptr* reader, vectorized::PathsInData leaves_paths; SubcolumnColumnReaders::get_leaves_of_node(node, leaves, leaves_paths); for (size_t i = 0; i < leaves_paths.size(); ++i) { - if (leaves_paths[i] == root->path) { + if (leaves_paths[i].empty()) { // use set_root to share instead continue; } diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h index 1d02685e445dfd..c665692ac8a2d0 100644 --- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h +++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h @@ -127,7 +127,7 @@ class HierarchicalDataReader : public ColumnIterator { auto& container_variant = assert_cast(*container); // add root first - if (_path.get_parts().size() == 1) { + if (_path.get_parts().empty() && _root_reader) { auto& root_var = _root_reader->column->is_nullable() ? assert_cast( diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 894b46767ba84e..f6d87e5b8e5cbe 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -158,7 +158,9 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o const TabletColumn& col = read_options.tablet_schema->column(column_id); ColumnReader* reader = nullptr; if (col.is_extracted_column()) { - const auto* node = _sub_column_tree.find_exact(*col.path_info_ptr()); + auto relative_path = col.path_info_ptr()->copy_pop_front(); + int32_t unique_id = col.unique_id() > 0 ? col.unique_id() : col.parent_unique_id(); + const auto* node = _sub_column_tree[unique_id].find_exact(relative_path); reader = node != nullptr ? node->data.reader.get() : nullptr; } else { reader = _column_readers.contains(col.unique_id()) @@ -381,19 +383,27 @@ Status Segment::_load_index_impl() { // Return the storage datatype of related column to field. // Return nullptr meaning no such storage infomation for this column -vectorized::DataTypePtr Segment::get_data_type_of(vectorized::PathInDataPtr path, bool is_nullable, - bool ignore_children) const { +vectorized::DataTypePtr Segment::get_data_type_of(const ColumnIdentifier& identifier, + bool read_flat_leaves) const { // Path has higher priority - if (path != nullptr && !path->empty()) { - const auto* node = _sub_column_tree.find_leaf(*path); - const auto* sparse_node = _sparse_column_tree.find_exact(*path); + if (identifier.path != nullptr && !identifier.path->empty()) { + auto relative_path = identifier.path->copy_pop_front(); + int32_t unique_id = + identifier.unique_id > 0 ? identifier.unique_id : identifier.parent_unique_id; + const auto* node = _sub_column_tree.contains(unique_id) + ? _sub_column_tree.at(unique_id).find_leaf(relative_path) + : nullptr; + const auto* sparse_node = + _sparse_column_tree.contains(unique_id) + ? _sparse_column_tree.at(unique_id).find_exact(relative_path) + : nullptr; if (node) { - if (ignore_children || (node->children.empty() && sparse_node == nullptr)) { + if (read_flat_leaves || (node->children.empty() && sparse_node == nullptr)) { return node->data.file_column_type; } } // it contains children or column missing in storage, so treat it as variant - return is_nullable + return identifier.is_nullable ? vectorized::make_nullable(std::make_shared()) : std::make_shared(); } @@ -450,7 +460,9 @@ Status Segment::_create_column_readers(const SegmentFooterPB& footer) { if (!column.has_path_info()) { continue; } - auto iter = column_path_to_footer_ordinal.find(*column.path_info_ptr()); + auto path = column.has_path_info() ? *column.path_info_ptr() + : vectorized::PathInData(column.name_lower_case()); + auto iter = column_path_to_footer_ordinal.find(path); if (iter == column_path_to_footer_ordinal.end()) { continue; } @@ -460,11 +472,25 @@ Status Segment::_create_column_readers(const SegmentFooterPB& footer) { std::unique_ptr reader; RETURN_IF_ERROR( ColumnReader::create(opts, column_pb, footer.num_rows(), _file_reader, &reader)); - _sub_column_tree.add( - iter->first, - SubcolumnReader { - std::move(reader), - vectorized::DataTypeFactory::instance().create_data_type(column_pb)}); + // root column use unique id, leaf column use parent_unique_id + int32_t unique_id = + column.parent_unique_id() > 0 ? column.parent_unique_id() : column.unique_id(); + auto relative_path = path.copy_pop_front(); + if (relative_path.empty()) { + // root column + _sub_column_tree[unique_id].create_root(SubcolumnReader { + std::move(reader), + vectorized::DataTypeFactory::instance().create_data_type(column_pb)}); + } else { + // check the root is already a leaf node + DCHECK(_sub_column_tree[unique_id].get_leaves()[0]->path.empty()); + _sub_column_tree[unique_id].add( + relative_path, + SubcolumnReader { + std::move(reader), + vectorized::DataTypeFactory::instance().create_data_type(column_pb)}); + } + // init sparse columns paths and type info for (uint32_t ordinal = 0; ordinal < column_pb.sparse_columns().size(); ++ordinal) { const auto& spase_column_pb = column_pb.sparse_columns(ordinal); @@ -472,8 +498,8 @@ Status Segment::_create_column_readers(const SegmentFooterPB& footer) { vectorized::PathInData path; path.from_protobuf(spase_column_pb.column_path_info()); // Read from root column, so reader is nullptr - _sparse_column_tree.add( - path, + _sparse_column_tree[column.unique_id()].add( + path.copy_pop_front(), SubcolumnReader {nullptr, vectorized::DataTypeFactory::instance().create_data_type( spase_column_pb)}); @@ -523,22 +549,22 @@ Status Segment::_new_iterator_with_variant_root(const TabletColumn& tablet_colum Status Segment::new_column_iterator_with_path(const TabletColumn& tablet_column, std::unique_ptr* iter, const StorageReadOptions* opt) { - vectorized::PathInData root_path; - if (!tablet_column.has_path_info()) { - // Missing path info, but need read the whole variant column - root_path = vectorized::PathInData(tablet_column.name_lower_case()); - } else { - root_path = vectorized::PathInData({tablet_column.path_info_ptr()->get_parts()[0]}); + // root column use unique id, leaf column use parent_unique_id + int32_t unique_id = tablet_column.unique_id() > 0 ? tablet_column.unique_id() + : tablet_column.parent_unique_id(); + if (!_sub_column_tree.contains(unique_id)) { + // No such variant column in this segment, get a default one + RETURN_IF_ERROR(new_default_iterator(tablet_column, iter)); + return Status::OK(); } - const auto* root = _sub_column_tree.find_leaf(root_path); + auto relative_path = tablet_column.path_info_ptr()->copy_pop_front(); + const auto* root = _sub_column_tree[unique_id].get_root(); const auto* node = tablet_column.has_path_info() - ? _sub_column_tree.find_exact(*tablet_column.path_info_ptr()) + ? _sub_column_tree[unique_id].find_exact(relative_path) : nullptr; - const auto* sparse_node = - tablet_column.has_path_info() - ? _sparse_column_tree.find_exact(*tablet_column.path_info_ptr()) - : nullptr; - + const auto* sparse_node = tablet_column.has_path_info() + ? _sparse_column_tree[unique_id].find_exact(relative_path) + : nullptr; // Currently only compaction and checksum need to read flat leaves // They both use tablet_schema_with_merged_max_schema_version as read schema auto type_to_read_flat_leaves = [](ReaderType type) { @@ -552,7 +578,7 @@ Status Segment::new_column_iterator_with_path(const TabletColumn& tablet_column, if (opt != nullptr && type_to_read_flat_leaves(opt->io_ctx.reader_type)) { // compaction need to read flat leaves nodes data to prevent from amplification const auto* node = tablet_column.has_path_info() - ? _sub_column_tree.find_leaf(*tablet_column.path_info_ptr()) + ? _sub_column_tree[unique_id].find_leaf(relative_path) : nullptr; if (!node) { // sparse_columns have this path, read from root @@ -574,15 +600,14 @@ Status Segment::new_column_iterator_with_path(const TabletColumn& tablet_column, if (node->is_leaf_node() && sparse_node == nullptr) { // Node contains column without any child sub columns and no corresponding sparse columns // Direct read extracted columns - const auto* node = _sub_column_tree.find_leaf(*tablet_column.path_info_ptr()); + const auto* node = _sub_column_tree[unique_id].find_leaf(relative_path); ColumnIterator* it; RETURN_IF_ERROR(node->data.reader->new_iterator(&it)); iter->reset(it); } else { // Node contains column with children columns or has correspoding sparse columns // Create reader with hirachical data - RETURN_IF_ERROR(HierarchicalDataReader::create(iter, *tablet_column.path_info_ptr(), - node, root)); + RETURN_IF_ERROR(HierarchicalDataReader::create(iter, relative_path, node, root)); } } else { // No such node, read from either sparse column or default column @@ -648,8 +673,11 @@ Status Segment::new_column_iterator(int32_t unique_id, std::unique_ptrcopy_pop_front(); + int32_t unique_id = col.unique_id() > 0 ? col.unique_id() : col.parent_unique_id(); + const auto* node = col.has_path_info() + ? _sub_column_tree[unique_id].find_exact(relative_path) + : nullptr; if (node != nullptr) { return node->data.reader.get(); } @@ -810,14 +838,19 @@ Status Segment::read_key_by_rowid(uint32_t row_id, std::string* key) { } bool Segment::same_with_storage_type(int32_t cid, const Schema& schema, - bool ignore_children) const { - auto file_column_type = get_data_type_of(schema.column(cid)->path(), - schema.column(cid)->is_nullable(), ignore_children); - auto expected_type = Schema::get_data_type_ptr(*schema.column(cid)); + bool read_flat_leaves) const { + const auto* col = schema.column(cid); + auto file_column_type = + get_data_type_of(ColumnIdentifier {.unique_id = col->unique_id(), + .parent_unique_id = col->parent_unique_id(), + .path = col->path(), + .is_nullable = col->is_nullable()}, + read_flat_leaves); + auto expected_type = Schema::get_data_type_ptr(*col); #ifndef NDEBUG if (file_column_type && !file_column_type->equals(*expected_type)) { VLOG_DEBUG << fmt::format("Get column {}, file column type {}, exepected type {}", - schema.column(cid)->name(), file_column_type->get_name(), + col->name(), file_column_type->get_name(), expected_type->get_name()); } #endif @@ -843,7 +876,10 @@ Status Segment::seek_and_read_by_rowid(const TabletSchema& schema, SlotDescripto vectorized::PathInDataPtr path = std::make_shared( schema.column_by_uid(slot->col_unique_id()).name_lower_case(), slot->column_paths()); - auto storage_type = get_data_type_of(path, slot->is_nullable(), false); + auto storage_type = get_data_type_of(ColumnIdentifier {.unique_id = slot->col_unique_id(), + .path = path, + .is_nullable = slot->is_nullable()}, + false); vectorized::MutableColumnPtr file_storage_column = storage_type->create_column(); DCHECK(storage_type != nullptr); TabletColumn column = TabletColumn::create_materialized_variant_column( diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index 1460cbd5950ddf..050255d7644ebd 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -150,14 +150,19 @@ class Segment : public std::enable_shared_from_this { void remove_from_segment_cache() const; + // Identify the column by unique id or path info + struct ColumnIdentifier { + int32_t unique_id = -1; + int32_t parent_unique_id = -1; + vectorized::PathInDataPtr path; + bool is_nullable = false; + }; // Get the inner file column's data type // ignore_chidren set to false will treat field as variant // when it contains children with field paths. // nullptr will returned if storage type does not contains such column - std::shared_ptr get_data_type_of(vectorized::PathInDataPtr path, - bool is_nullable, - bool ignore_children) const; - + std::shared_ptr get_data_type_of( + const ColumnIdentifier& identifier, bool read_flat_leaves) const; // Check is schema read type equals storage column type bool same_with_storage_type(int32_t cid, const Schema& schema, bool ignore_children) const; @@ -166,8 +171,12 @@ class Segment : public std::enable_shared_from_this { bool can_apply_predicate_safely(int cid, Predicate* pred, const Schema& schema, ReaderType read_type) const { const Field* col = schema.column(cid); - vectorized::DataTypePtr storage_column_type = get_data_type_of( - col->path(), col->is_nullable(), read_type != ReaderType::READER_QUERY); + vectorized::DataTypePtr storage_column_type = + get_data_type_of(ColumnIdentifier {.unique_id = col->unique_id(), + .parent_unique_id = col->parent_unique_id(), + .path = col->path(), + .is_nullable = col->is_nullable()}, + read_type != ReaderType::READER_QUERY); if (storage_column_type == nullptr) { // Default column iterator return true; @@ -239,10 +248,12 @@ class Segment : public std::enable_shared_from_this { // Each node in the tree represents the sub column reader and type // for variants. - SubcolumnColumnReaders _sub_column_tree; + // map column unique id --> it's sub column readers + std::map _sub_column_tree; // each sprase column's path and types info - SubcolumnColumnReaders _sparse_column_tree; + // map column unique id --> it's sparse sub column readers + std::map _sparse_column_tree; // used to guarantee that short key index will be loaded at most once in a thread-safe way DorisCallOnce _load_index_once; diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index b9f9615f008fa2..90635bf04018f0 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -333,7 +333,12 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { const Field* col = _schema->column(i); if (col) { auto storage_type = _segment->get_data_type_of( - col->path(), col->is_nullable(), + Segment::ColumnIdentifier { + col->unique_id(), + col->parent_unique_id(), + col->path(), + col->is_nullable(), + }, _opts.io_ctx.reader_type != ReaderType::READER_QUERY); if (storage_type == nullptr) { storage_type = vectorized::DataTypeFactory::instance().create_data_type(*col); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 4b0952fb99cde5..44f71f23b02a93 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -269,7 +269,12 @@ class SegmentIterator : public RowwiseIterator { continue; } vectorized::DataTypePtr storage_type = _segment->get_data_type_of( - _schema->column(cid)->path(), _schema->column(cid)->is_nullable(), false); + Segment::ColumnIdentifier { + .unique_id = _schema->column(cid)->unique_id(), + .parent_unique_id = _schema->column(cid)->parent_unique_id(), + .path = _schema->column(cid)->path(), + .is_nullable = _schema->column(cid)->is_nullable()}, + false); if (storage_type && !storage_type->equals(*block->get_by_position(block_cid).type)) { // Do additional cast vectorized::MutableColumnPtr tmp = storage_type->create_column(); diff --git a/be/src/vec/columns/subcolumn_tree.h b/be/src/vec/columns/subcolumn_tree.h index 30d6c36ba134d9..4caaa1e94736cc 100644 --- a/be/src/vec/columns/subcolumn_tree.h +++ b/be/src/vec/columns/subcolumn_tree.h @@ -137,9 +137,15 @@ class SubcolumnsTree { /// flag, which is true if node already exists. using NodeCreator = std::function; + // create root as SCALAR node + void create_root(NodeData&& leaf_data) { + root = std::make_shared(Node::SCALAR, std::move(leaf_data)); + leaves.push_back(root); + } + // create root as SCALAR node void create_root(const NodeData& leaf_data) { - root = std::make_shared(Node::SCALAR, leaf_data); + root = std::make_shared(Node::SCALAR, std::move(leaf_data)); leaves.push_back(root); } diff --git a/regression-test/data/variant_p0/column_name.out b/regression-test/data/variant_p0/column_name.out index 7e1f23d4ac8b56..6ac882d29225d2 100644 --- a/regression-test/data/variant_p0/column_name.out +++ b/regression-test/data/variant_p0/column_name.out @@ -37,7 +37,7 @@ UPPER CASE lower case \N \N \N -\N +"" "" 1234566 16 diff --git a/regression-test/data/variant_p0/schema_change/rename.out b/regression-test/data/variant_p0/schema_change/rename.out new file mode 100644 index 00000000000000..a4e01839129fe5 --- /dev/null +++ b/regression-test/data/variant_p0/schema_change/rename.out @@ -0,0 +1,13 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +0 {"k1":1,"k2":"hello world","k3":[1234],"k4":1.1,"k5":[[123]]} + +-- !sql -- +0 {"k1":1,"k2":"hello world","k3":[1234],"k4":1.1,"k5":[[123]]} \N +2 {"xxxx":1234} {"yyyy":1.1111} + +-- !sql -- +0 {"k1":1,"k2":"hello world","k3":[1234],"k4":1.1,"k5":[[123]]} \N +2 {"xxxx":1234} \N +2 {"xxxx":1234} \N + diff --git a/regression-test/suites/variant_p0/schema_change/rename.groovy b/regression-test/suites/variant_p0/schema_change/rename.groovy new file mode 100644 index 00000000000000..5f0fdf9a846386 --- /dev/null +++ b/regression-test/suites/variant_p0/schema_change/rename.groovy @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("regression_test_variant_column_rename", "variant_type"){ + sql "DROP TABLE IF EXISTS variant_renam" + sql """ + CREATE TABLE IF NOT EXISTS variant_renam( + k bigint not null, + v variant not null + ) + DUPLICATE KEY(`k`) + DISTRIBUTED BY HASH(k) BUCKETS 4 + properties("replication_num" = "1"); + """ + + sql """INSERT INTO variant_renam SELECT *, '{"k1":1, "k2": "hello world", "k3" : [1234], "k4" : 1.10000, "k5" : [[123]]}' FROM numbers("number" = "1")""" + sql """alter table variant_renam rename column v va"""; + qt_sql """select * from variant_renam""" + + // drop column and add the same name column + sql """alter table variant_renam add column v2 variant default null""" + sql """insert into variant_renam values (2, '{"xxxx" : 1234}', '{"yyyy" : 1.1111}')""" + qt_sql "select * from variant_renam order by k" + sql """alter table variant_renam drop column v2""" + sql """insert into variant_renam values (2, '{"xxxx" : 1234}')""" + sql """alter table variant_renam add column v2 variant default null""" + qt_sql "select * from variant_renam order by k" +} \ No newline at end of file