Skip to content

Commit

Permalink
fix and test
Browse files Browse the repository at this point in the history
  • Loading branch information
eldenmoon committed Aug 8, 2024
1 parent 5820cda commit d0ab4c0
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 23 deletions.
5 changes: 5 additions & 0 deletions be/src/olap/rowset/segment_v2/column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1686,6 +1686,11 @@ static void fill_nested_with_defaults(vectorized::MutableColumnPtr& dst,
auto new_array = make_nullable(vectorized::ColumnArray::create(
new_nested->assume_mutable(), sibling_array->get_offsets_ptr()->assume_mutable()));
dst->insert_range_from(*new_array, 0, new_array->size());
#ifndef NDEBUG
if (!dst_array->has_equal_offsets(*sibling_array)) {
throw doris::Exception(ErrorCode::INTERNAL_ERROR, "Expected same array offsets");
}
#endif
}

Status DefaultNestedColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& dst,
Expand Down
25 changes: 16 additions & 9 deletions be/src/olap/rowset/segment_v2/hierarchical_data_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,9 @@ class HierarchicalDataReader : public ColumnIterator {
}
for (auto& entry : nested_subcolumns) {
MutableColumnPtr nested_object = ColumnObject::create(true, false);
MutableColumnPtr offset =
check_and_get_column<ColumnArray>(*remove_nullable(entry.second[0].column))
->get_offsets_ptr()
->assume_mutable();
const auto* base_array =
check_and_get_column<ColumnArray>(remove_nullable(entry.second[0].column));
MutableColumnPtr offset = base_array->get_offsets_ptr()->assume_mutable();
auto* nested_object_ptr = assert_cast<ColumnObject*>(nested_object.get());
// flatten nested arrays
for (const auto& subcolumn : entry.second) {
Expand All @@ -164,10 +163,17 @@ class HierarchicalDataReader : public ColumnIterator {
"Meet none array column when flatten nested array, path {}, type {}",
subcolumn.path.get_path(), subcolumn.type->get_name());
}
MutableColumnPtr flattend_column =
check_and_get_column<ColumnArray>(remove_nullable(subcolumn.column).get())
->get_data_ptr()
->assume_mutable();
const auto* target_array =
check_and_get_column<ColumnArray>(remove_nullable(subcolumn.column).get());
if (!base_array->has_equal_offsets(*target_array)) {
return Status::InvalidArgument(
"Meet none equal offsets array when flatten nested array, path {}, "
"type {}",
subcolumn.path.get_path(), subcolumn.type->get_name());
}
MutableColumnPtr flattend_column = check_and_get_column<ColumnArray>(target_array)
->get_data_ptr()
->assume_mutable();
DataTypePtr flattend_type =
check_and_get_data_type<DataTypeArray>(remove_nullable(type).get())
->get_nested_type();
Expand All @@ -179,7 +185,8 @@ class HierarchicalDataReader : public ColumnIterator {
nested_object = make_nullable(nested_object->get_ptr())->assume_mutable();
auto array =
make_nullable(ColumnArray::create(std::move(nested_object), std::move(offset)));
container_variant.add_sub_column(entry.first, array->assume_mutable(),
PathInData path_without_nested(entry.first.get_path());
container_variant.add_sub_column(path_without_nested, array->assume_mutable(),
ColumnObject::NESTED_TYPE);
}

Expand Down
1 change: 1 addition & 0 deletions be/src/olap/rowset/segment_v2/segment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,7 @@ Status Segment::new_column_iterator_with_path(const TabletColumn& tablet_column,
const auto* parent = _sub_column_tree.find_best_match(*tablet_column.path_info_ptr());
VLOG_DEBUG << "find with path " << tablet_column.path_info_ptr()->get_path()
<< " parent " << (parent ? parent->path.get_path() : "nullptr") << ", type "
<< ", parent is nested " << (parent ? parent->is_nested() : false) << ", "
<< TabletColumn::get_string_by_field_type(tablet_column.type());
if (parent && parent->is_nested()) {
/// Find any leaf of Nested subcolumn.
Expand Down
25 changes: 15 additions & 10 deletions be/src/vec/columns/column_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ void ColumnObject::Subcolumn::add_new_column_part(DataTypePtr type) {

void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
auto base_type = WhichDataType(info.scalar_type_id);
if (base_type.is_nothing()) {
if (base_type.is_nothing() && info.num_dimensions == 0) {
insertDefault();
return;
}
Expand Down Expand Up @@ -457,12 +457,11 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
}

static DataTypePtr create_array(TypeIndex type, size_t num_dimensions) {
DataTypePtr result_type;
auto nested_type = make_nullable(DataTypeFactory::instance().create_data_type(type));
DataTypePtr result_type = make_nullable(DataTypeFactory::instance().create_data_type(type));
for (size_t i = 0; i < num_dimensions; ++i) {
result_type = std::make_shared<DataTypeArray>(nested_type);
result_type = make_nullable(std::make_shared<DataTypeArray>(result_type));
}
return make_nullable(result_type);
return result_type;
}

Array create_empty_array_field(size_t num_dimensions) {
Expand All @@ -484,7 +483,7 @@ Array create_empty_array_field(size_t num_dimensions) {
// Recreates column with default scalar values and keeps sizes of arrays.
static ColumnPtr recreate_column_with_default_values(const ColumnPtr& column, TypeIndex scalar_type,
size_t num_dimensions) {
const auto* column_array = check_and_get_column<ColumnArray>(column.get());
const auto* column_array = check_and_get_column<ColumnArray>(remove_nullable(column).get());
if (column_array && num_dimensions) {
return make_nullable(ColumnArray::create(
recreate_column_with_default_values(column_array->get_data_ptr(), scalar_type,
Expand All @@ -503,9 +502,11 @@ ColumnObject::Subcolumn ColumnObject::Subcolumn::recreateWithDefaultValues(
new_subcolumn.least_common_type =
LeastCommonType {create_array(field_info.scalar_type_id, field_info.num_dimensions)};

for (auto& part : new_subcolumn.data) {
part = recreate_column_with_default_values(part, field_info.scalar_type_id,
field_info.num_dimensions);
for (int i = 0; i < new_subcolumn.data.size(); ++i) {
new_subcolumn.data[i] = recreate_column_with_default_values(
new_subcolumn.data[i], field_info.scalar_type_id, field_info.num_dimensions);
new_subcolumn.data_types[i] = create_array_of_type(field_info.scalar_type_id,
field_info.num_dimensions, is_nullable);
}

return new_subcolumn;
Expand Down Expand Up @@ -883,6 +884,10 @@ void ColumnObject::insert_from(const IColumn& src, size_t n) {

void ColumnObject::try_insert(const Field& field) {
if (field.get_type() != Field::Types::VariantMap) {
if (field.is_null()) {
insert_default();
return;
}
auto* root = get_subcolumn({});
// Insert to an emtpy ColumnObject may result root null,
// so create a root column of Variant is expected.
Expand Down Expand Up @@ -1016,7 +1021,7 @@ void ColumnObject::add_nested_subcolumn(const PathInData& key, const FieldInfo&
/// We find node that represents the same Nested type as @key.
const auto* nested_node = subcolumns.find_best_match(key);

if (nested_node && !nested_node->path.empty()) {
if (nested_node && nested_node->is_nested()) {
/// Find any leaf of Nested subcolumn.
const auto* leaf = Subcolumns::find_leaf(nested_node, [&](const auto&) { return true; });
assert(leaf);
Expand Down
3 changes: 0 additions & 3 deletions be/src/vec/columns/subcolumn_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,6 @@ class SubcolumnsTree {
return node;
}

// get the mutable parent node
static Node* get_mutable_parent(const Node* node) { return const_cast<Node*>(node->parent); }

bool empty() const { return root == nullptr; }
size_t size() const { return leaves.size(); }

Expand Down
4 changes: 3 additions & 1 deletion regression-test/suites/variant_github_events_p2/load.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,14 @@ suite("regression_test_variant_github_events_p2", "nonConcurrent,p2"){
properties("replication_num" = "1", "disable_auto_compaction" = "false", "bloom_filter_columns" = "v");
"""
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
set_be_config.call("variant_enable_flatten_nested", "true")
// 2015
load_json_data.call(table_name, """${getS3Url() + '/regression/gharchive.m/2015-01-01-0.json'}""")
load_json_data.call(table_name, """${getS3Url() + '/regression/gharchive.m/2015-01-01-1.json'}""")
load_json_data.call(table_name, """${getS3Url() + '/regression/gharchive.m/2015-01-01-2.json'}""")
load_json_data.call(table_name, """${getS3Url() + '/regression/gharchive.m/2015-01-01-3.json'}""")

// build inverted index at middle of loading the data
// // build inverted index at middle of loading the data
// ADD INDEX
sql """ ALTER TABLE github_events ADD INDEX idx_var (`v`) USING INVERTED PROPERTIES("parser" = "chinese", "parser_mode" = "fine_grained", "support_phrase" = "true") """
wait_for_latest_op_on_table_finish("github_events", timeout)
Expand Down Expand Up @@ -219,4 +220,5 @@ suite("regression_test_variant_github_events_p2", "nonConcurrent,p2"){
qt_sql """select cast(v["payload"]["pull_request"]["additions"] as int) from github_events where cast(v["repo"]["name"] as string) = 'xpressengine/xe-core' order by 1;"""
qt_sql """select * from github_events where cast(v["repo"]["name"] as string) = 'xpressengine/xe-core' order by 1 limit 10"""
// TODO add test case that some certain columns are materialized in some file while others are not materilized(sparse)
set_be_config.call("variant_enable_flatten_nested", "false")
}

0 comments on commit d0ab4c0

Please sign in to comment.