Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
eldenmoon committed Aug 9, 2024
1 parent eb75aba commit c62d5a6
Show file tree
Hide file tree
Showing 9 changed files with 175 additions and 111 deletions.
83 changes: 58 additions & 25 deletions be/src/vec/columns/column_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,7 @@ bool ColumnObject::Subcolumn::check_if_sparse_column(size_t num_rows) {
return default_ratio >= config::variant_ratio_of_defaults_as_sparse_column;
}

void ColumnObject::Subcolumn::finalize() {
void ColumnObject::Subcolumn::finalize(FinalizeMode mode) {
if (is_finalized()) {
return;
}
Expand All @@ -666,8 +666,8 @@ void ColumnObject::Subcolumn::finalize() {
return;
}
DataTypePtr to_type = least_common_type.get();
if (is_root) {
// Root always JSONB type
if (mode == FinalizeMode::WRITE_MODE && is_root) {
// Root always JSONB type in write mode
to_type = is_nullable ? make_nullable(std::make_shared<MostCommonType>())
: std::make_shared<MostCommonType>();
least_common_type = LeastCommonType {to_type};
Expand Down Expand Up @@ -944,6 +944,10 @@ void ColumnObject::insert_default() {
}

void ColumnObject::Subcolumn::get(size_t n, Field& res) const {
if (least_common_type.get_base_type_id() == TypeIndex::Nothing) {
res = Null();
return;
}
if (is_finalized()) {
if (least_common_type.get_base_type_id() == TypeIndex::JSONB) {
// JsonbFiled is special case
Expand All @@ -955,10 +959,6 @@ void ColumnObject::Subcolumn::get(size_t n, Field& res) const {

size_t ind = n;
if (ind < num_of_defaults_in_prefix) {
if (least_common_type.get_base_type_id() == TypeIndex::Nothing) {
res = Null();
return;
}
res = least_common_type.get()->get_default();
return;
}
Expand Down Expand Up @@ -1086,7 +1086,7 @@ void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t le
}
}
num_rows += length;
finalize();
finalize(FinalizeMode::READ_MODE);
#ifndef NDEBUG
check_consistency();
#endif
Expand Down Expand Up @@ -1314,7 +1314,7 @@ rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInDat
bool skip_empty_json(const ColumnNullable* nullable, const DataTypePtr& type, int row,
const PathInData& path) {
// skip nulls
if (nullable->is_null_at(row)) {
if (nullable && nullable->is_null_at(row)) {
return true;
}
// check if it is empty nested json array, then skip
Expand All @@ -1334,7 +1334,7 @@ bool skip_empty_json(const ColumnNullable* nullable, const DataTypePtr& type, in
}
}
// skip empty jsonb value
if ((path.empty() && nullable->get_data_at(row).empty())) {
if ((path.empty() && nullable && nullable->get_data_at(row).empty())) {
return true;
}
// skip nothing type
Expand All @@ -1355,7 +1355,7 @@ Status find_and_set_leave_value(const IColumn* column, const PathInData& path,
"failed to set value for path {}, expected type {}, but got {} at row {}",
path.get_path(), type->get_name(), column->get_name(), row);
}
const auto* nullable = assert_cast<const ColumnNullable*>(column);
const auto* nullable = check_and_get_column<ColumnNullable>(column);
if (skip_empty_json(nullable, type, row, path)) {
return Status::OK();
}
Expand Down Expand Up @@ -1416,7 +1416,7 @@ void get_json_by_column_tree(rapidjson::Value& root, rapidjson::Document::Alloca

Status ColumnObject::serialize_one_row_to_string(int row, std::string* output) const {
if (!is_finalized()) {
const_cast<ColumnObject*>(this)->finalize();
const_cast<ColumnObject*>(this)->finalize(FinalizeMode::READ_MODE);
}
rapidjson::StringBuffer buf;
if (is_scalar_variant()) {
Expand All @@ -1432,7 +1432,7 @@ Status ColumnObject::serialize_one_row_to_string(int row, std::string* output) c

Status ColumnObject::serialize_one_row_to_string(int row, BufferWritable& output) const {
if (!is_finalized()) {
const_cast<ColumnObject*>(this)->finalize();
const_cast<ColumnObject*>(this)->finalize(FinalizeMode::READ_MODE);
}
if (is_scalar_variant()) {
auto type = get_root_type();
Expand Down Expand Up @@ -1593,34 +1593,67 @@ Status ColumnObject::merge_sparse_to_root_column() {
return Status::OK();
}

void ColumnObject::finalize_if_not() {
if (!is_finalized()) {
finalize();
}
}

void ColumnObject::finalize(bool ignore_sparse) {
void ColumnObject::finalize(FinalizeMode mode) {
Subcolumns new_subcolumns;
// finalize root first
if (!ignore_sparse || !is_null_root()) {
if (mode == FinalizeMode::WRITE_MODE || !is_null_root()) {
new_subcolumns.create_root(subcolumns.get_root()->data);
new_subcolumns.get_mutable_root()->data.finalize();
new_subcolumns.get_mutable_root()->data.finalize(mode);
}
for (auto&& entry : subcolumns) {
const auto& least_common_type = entry->data.get_least_common_type();
/// Do not add subcolumns, which consists only from NULLs
if (is_nothing(get_base_type_of_array(least_common_type))) {
continue;
}
entry->data.finalize();

// unnest all nested columns
if (mode == FinalizeMode::WRITE_MODE &&
least_common_type->equals(*ColumnObject::NESTED_TYPE)) {
entry->data.finalize(mode);
auto nested_column = entry->data.get_finalized_column_ptr()->assume_mutable();
auto* nested_column_nullable = assert_cast<ColumnNullable*>(nested_column.get());
auto* nested_column_array = assert_cast<ColumnArray*>(
nested_column_nullable->get_nested_column_ptr().get());
auto& offset = nested_column_array->get_offsets_ptr();

auto* nested_object_nullable = assert_cast<ColumnNullable*>(
nested_column_array->get_data_ptr()->assume_mutable().get());
auto& nested_object_column =
assert_cast<ColumnObject&>(nested_object_nullable->get_nested_column());
PathInData nested_path = entry->path;
// nested_path.set_nested(nested_path.get_parts().size() - 1);
for (auto& nested_entry : nested_object_column.subcolumns) {
if (nested_entry->data.least_common_type.get_base_type_id() == TypeIndex::Nothing) {
continue;
}
nested_entry->data.finalize(FinalizeMode::READ_MODE);
PathInDataBuilder path_builder;
path_builder.append(nested_path.get_parts(), false);
path_builder.append(nested_entry->path.get_parts(), true);
auto subnested_column = ColumnArray::create(
ColumnNullable::create(nested_entry->data.get_finalized_column_ptr(),
nested_object_nullable->get_null_map_column_ptr()),
offset);
auto nullable_subnested_column = ColumnNullable::create(
subnested_column, nested_column_nullable->get_null_map_column_ptr());
auto type = make_nullable(std::make_shared<DataTypeArray>(
nested_entry->data.least_common_type.get()));
Subcolumn subcolumn(nullable_subnested_column->assume_mutable(), type, is_nullable);
new_subcolumns.add(path_builder.build(), subcolumn);
}
continue;
}

entry->data.finalize(mode);
entry->data.wrapp_array_nullable();

if (entry->data.is_root) {
continue;
}

// Check and spilit sparse subcolumns
if (!ignore_sparse && (entry->data.check_if_sparse_column(num_rows))) {
if (mode == FinalizeMode::WRITE_MODE && (entry->data.check_if_sparse_column(num_rows))) {
// TODO seperate ambiguous path
sparse_columns.add(entry->path, entry->data);
continue;
Expand All @@ -1634,7 +1667,7 @@ void ColumnObject::finalize(bool ignore_sparse) {
}

void ColumnObject::finalize() {
finalize(true);
finalize(FinalizeMode::READ_MODE);
}

void ColumnObject::ensure_root_node_type(const DataTypePtr& expected_root_type) {
Expand Down
12 changes: 7 additions & 5 deletions be/src/vec/columns/column_object.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
// and modified by Doris

#pragma once
#include <butil/compiler_specific.h>
#include <glog/logging.h>
#include <rapidjson/document.h>
#include <rapidjson/stringbuffer.h>
Expand Down Expand Up @@ -94,6 +95,9 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {
// Using jsonb type as most common type, since it's adopted all types of json
using MostCommonType = DataTypeJsonb;
constexpr static TypeIndex MOST_COMMON_TYPE_ID = TypeIndex::JSONB;
// Finlize mode for subcolumns, write mode will deal with sparse columns, only affects in flush block to segments.
// Otherwise read mode should be as default mode.
enum class FinalizeMode { WRITE_MODE, READ_MODE };
class Subcolumn {
public:
Subcolumn() = default;
Expand Down Expand Up @@ -151,7 +155,7 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {

/// Converts all column's parts to the common type and
/// creates a single column that stores all values.
void finalize();
void finalize(FinalizeMode mode);

/// Returns last inserted field.
Field get_last_field() const;
Expand Down Expand Up @@ -364,7 +368,7 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {
void remove_subcolumns(const std::unordered_set<std::string>& keys);

// use sparse_subcolumns_schema to record sparse column's path info and type
void finalize(bool ignore_sparser);
void finalize(FinalizeMode mode);

/// Finalizes all subcolumns.
void finalize() override;
Expand All @@ -373,12 +377,10 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {

MutableColumnPtr clone_finalized() const {
auto finalized = IColumn::mutate(get_ptr());
static_cast<ColumnObject*>(finalized.get())->finalize();
static_cast<ColumnObject*>(finalized.get())->finalize(FinalizeMode::READ_MODE);
return finalized;
}

void finalize_if_not();

void clear() override;

void resize(size_t n) override;
Expand Down
15 changes: 0 additions & 15 deletions be/src/vec/common/schema_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -534,21 +534,6 @@ Status parse_variant_columns(Block& block, const std::vector<int>& variant_pos,
return Status::OK();
}

void finalize_variant_columns(Block& block, const std::vector<int>& variant_pos,
bool ignore_sparse) {
for (int i = 0; i < variant_pos.size(); ++i) {
auto& column_ref = block.get_by_position(variant_pos[i]).column->assume_mutable_ref();
auto& column =
column_ref.is_nullable()
? assert_cast<ColumnObject&>(
assert_cast<ColumnNullable&>(column_ref).get_nested_column())
: assert_cast<ColumnObject&>(column_ref);
// Record information about columns merged into a sparse column within a variant
std::vector<TabletColumn> sparse_subcolumns_schema;
column.finalize(ignore_sparse);
}
}

Status encode_variant_sparse_subcolumns(ColumnObject& column) {
// Make sure the root node is jsonb storage type
auto expected_root_type = make_nullable(std::make_shared<ColumnObject::MostCommonType>());
Expand Down
2 changes: 0 additions & 2 deletions be/src/vec/common/schema_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,6 @@ struct ParseContext {
// 3. encode sparse sub columns
Status parse_variant_columns(Block& block, const std::vector<int>& variant_pos,
const ParseContext& ctx);
void finalize_variant_columns(Block& block, const std::vector<int>& variant_pos,
bool ignore_sparse = true);
Status encode_variant_sparse_subcolumns(ColumnObject& column);

// Pick the tablet schema with the highest schema version as the reference.
Expand Down
1 change: 1 addition & 0 deletions be/src/vec/json/path_in_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class PathInData {
const vectorized::String& get_path() const { return path; }
const Parts& get_parts() const { return parts; }
bool is_nested(size_t i) const { return parts[i].is_nested; }
bool set_nested(size_t i) { return parts[i].is_nested = true; }
bool has_nested_part() const { return has_nested; }
bool operator==(const PathInData& other) const { return parts == other.parts; }
struct Hash {
Expand Down
2 changes: 1 addition & 1 deletion be/src/vec/olap/olap_data_convertor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1111,7 +1111,7 @@ void OlapBlockDataConvertor::OlapColumnDataConvertorVariant::set_source_column(
}
// ensure data finalized
_source_column_ptr = &const_cast<ColumnObject&>(variant);
_source_column_ptr->finalize(false);
_source_column_ptr->finalize(ColumnObject::FinalizeMode::WRITE_MODE);
_root_data_convertor = std::make_unique<OlapColumnDataConvertorVarChar>(true);
_root_data_convertor->set_source_column(
{_source_column_ptr->get_root()->get_ptr(), nullptr, ""}, row_pos, num_rows);
Expand Down
Loading

0 comments on commit c62d5a6

Please sign in to comment.