Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Performance](Variant) Improve load performance for variant type #33890

Merged
merged 5 commits into from
May 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 143 additions & 46 deletions be/src/vec/columns/column_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "util/defer_op.h"
#include "util/simd/bits.h"
#include "vec/aggregate_functions/aggregate_function.h"
#include "vec/aggregate_functions/helpers.h"
#include "vec/columns/column.h"
#include "vec/columns/column_array.h"
#include "vec/columns/column_nullable.h"
Expand All @@ -55,6 +56,7 @@
#include "vec/common/field_visitors.h"
#include "vec/common/schema_util.h"
#include "vec/common/string_buffer.hpp"
#include "vec/common/string_ref.h"
#include "vec/core/column_with_type_and_name.h"
#include "vec/core/field.h"
#include "vec/core/types.h"
Expand All @@ -67,6 +69,7 @@
#include "vec/data_types/data_type_nothing.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/get_least_supertype.h"
#include "vec/json/path_in_data.h"

#ifdef __AVX2__
#include "util/jsonb_parser_simd.h"
Expand All @@ -77,23 +80,22 @@
namespace doris::vectorized {
namespace {

DataTypePtr create_array_of_type(DataTypePtr type, size_t num_dimensions, bool is_nullable) {
const DataTypeNullable* nullable = typeid_cast<const DataTypeNullable*>(type.get());
if ((nullable &&
typeid_cast<const ColumnObject::MostCommonType*>(nullable->get_nested_type().get())) ||
typeid_cast<const ColumnObject::MostCommonType*>(type.get())) {
DataTypePtr create_array_of_type(TypeIndex type, size_t num_dimensions, bool is_nullable) {
if (type == ColumnObject::MOST_COMMON_TYPE_ID) {
// JSONB type MUST NOT wrapped in ARRAY column, it should be top level.
// So we ignored num_dimensions.
return type;
return is_nullable ? make_nullable(std::make_shared<ColumnObject::MostCommonType>())
: std::make_shared<ColumnObject::MostCommonType>();
}
DataTypePtr result = DataTypeFactory::instance().create_data_type(type, is_nullable);
for (size_t i = 0; i < num_dimensions; ++i) {
type = std::make_shared<DataTypeArray>(std::move(type));
result = std::make_shared<DataTypeArray>(result);
if (is_nullable) {
// wrap array with nullable
type = make_nullable(type);
result = make_nullable(result);
}
}
return type;
return result;
}

DataTypePtr get_base_type_of_array(const DataTypePtr& type) {
Expand Down Expand Up @@ -148,6 +150,63 @@ class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t> {
}
};

// Visitor that allows to get type of scalar field
// but exclude fields contain complex field.This is a faster version
// for FieldVisitorToScalarType which does not support complex field.
class SimpleFieldVisitorToScalarType : public StaticVisitor<size_t> {
public:
size_t operator()(const Array& x) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
}
size_t operator()(const UInt64& x) {
if (x <= std::numeric_limits<Int8>::max()) {
type = TypeIndex::Int8;
} else if (x <= std::numeric_limits<Int16>::max()) {
type = TypeIndex::Int16;
} else if (x <= std::numeric_limits<Int32>::max()) {
type = TypeIndex::Int32;
} else {
type = TypeIndex::Int64;
}
return 1;
}
size_t operator()(const Int64& x) {
if (x <= std::numeric_limits<Int8>::max() && x >= std::numeric_limits<Int8>::min()) {
type = TypeIndex::Int8;
} else if (x <= std::numeric_limits<Int16>::max() &&
x >= std::numeric_limits<Int16>::min()) {
type = TypeIndex::Int16;
} else if (x <= std::numeric_limits<Int32>::max() &&
x >= std::numeric_limits<Int32>::min()) {
type = TypeIndex::Int32;
} else {
type = TypeIndex::Int64;
}
return 1;
}
size_t operator()(const JsonbField& x) {
type = TypeIndex::JSONB;
return 1;
}
size_t operator()(const Null&) {
have_nulls = true;
return 1;
}
template <typename T>
size_t operator()(const T&) {
type = TypeId<NearestFieldType<T>>::value;
return 1;
}
void get_scalar_type(TypeIndex* data_type) const { *data_type = type; }
bool contain_nulls() const { return have_nulls; }

bool need_convert_field() const { return false; }
eldenmoon marked this conversation as resolved.
Show resolved Hide resolved

private:
TypeIndex type = TypeIndex::Nothing;
bool have_nulls;
};

/// Visitor that allows to get type of scalar field
/// or least common type of scalars in array.
/// More optimized version of FieldToDataType.
Expand Down Expand Up @@ -207,8 +266,10 @@ class FieldVisitorToScalarType : public StaticVisitor<size_t> {
type_indexes.insert(TypeId<NearestFieldType<T>>::value);
return 0;
}
void get_scalar_type(DataTypePtr* type) const {
get_least_supertype<LeastSupertypeOnError::Jsonb>(type_indexes, type);
void get_scalar_type(TypeIndex* type) const {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: method 'get_scalar_type' can be made static [readability-convert-member-functions-to-static]

Suggested change
void get_scalar_type(TypeIndex* type) const {
static void get_scalar_type(TypeIndex* type) {

DataTypePtr data_type;
get_least_supertype<LeastSupertypeOnError::Jsonb>(type_indexes, &data_type);
*type = data_type->get_type_id();
}
bool contain_nulls() const { return have_nulls; }
bool need_convert_field() const { return field_types.size() > 1; }
Expand All @@ -220,20 +281,30 @@ class FieldVisitorToScalarType : public StaticVisitor<size_t> {
};

} // namespace
void get_field_info(const Field& field, FieldInfo* info) {
FieldVisitorToScalarType to_scalar_type_visitor;

template <typename Visitor>
void get_field_info_impl(const Field& field, FieldInfo* info) {
Visitor to_scalar_type_visitor;
apply_visitor(to_scalar_type_visitor, field);
DataTypePtr type = nullptr;
to_scalar_type_visitor.get_scalar_type(&type);
TypeIndex type_id;
to_scalar_type_visitor.get_scalar_type(&type_id);
// array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
*info = {
type,
type_id,
to_scalar_type_visitor.contain_nulls(),
to_scalar_type_visitor.need_convert_field(),
apply_visitor(FieldVisitorToNumberOfDimensions(), field),
};
}

void get_field_info(const Field& field, FieldInfo* info) {
if (field.is_complex_field()) {
get_field_info_impl<FieldVisitorToScalarType>(field, info);
} else {
get_field_info_impl<SimpleFieldVisitorToScalarType>(field, info);
xiaokang marked this conversation as resolved.
Show resolved Hide resolved
}
}

ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr&& data_, DataTypePtr type, bool is_nullable_,
bool is_root_)
: least_common_type(type), is_nullable(is_nullable_), is_root(is_root_) {
Expand Down Expand Up @@ -284,8 +355,8 @@ void ColumnObject::Subcolumn::add_new_column_part(DataTypePtr type) {
}

void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
auto base_type = std::move(info.scalar_type);
if (is_nothing(base_type)) {
auto base_type = WhichDataType(info.scalar_type_id);
if (base_type.is_nothing()) {
insertDefault();
return;
}
Expand All @@ -294,7 +365,7 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
if (is_nothing(least_common_type.get_base())) {
column_dim = value_dim;
}
if (is_nothing(base_type)) {
if (base_type.is_nothing()) {
value_dim = column_dim;
}
bool type_changed = false;
Expand All @@ -304,29 +375,30 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
"Dimension of types mismatched between inserted value and column, "
"expected:{}, but meet:{} for type:{}",
column_dim, value_dim, least_common_type.get()->get_name());
base_type = std::make_shared<MostCommonType>();
base_type = MOST_COMMON_TYPE_ID;
value_dim = 0;
type_changed = true;
}
if (is_nullable && !is_nothing(base_type)) {
base_type = make_nullable(base_type);
}

const auto& least_common_base_type = least_common_type.get_base();
if (data.empty()) {
add_new_column_part(create_array_of_type(std::move(base_type), value_dim, is_nullable));
} else if (!least_common_base_type->equals(*base_type) && !is_nothing(base_type)) {
if (!schema_util::is_conversion_required_between_integers(*base_type,
*least_common_base_type)) {
add_new_column_part(create_array_of_type(base_type.idx, value_dim, is_nullable));
} else if (least_common_type.get_type_id() != base_type.idx && !base_type.is_nothing()) {
if (schema_util::is_conversion_required_between_integers(base_type.idx,
least_common_type.get_type_id())) {
LOG_EVERY_N(INFO, 100) << "Conversion between " << getTypeName(base_type.idx) << " and "
<< getTypeName(least_common_type.get_type_id());
DataTypePtr base_data_type;
TypeIndex base_data_type_id;
get_least_supertype<LeastSupertypeOnError::Jsonb>(
xiaokang marked this conversation as resolved.
Show resolved Hide resolved
DataTypes {std::move(base_type), least_common_base_type}, &base_type);
TypeIndexSet {base_type.idx, least_common_type.get_base_type_id()},
&base_data_type);
type_changed = true;
base_data_type_id = base_data_type->get_type_id();
if (is_nullable) {
base_type = make_nullable(base_type);
base_data_type = make_nullable(base_data_type);
}
if (!least_common_base_type->equals(*base_type)) {
if (!least_common_type.get_base()->equals(*base_data_type)) {
add_new_column_part(
create_array_of_type(std::move(base_type), value_dim, is_nullable));
create_array_of_type(base_data_type_id, value_dim, is_nullable));
}
}
}
Expand Down Expand Up @@ -577,6 +649,14 @@ ColumnObject::Subcolumn::LeastCommonType::LeastCommonType(DataTypePtr type_)
if (!WhichDataType(type).is_nothing()) {
least_common_type_serder = type->get_serde();
}
type_id = type->is_nullable() ? assert_cast<const DataTypeNullable*>(type.get())
->get_nested_type()
->get_type_id()
: type->get_type_id();
base_type_id = base_type->is_nullable() ? assert_cast<const DataTypeNullable*>(base_type.get())
->get_nested_type()
->get_type_id()
: base_type->get_type_id();
}

ColumnObject::ColumnObject(bool is_nullable_, bool create_root_)
Expand Down Expand Up @@ -676,14 +756,12 @@ void ColumnObject::try_insert(const Field& field) {
return;
}
const auto& object = field.get<const VariantMap&>();
phmap::flat_hash_set<std::string> inserted;
size_t old_size = size();
for (const auto& [key_str, value] : object) {
PathInData key;
if (!key_str.empty()) {
key = PathInData(key_str);
}
inserted.insert(key_str);
if (!has_subcolumn(key)) {
bool succ = add_sub_column(key, old_size);
if (!succ) {
Expand All @@ -699,7 +777,7 @@ void ColumnObject::try_insert(const Field& field) {
subcolumn->insert(value);
}
for (auto& entry : subcolumns) {
if (!inserted.contains(entry->path.get_path())) {
if (old_size == entry->data.size()) {
entry->data.insertDefault();
}
}
Expand Down Expand Up @@ -748,16 +826,6 @@ Status ColumnObject::try_insert_indices_from(const IColumn& src, const int* indi
return Status::OK();
}

FieldInfo ColumnObject::Subcolumn::get_subcolumn_field_info() const {
const auto& base_type = least_common_type.get_base();
return FieldInfo {
.scalar_type = base_type,
.have_nulls = base_type->is_nullable(),
.need_convert = false,
.num_dimensions = least_common_type.get_dimensions(),
};
}

void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t length) {
#ifndef NDEBUG
check_consistency();
Expand Down Expand Up @@ -808,6 +876,33 @@ const ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key
return &node->data;
}

const ColumnObject::Subcolumn* ColumnObject::get_subcolumn_with_cache(const PathInData& key,
size_t key_index) const {
// Optimization by caching the order of fields (which is almost always the same)
// and a quick check to match the next expected field, instead of searching the hash table.
if (_prev_positions.size() > key_index && _prev_positions[key_index].second != nullptr &&
key == _prev_positions[key_index].first) {
return _prev_positions[key_index].second;
}
const auto* subcolumn = get_subcolumn(key);
if (key_index >= _prev_positions.size()) {
_prev_positions.resize(key_index + 1);
}
if (subcolumn != nullptr) {
_prev_positions[key_index] = std::make_pair(key, subcolumn);
}
return subcolumn;
}

ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key, size_t key_index) {
xiaokang marked this conversation as resolved.
Show resolved Hide resolved
return const_cast<ColumnObject::Subcolumn*>(get_subcolumn_with_cache(key, key_index));
}

const ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key,
size_t key_index) const {
return get_subcolumn_with_cache(key, key_index);
}

ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key) {
const auto* node = subcolumns.find_leaf(key);
if (node == nullptr) {
Expand Down Expand Up @@ -1233,6 +1328,7 @@ void ColumnObject::finalize(bool ignore_sparse) {
}
std::swap(subcolumns, new_subcolumns);
doc_structure = nullptr;
_prev_positions.clear();
}

void ColumnObject::finalize() {
Expand Down Expand Up @@ -1351,6 +1447,7 @@ void ColumnObject::clear() {
Subcolumns empty;
std::swap(empty, subcolumns);
num_rows = 0;
_prev_positions.clear();
}

void ColumnObject::revise_to(int target_num_rows) {
Expand Down
Loading
Loading