Skip to content

Commit

Permalink
[Performance](Variant) Improve load performance for variant type (apa…
Browse files Browse the repository at this point in the history
…che#33890)

1. remove phmap for padding rows
2. add SimpleFieldVisitorToScarlarType for short circuit type deducing
3. correct type coercion for conflict types bettween integers
4. improve nullable column performance
5. remove shared_ptr dependancy for DataType use TypeIndex instead
6. Optimization by caching the order of fields (which is almost always the same)
and a quick check to match the next expected field, instead of searching the hash table.

benchmark:
In clickbench data, load performance:
12m36.799s ->7m10.934s about 43% latency reduce

In variant_p2/performance.groovy:
3min44s20 -> 1min15s80 about 66% latency reducy
  • Loading branch information
eldenmoon authored and ByteYue committed May 15, 2024
1 parent 0a7f293 commit 6ca3bce
Show file tree
Hide file tree
Showing 8 changed files with 250 additions and 94 deletions.
189 changes: 143 additions & 46 deletions be/src/vec/columns/column_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "util/defer_op.h"
#include "util/simd/bits.h"
#include "vec/aggregate_functions/aggregate_function.h"
#include "vec/aggregate_functions/helpers.h"
#include "vec/columns/column.h"
#include "vec/columns/column_array.h"
#include "vec/columns/column_nullable.h"
Expand All @@ -56,6 +57,7 @@
#include "vec/common/field_visitors.h"
#include "vec/common/schema_util.h"
#include "vec/common/string_buffer.hpp"
#include "vec/common/string_ref.h"
#include "vec/core/column_with_type_and_name.h"
#include "vec/core/field.h"
#include "vec/core/types.h"
Expand All @@ -68,6 +70,7 @@
#include "vec/data_types/data_type_nothing.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/get_least_supertype.h"
#include "vec/json/path_in_data.h"

#ifdef __AVX2__
#include "util/jsonb_parser_simd.h"
Expand All @@ -78,23 +81,22 @@
namespace doris::vectorized {
namespace {

DataTypePtr create_array_of_type(DataTypePtr type, size_t num_dimensions, bool is_nullable) {
const DataTypeNullable* nullable = typeid_cast<const DataTypeNullable*>(type.get());
if ((nullable &&
typeid_cast<const ColumnObject::MostCommonType*>(nullable->get_nested_type().get())) ||
typeid_cast<const ColumnObject::MostCommonType*>(type.get())) {
DataTypePtr create_array_of_type(TypeIndex type, size_t num_dimensions, bool is_nullable) {
if (type == ColumnObject::MOST_COMMON_TYPE_ID) {
// JSONB type MUST NOT wrapped in ARRAY column, it should be top level.
// So we ignored num_dimensions.
return type;
return is_nullable ? make_nullable(std::make_shared<ColumnObject::MostCommonType>())
: std::make_shared<ColumnObject::MostCommonType>();
}
DataTypePtr result = DataTypeFactory::instance().create_data_type(type, is_nullable);
for (size_t i = 0; i < num_dimensions; ++i) {
type = std::make_shared<DataTypeArray>(std::move(type));
result = std::make_shared<DataTypeArray>(result);
if (is_nullable) {
// wrap array with nullable
type = make_nullable(type);
result = make_nullable(result);
}
}
return type;
return result;
}

DataTypePtr get_base_type_of_array(const DataTypePtr& type) {
Expand Down Expand Up @@ -149,6 +151,63 @@ class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t> {
}
};

// Visitor that allows to get type of scalar field
// but exclude fields contain complex field.This is a faster version
// for FieldVisitorToScalarType which does not support complex field.
class SimpleFieldVisitorToScalarType : public StaticVisitor<size_t> {
public:
size_t operator()(const Array& x) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
}
size_t operator()(const UInt64& x) {
if (x <= std::numeric_limits<Int8>::max()) {
type = TypeIndex::Int8;
} else if (x <= std::numeric_limits<Int16>::max()) {
type = TypeIndex::Int16;
} else if (x <= std::numeric_limits<Int32>::max()) {
type = TypeIndex::Int32;
} else {
type = TypeIndex::Int64;
}
return 1;
}
size_t operator()(const Int64& x) {
if (x <= std::numeric_limits<Int8>::max() && x >= std::numeric_limits<Int8>::min()) {
type = TypeIndex::Int8;
} else if (x <= std::numeric_limits<Int16>::max() &&
x >= std::numeric_limits<Int16>::min()) {
type = TypeIndex::Int16;
} else if (x <= std::numeric_limits<Int32>::max() &&
x >= std::numeric_limits<Int32>::min()) {
type = TypeIndex::Int32;
} else {
type = TypeIndex::Int64;
}
return 1;
}
size_t operator()(const JsonbField& x) {
type = TypeIndex::JSONB;
return 1;
}
size_t operator()(const Null&) {
have_nulls = true;
return 1;
}
template <typename T>
size_t operator()(const T&) {
type = TypeId<NearestFieldType<T>>::value;
return 1;
}
void get_scalar_type(TypeIndex* data_type) const { *data_type = type; }
bool contain_nulls() const { return have_nulls; }

bool need_convert_field() const { return false; }

private:
TypeIndex type = TypeIndex::Nothing;
bool have_nulls;
};

/// Visitor that allows to get type of scalar field
/// or least common type of scalars in array.
/// More optimized version of FieldToDataType.
Expand Down Expand Up @@ -208,8 +267,10 @@ class FieldVisitorToScalarType : public StaticVisitor<size_t> {
type_indexes.insert(TypeId<NearestFieldType<T>>::value);
return 0;
}
void get_scalar_type(DataTypePtr* type) const {
get_least_supertype<LeastSupertypeOnError::Jsonb>(type_indexes, type);
void get_scalar_type(TypeIndex* type) const {
DataTypePtr data_type;
get_least_supertype<LeastSupertypeOnError::Jsonb>(type_indexes, &data_type);
*type = data_type->get_type_id();
}
bool contain_nulls() const { return have_nulls; }
bool need_convert_field() const { return field_types.size() > 1; }
Expand All @@ -221,20 +282,30 @@ class FieldVisitorToScalarType : public StaticVisitor<size_t> {
};

} // namespace
void get_field_info(const Field& field, FieldInfo* info) {
FieldVisitorToScalarType to_scalar_type_visitor;

template <typename Visitor>
void get_field_info_impl(const Field& field, FieldInfo* info) {
Visitor to_scalar_type_visitor;
apply_visitor(to_scalar_type_visitor, field);
DataTypePtr type = nullptr;
to_scalar_type_visitor.get_scalar_type(&type);
TypeIndex type_id;
to_scalar_type_visitor.get_scalar_type(&type_id);
// array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
*info = {
type,
type_id,
to_scalar_type_visitor.contain_nulls(),
to_scalar_type_visitor.need_convert_field(),
apply_visitor(FieldVisitorToNumberOfDimensions(), field),
};
}

void get_field_info(const Field& field, FieldInfo* info) {
if (field.is_complex_field()) {
get_field_info_impl<FieldVisitorToScalarType>(field, info);
} else {
get_field_info_impl<SimpleFieldVisitorToScalarType>(field, info);
}
}

ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr&& data_, DataTypePtr type, bool is_nullable_,
bool is_root_)
: least_common_type(type), is_nullable(is_nullable_), is_root(is_root_) {
Expand Down Expand Up @@ -285,8 +356,8 @@ void ColumnObject::Subcolumn::add_new_column_part(DataTypePtr type) {
}

void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
auto base_type = std::move(info.scalar_type);
if (is_nothing(base_type)) {
auto base_type = WhichDataType(info.scalar_type_id);
if (base_type.is_nothing()) {
insertDefault();
return;
}
Expand All @@ -295,7 +366,7 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
if (is_nothing(least_common_type.get_base())) {
column_dim = value_dim;
}
if (is_nothing(base_type)) {
if (base_type.is_nothing()) {
value_dim = column_dim;
}
bool type_changed = false;
Expand All @@ -305,29 +376,30 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
"Dimension of types mismatched between inserted value and column, "
"expected:{}, but meet:{} for type:{}",
column_dim, value_dim, least_common_type.get()->get_name());
base_type = std::make_shared<MostCommonType>();
base_type = MOST_COMMON_TYPE_ID;
value_dim = 0;
type_changed = true;
}
if (is_nullable && !is_nothing(base_type)) {
base_type = make_nullable(base_type);
}

const auto& least_common_base_type = least_common_type.get_base();
if (data.empty()) {
add_new_column_part(create_array_of_type(std::move(base_type), value_dim, is_nullable));
} else if (!least_common_base_type->equals(*base_type) && !is_nothing(base_type)) {
if (!schema_util::is_conversion_required_between_integers(*base_type,
*least_common_base_type)) {
add_new_column_part(create_array_of_type(base_type.idx, value_dim, is_nullable));
} else if (least_common_type.get_type_id() != base_type.idx && !base_type.is_nothing()) {
if (schema_util::is_conversion_required_between_integers(base_type.idx,
least_common_type.get_type_id())) {
LOG_EVERY_N(INFO, 100) << "Conversion between " << getTypeName(base_type.idx) << " and "
<< getTypeName(least_common_type.get_type_id());
DataTypePtr base_data_type;
TypeIndex base_data_type_id;
get_least_supertype<LeastSupertypeOnError::Jsonb>(
DataTypes {std::move(base_type), least_common_base_type}, &base_type);
TypeIndexSet {base_type.idx, least_common_type.get_base_type_id()},
&base_data_type);
type_changed = true;
base_data_type_id = base_data_type->get_type_id();
if (is_nullable) {
base_type = make_nullable(base_type);
base_data_type = make_nullable(base_data_type);
}
if (!least_common_base_type->equals(*base_type)) {
if (!least_common_type.get_base()->equals(*base_data_type)) {
add_new_column_part(
create_array_of_type(std::move(base_type), value_dim, is_nullable));
create_array_of_type(base_data_type_id, value_dim, is_nullable));
}
}
}
Expand Down Expand Up @@ -578,6 +650,14 @@ ColumnObject::Subcolumn::LeastCommonType::LeastCommonType(DataTypePtr type_)
if (!WhichDataType(type).is_nothing()) {
least_common_type_serder = type->get_serde();
}
type_id = type->is_nullable() ? assert_cast<const DataTypeNullable*>(type.get())
->get_nested_type()
->get_type_id()
: type->get_type_id();
base_type_id = base_type->is_nullable() ? assert_cast<const DataTypeNullable*>(base_type.get())
->get_nested_type()
->get_type_id()
: base_type->get_type_id();
}

ColumnObject::ColumnObject(bool is_nullable_, bool create_root_)
Expand Down Expand Up @@ -677,14 +757,12 @@ void ColumnObject::try_insert(const Field& field) {
return;
}
const auto& object = field.get<const VariantMap&>();
phmap::flat_hash_set<std::string> inserted;
size_t old_size = size();
for (const auto& [key_str, value] : object) {
PathInData key;
if (!key_str.empty()) {
key = PathInData(key_str);
}
inserted.insert(key_str);
if (!has_subcolumn(key)) {
bool succ = add_sub_column(key, old_size);
if (!succ) {
Expand All @@ -700,7 +778,7 @@ void ColumnObject::try_insert(const Field& field) {
subcolumn->insert(value);
}
for (auto& entry : subcolumns) {
if (!inserted.contains(entry->path.get_path())) {
if (old_size == entry->data.size()) {
entry->data.insertDefault();
}
}
Expand Down Expand Up @@ -749,16 +827,6 @@ Status ColumnObject::try_insert_indices_from(const IColumn& src, const int* indi
return Status::OK();
}

FieldInfo ColumnObject::Subcolumn::get_subcolumn_field_info() const {
const auto& base_type = least_common_type.get_base();
return FieldInfo {
.scalar_type = base_type,
.have_nulls = base_type->is_nullable(),
.need_convert = false,
.num_dimensions = least_common_type.get_dimensions(),
};
}

void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t length) {
#ifndef NDEBUG
check_consistency();
Expand Down Expand Up @@ -809,6 +877,33 @@ const ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key
return &node->data;
}

const ColumnObject::Subcolumn* ColumnObject::get_subcolumn_with_cache(const PathInData& key,
size_t key_index) const {
// Optimization by caching the order of fields (which is almost always the same)
// and a quick check to match the next expected field, instead of searching the hash table.
if (_prev_positions.size() > key_index && _prev_positions[key_index].second != nullptr &&
key == _prev_positions[key_index].first) {
return _prev_positions[key_index].second;
}
const auto* subcolumn = get_subcolumn(key);
if (key_index >= _prev_positions.size()) {
_prev_positions.resize(key_index + 1);
}
if (subcolumn != nullptr) {
_prev_positions[key_index] = std::make_pair(key, subcolumn);
}
return subcolumn;
}

ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key, size_t key_index) {
return const_cast<ColumnObject::Subcolumn*>(get_subcolumn_with_cache(key, key_index));
}

const ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key,
size_t key_index) const {
return get_subcolumn_with_cache(key, key_index);
}

ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key) {
const auto* node = subcolumns.find_leaf(key);
if (node == nullptr) {
Expand Down Expand Up @@ -1238,6 +1333,7 @@ void ColumnObject::finalize(bool ignore_sparse) {
}
std::swap(subcolumns, new_subcolumns);
doc_structure = nullptr;
_prev_positions.clear();
}

void ColumnObject::finalize() {
Expand Down Expand Up @@ -1356,6 +1452,7 @@ void ColumnObject::clear() {
Subcolumns empty;
std::swap(empty, subcolumns);
num_rows = 0;
_prev_positions.clear();
}

void ColumnObject::revise_to(int target_num_rows) {
Expand Down
Loading

0 comments on commit 6ca3bce

Please sign in to comment.