Skip to content

Commit

Permalink
[feature](agg) add aggregation function 'bitmap_agg' (#22768)
Browse files Browse the repository at this point in the history
This function can be used to replace bitmap_union(to_bitmap(expr)), because bitmap_union(to_bitmap(expr)) need create many many small bitmaps firstly and then merge them into a single bitmap.
bitmap_agg will convert the column value into a bitmap directly. Its performance is better than bitmap_union(to_bitmap(expr)) . In our test , there is about 30% improvement.
  • Loading branch information
mrhhsg authored Aug 10, 2023
1 parent 94d563f commit 57fb979
Show file tree
Hide file tree
Showing 9 changed files with 516 additions and 1 deletion.
56 changes: 56 additions & 0 deletions be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "vec/aggregate_functions/aggregate_function_bitmap_agg.h"

#include "vec/aggregate_functions/aggregate_function_simple_factory.h"
#include "vec/aggregate_functions/helpers.h"
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_nullable.h"

namespace doris::vectorized {

template <bool nullable>
AggregateFunctionPtr create_with_int_data_type(const DataTypes& argument_types) {
auto type = remove_nullable(argument_types[0]);
WhichDataType which(type);
#define DISPATCH(TYPE) \
if (which.idx == TypeIndex::TYPE) { \
return std::make_shared<AggregateFunctionBitmapAgg<nullable, TYPE>>(argument_types); \
}
FOR_INTEGER_TYPES(DISPATCH)
#undef DISPATCH
LOG(WARNING) << "with unknown type, failed in create_with_int_data_type bitmap_union_int"
<< " and type is: " << argument_types[0]->get_name();
return nullptr;
}

AggregateFunctionPtr create_aggregate_function_bitmap_agg(const std::string& name,
const DataTypes& argument_types,
const bool result_is_nullable) {
const bool arg_is_nullable = argument_types[0]->is_nullable();
if (arg_is_nullable) {
return AggregateFunctionPtr(create_with_int_data_type<true>(argument_types));
} else {
return AggregateFunctionPtr(create_with_int_data_type<false>(argument_types));
}
}

void register_aggregate_function_bitmap_agg(AggregateFunctionSimpleFactory& factory) {
factory.register_function_both("bitmap_agg", create_aggregate_function_bitmap_agg);
}
} // namespace doris::vectorized
189 changes: 189 additions & 0 deletions be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <stddef.h>

#include <algorithm>
#include <boost/iterator/iterator_facade.hpp>
#include <memory>
#include <string>
#include <vector>

#include "util/bitmap_value.h"
#include "vec/aggregate_functions/aggregate_function.h"
#include "vec/data_types/data_type_bitmap.h"

namespace doris {
namespace vectorized {
class Arena;
class BufferReadable;
class BufferWritable;
class IColumn;
} // namespace vectorized
} // namespace doris

namespace doris::vectorized {

template <typename T>
struct AggregateFunctionBitmapAggData {
BitmapValue value;

void add(const T& value_) { value.add(value_); }

void reset() { value.clear(); }

void merge(const AggregateFunctionBitmapAggData& other) { value |= other.value; }
};

template <bool arg_nullable, typename T>
class AggregateFunctionBitmapAgg final
: public IAggregateFunctionDataHelper<AggregateFunctionBitmapAggData<T>,
AggregateFunctionBitmapAgg<arg_nullable, T>> {
public:
using ColVecType = ColumnVector<T>;
using Data = AggregateFunctionBitmapAggData<T>;

AggregateFunctionBitmapAgg(const DataTypes& argument_types_)
: IAggregateFunctionDataHelper<Data, AggregateFunctionBitmapAgg<arg_nullable, T>>(
argument_types_) {}

std::string get_name() const override { return "bitmap_agg"; }
DataTypePtr get_return_type() const override { return std::make_shared<DataTypeBitMap>(); }

void add(AggregateDataPtr __restrict place, const IColumn** columns, size_t row_num,
Arena* arena) const override {
DCHECK_LT(row_num, columns[0]->size());
if constexpr (arg_nullable) {
auto& nullable_col = assert_cast<const ColumnNullable&>(*columns[0]);
auto& nullable_map = nullable_col.get_null_map_data();
if (!nullable_map[row_num]) {
auto& col = assert_cast<const ColVecType&>(nullable_col.get_nested_column());
this->data(place).add(col.get_data()[row_num]);
}
} else {
auto& col = assert_cast<const ColVecType&>(*columns[0]);
this->data(place).add(col.get_data()[row_num]);
}
}

void reset(AggregateDataPtr place) const override { this->data(place).reset(); }

void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs,
Arena* arena) const override {
this->data(place).merge(this->data(rhs));
}

void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override {
auto& column = assert_cast<ColumnBitmap&>(to);
column.get_data().push_back(this->data(place).value);
}

void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override {
__builtin_unreachable();
}

void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf,
Arena*) const override {
__builtin_unreachable();
}

void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena,
size_t num_rows) const override {
auto& col = assert_cast<const ColumnBitmap&>(column);
DCHECK(col.size() >= num_rows) << "source column's size should greater than num_rows";
auto* src = col.get_data().data();
auto* data = &(this->data(places));
for (size_t i = 0; i != num_rows; ++i) {
data[i].value = src[i];
}
}

void serialize_to_column(const std::vector<AggregateDataPtr>& places, size_t offset,
MutableColumnPtr& dst, const size_t num_rows) const override {
auto& col = assert_cast<ColumnBitmap&>(*dst);
col.resize(num_rows);
auto* data = col.get_data().data();
for (size_t i = 0; i != num_rows; ++i) {
data[i] = this->data(places[i] + offset).value;
}
}

void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column,
Arena* arena) const override {
auto& col = assert_cast<const ColumnBitmap&>(column);
const size_t num_rows = column.size();
auto* data = col.get_data().data();

for (size_t i = 0; i != num_rows; ++i) {
this->data(place).value |= data[i];
}
}

void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place,
const IColumn& column, size_t begin, size_t end,
Arena* arena) const override {
DCHECK(end <= column.size() && begin <= end)
<< ", begin:" << begin << ", end:" << end << ", column.size():" << column.size();
auto& col = assert_cast<const ColumnBitmap&>(column);
auto* data = col.get_data().data();
for (size_t i = begin; i <= end; ++i) {
this->data(place).value |= data[i];
}
}

void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset,
AggregateDataPtr rhs, const ColumnString* column, Arena* arena,
const size_t num_rows) const override {
auto& col = assert_cast<const ColumnBitmap&>(*assert_cast<const IColumn*>(column));
auto* data = col.get_data().data();
for (size_t i = 0; i != num_rows; ++i) {
this->data(places[i]).value |= data[i];
}
}

void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset,
AggregateDataPtr rhs, const ColumnString* column,
Arena* arena, const size_t num_rows) const override {
auto& col = assert_cast<const ColumnBitmap&>(*assert_cast<const IColumn*>(column));
auto* data = col.get_data().data();
for (size_t i = 0; i != num_rows; ++i) {
if (places[i]) {
this->data(places[i]).value |= data[i];
}
}
}

void serialize_without_key_to_column(ConstAggregateDataPtr __restrict place,
IColumn& to) const override {
auto& col = assert_cast<ColumnBitmap&>(to);
size_t old_size = col.size();
col.resize(old_size + 1);
col.get_data()[old_size] = this->data(place).value;
}

[[nodiscard]] MutableColumnPtr create_serialize_column() const override {
return ColumnBitmap::create();
}

[[nodiscard]] DataTypePtr get_serialized_type() const override {
return std::make_shared<DataTypeBitMap>();
}
};

} // namespace doris::vectorized
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ void register_aggregate_function_histogram(AggregateFunctionSimpleFactory& facto
void register_aggregate_function_count_old(AggregateFunctionSimpleFactory& factory);
void register_aggregate_function_sum_old(AggregateFunctionSimpleFactory& factory);
void register_aggregate_function_map_agg(AggregateFunctionSimpleFactory& factory);
void register_aggregate_function_bitmap_agg(AggregateFunctionSimpleFactory& factory);

AggregateFunctionSimpleFactory& AggregateFunctionSimpleFactory::instance() {
static std::once_flag oc;
Expand Down Expand Up @@ -97,6 +98,7 @@ AggregateFunctionSimpleFactory& AggregateFunctionSimpleFactory::instance() {
register_aggregate_function_avg_weighted(instance);
register_aggregate_function_histogram(instance);
register_aggregate_function_map_agg(instance);
register_aggregate_function_bitmap_agg(instance);

register_aggregate_function_stddev_variance_samp(instance);
register_aggregate_function_replace_reader_load(instance);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
---
{
"title": "BITMAP_AGG",
"language": "en"
}
---

<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

## BITMAP_AGG
### description
#### Syntax

`BITMAP_AGG(expr)`


This aggregating function returns a bitmap that aggregates the values of expr, excluding any null values.
The type of expr needs to be TINYINT, SMALLINT, INT, or BIGINT.

### example
```
MySQL > select `n_nationkey`, `n_name`, `n_regionkey` from `nation`;
+-------------+----------------+-------------+
| n_nationkey | n_name | n_regionkey |
+-------------+----------------+-------------+
| 0 | ALGERIA | 0 |
| 1 | ARGENTINA | 1 |
| 2 | BRAZIL | 1 |
| 3 | CANADA | 1 |
| 4 | EGYPT | 4 |
| 5 | ETHIOPIA | 0 |
| 6 | FRANCE | 3 |
| 7 | GERMANY | 3 |
| 8 | INDIA | 2 |
| 9 | INDONESIA | 2 |
| 10 | IRAN | 4 |
| 11 | IRAQ | 4 |
| 12 | JAPAN | 2 |
| 13 | JORDAN | 4 |
| 14 | KENYA | 0 |
| 15 | MOROCCO | 0 |
| 16 | MOZAMBIQUE | 0 |
| 17 | PERU | 1 |
| 18 | CHINA | 2 |
| 19 | ROMANIA | 3 |
| 20 | SAUDI ARABIA | 4 |
| 21 | VIETNAM | 2 |
| 22 | RUSSIA | 3 |
| 23 | UNITED KINGDOM | 3 |
| 24 | UNITED STATES | 1 |
+-------------+----------------+-------------+
MySQL > select n_regionkey, bitmap_to_string(bitmap_agg(n_nationkey)) from nation group by n_regionkey;
+-------------+---------------------------------------------+
| n_regionkey | bitmap_to_string(bitmap_agg(`n_nationkey`)) |
+-------------+---------------------------------------------+
| 4 | 4,10,11,13,20 |
| 2 | 8,9,12,18,21 |
| 1 | 1,2,3,17,24 |
| 0 | 0,5,14,15,16 |
| 3 | 6,7,19,22,23 |
+-------------+---------------------------------------------+
MySQL > select bitmap_count(bitmap_agg(n_nationkey)) from nation;
+-----------------------------------------+
| bitmap_count(bitmap_agg(`n_nationkey`)) |
+-----------------------------------------+
| 25 |
+-----------------------------------------+
```
### keywords
BITMAP_AGG
Loading

0 comments on commit 57fb979

Please sign in to comment.