Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-39798: [C++] Optimize Take for fixed-size types including nested fixed-size lists #41297

Merged
merged 12 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ set(ARROW_UTIL_SRCS
util/decimal.cc
util/delimiting.cc
util/dict_util.cc
util/fixed_width_internal.cc
util/float16.cc
util/formatting.cc
util/future.cc
Expand Down
30 changes: 17 additions & 13 deletions cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/fixed_width_internal.h"

namespace arrow {

Expand Down Expand Up @@ -158,9 +159,11 @@ class PrimitiveFilterImpl {
PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter,
FilterOptions::NullSelectionBehavior null_selection,
ArrayData* out_arr)
: byte_width_(values.type->byte_width()),
: byte_width_(util::FixedWidthInBytes(*values.type)),
values_is_valid_(values.buffers[0].data),
values_data_(values.buffers[1].data),
// No offset applied for boolean because it's a bitmap
values_data_(kIsBoolean ? values.buffers[1].data
: util::OffsetPointerOfFixedWidthValues(values)),
values_null_count_(values.null_count),
values_offset_(values.offset),
values_length_(values.length),
Expand All @@ -169,17 +172,13 @@ class PrimitiveFilterImpl {
if constexpr (kByteWidth >= 0 && !kIsBoolean) {
DCHECK_EQ(kByteWidth, byte_width_);
}
if constexpr (!kIsBoolean) {
// No offset applied for boolean because it's a bitmap
values_data_ += values.offset * byte_width();
}

DCHECK_EQ(out_arr->offset, 0);
if (out_arr->buffers[0] != nullptr) {
// May be unallocated if neither filter nor values contain nulls
out_is_valid_ = out_arr->buffers[0]->mutable_data();
}
out_data_ = out_arr->buffers[1]->mutable_data();
DCHECK_EQ(out_arr->offset, 0);
out_data_ = util::MutableFixedWidthValuesPointer(out_arr);
out_length_ = out_arr->length;
out_position_ = 0;
}
Expand Down Expand Up @@ -416,7 +415,7 @@ class PrimitiveFilterImpl {
out_position_ += length;
}

constexpr int32_t byte_width() const {
constexpr int64_t byte_width() const {
if constexpr (kByteWidth >= 0) {
return kByteWidth;
} else {
Expand All @@ -425,7 +424,7 @@ class PrimitiveFilterImpl {
}

private:
int32_t byte_width_;
int64_t byte_width_;
const uint8_t* values_is_valid_;
const uint8_t* values_data_;
int64_t values_null_count_;
Expand All @@ -439,6 +438,8 @@ class PrimitiveFilterImpl {
int64_t out_position_;
};

} // namespace

Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& values = batch[0].array;
const ArraySpan& filter = batch[1].array;
Expand Down Expand Up @@ -468,9 +469,10 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
// validity bitmap.
const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero;

const int bit_width = values.type->bit_width();
RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, output_length, bit_width,
allocate_validity, out_arr));
DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false));
const int64_t bit_width = util::FixedWidthInBits(*values.type);
RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData(
ctx, output_length, /*source=*/values, allocate_validity, out_arr));

switch (bit_width) {
case 1:
Expand Down Expand Up @@ -505,6 +507,8 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
return Status::OK();
}

namespace {

// ----------------------------------------------------------------------
// Optimized filter for base binary types (32-bit and 64-bit)

Expand Down
56 changes: 38 additions & 18 deletions cpp/src/arrow/compute/kernels/vector_selection_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "arrow/util/bit_block_counter.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/fixed_width_internal.h"
#include "arrow/util/int_util.h"
#include "arrow/util/logging.h"
#include "arrow/util/ree_util.h"
Expand Down Expand Up @@ -65,24 +66,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
DCHECK_OK(registry->AddFunction(std::move(func)));
}

Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width,
bool allocate_validity, ArrayData* out) {
// Preallocate memory
out->length = length;
out->buffers.resize(2);

if (allocate_validity) {
ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length));
}
if (bit_width == 1) {
ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length));
} else {
ARROW_ASSIGN_OR_RAISE(out->buffers[1],
ctx->Allocate(bit_util::BytesForBits(length * bit_width)));
}
return Status::OK();
}

namespace {

/// \brief Iterate over a REE filter, emitting ranges of a plain values array that
Expand Down Expand Up @@ -909,6 +892,20 @@ Status LargeListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
}

Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& values = batch[0].array;

// If a FixedSizeList wraps a fixed-width type we can, in some cases, use
// PrimitiveFilterExec for a fixed-size list array.
if (util::IsFixedWidthLike(values,
/*force_null_count=*/true,
/*exclude_dictionary=*/true)) {
const auto byte_width = util::FixedWidthInBytes(*values.type);
// 0 is a valid byte width for FixedSizeList, but PrimitiveFilterExec
// might not handle it correctly.
if (byte_width > 0) {
return PrimitiveFilterExec(ctx, batch, out);
}
}
return FilterExec<FSLSelectionImpl>(ctx, batch, out);
}

Expand Down Expand Up @@ -968,6 +965,29 @@ Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
}

Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& values = batch[0].array;

// If a FixedSizeList wraps a fixed-width type we can, in some cases, use
// PrimitiveTakeExec for a fixed-size list array.
if (util::IsFixedWidthLike(values,
/*force_null_count=*/true,
/*exclude_dictionary=*/true)) {
const auto byte_width = util::FixedWidthInBytes(*values.type);
// Additionally, PrimitiveTakeExec is only implemented for specific byte widths.
// TODO(GH-41301): Extend PrimitiveTakeExec for any fixed-width type.
switch (byte_width) {
case 1:
case 2:
case 4:
case 8:
case 16:
case 32:
return PrimitiveTakeExec(ctx, batch, out);
default:
break; // fallback to TakeExec<FSBSelectionImpl>
}
}

return TakeExec<FSLSelectionImpl>(ctx, batch, out);
}

Expand Down
7 changes: 1 addition & 6 deletions cpp/src/arrow/compute/kernels/vector_selection_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
const FunctionOptions* default_options,
FunctionRegistry* registry);

/// \brief Allocate an ArrayData for a primitive array with a given length and bit width
///
/// \param[in] bit_width 1 or a multiple of 8
Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width,
bool allocate_validity, ArrayData* out);

/// \brief Callback type for VisitPlainxREEFilterOutputSegments.
///
/// position is the logical position in the values array relative to its offset.
Expand All @@ -70,6 +64,7 @@ void VisitPlainxREEFilterOutputSegments(
FilterOptions::NullSelectionBehavior null_selection,
const EmitREEFilterSegment& emit_segment);

Status PrimitiveFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
Expand Down
39 changes: 23 additions & 16 deletions cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "arrow/util/bit_block_counter.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/fixed_width_internal.h"
#include "arrow/util/int_util.h"
#include "arrow/util/ree_util.h"

Expand Down Expand Up @@ -323,7 +324,7 @@ namespace {
using TakeState = OptionsWrapper<TakeOptions>;

// ----------------------------------------------------------------------
// Implement optimized take for primitive types from boolean to 1/2/4/8-byte
// Implement optimized take for primitive types from boolean to 1/2/4/8/16/32-byte
// C-type based types. Use common implementation for every byte width and only
// generate code for unsigned integer indices, since after boundschecking to
// check for negative numbers in the indices we can safely reinterpret_cast
Expand All @@ -333,33 +334,36 @@ using TakeState = OptionsWrapper<TakeOptions>;
/// use the logical Arrow type but rather the physical C type. This way we
/// only generate one take function for each byte width.
///
/// This function assumes that the indices have been boundschecked.
/// Also note that this function can also handle fixed-size-list arrays if
/// they fit the criteria described in fixed_width_internal.h, so use the
/// function defined in that file to access values and destination pointers
/// and DO NOT ASSUME `values.type()` is a primitive type.
///
/// \pre the indices have been boundschecked
template <typename IndexCType, typename ValueWidthConstant>
struct PrimitiveTakeImpl {
static constexpr int kValueWidth = ValueWidthConstant::value;

static void Exec(const ArraySpan& values, const ArraySpan& indices,
ArrayData* out_arr) {
DCHECK_EQ(values.type->byte_width(), kValueWidth);
const auto* values_data =
values.GetValues<uint8_t>(1, 0) + kValueWidth * values.offset;
DCHECK_EQ(util::FixedWidthInBytes(*values.type), kValueWidth);
const auto* values_data = util::OffsetPointerOfFixedWidthValues(values);
const uint8_t* values_is_valid = values.buffers[0].data;
auto values_offset = values.offset;

const auto* indices_data = indices.GetValues<IndexCType>(1);
const uint8_t* indices_is_valid = indices.buffers[0].data;
auto indices_offset = indices.offset;

auto out = out_arr->GetMutableValues<uint8_t>(1, 0) + kValueWidth * out_arr->offset;
DCHECK_EQ(out_arr->offset, 0);
auto* out = util::MutableFixedWidthValuesPointer(out_arr);
auto out_is_valid = out_arr->buffers[0]->mutable_data();
auto out_offset = out_arr->offset;
felipecrv marked this conversation as resolved.
Show resolved Hide resolved
DCHECK_EQ(out_offset, 0);

// If either the values or indices have nulls, we preemptively zero out the
// out validity bitmap so that we don't have to use ClearBit in each
// iteration for nulls.
if (values.null_count != 0 || indices.null_count != 0) {
bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
bit_util::SetBitsTo(out_is_valid, 0, indices.length, false);
}

auto WriteValue = [&](int64_t position) {
Expand All @@ -386,7 +390,7 @@ struct PrimitiveTakeImpl {
valid_count += block.popcount;
if (block.popcount == block.length) {
// Fastest path: neither values nor index nulls
bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
bit_util::SetBitsTo(out_is_valid, position, block.length, true);
for (int64_t i = 0; i < block.length; ++i) {
WriteValue(position);
++position;
Expand All @@ -396,7 +400,7 @@ struct PrimitiveTakeImpl {
for (int64_t i = 0; i < block.length; ++i) {
if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
// index is not null
bit_util::SetBit(out_is_valid, out_offset + position);
bit_util::SetBit(out_is_valid, position);
WriteValue(position);
} else {
WriteZero(position);
Expand All @@ -416,7 +420,7 @@ struct PrimitiveTakeImpl {
values_offset + indices_data[position])) {
// value is not null
WriteValue(position);
bit_util::SetBit(out_is_valid, out_offset + position);
bit_util::SetBit(out_is_valid, position);
++valid_count;
} else {
WriteZero(position);
Expand All @@ -433,7 +437,7 @@ struct PrimitiveTakeImpl {
values_offset + indices_data[position])) {
// index is not null && value is not null
WriteValue(position);
bit_util::SetBit(out_is_valid, out_offset + position);
bit_util::SetBit(out_is_valid, position);
++valid_count;
} else {
WriteZero(position);
Expand Down Expand Up @@ -584,14 +588,17 @@ Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*

ArrayData* out_arr = out->array_data().get();

const int bit_width = values.type->bit_width();
DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false,
/*exclude_dictionary=*/true));
const int64_t bit_width = util::FixedWidthInBits(*values.type);

// TODO: When neither values nor indices contain nulls, we can skip
// allocating the validity bitmap altogether and save time and space. A
// streamlined PrimitiveTakeImpl would need to be written that skips all
// interactions with the output validity bitmap, though.
RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, indices.length, bit_width,
/*allocate_validity=*/true, out_arr));
RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData(
ctx, indices.length, /*source=*/values,
/*allocate_validity=*/true, out_arr));
switch (bit_width) {
case 1:
TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
Expand Down
Loading
Loading