Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-41301: [C++] Extract the kernel loops used for PrimitiveTakeExec and generalize to any fixed-width type #41373

Merged
merged 22 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
27d5efb
GH-41188: [C++] Don't recursively produce nulls when appending nulls …
felipecrv Apr 13, 2024
0817d59
gather_internal.h: Introduce the Gather class helper
felipecrv Feb 24, 2024
35c409d
Take: Use Gather for PrimitiveTakeImpl
felipecrv Apr 24, 2024
bc75bfa
Take: Pass bit-width instead of byte-width
felipecrv Mar 5, 2024
9ee75fb
Take: Use a Gather specialization for boolean arrays
felipecrv Apr 25, 2024
b0614a0
Take: Skip allocating validity bitmap when values and filter don't ha…
felipecrv Apr 25, 2024
1fdc401
Take: Fold the implementation for FSB arrays into the one used for pr…
felipecrv Apr 25, 2024
b122382
Revert "GH-41188: [C++] Don't recursively produce nulls when appendin…
felipecrv May 3, 2024
97be842
gather_internal.h: Please the linter
felipecrv May 3, 2024
d77c758
gather_internal.h: Small fixup to top-level comment
felipecrv Apr 27, 2024
47ef815
gather_internal.h: Rename {->k}WithFactor and use static_assert
felipecrv May 6, 2024
5f635aa
gather_internal.h: Use int64_t instead of size_t for size factors
felipecrv May 6, 2024
54ca9a4
gather_internal.h: Fix assertion: src_validity.offset is not always r…
felipecrv May 17, 2024
9f5d834
fixup! gather_internal.h: Introduce the Gather class helper
felipecrv Jun 4, 2024
8a089c8
fixup! Take: Use a Gather specialization for boolean arrays
felipecrv Jun 4, 2024
8629277
fixup! gather_internal.h: Rename {->k}WithFactor and use static_assert
felipecrv Jun 4, 2024
efaeb9c
gather_internal.h: Remove inline nspc because force-inline annotation…
felipecrv Jun 5, 2024
8f884c6
Take: Add an ARROW_PREDICT_TRUE that reduces binary size
felipecrv Jun 6, 2024
a64a582
macros.h: Add ARROW_COMPILER_UNREACHABLE macro
felipecrv Jun 6, 2024
c31e5be
Take: Move a few branches around and reduce binary code size further
felipecrv Jun 6, 2024
075508c
Take: Add note about typename tparams in FixedWidthTakeImpl
felipecrv Jun 6, 2024
ca6937c
Remove ARROW_COMPILER_UNREACHABLE and use default+DCHECK on the switch
felipecrv Jun 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
306 changes: 306 additions & 0 deletions cpp/src/arrow/compute/kernels/gather_internal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cassert>
#include <cstddef>
#include <cstdint>

#include "arrow/array/data.h"
felipecrv marked this conversation as resolved.
Show resolved Hide resolved
#include "arrow/util/bit_block_counter.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/macros.h"

// Implementation helpers for kernels that need to load/gather fixed-width
// data from multiple, arbitrary indices.
//
// https://en.wikipedia.org/wiki/Gather/scatter_(vector_addressing)

namespace arrow::internal {

// CRTP [1] base class for Gather that provides a gathering loop in terms of
// Write*() methods that must be implemented by the derived class.
//
// [1] https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern
template <class GatherImpl>
class GatherBaseCRTP {
public:
// Output offset is not supported by Gather and idx is supposed to have offset
// pre-applied. idx_validity parameters on functions can use the offset they
// carry to read the validity bitmap as bitmaps can't have pre-applied offsets
// (they might not align to byte boundaries).

GatherBaseCRTP() = default;
ARROW_DISALLOW_COPY_AND_ASSIGN(GatherBaseCRTP);
ARROW_DEFAULT_MOVE_AND_ASSIGN(GatherBaseCRTP);

protected:
ARROW_FORCE_INLINE int64_t ExecuteNoNulls(int64_t idx_length) {
auto* self = static_cast<GatherImpl*>(this);
for (int64_t position = 0; position < idx_length; position++) {
self->WriteValue(position);
}
return idx_length;
}

// See derived Gather classes below for the meaning of the parameters, pre and
// post-conditions.
felipecrv marked this conversation as resolved.
Show resolved Hide resolved
//
// src_validity is not necessarily the source of the values that are being
// gathered (e.g. the source could be a nested fixed-size list array and the
// values being gathered are from the innermost buffer), so the ArraySpan is
// used solely to check for nulls in the source values and nothing else.
//
// idx_length is the number of elements in idx and consequently the number of
// bits that might be written to out_is_valid. Member `Write*()` functions will be
// called with positions from 0 to idx_length - 1.
//
// If `kOutputIsZeroInitialized` is true, then `WriteZero()` or `WriteZeroSegment()`
// doesn't have to be called for resulting null positions. A position is
// considered null if either the index or the source value is null at that
// position.
template <bool kOutputIsZeroInitialized, typename IndexCType>
ARROW_FORCE_INLINE int64_t ExecuteWithNulls(const ArraySpan& src_validity,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is ARROW_FORCE_INLINE desirable here? Generally, the compiler is able to make more informed heuristics than us (but not always admittedly). This function is rather complex so I'm not sure force-inlining it would actually be beneficial (and could blow up code generation sizes).

Copy link
Contributor Author

@felipecrv felipecrv Jun 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I checked code and benchmarks (a month or more ago).

Here's the overall approach with this header.

GatherBaseCRTP is a template that generates these templates:

template <typename Global, typename Params>
class Gather {
  a simple;
  b scalar;
  c members;
  
  void WriteValue(pos) { ... }
  void WriteZero(pos) { ... }
  void WriteZeroSegment(pos, len) { ... }
  
  int64_t ExecuteNoNulls(len) { .. }
  
  template<bool OutputIsZeroInitialized>
  int64_t Execute(src_validity, idx_validity, out_is_valid) { ... }
};

The usage should be as follows. On the same function, the specific Gather instance should be instantiated and the method should be called on it:

    const bool out_has_validity = values.MayHaveNulls() || indices.MayHaveNulls();

    const uint8_t* src;
    int64_t src_offset;
    std::tie(src_offset, src) = util::OffsetPointerOfFixedBitWidthValues(values);
    uint8_t* out = util::MutableFixedWidthValuesPointer(out_arr);
    int64_t valid_count = 0;
    arrow::internal::Gather<kValueWidthInBits, IndexCType, WithFactor::value> gather{
        /*src_length=*/values.length,
        src,
        src_offset,
        /*idx_length=*/indices.length,
        /*idx=*/indices.GetValues<IndexCType>(1),
        out,
        factor};
    if (out_has_validity) {
      DCHECK_EQ(out_arr->offset, 0);
      // out_is_valid must be zero-initiliazed, because Gather::Execute
      // saves time by not having to ClearBit on every null element.
      auto out_is_valid = out_arr->GetMutableValues<uint8_t>(0);
      memset(out_is_valid, 0, bit_util::BytesForBits(out_arr->length));
      valid_count = gather.template Execute<OutputIsZeroInitialized::value>(
          /*src_validity=*/values, /*idx_validity=*/indices, out_is_valid);
    } else {
      valid_count = gather.Execute();
    }

What this achieves? Gather<kValueWidthInBits, ...> gather{...} isn't really the initialization of a portion of the memory in the stack, but of registers. Because the lifetime of gather is confined to a single function and all the method calls on it are inlined, the member of gather can live in registers from initialization to the end of the kernel loop. I did this to achieve the same performance we had when these loops were hand-written (more than once!) in vector_selection_take_internal.cc.

This idea of keeping variables in registers is mentioned in database implementation reports (Efficiently Compiling Efficient Query Plans for Modern Hardware) and it's not just compiler-brain perfectionism (https://llvm.org/docs/Passes.html#sroa-scalar-replacement-of-aggregates).

By force-inlining the Execute methods I'm actually reducing code size (together with the fact that I instantiate Gather and these specific Execute calls from a single place). Without inlining Execute there would be more binary code to deal with construction/destruction, loads to/from memory.

To make sure we only ever instantiate these templates once, I could wrap this code in templates outside the inline namespace. The templates would contain the initialization of Gather<...> gather and the inlined calls to the two inlined Execute functions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want me to build that functional façade to these classes before we concretely have more than two instantiations of these templates? I could create an issue to track this concern.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know using registers is critical for performance. I'm just surprised that the compiler wouldn't cache member variables in registers in a (non-inlined) method's body. Adding a couple memory lookups at the start and end of such a heavy function should not yield a noticeable slowdown.

Just because a compiler feels better with some particular organization of the source code doesn't mean we've unlocked a robust performance improvement. We may just have hit a sweeter spot in this optimizer's behaviour.

In any case, we can proceed with this. This is helper code that should only be used in a couple kernels, so the potential code size increase is hopefully (!) minor.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will remove the inline namespace annotation as well as it seems to be a red-herring.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm just surprised that the compiler wouldn't cache member variables in registers in a (non-inlined) method's body. Adding a couple memory lookups at the start and end of such a heavy function should not yield a noticeable slowdown.

Without inlining Execute, there has to be a this pointer pointing to the struct laid out in memory and call to Write* have to load members through the this pointer. Not an impossible optmization, but wasn't happening.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without inlining Execute, there has to be a this pointer pointing to the struct laid out in memory and call to Write* have to load members through the this pointer. Not an impossible optmization, but wasn't happening.

I see. Compilers are annoying :-)

int64_t idx_length, const IndexCType* idx,
const ArraySpan& idx_validity,
uint8_t* out_is_valid) {
auto* self = static_cast<GatherImpl*>(this);
OptionalBitBlockCounter indices_bit_counter(idx_validity.buffers[0].data,
idx_validity.offset, idx_length);
int64_t position = 0;
int64_t valid_count = 0;
while (position < idx_length) {
BitBlockCount block = indices_bit_counter.NextBlock();
if (!src_validity.MayHaveNulls()) {
// Source values are never null, so things are easier
valid_count += block.popcount;
if (block.popcount == block.length) {
// Fastest path: neither source values nor index nulls
bit_util::SetBitsTo(out_is_valid, position, block.length, true);
for (int64_t i = 0; i < block.length; ++i) {
self->WriteValue(position);
++position;
}
} else if (block.popcount > 0) {
// Slow path: some indices but not all are null
for (int64_t i = 0; i < block.length; ++i) {
ARROW_COMPILER_ASSUME(idx_validity.buffers[0].data != nullptr);
if (idx_validity.IsValid(position)) {
// index is not null
bit_util::SetBit(out_is_valid, position);
self->WriteValue(position);
} else if constexpr (!kOutputIsZeroInitialized) {
self->WriteZero(position);
}
++position;
}
} else {
self->WriteZeroSegment(position, block.length);
position += block.length;
}
} else {
// Source values may be null, so we must do random access into src_validity
if (block.popcount == block.length) {
// Faster path: indices are not null but source values may be
for (int64_t i = 0; i < block.length; ++i) {
ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr);
if (src_validity.IsValid(idx[position])) {
// value is not null
self->WriteValue(position);
bit_util::SetBit(out_is_valid, position);
++valid_count;
} else if constexpr (!kOutputIsZeroInitialized) {
self->WriteZero(position);
}
++position;
}
} else if (block.popcount > 0) {
// Slow path: some but not all indices are null. Since we are doing
// random access in general we have to check the value nullness one by
// one.
for (int64_t i = 0; i < block.length; ++i) {
ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr);
ARROW_COMPILER_ASSUME(idx_validity.buffers[0].data != nullptr);
if (idx_validity.IsValid(position) && src_validity.IsValid(idx[position])) {
// index is not null && value is not null
self->WriteValue(position);
bit_util::SetBit(out_is_valid, position);
++valid_count;
} else if constexpr (!kOutputIsZeroInitialized) {
self->WriteZero(position);
}
++position;
}
} else {
if constexpr (!kOutputIsZeroInitialized) {
self->WriteZeroSegment(position, block.length);
}
position += block.length;
}
}
}
return valid_count;
}
};

// A gather primitive for primitive fixed-width types with a integral byte width. If
// `kWithFactor` is true, the actual width is a runtime multiple of `kValueWidthInbits`
// (this can be useful for fixed-size list inputs and other input types with unusual byte
// widths that don't deserve value specialization).
template <int kValueWidthInBits, typename IndexCType, bool kWithFactor>
class Gather : public GatherBaseCRTP<Gather<kValueWidthInBits, IndexCType, kWithFactor>> {
felipecrv marked this conversation as resolved.
Show resolved Hide resolved
public:
static_assert(kValueWidthInBits >= 0 && kValueWidthInBits % 8 == 0);
static constexpr int kValueWidth = kValueWidthInBits / 8;

private:
const int64_t src_length_; // number of elements of kValueWidth bytes in src_
const uint8_t* src_;
const int64_t idx_length_; // number IndexCType elements in idx_
const IndexCType* idx_;
uint8_t* out_;
int64_t factor_;

public:
void WriteValue(int64_t position) {
if constexpr (kWithFactor) {
const int64_t scaled_factor = kValueWidth * factor_;
memcpy(out_ + position * scaled_factor, src_ + idx_[position] * scaled_factor,
scaled_factor);
} else {
memcpy(out_ + position * kValueWidth, src_ + idx_[position] * kValueWidth,
kValueWidth);
}
}

void WriteZero(int64_t position) {
if constexpr (kWithFactor) {
const int64_t scaled_factor = kValueWidth * factor_;
memset(out_ + position * scaled_factor, 0, scaled_factor);
} else {
memset(out_ + position * kValueWidth, 0, kValueWidth);
}
}

void WriteZeroSegment(int64_t position, int64_t length) {
if constexpr (kWithFactor) {
const int64_t scaled_factor = kValueWidth * factor_;
memset(out_ + position * scaled_factor, 0, length * scaled_factor);
} else {
memset(out_ + position * kValueWidth, 0, length * kValueWidth);
}
}

public:
Gather(int64_t src_length, const uint8_t* src, int64_t zero_src_offset,
int64_t idx_length, const IndexCType* idx, uint8_t* out, int64_t factor)
: src_length_(src_length),
src_(src),
idx_length_(idx_length),
idx_(idx),
out_(out),
factor_(factor) {
assert(zero_src_offset == 0);
assert(src && idx && out);
assert((kWithFactor || factor == 1) &&
"When kWithFactor is false, the factor is assumed to be 1 at compile time");
}

ARROW_FORCE_INLINE int64_t Execute() { return this->ExecuteNoNulls(idx_length_); }

/// \pre If kOutputIsZeroInitialized, then this->out_ has to be zero initialized.
/// \pre Bits in out_is_valid have to always be zero initialized.
/// \post The bits for the valid elements (and only those) are set in out_is_valid.
/// \post If !kOutputIsZeroInitialized, then positions in this->_out containing null
/// elements have 0s written to them. This might be less efficient than
/// zero-initializing first and calling this->Execute() afterwards.
/// \return The number of valid elements in out.
template <bool kOutputIsZeroInitialized = false>
ARROW_FORCE_INLINE int64_t Execute(const ArraySpan& src_validity,
const ArraySpan& idx_validity,
uint8_t* out_is_valid) {
assert(src_length_ == src_validity.length);
assert(idx_length_ == idx_validity.length);
assert(out_is_valid);
return this->template ExecuteWithNulls<kOutputIsZeroInitialized>(
src_validity, idx_length_, idx_, idx_validity, out_is_valid);
}
};

// A gather primitive for boolean inputs. Unlike its counterpart above,
// this does not support passing a non-trivial factor parameter.
template <typename IndexCType>
class Gather</*kValueWidthInBits=*/1, IndexCType, /*kWithFactor=*/false>
felipecrv marked this conversation as resolved.
Show resolved Hide resolved
: public GatherBaseCRTP<Gather<1, IndexCType, false>> {
private:
const int64_t src_length_; // number of elements of bits bytes in src_ after offset
const uint8_t* src_; // the boolean array data buffer in bits
const int64_t src_offset_; // offset in bits
const int64_t idx_length_; // number IndexCType elements in idx_
const IndexCType* idx_;
uint8_t* out_; // output boolean array data buffer in bits

public:
Gather(int64_t src_length, const uint8_t* src, int64_t src_offset, int64_t idx_length,
const IndexCType* idx, uint8_t* out, int64_t factor)
: src_length_(src_length),
src_(src),
src_offset_(src_offset),
idx_length_(idx_length),
idx_(idx),
out_(out) {
assert(src && idx && out);
assert(factor == 1 &&
"factor != 1 is not supported when Gather is used to gather bits/booleans");
}

void WriteValue(int64_t position) {
bit_util::SetBitTo(out_, position,
bit_util::GetBit(src_, src_offset_ + idx_[position]));
}

void WriteZero(int64_t position) { bit_util::ClearBit(out_, position); }

void WriteZeroSegment(int64_t position, int64_t block_length) {
bit_util::SetBitsTo(out_, position, block_length, false);
}

ARROW_FORCE_INLINE int64_t Execute() { return this->ExecuteNoNulls(idx_length_); }

/// \pre If kOutputIsZeroInitialized, then this->out_ has to be zero initialized.
/// \pre Bits in out_is_valid have to always be zero initialized.
/// \post The bits for the valid elements (and only those) are set in out_is_valid.
/// \post If !kOutputIsZeroInitialized, then positions in this->_out containing null
/// elements have 0s written to them. This might be less efficient than
/// zero-initializing first and calling this->Execute() afterwards.
/// \return The number of valid elements in out.
template <bool kOutputIsZeroInitialized = false>
ARROW_FORCE_INLINE int64_t Execute(const ArraySpan& src_validity,
const ArraySpan& idx_validity,
uint8_t* out_is_valid) {
assert(src_length_ == src_validity.length);
assert(idx_length_ == idx_validity.length);
assert(out_is_valid);
return this->template ExecuteWithNulls<kOutputIsZeroInitialized>(
src_validity, idx_length_, idx_, idx_validity, out_is_valid);
}
};

} // namespace arrow::internal
68 changes: 2 additions & 66 deletions cpp/src/arrow/compute/kernels/vector_selection_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -547,39 +547,6 @@ struct VarBinarySelectionImpl : public Selection<VarBinarySelectionImpl<Type>, T
}
};

struct FSBSelectionImpl : public Selection<FSBSelectionImpl, FixedSizeBinaryType> {
using Base = Selection<FSBSelectionImpl, FixedSizeBinaryType>;
LIFT_BASE_MEMBERS();

TypedBufferBuilder<uint8_t> data_builder;

FSBSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
ExecResult* out)
: Base(ctx, batch, output_length, out), data_builder(ctx->memory_pool()) {}

template <typename Adapter>
Status GenerateOutput() {
FixedSizeBinaryArray typed_values(this->values.ToArrayData());
int32_t value_size = typed_values.byte_width();

RETURN_NOT_OK(data_builder.Reserve(value_size * output_length));
Adapter adapter(this);
return adapter.Generate(
[&](int64_t index) {
auto val = typed_values.GetView(index);
data_builder.UnsafeAppend(reinterpret_cast<const uint8_t*>(val.data()),
value_size);
return Status::OK();
},
[&]() {
data_builder.UnsafeAppend(value_size, static_cast<uint8_t>(0x00));
return Status::OK();
});
}

Status Finish() override { return data_builder.Finish(&out->buffers[1]); }
};

template <typename Type>
struct ListSelectionImpl : public Selection<ListSelectionImpl<Type>, Type> {
using offset_type = typename Type::offset_type;
Expand Down Expand Up @@ -939,23 +906,6 @@ Status LargeVarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch,
return TakeExec<VarBinarySelectionImpl<LargeBinaryType>>(ctx, batch, out);
}

Status FSBTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& values = batch[0].array;
const auto byte_width = values.type->byte_width();
// Use primitive Take implementation (presumably faster) for some byte widths
switch (byte_width) {
case 1:
case 2:
case 4:
case 8:
case 16:
case 32:
return PrimitiveTakeExec(ctx, batch, out);
default:
return TakeExec<FSBSelectionImpl>(ctx, batch, out);
}
}

Status ListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
return TakeExec<ListSelectionImpl<ListType>>(ctx, batch, out);
}
Expand All @@ -968,26 +918,12 @@ Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& values = batch[0].array;

// If a FixedSizeList wraps a fixed-width type we can, in some cases, use
// PrimitiveTakeExec for a fixed-size list array.
// FixedWidthTakeExec for a fixed-size list array.
if (util::IsFixedWidthLike(values,
/*force_null_count=*/true,
/*exclude_bool_and_dictionary=*/true)) {
const auto byte_width = util::FixedWidthInBytes(*values.type);
// Additionally, PrimitiveTakeExec is only implemented for specific byte widths.
// TODO(GH-41301): Extend PrimitiveTakeExec for any fixed-width type.
switch (byte_width) {
case 1:
case 2:
case 4:
case 8:
case 16:
case 32:
return PrimitiveTakeExec(ctx, batch, out);
default:
break; // fallback to TakeExec<FSBSelectionImpl>
}
return FixedWidthTakeExec(ctx, batch, out);
}

return TakeExec<FSLSelectionImpl>(ctx, batch, out);
}

Expand Down
Loading
Loading