Skip to content

Commit

Permalink
ARROW-8486: [C++] Fix BitArray failures on big-endian platforms
Browse files Browse the repository at this point in the history
This PR supports big-endian platforms of bit operations such as Bitmap, BitRead, BitWriter, and others. Ths PR can fix `BitArray.TestBool` and `BitArray.TestValue` in arrow-utility-test's [failures](https://travis-ci.org/github/apache/arrow/builds/684931696).

This PR consists of two parts.
1. Convert data layout to native-endian to process data. This PR basically inserts this conversion before and after `memcpy`. This is because because `memcpy` transfers data from/to variable using a memory layout in the buffer. This PR still assumes that data layout in buffer is in little-endian.
2. Add ``BitUtil::ByteSwap` for uint8.

Closes apache#7136 from kiszk/ARROW-8486

Authored-by: Kazuaki Ishizaki <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
kiszk authored and pitrou committed May 11, 2020
1 parent e215e89 commit b2cf62f
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 17 deletions.
13 changes: 12 additions & 1 deletion cpp/src/arrow/util/bit_stream_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class BitWriter {
/// Writes v to the next aligned byte using num_bytes. If T is larger than
/// num_bytes, the extra high-order bytes will be ignored. Returns false if
/// there was not enough space.
/// Assume the v is stored in buffer_ as a litte-endian format
template <typename T>
bool PutAligned(T v, int num_bytes);

Expand Down Expand Up @@ -107,6 +108,7 @@ class BitReader {
: buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) {
int num_bytes = std::min(8, max_bytes_ - byte_offset_);
memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
}

BitReader()
Expand All @@ -123,6 +125,7 @@ class BitReader {
bit_offset_ = 0;
int num_bytes = std::min(8, max_bytes_ - byte_offset_);
memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
}

/// Gets the next value from the buffer. Returns true if 'v' could be read or false if
Expand All @@ -139,6 +142,7 @@ class BitReader {
/// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
/// be advanced to the start of the next byte before 'v' is read. Returns
/// false if there are not enough bytes left.
/// Assume the v was stored in buffer_ as a litte-endian format
template <typename T>
bool GetAligned(int num_bytes, T* v);

Expand Down Expand Up @@ -185,6 +189,7 @@ inline bool BitWriter::PutValue(uint64_t v, int num_bits) {

if (ARROW_PREDICT_FALSE(bit_offset_ >= 64)) {
// Flush buffered_values_ and write out bits of v that did not fit
buffered_values_ = arrow::BitUtil::ToLittleEndian(buffered_values_);
memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
buffered_values_ = 0;
byte_offset_ += 8;
Expand All @@ -198,7 +203,8 @@ inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
inline void BitWriter::Flush(bool align) {
int num_bytes = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
memcpy(buffer_ + byte_offset_, &buffered_values_, num_bytes);
auto buffered_values = arrow::BitUtil::ToLittleEndian(buffered_values_);
memcpy(buffer_ + byte_offset_, &buffered_values, num_bytes);

if (align) {
buffered_values_ = 0;
Expand All @@ -220,6 +226,7 @@ template <typename T>
inline bool BitWriter::PutAligned(T val, int num_bytes) {
uint8_t* ptr = GetNextBytePtr(num_bytes);
if (ptr == NULL) return false;
val = arrow::BitUtil::ToLittleEndian(val);
memcpy(ptr, &val, num_bytes);
return true;
}
Expand Down Expand Up @@ -249,6 +256,7 @@ inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
} else {
memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
}
*buffered_values = arrow::BitUtil::FromLittleEndian(*buffered_values);
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4800 4805)
Expand Down Expand Up @@ -335,6 +343,7 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
} else {
memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
}
buffered_values = arrow::BitUtil::FromLittleEndian(buffered_values);

for (; i < batch_size; ++i) {
detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
Expand Down Expand Up @@ -362,6 +371,7 @@ inline bool BitReader::GetAligned(int num_bytes, T* v) {
// Advance byte_offset to next unread byte and read num_bytes
byte_offset_ += bytes_read;
memcpy(v, buffer_ + byte_offset_, num_bytes);
*v = arrow::BitUtil::FromLittleEndian(*v);
byte_offset_ += num_bytes;

// Reset buffered_values_
Expand All @@ -372,6 +382,7 @@ inline bool BitReader::GetAligned(int num_bytes, T* v) {
} else {
memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
}
buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
return true;
}

Expand Down
41 changes: 25 additions & 16 deletions cpp/src/arrow/util/bit_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ static inline int16_t ByteSwap(int16_t value) {
static inline uint16_t ByteSwap(uint16_t value) {
return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
}
static inline uint8_t ByteSwap(uint8_t value) { return value; }

// Write the swapped bytes into dst. Src and dst cannot overlap.
static inline void ByteSwap(void* dst, const void* src, int len) {
Expand Down Expand Up @@ -358,53 +359,61 @@ static inline void ByteSwap(void* dst, const void* src, int len) {

// Convert to little/big endian format from the machine's native endian format.
#if ARROW_LITTLE_ENDIAN
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
uint32_t, int16_t, uint16_t>>
template <typename T,
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
int16_t, uint16_t, uint8_t>>
static inline T ToBigEndian(T value) {
return ByteSwap(value);
}

template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
uint32_t, int16_t, uint16_t>>
template <typename T,
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
int16_t, uint16_t, uint8_t>>
static inline T ToLittleEndian(T value) {
return value;
}
#else
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
uint32_t, int16_t, uint16_t>>
template <typename T,
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
int16_t, uint16_t, uint8_t>>
static inline T ToBigEndian(T value) {
return value;
}

template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
uint32_t, int16_t, uint16_t>>
template <typename T,
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
int16_t, uint16_t, uint8_t>>
static inline T ToLittleEndian(T value) {
return ByteSwap(value);
}
#endif

// Convert from big/little endian format to the machine's native endian format.
#if ARROW_LITTLE_ENDIAN
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
uint32_t, int16_t, uint16_t>>
template <typename T,
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
int16_t, uint16_t, uint8_t>>
static inline T FromBigEndian(T value) {
return ByteSwap(value);
}

template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
uint32_t, int16_t, uint16_t>>
template <typename T,
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
int16_t, uint16_t, uint8_t>>
static inline T FromLittleEndian(T value) {
return value;
}
#else
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
uint32_t, int16_t, uint16_t>>
template <typename T,
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
int16_t, uint16_t, uint8_t>>
static inline T FromBigEndian(T value) {
return value;
}

template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
uint32_t, int16_t, uint16_t>>
template <typename T,
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
int16_t, uint16_t, uint8_t>>
static inline T FromLittleEndian(T value) {
return ByteSwap(value);
}
Expand Down

0 comments on commit b2cf62f

Please sign in to comment.