Skip to content

Commit

Permalink
Use get_unchecked in unsafe block and add MAX_ELEMENT_SIZE const
Browse files Browse the repository at this point in the history
  • Loading branch information
adamreeve committed Jun 23, 2024
1 parent 339c0ce commit 644355c
Showing 1 changed file with 25 additions and 10 deletions.
Original file line number Diff line number Diff line change
@@ -1,38 +1,42 @@
use crate::parquet::error::ParquetError;

const MAX_ELEMENT_SIZE: usize = 8;

/// Decodes using the [Byte Stream Split](https://github.com/apache/parquet-format/blob/master/Encodings.md#byte-stream-split-byte_stream_split--9) encoding.
/// # Implementation
/// A fixed size buffer is stored inline to support reading types of up to 8 bytes in size.
#[derive(Debug)]
pub struct Decoder<'a> {
values: &'a [u8],
buffer: [u8; 8],
buffer: [u8; MAX_ELEMENT_SIZE],
num_elements: usize,
position: usize,
element_size: usize,
}

impl<'a> Decoder<'a> {
pub fn try_new(values: &'a [u8], element_size: usize) -> Result<Self, ParquetError> {
if element_size > 8 {
if element_size > MAX_ELEMENT_SIZE {
// Since Parquet format version 2.11 it's valid to use byte stream split for fixed-length byte array data,
// which could be larger than 8 bytes, but Polars doesn't yet support reading byte stream split encoded FLBA data.
return Err(ParquetError::oos(
"Byte stream split decoding only supports up to 8 byte element sizes",
));
return Err(ParquetError::oos(format!(
"Byte stream split decoding only supports up to {} byte element sizes",
MAX_ELEMENT_SIZE
)));
}

let values_size = values.len();
if values_size % element_size != 0 {
return Err(ParquetError::oos(
"Values array length is not a multiple of the element size",
));
return Err(ParquetError::oos(format!(
"Values array length ({}) is not a multiple of the element size ({})",
values_size, element_size
)));
}
let num_elements = values.len() / element_size;

Ok(Self {
values,
buffer: [0; 8],
buffer: [0; MAX_ELEMENT_SIZE],
num_elements,
position: 0,
element_size,
Expand All @@ -44,8 +48,19 @@ impl<'a> Decoder<'a> {
return false;
}

debug_assert!(self.element_size <= MAX_ELEMENT_SIZE);
debug_assert!(self.values.len() >= self.num_elements * self.element_size);
for n in 0..self.element_size {
self.buffer[n] = self.values[(self.num_elements * n) + self.position]
unsafe {
// SAFETY:
// We have the invariants that element_size <= MAX_ELEMENT_SIZE,
// buffer.len() == MAX_ELEMENT_SIZE,
// position < num_elements and
// values.len() >= num_elements * element_size.
*self.buffer.get_unchecked_mut(n) = *self
.values
.get_unchecked((self.num_elements * n) + self.position)
}
}

self.position += 1;
Expand Down

0 comments on commit 644355c

Please sign in to comment.