Skip to content

Commit

Permalink
GH-41225: [C#] Slice value buffers when writing sliced list or binary…
Browse files Browse the repository at this point in the history
… arrays in IPC format (#41230)

### Rationale for this change

This reduces file sizes when writing sliced binary or list arrays to IPC format.

### What changes are included in this PR?

Changes `ArrowStreamWriter` to write only the subset of the values that is needed rather than the full value buffer when writing a `ListArray` or `BinaryArray`, and compute shifted value offset buffers.

### Are these changes tested?

This code is covered by existing tests and the change doesn't introduce any difference in the observed array values, so I haven't added new tests or checks.

I did change how list arrays are compared though as we can no longer compare the value and value offset buffers directly, so the tests now get list items as arrays and create a new `ArrayComparer` to compare them. This meant that array offsets are no longer always zero, so I've changed the offset assertions to only be used in strict mode.

### Are there any user-facing changes?

Yes, this might reduce IPC file sizes for users writing sliced data.
* GitHub Issue: #41225

Authored-by: Adam Reeve <[email protected]>
Signed-off-by: Curt Hagenlocher <[email protected]>
  • Loading branch information
adamreeve authored Apr 16, 2024
1 parent 3c37848 commit 56186b9
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 30 deletions.
54 changes: 50 additions & 4 deletions csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,18 @@ public void Visit(BooleanArray array)
public void Visit(ListArray array)
{
_buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length));
_buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer, array.Offset, array.Length + 1));
_buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer, array.Offset, array.Length)));

VisitArray(array.Values);
int valuesOffset = array.ValueOffsets[0];
int valuesLength = array.ValueOffsets[array.Length] - valuesOffset;

var values = array.Values;
if (valuesOffset > 0 || valuesLength < values.Length)
{
values = ArrowArrayFactory.Slice(values, valuesOffset, valuesLength);
}

VisitArray(values);
}

public void Visit(ListViewArray array)
Expand Down Expand Up @@ -195,8 +204,12 @@ public void Visit(FixedSizeListArray array)
public void Visit(BinaryArray array)
{
_buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length));
_buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer, array.Offset, array.Length + 1));
_buffers.Add(CreateBuffer(array.ValueBuffer));
_buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer, array.Offset, array.Length)));

int valuesOffset = array.ValueOffsets[0];
int valuesLength = array.ValueOffsets[array.Length] - valuesOffset;

_buffers.Add(CreateSlicedBuffer<byte>(array.ValueBuffer, valuesOffset, valuesLength));
}

public void Visit(BinaryViewArray array)
Expand Down Expand Up @@ -263,6 +276,39 @@ public void Visit(NullArray array)
// There are no buffers for a NullArray
}

private ArrowBuffer GetZeroBasedValueOffsets(ArrowBuffer valueOffsetsBuffer, int arrayOffset, int arrayLength)
{
var requiredBytes = CalculatePaddedBufferLength(sizeof(int) * (arrayLength + 1));

if (arrayOffset != 0)
{
// Array has been sliced, so we need to shift and adjust the offsets
var originalOffsets = valueOffsetsBuffer.Span.CastTo<int>().Slice(arrayOffset, arrayLength + 1);
var firstOffset = arrayLength > 0 ? originalOffsets[0] : 0;

var newValueOffsetsBuffer = _allocator.Allocate(requiredBytes);
var newValueOffsets = newValueOffsetsBuffer.Memory.Span.CastTo<int>();

for (int i = 0; i < arrayLength + 1; ++i)
{
newValueOffsets[i] = originalOffsets[i] - firstOffset;
}

return new ArrowBuffer(newValueOffsetsBuffer);
}
else if (valueOffsetsBuffer.Length > requiredBytes)
{
// Array may have been sliced but the offset is zero,
// so we can truncate the existing offsets
return new ArrowBuffer(valueOffsetsBuffer.Memory.Slice(0, requiredBytes));
}
else
{
// Use the full buffer
return valueOffsetsBuffer;
}
}

private Buffer CreateBitmapBuffer(ArrowBuffer buffer, int offset, int length)
{
if (buffer.IsEmpty)
Expand Down
1 change: 1 addition & 0 deletions csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ public async Task WritesFooterAlignedMultipleOf8Async()
[InlineData(0, 45)]
[InlineData(3, 45)]
[InlineData(16, 45)]
[InlineData(10, 0)]
public async Task WriteSlicedArrays(int sliceOffset, int sliceLength)
{
var originalBatch = TestData.CreateSampleRecordBatch(length: 100);
Expand Down
50 changes: 35 additions & 15 deletions csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,14 @@ public void Visit(StructArray array)

Assert.Equal(expectedArray.Length, array.Length);
Assert.Equal(expectedArray.NullCount, array.NullCount);
Assert.Equal(0, array.Offset);
Assert.Equal(expectedArray.Data.Children.Length, array.Data.Children.Length);
Assert.Equal(expectedArray.Fields.Count, array.Fields.Count);

if (_strictCompare)
{
Assert.Equal(expectedArray.Offset, array.Offset);
}

for (int i = 0; i < array.Fields.Count; i++)
{
array.Fields[i].Accept(new ArrayComparer(expectedArray.Fields[i], _strictCompare));
Expand All @@ -178,12 +182,12 @@ public void Visit(UnionArray array)
Assert.Equal(expectedArray.Mode, array.Mode);
Assert.Equal(expectedArray.Length, array.Length);
Assert.Equal(expectedArray.NullCount, array.NullCount);
Assert.Equal(0, array.Offset);
Assert.Equal(expectedArray.Data.Children.Length, array.Data.Children.Length);
Assert.Equal(expectedArray.Fields.Count, array.Fields.Count);

if (_strictCompare)
{
Assert.Equal(expectedArray.Offset, array.Offset);
Assert.True(expectedArray.TypeBuffer.Span.SequenceEqual(array.TypeBuffer.Span));
}
else
Expand Down Expand Up @@ -252,12 +256,12 @@ private void CompareBinaryArrays<T>(BinaryArray actualArray)

Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
Assert.Equal(0, actualArray.Offset);

CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);

if (_strictCompare)
{
Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
Assert.True(expectedArray.Values.Slice(0, expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, actualArray.Length)));
}
Expand All @@ -284,7 +288,11 @@ private void CompareVariadicArrays<T>(BinaryViewArray actualArray)

Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
Assert.Equal(0, actualArray.Offset);

if (_strictCompare)
{
Assert.Equal(expectedArray.Offset, actualArray.Offset);
}

CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);

Expand All @@ -309,12 +317,12 @@ private void CompareArrays(FixedSizeBinaryArray actualArray)

Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
Assert.Equal(0, actualArray.Offset);

CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);

if (_strictCompare)
{
Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.ValueBuffer.Span.Slice(0, expectedArray.Length).SequenceEqual(actualArray.ValueBuffer.Span.Slice(0, actualArray.Length)));
}
else
Expand All @@ -338,12 +346,12 @@ private void CompareArrays<T>(PrimitiveArray<T> actualArray)

Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
Assert.Equal(0, actualArray.Offset);

CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);

if (_strictCompare)
{
Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.Values.Slice(0, expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, actualArray.Length)));
}
else
Expand All @@ -370,12 +378,12 @@ private void CompareArrays(BooleanArray actualArray)

Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
Assert.Equal(0, actualArray.Offset);

CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);

if (_strictCompare)
{
Assert.Equal(expectedArray.Offset, actualArray.Offset);
int booleanByteCount = BitUtility.ByteCount(expectedArray.Length);
Assert.True(expectedArray.Values.Slice(0, booleanByteCount).SequenceEqual(actualArray.Values.Slice(0, booleanByteCount)));
}
Expand All @@ -397,22 +405,31 @@ private void CompareArrays(ListArray actualArray)

Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
Assert.Equal(0, actualArray.Offset);

CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);

if (_strictCompare)
{
Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare));
}
else
{
int offsetsStart = (expectedArray.Offset) * sizeof(int);
int offsetsLength = (expectedArray.Length + 1) * sizeof(int);
Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(offsetsStart, offsetsLength).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, offsetsLength)));
for (int i = 0; i < actualArray.Length; ++i)
{
if (expectedArray.IsNull(i))
{
Assert.True(actualArray.IsNull(i));
}
else
{
var expectedList = expectedArray.GetSlicedValues(i);
var actualList = actualArray.GetSlicedValues(i);
actualList.Accept(new ArrayComparer(expectedList, _strictCompare));
}
}
}

actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare));
}

private void CompareArrays(ListViewArray actualArray)
Expand All @@ -424,12 +441,12 @@ private void CompareArrays(ListViewArray actualArray)

Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
Assert.Equal(0, actualArray.Offset);

CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);

if (_strictCompare)
{
Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span));
}
Expand All @@ -453,7 +470,10 @@ private void CompareArrays(FixedSizeListArray actualArray)

Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
Assert.Equal(0, actualArray.Offset);
if (_strictCompare)
{
Assert.Equal(expectedArray.Offset, actualArray.Offset);
}

CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer);

Expand Down
47 changes: 36 additions & 11 deletions csharp/test/Apache.Arrow.Tests/TestData.cs
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,18 @@ public void Visit(StringType type)

for (var i = 0; i < Length; i++)
{
builder.Append(str);
switch (i % 3)
{
case 0:
builder.AppendNull();
break;
case 1:
builder.Append(str);
break;
case 2:
builder.Append(str + str);
break;
}
}

Array = builder.Build();
Expand Down Expand Up @@ -328,15 +339,21 @@ public void Visit(ListType type)
{
var builder = new ListArray.Builder(type.ValueField).Reserve(Length);

var valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(Length + 1);
var valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(Length * 3 / 2);

for (var i = 0; i < Length; i++)
{
builder.Append();
valueBuilder.Append(i);
if (i % 10 == 2)
{
builder.AppendNull();
}
else
{
builder.Append();
var listLength = i % 4;
valueBuilder.AppendRange(Enumerable.Range(i, listLength).Select(x => (long)x));
}
}
//Add a value to check if Values.Length can exceed ListArray.Length
valueBuilder.Append(0);

Array = builder.Build();
}
Expand All @@ -352,8 +369,12 @@ public void Visit(ListViewType type)
builder.Append();
valueBuilder.Append(i);
}
//Add a value to check if Values.Length can exceed ListArray.Length
valueBuilder.Append(0);

if (Length > 0)
{
// Add a value to check if Values.Length can exceed ListArray.Length
valueBuilder.Append(0);
}

Array = builder.Build();
}
Expand Down Expand Up @@ -562,9 +583,13 @@ public void Visit(MapType type)
keyBuilder.Append(i.ToString());
valueBuilder.Append(i);
}
//Add a value to check if Values.Length can exceed MapArray.Length
keyBuilder.Append("0");
valueBuilder.Append(0);

if (Length > 0)
{
// Add a value to check if Values.Length can exceed MapArray.Length
keyBuilder.Append("0");
valueBuilder.Append(0);
}

Array = builder.Build();
}
Expand Down

0 comments on commit 56186b9

Please sign in to comment.