From 75249d40445be9ea8670ef61c35c2b75716bdc76 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 17 Apr 2024 00:25:49 +1200 Subject: [PATCH] GH-41225: [C#] Slice value buffers when writing sliced list or binary arrays in IPC format (#41230) ### Rationale for this change This reduces file sizes when writing sliced binary or list arrays to IPC format. ### What changes are included in this PR? Changes `ArrowStreamWriter` to write only the subset of the values that is needed rather than the full value buffer when writing a `ListArray` or `BinaryArray`, and compute shifted value offset buffers. ### Are these changes tested? This code is covered by existing tests and the change doesn't introduce any difference in the observed array values, so I haven't added new tests or checks. I did change how list arrays are compared though as we can no longer compare the value and value offset buffers directly, so the tests now get list items as arrays and create a new `ArrayComparer` to compare them. This meant that array offsets are no longer always zero, so I've changed the offset assertions to only be used in strict mode. ### Are there any user-facing changes? Yes, this might reduce IPC file sizes for users writing sliced data. * GitHub Issue: #41225 Authored-by: Adam Reeve Signed-off-by: Curt Hagenlocher --- .../src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 54 +++++++++++++++++-- .../ArrowFileWriterTests.cs | 1 + .../Apache.Arrow.Tests/ArrowReaderVerifier.cs | 50 +++++++++++------ csharp/test/Apache.Arrow.Tests/TestData.cs | 47 ++++++++++++---- 4 files changed, 122 insertions(+), 30 deletions(-) diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index 6127c5a662dfe..1b83735925556 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -163,9 +163,18 @@ public void Visit(BooleanArray array) public void Visit(ListArray array) { _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); - _buffers.Add(CreateSlicedBuffer(array.ValueOffsetsBuffer, array.Offset, array.Length + 1)); + _buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer, array.Offset, array.Length))); - VisitArray(array.Values); + int valuesOffset = array.ValueOffsets[0]; + int valuesLength = array.ValueOffsets[array.Length] - valuesOffset; + + var values = array.Values; + if (valuesOffset > 0 || valuesLength < values.Length) + { + values = ArrowArrayFactory.Slice(values, valuesOffset, valuesLength); + } + + VisitArray(values); } public void Visit(ListViewArray array) @@ -195,8 +204,12 @@ public void Visit(FixedSizeListArray array) public void Visit(BinaryArray array) { _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); - _buffers.Add(CreateSlicedBuffer(array.ValueOffsetsBuffer, array.Offset, array.Length + 1)); - _buffers.Add(CreateBuffer(array.ValueBuffer)); + _buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer, array.Offset, array.Length))); + + int valuesOffset = array.ValueOffsets[0]; + int valuesLength = array.ValueOffsets[array.Length] - valuesOffset; + + _buffers.Add(CreateSlicedBuffer(array.ValueBuffer, valuesOffset, valuesLength)); } public void Visit(BinaryViewArray array) @@ -263,6 +276,39 @@ public void Visit(NullArray array) // There are no buffers for a NullArray } + private ArrowBuffer GetZeroBasedValueOffsets(ArrowBuffer valueOffsetsBuffer, int arrayOffset, int arrayLength) + { + var requiredBytes = CalculatePaddedBufferLength(sizeof(int) * (arrayLength + 1)); + + if (arrayOffset != 0) + { + // Array has been sliced, so we need to shift and adjust the offsets + var originalOffsets = valueOffsetsBuffer.Span.CastTo().Slice(arrayOffset, arrayLength + 1); + var firstOffset = arrayLength > 0 ? originalOffsets[0] : 0; + + var newValueOffsetsBuffer = _allocator.Allocate(requiredBytes); + var newValueOffsets = newValueOffsetsBuffer.Memory.Span.CastTo(); + + for (int i = 0; i < arrayLength + 1; ++i) + { + newValueOffsets[i] = originalOffsets[i] - firstOffset; + } + + return new ArrowBuffer(newValueOffsetsBuffer); + } + else if (valueOffsetsBuffer.Length > requiredBytes) + { + // Array may have been sliced but the offset is zero, + // so we can truncate the existing offsets + return new ArrowBuffer(valueOffsetsBuffer.Memory.Slice(0, requiredBytes)); + } + else + { + // Use the full buffer + return valueOffsetsBuffer; + } + } + private Buffer CreateBitmapBuffer(ArrowBuffer buffer, int offset, int length) { if (buffer.IsEmpty) diff --git a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs index faf650973d64c..baea4d61e5b66 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs @@ -113,6 +113,7 @@ public async Task WritesFooterAlignedMultipleOf8Async() [InlineData(0, 45)] [InlineData(3, 45)] [InlineData(16, 45)] + [InlineData(10, 0)] public async Task WriteSlicedArrays(int sliceOffset, int sliceLength) { var originalBatch = TestData.CreateSampleRecordBatch(length: 100); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index 07c8aa3f56b3b..95972193219ac 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -160,10 +160,14 @@ public void Visit(StructArray array) Assert.Equal(expectedArray.Length, array.Length); Assert.Equal(expectedArray.NullCount, array.NullCount); - Assert.Equal(0, array.Offset); Assert.Equal(expectedArray.Data.Children.Length, array.Data.Children.Length); Assert.Equal(expectedArray.Fields.Count, array.Fields.Count); + if (_strictCompare) + { + Assert.Equal(expectedArray.Offset, array.Offset); + } + for (int i = 0; i < array.Fields.Count; i++) { array.Fields[i].Accept(new ArrayComparer(expectedArray.Fields[i], _strictCompare)); @@ -178,12 +182,12 @@ public void Visit(UnionArray array) Assert.Equal(expectedArray.Mode, array.Mode); Assert.Equal(expectedArray.Length, array.Length); Assert.Equal(expectedArray.NullCount, array.NullCount); - Assert.Equal(0, array.Offset); Assert.Equal(expectedArray.Data.Children.Length, array.Data.Children.Length); Assert.Equal(expectedArray.Fields.Count, array.Fields.Count); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, array.Offset); Assert.True(expectedArray.TypeBuffer.Span.SequenceEqual(array.TypeBuffer.Span)); } else @@ -252,12 +256,12 @@ private void CompareBinaryArrays(BinaryArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); Assert.True(expectedArray.Values.Slice(0, expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, actualArray.Length))); } @@ -284,7 +288,11 @@ private void CompareVariadicArrays(BinaryViewArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); + + if (_strictCompare) + { + Assert.Equal(expectedArray.Offset, actualArray.Offset); + } CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); @@ -309,12 +317,12 @@ private void CompareArrays(FixedSizeBinaryArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); Assert.True(expectedArray.ValueBuffer.Span.Slice(0, expectedArray.Length).SequenceEqual(actualArray.ValueBuffer.Span.Slice(0, actualArray.Length))); } else @@ -338,12 +346,12 @@ private void CompareArrays(PrimitiveArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); Assert.True(expectedArray.Values.Slice(0, expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, actualArray.Length))); } else @@ -370,12 +378,12 @@ private void CompareArrays(BooleanArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); int booleanByteCount = BitUtility.ByteCount(expectedArray.Length); Assert.True(expectedArray.Values.Slice(0, booleanByteCount).SequenceEqual(actualArray.Values.Slice(0, booleanByteCount))); } @@ -397,22 +405,31 @@ private void CompareArrays(ListArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); + actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); } else { - int offsetsStart = (expectedArray.Offset) * sizeof(int); - int offsetsLength = (expectedArray.Length + 1) * sizeof(int); - Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(offsetsStart, offsetsLength).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, offsetsLength))); + for (int i = 0; i < actualArray.Length; ++i) + { + if (expectedArray.IsNull(i)) + { + Assert.True(actualArray.IsNull(i)); + } + else + { + var expectedList = expectedArray.GetSlicedValues(i); + var actualList = actualArray.GetSlicedValues(i); + actualList.Accept(new ArrayComparer(expectedList, _strictCompare)); + } + } } - - actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); } private void CompareArrays(ListViewArray actualArray) @@ -424,12 +441,12 @@ private void CompareArrays(ListViewArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span)); } @@ -453,7 +470,10 @@ private void CompareArrays(FixedSizeListArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); + if (_strictCompare) + { + Assert.Equal(expectedArray.Offset, actualArray.Offset); + } CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); diff --git a/csharp/test/Apache.Arrow.Tests/TestData.cs b/csharp/test/Apache.Arrow.Tests/TestData.cs index 29ddef2864862..3ea42ee0fbcb7 100644 --- a/csharp/test/Apache.Arrow.Tests/TestData.cs +++ b/csharp/test/Apache.Arrow.Tests/TestData.cs @@ -294,7 +294,18 @@ public void Visit(StringType type) for (var i = 0; i < Length; i++) { - builder.Append(str); + switch (i % 3) + { + case 0: + builder.AppendNull(); + break; + case 1: + builder.Append(str); + break; + case 2: + builder.Append(str + str); + break; + } } Array = builder.Build(); @@ -328,15 +339,21 @@ public void Visit(ListType type) { var builder = new ListArray.Builder(type.ValueField).Reserve(Length); - var valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(Length + 1); + var valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(Length * 3 / 2); for (var i = 0; i < Length; i++) { - builder.Append(); - valueBuilder.Append(i); + if (i % 10 == 2) + { + builder.AppendNull(); + } + else + { + builder.Append(); + var listLength = i % 4; + valueBuilder.AppendRange(Enumerable.Range(i, listLength).Select(x => (long)x)); + } } - //Add a value to check if Values.Length can exceed ListArray.Length - valueBuilder.Append(0); Array = builder.Build(); } @@ -352,8 +369,12 @@ public void Visit(ListViewType type) builder.Append(); valueBuilder.Append(i); } - //Add a value to check if Values.Length can exceed ListArray.Length - valueBuilder.Append(0); + + if (Length > 0) + { + // Add a value to check if Values.Length can exceed ListArray.Length + valueBuilder.Append(0); + } Array = builder.Build(); } @@ -562,9 +583,13 @@ public void Visit(MapType type) keyBuilder.Append(i.ToString()); valueBuilder.Append(i); } - //Add a value to check if Values.Length can exceed MapArray.Length - keyBuilder.Append("0"); - valueBuilder.Append(0); + + if (Length > 0) + { + // Add a value to check if Values.Length can exceed MapArray.Length + keyBuilder.Append("0"); + valueBuilder.Append(0); + } Array = builder.Build(); }