From 11cd6e81072657126fdbf16b04f6439764aff6c2 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 19 Apr 2024 15:00:15 +1200 Subject: [PATCH] GH-41231: [C#] Slice values array when writing a sliced list view array to IPC format (#41255) ### Rationale for this change Reduces IPC file sizes when writing sliced list view arrays. ### What changes are included in this PR? Updates `ArrowSreamWriter` so it only writes the required range of values for a list view array, and adjusts the offset values accordingly. ### Are these changes tested? Yes, this is covered by existing tests and I've also added a new test to verify the behaviour with list view arrays that have unordered offsets. ### Are there any user-facing changes? Yes, this might reduce IPC file sizes for users writing sliced data. * GitHub Issue: #41231 Lead-authored-by: Adam Reeve Co-authored-by: Curt Hagenlocher Signed-off-by: Curt Hagenlocher --- .../src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 64 ++++++++++++++- .../ArrowFileWriterTests.cs | 77 +++++++++++++++++++ .../Apache.Arrow.Tests/ArrowReaderVerifier.cs | 20 +++-- 3 files changed, 153 insertions(+), 8 deletions(-) diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index a7e4c13525236..b11479c0d4460 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -184,11 +184,19 @@ public void Visit(ListArray array) public void Visit(ListViewArray array) { + var (valueOffsetsBuffer, minOffset, maxEnd) = GetZeroBasedListViewOffsets(array); + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); - _buffers.Add(CreateSlicedBuffer(array.ValueOffsetsBuffer, array.Offset, array.Length)); + _buffers.Add(CreateBuffer(valueOffsetsBuffer)); _buffers.Add(CreateSlicedBuffer(array.SizesBuffer, array.Offset, array.Length)); - VisitArray(array.Values); + IArrowArray values = array.Values; + if (minOffset != 0 || values.Length != maxEnd) + { + values = ArrowArrayFactory.Slice(values, minOffset, maxEnd - minOffset); + } + + VisitArray(values); } public void Visit(FixedSizeListArray array) @@ -319,6 +327,58 @@ private ArrowBuffer GetZeroBasedValueOffsets(ArrowBuffer valueOffsetsBuffer, int } } + private (ArrowBuffer Buffer, int minOffset, int maxEnd) GetZeroBasedListViewOffsets(ListViewArray array) + { + if (array.Length == 0) + { + return (ArrowBuffer.Empty, 0, 0); + } + + var offsets = array.ValueOffsets; + var sizes = array.Sizes; + + int minOffset = offsets[0]; + int maxEnd = offsets[array.Length - 1] + sizes[array.Length - 1]; + + // Min possible offset is zero, and max possible end is the values length. + // If these match the first offset and last end we don't need to do anything further, + // but otherwise we need to iterate over each index in case the offsets aren't ordered. + if (minOffset != 0 || maxEnd != array.Values.Length) + { + for (int i = 0; i < array.Length; ++i) + { + minOffset = Math.Min(minOffset, offsets[i]); + maxEnd = Math.Max(maxEnd, offsets[i] + sizes[i]); + } + } + + var requiredBytes = CalculatePaddedBufferLength(sizeof(int) * array.Length); + + if (minOffset == 0) + { + // No need to adjust the offsets, but we may need to slice the offsets buffer. + ArrowBuffer buffer = array.ValueOffsetsBuffer; + if (array.Offset != 0 || buffer.Length > requiredBytes) + { + var byteOffset = sizeof(int) * array.Offset; + var sliceLength = Math.Min(requiredBytes, buffer.Length - byteOffset); + buffer = new ArrowBuffer(buffer.Memory.Slice(byteOffset, sliceLength)); + } + + return (buffer, minOffset, maxEnd); + } + + // Compute shifted offsets + var newOffsetsBuffer = _allocator.Allocate(requiredBytes); + var newOffsets = newOffsetsBuffer.Memory.Span.CastTo(); + for (int i = 0; i < array.Length; ++i) + { + newOffsets[i] = offsets[i] - minOffset; + } + + return (new ArrowBuffer(newOffsetsBuffer), minOffset, maxEnd); + } + private Buffer CreateBitmapBuffer(ArrowBuffer buffer, int offset, int length) { if (buffer.IsEmpty) diff --git a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs index 297cb5e181cd3..7b25910114882 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs @@ -135,6 +135,71 @@ public async Task WriteSlicedArrays(int sliceOffset, int sliceLength) await ValidateRecordBatchFile(stream, slicedBatch, strictCompare: false); } + [Theory] + [InlineData(0, 100)] + [InlineData(0, 50)] + [InlineData(50, 50)] + [InlineData(25, 50)] + public async Task WriteListViewDataWithUnorderedOffsets(int sliceOffset, int sliceLength) + { + // A list-view array doesn't require that offsets are ordered, + // so verify that we can round trip a list-view array with out-of-order offsets. + const int length = 100; + var random = new Random(); + + var randomizedIndices = Enumerable.Range(0, length).ToArray(); + Shuffle(randomizedIndices, random); + + var offsetsBuilder = new ArrowBuffer.Builder().Resize(length); + var sizesBuilder = new ArrowBuffer.Builder().Resize(length); + var validityBuilder = new ArrowBuffer.BitmapBuilder().Reserve(length); + + var valuesLength = 0; + for (int i = 0; i < length; ++i) + { + var index = randomizedIndices[i]; + var listLength = random.Next(0, 10); + offsetsBuilder.Span[index] = valuesLength; + sizesBuilder.Span[index] = listLength; + valuesLength += listLength; + + validityBuilder.Append(random.NextDouble() < 0.9); + } + + var valuesBuilder = new Int64Array.Builder().Reserve(valuesLength); + for (int i = 0; i < valuesLength; ++i) + { + valuesBuilder.Append(random.Next(0, 1_000)); + } + + var type = new ListViewType(new Int64Type()); + var offsets = offsetsBuilder.Build(); + var sizes = sizesBuilder.Build(); + var values = valuesBuilder.Build(); + var nullCount = validityBuilder.UnsetBitCount; + var validityBuffer = validityBuilder.Build(); + + IArrowArray listViewArray = new ListViewArray( + type, length, offsets, sizes, values, validityBuffer, nullCount); + + if (sliceOffset != 0 || sliceLength != length) + { + listViewArray = ArrowArrayFactory.Slice(listViewArray, sliceOffset, sliceLength); + } + + var recordBatch = new RecordBatch.Builder().Append("x", true, listViewArray).Build(); + + var stream = new MemoryStream(); + var writer = new ArrowFileWriter(stream, recordBatch.Schema, leaveOpen: true); + + await writer.WriteRecordBatchAsync(recordBatch); + await writer.WriteEndAsync(); + + stream.Position = 0; + + await ValidateRecordBatchFile(stream, recordBatch, strictCompare: false); + } + private async Task ValidateRecordBatchFile(Stream stream, RecordBatch recordBatch, bool strictCompare = true) { var reader = new ArrowFileReader(stream); @@ -245,5 +310,17 @@ public async Task WriteListArrayWithEmptyOffsets() await ValidateRecordBatchFile(stream, recordBatch, strictCompare: false); } + + private static void Shuffle(int[] values, Random random) + { + var length = values.Length; + for (int i = 0; i < length - 1; ++i) + { + var j = random.Next(i, length); + var tmp = values[i]; + values[i] = values[j]; + values[j] = tmp; + } + } } } diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index f656979c22c86..360c3d71296c7 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -449,16 +449,24 @@ private void CompareArrays(ListViewArray actualArray) Assert.Equal(expectedArray.Offset, actualArray.Offset); Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span)); + actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); } else { - int start = expectedArray.Offset * sizeof(int); - int length = expectedArray.Length * sizeof(int); - Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(start, length).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, length))); - Assert.True(expectedArray.SizesBuffer.Span.Slice(start, length).SequenceEqual(actualArray.SizesBuffer.Span.Slice(0, length))); + for (int i = 0; i < actualArray.Length; ++i) + { + if (expectedArray.IsNull(i)) + { + Assert.True(actualArray.IsNull(i)); + } + else + { + var expectedList = expectedArray.GetSlicedValues(i); + var actualList = actualArray.GetSlicedValues(i); + actualList.Accept(new ArrayComparer(expectedList, _strictCompare)); + } + } } - - actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); } private void CompareArrays(FixedSizeListArray actualArray)