Skip to content

Commit

Permalink
apacheGH-41164: [C#] Fix concatenation of sliced arrays (apache#41245)
Browse files Browse the repository at this point in the history
### Rationale for this change

Makes array concatenation work correctly when the input arrays have been sliced.

### What changes are included in this PR?

* Updates the array concatenation tests so that the `TestDataGenerator` can generate test cases with sliced input arrays. To avoid too much duplicated logic, I've added a new `GenerateTestData<TArray, TArrayBuilder>` method that works with builders that are not `IArrowArrayBuilder<T, TArray, TArrayBuilder>`, and simplified a lot of the data generation by using this new method. Only struct and union array test data generation still needs to duplicate the logic in `GenerateTestData`.
* Fixes `ArrayDataConcatenator` logic to handle sliced input arrays

### Are these changes tested?

Yes, I've added a new test for this.

### Are there any user-facing changes?

Yes, this is a user-facing bug fix.
* GitHub Issue: apache#41164

Authored-by: Adam Reeve <[email protected]>
Signed-off-by: Curt Hagenlocher <[email protected]>
  • Loading branch information
adamreeve authored Apr 17, 2024
1 parent d49b62d commit 63c91ff
Show file tree
Hide file tree
Showing 4 changed files with 430 additions and 465 deletions.
197 changes: 164 additions & 33 deletions csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -107,38 +107,76 @@ public void Visit(ListViewType type)
{
CheckData(type, 3);
ArrowBuffer validityBuffer = ConcatenateValidityBuffer();
ArrowBuffer sizesBuffer = ConcatenateFixedWidthTypeValueBuffer(2, Int32Type.Default);

var children = new List<ArrayData>(_arrayDataList.Count);
var offsetsBuilder = new ArrowBuffer.Builder<int>(_totalLength);
int baseOffset = 0;

foreach (ArrayData arrayData in _arrayDataList)
{
if (arrayData.Length > 0)
if (arrayData.Length == 0)
{
ReadOnlySpan<int> span = arrayData.Buffers[1].Span.CastTo<int>().Slice(0, arrayData.Length);
foreach (int offset in span)
{
offsetsBuilder.Append(baseOffset + offset);
}
continue;
}

var child = arrayData.Children[0];
ReadOnlySpan<int> offsets = arrayData.Buffers[1].Span.CastTo<int>().Slice(arrayData.Offset, arrayData.Length);
ReadOnlySpan<int> sizes = arrayData.Buffers[2].Span.CastTo<int>().Slice(arrayData.Offset, arrayData.Length);
var minOffset = offsets[0];
var maxEnd = 0;

for (int i = 0; i < arrayData.Length; ++i)
{
minOffset = Math.Min(minOffset, offsets[i]);
maxEnd = Math.Max(maxEnd, offsets[i] + sizes[i]);
}

baseOffset += arrayData.Children[0].Length;
foreach (int offset in offsets)
{
offsetsBuilder.Append(baseOffset + offset - minOffset);
}

var childLength = maxEnd - minOffset;
if (minOffset != 0 || childLength != child.Length)
{
child = child.Slice(minOffset, childLength);
}

baseOffset += childLength;
children.Add(child);
}

ArrowBuffer offsetBuffer = offsetsBuilder.Build(_allocator);
ArrowBuffer sizesBuffer = ConcatenateFixedWidthTypeValueBuffer(2, Int32Type.Default);
ArrayData child = Concatenate(SelectChildren(0), _allocator);
ArrayData combinedChild = Concatenate(children, _allocator);

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, sizesBuffer }, new[] { child });
Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, sizesBuffer }, new[] { combinedChild });
}

public void Visit(FixedSizeListType type)
{
CheckData(type, 1);
var listSize = type.ListSize;
ArrowBuffer validityBuffer = ConcatenateValidityBuffer();
ArrayData child = Concatenate(SelectChildren(0), _allocator);

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer }, new[] { child });
var children = new List<ArrayData>(_arrayDataList.Count);

foreach (ArrayData arrayData in _arrayDataList)
{
var offset = arrayData.Offset;
var length = arrayData.Length;
var child = arrayData.Children[0];
if (offset != 0 || child.Length != length * listSize)
{
child = child.Slice(offset * listSize, length * listSize);
}

children.Add(child);
}

ArrayData combinedChild = Concatenate(children, _allocator);

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer }, new[] { combinedChild });
}

public void Visit(StructType type)
Expand All @@ -149,7 +187,7 @@ public void Visit(StructType type)

for (int i = 0; i < type.Fields.Count; i++)
{
children.Add(Concatenate(SelectChildren(i), _allocator));
children.Add(Concatenate(SelectSlicedChildren(i), _allocator));
}

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer }, children);
Expand All @@ -169,7 +207,11 @@ public void Visit(UnionType type)

for (int i = 0; i < type.Fields.Count; i++)
{
children.Add(Concatenate(SelectChildren(i), _allocator));
// For dense mode, the offsets aren't adjusted so are into the non-sliced child arrays
var fieldChildren = type.Mode == UnionMode.Sparse
? SelectSlicedChildren(i)
: SelectChildren(i);
children.Add(Concatenate(fieldChildren, _allocator));
}

ArrowBuffer[] buffers = new ArrowBuffer[bufferCount];
Expand Down Expand Up @@ -242,9 +284,30 @@ private void ConcatenateLists(NestedType type)
CheckData(type, 2);
ArrowBuffer validityBuffer = ConcatenateValidityBuffer();
ArrowBuffer offsetBuffer = ConcatenateOffsetBuffer();
ArrayData child = Concatenate(SelectChildren(0), _allocator);

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer }, new[] { child });
var children = new List<ArrayData>(_arrayDataList.Count);
foreach (ArrayData arrayData in _arrayDataList)
{
if (arrayData.Length == 0)
{
continue;
}

var child = arrayData.Children[0];
ReadOnlySpan<int> offsets = arrayData.Buffers[1].Span.CastTo<int>().Slice(arrayData.Offset, arrayData.Length + 1);
var firstOffset = offsets[0];
var lastOffset = offsets[arrayData.Length];
if (firstOffset != 0 || lastOffset != child.Length)
{
child = child.Slice(firstOffset, lastOffset - firstOffset);
}

children.Add(child);
}

ArrayData combinedChild = Concatenate(children, _allocator);

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer }, new[] { combinedChild });
}

private ArrowBuffer ConcatenateValidityBuffer()
Expand All @@ -254,7 +317,43 @@ private ArrowBuffer ConcatenateValidityBuffer()
return ArrowBuffer.Empty;
}

return ConcatenateBitmapBuffer(0);
var builder = new ArrowBuffer.BitmapBuilder(_totalLength);

foreach (ArrayData arrayData in _arrayDataList)
{
int length = arrayData.Length;
int offset = arrayData.Offset;
ReadOnlySpan<byte> span = arrayData.Buffers[0].Span;

if (length > 0 && span.Length == 0)
{
if (arrayData.NullCount == 0)
{
builder.AppendRange(true , length);
}
else if (arrayData.NullCount == length)
{
builder.AppendRange(false , length);
}
else
{
throw new Exception("Array has no validity buffer and null count != 0 or length");
}
}
else if (offset == 0)
{
builder.Append(span, length);
}
else
{
for (int i = 0; i < length; ++i)
{
builder.Append(BitUtility.GetBit(span, offset + i));
}
}
}

return builder.Build(_allocator);
}

private ArrowBuffer ConcatenateBitmapBuffer(int bufferIndex)
Expand All @@ -264,9 +363,20 @@ private ArrowBuffer ConcatenateBitmapBuffer(int bufferIndex)
foreach (ArrayData arrayData in _arrayDataList)
{
int length = arrayData.Length;
int offset = arrayData.Offset;
ReadOnlySpan<byte> span = arrayData.Buffers[bufferIndex].Span;

builder.Append(span, length);
if (offset == 0)
{
builder.Append(span, length);
}
else
{
for (int i = 0; i < length; ++i)
{
builder.Append(BitUtility.GetBit(span, offset + i));
}
}
}

return builder.Build(_allocator);
Expand All @@ -279,10 +389,10 @@ private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(int bufferIndex, FixedW

foreach (ArrayData arrayData in _arrayDataList)
{
int length = arrayData.Length;
int byteLength = length * typeByteWidth;
int byteLength = arrayData.Length * typeByteWidth;
int byteOffset = arrayData.Offset * typeByteWidth;

builder.Append(arrayData.Buffers[bufferIndex].Span.Slice(0, byteLength));
builder.Append(arrayData.Buffers[bufferIndex].Span.Slice(byteOffset, byteLength));
}

return builder.Build(_allocator);
Expand All @@ -294,8 +404,10 @@ private ArrowBuffer ConcatenateVariableBinaryValueBuffer()

foreach (ArrayData arrayData in _arrayDataList)
{
int lastOffset = arrayData.Buffers[1].Span.CastTo<int>()[arrayData.Length];
builder.Append(arrayData.Buffers[2].Span.Slice(0, lastOffset));
var offsets = arrayData.Buffers[1].Span.CastTo<int>().Slice(arrayData.Offset, arrayData.Length + 1);
var firstOffset = offsets[0];
var lastOffset = offsets[arrayData.Length];
builder.Append(arrayData.Buffers[2].Span.Slice(firstOffset, lastOffset - firstOffset));
}

return builder.Build(_allocator);
Expand All @@ -306,28 +418,27 @@ private ArrowBuffer ConcatenateOffsetBuffer()
var builder = new ArrowBuffer.Builder<int>(_totalLength + 1);
int baseOffset = 0;

builder.Append(0);

foreach (ArrayData arrayData in _arrayDataList)
{
if (arrayData.Length == 0)
{
continue;
}

// The first offset is always 0.
// It should be skipped because it duplicate to the last offset of builder.
ReadOnlySpan<int> span = arrayData.Buffers[1].Span.CastTo<int>().Slice(1, arrayData.Length);
ReadOnlySpan<int> span = arrayData.Buffers[1].Span.CastTo<int>().Slice(arrayData.Offset, arrayData.Length + 1);
// First offset may be non-zero for sliced arrays
var firstOffset = span[0];

foreach (int offset in span)
foreach (int offset in span.Slice(0, arrayData.Length))
{
builder.Append(baseOffset + offset);
builder.Append(baseOffset + offset - firstOffset);
}

// The next offset must start from the current last offset.
baseOffset += span[arrayData.Length - 1];
baseOffset += span[arrayData.Length] - firstOffset;
}

builder.Append(baseOffset);

return builder.Build(_allocator);
}

Expand All @@ -342,7 +453,7 @@ private ArrowBuffer ConcatenateViewBuffer(out int variadicBufferCount)
continue;
}

ReadOnlySpan<BinaryView> span = arrayData.Buffers[1].Span.CastTo<BinaryView>().Slice(0, arrayData.Length);
ReadOnlySpan<BinaryView> span = arrayData.Buffers[1].Span.CastTo<BinaryView>().Slice(arrayData.Offset, arrayData.Length);
foreach (BinaryView view in span)
{
if (view.Length > BinaryView.MaxInlineLength)
Expand Down Expand Up @@ -412,6 +523,26 @@ private List<ArrayData> SelectChildren(int index)

return children;
}

private List<ArrayData> SelectSlicedChildren(int index)
{
var children = new List<ArrayData>(_arrayDataList.Count);

foreach (ArrayData arrayData in _arrayDataList)
{
var offset = arrayData.Offset;
var length = arrayData.Length;
var child = arrayData.Children[index];
if (offset != 0 || child.Length != length)
{
child = child.Slice(offset, length);
}

children.Add(child);
}

return children;
}
}
}
}
13 changes: 11 additions & 2 deletions csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,21 @@ internal static IArrowArrayBuilder<IArrowArray, IArrowArrayBuilder<IArrowArray>>
return new Decimal128Array.Builder(dataType as Decimal128Type);
case ArrowTypeId.Decimal256:
return new Decimal256Array.Builder(dataType as Decimal256Type);
case ArrowTypeId.Interval:
var intervalType = (IntervalType)dataType;
return intervalType.Unit switch
{
IntervalUnit.YearMonth => new YearMonthIntervalArray.Builder(),
IntervalUnit.DayTime => new DayTimeIntervalArray.Builder(),
IntervalUnit.MonthDayNanosecond => new MonthDayNanosecondIntervalArray.Builder(),
_ => throw new ArgumentOutOfRangeException($"unsupported interval unit <{intervalType.Unit}>")
};
case ArrowTypeId.Map:
return new MapArray.Builder(dataType as MapType);
case ArrowTypeId.Struct:
case ArrowTypeId.Union:
case ArrowTypeId.Dictionary:
case ArrowTypeId.FixedSizedBinary:
case ArrowTypeId.Interval:
case ArrowTypeId.Map:
default:
throw new NotSupportedException($"An ArrowArrayBuilder cannot be built for type {dataType.TypeId}.");
}
Expand Down

This file was deleted.

Loading

0 comments on commit 63c91ff

Please sign in to comment.