diff --git a/csharp/README.md b/csharp/README.md index b36eb899db2d5..663aaf8ab243c 100644 --- a/csharp/README.md +++ b/csharp/README.md @@ -129,7 +129,8 @@ for currently available features. - Types - Tensor - Arrays - - Large Arrays + - Large Arrays. There are large array types provided to help with interoperability with other libraries, + but these do not support buffers larger than 2 GiB and an exception will be raised if trying to import an array that is too large. - Large Binary - Large List - Large String diff --git a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs index 67c4b21a2e531..bd06c3a1b8b14 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs @@ -53,18 +53,24 @@ public static IArrowArray BuildArray(ArrayData data) return new StringArray(data); case ArrowTypeId.StringView: return new StringViewArray(data); + case ArrowTypeId.LargeString: + return new LargeStringArray(data); case ArrowTypeId.FixedSizedBinary: return new FixedSizeBinaryArray(data); case ArrowTypeId.Binary: return new BinaryArray(data); case ArrowTypeId.BinaryView: return new BinaryViewArray(data); + case ArrowTypeId.LargeBinary: + return new LargeBinaryArray(data); case ArrowTypeId.Timestamp: return new TimestampArray(data); case ArrowTypeId.List: return new ListArray(data); case ArrowTypeId.ListView: return new ListViewArray(data); + case ArrowTypeId.LargeList: + return new LargeListArray(data); case ArrowTypeId.Map: return new MapArray(data); case ArrowTypeId.Struct: diff --git a/csharp/src/Apache.Arrow/Arrays/LargeBinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/LargeBinaryArray.cs new file mode 100644 index 0000000000000..9eddbedab54ed --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/LargeBinaryArray.cs @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Apache.Arrow.Types; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Runtime.CompilerServices; + +namespace Apache.Arrow; + +public class LargeBinaryArray : Array, IReadOnlyList, ICollection +{ + public LargeBinaryArray(ArrayData data) + : base(data) + { + data.EnsureDataType(ArrowTypeId.LargeBinary); + data.EnsureBufferCount(3); + } + + public LargeBinaryArray(ArrowTypeId typeId, ArrayData data) + : base(data) + { + data.EnsureDataType(typeId); + data.EnsureBufferCount(3); + } + + public LargeBinaryArray(IArrowType dataType, int length, + ArrowBuffer valueOffsetsBuffer, + ArrowBuffer dataBuffer, + ArrowBuffer nullBitmapBuffer, + int nullCount = 0, int offset = 0) + : this(new ArrayData(dataType, length, nullCount, offset, + new[] { nullBitmapBuffer, valueOffsetsBuffer, dataBuffer })) + { } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + public ArrowBuffer ValueOffsetsBuffer => Data.Buffers[1]; + + public ArrowBuffer ValueBuffer => Data.Buffers[2]; + + public ReadOnlySpan ValueOffsets => ValueOffsetsBuffer.Span.CastTo().Slice(Offset, Length + 1); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetValueLength(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + if (!IsValid(index)) + { + return 0; + } + + ReadOnlySpan offsets = ValueOffsets; + return checked((int)(offsets[index + 1] - offsets[index])); + } + + /// + /// Get the collection of bytes, as a read-only span, at a given index in the array. + /// + /// + /// Note that this method cannot reliably identify null values, which are indistinguishable from empty byte + /// collection values when seen in the context of this method's return type of . + /// Use the method or the overload instead + /// to reliably determine null values. + /// + /// Index at which to get bytes. + /// Returns a object. + /// If the index is negative or beyond the length of the array. + /// + public ReadOnlySpan GetBytes(int index) => GetBytes(index, out _); + + /// + /// Get the collection of bytes, as a read-only span, at a given index in the array. + /// + /// Index at which to get bytes. + /// Set to if the value at the given index is null. + /// Returns a object. + /// If the index is negative or beyond the length of the array. + /// + public ReadOnlySpan GetBytes(int index, out bool isNull) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + isNull = IsNull(index); + + if (isNull) + { + // Note that `return null;` is valid syntax, but would be misleading as `null` in the context of a span + // is actually returned as an empty span. + return ReadOnlySpan.Empty; + } + + var offset = checked((int)ValueOffsets[index]); + return ValueBuffer.Span.Slice(offset, GetValueLength(index)); + } + + int IReadOnlyCollection.Count => Length; + + byte[] IReadOnlyList.this[int index] => GetBytes(index).ToArray(); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetBytes(index).ToArray(); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(byte[] item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(byte[] item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(byte[] item) + { + for (int index = 0; index < Length; index++) + { + if (GetBytes(index).SequenceEqual(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(byte[][] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetBytes(srcIndex).ToArray(); + } + } +} diff --git a/csharp/src/Apache.Arrow/Arrays/LargeListArray.cs b/csharp/src/Apache.Arrow/Arrays/LargeListArray.cs new file mode 100644 index 0000000000000..6e37aa4c63536 --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/LargeListArray.cs @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Types; + +namespace Apache.Arrow +{ + public class LargeListArray : Array + { + public IArrowArray Values { get; } + + public ArrowBuffer ValueOffsetsBuffer => Data.Buffers[1]; + + public ReadOnlySpan ValueOffsets => ValueOffsetsBuffer.Span.CastTo().Slice(Offset, Length + 1); + + public LargeListArray(IArrowType dataType, int length, + ArrowBuffer valueOffsetsBuffer, IArrowArray values, + ArrowBuffer nullBitmapBuffer, int nullCount = 0, int offset = 0) + : this(new ArrayData(dataType, length, nullCount, offset, + new[] { nullBitmapBuffer, valueOffsetsBuffer }, new[] { values.Data }), + values) + { + } + + public LargeListArray(ArrayData data) + : this(data, ArrowArrayFactory.BuildArray(data.Children[0])) + { + } + + private LargeListArray(ArrayData data, IArrowArray values) : base(data) + { + data.EnsureBufferCount(2); + data.EnsureDataType(ArrowTypeId.LargeList); + Values = values; + } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + public int GetValueLength(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + if (IsNull(index)) + { + return 0; + } + + ReadOnlySpan offsets = ValueOffsets; + return checked((int)(offsets[index + 1] - offsets[index])); + } + + public IArrowArray GetSlicedValues(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + if (IsNull(index)) + { + return null; + } + + if (!(Values is Array array)) + { + return default; + } + + return array.Slice(checked((int)ValueOffsets[index]), GetValueLength(index)); + } + + protected override void Dispose(bool disposing) + { + if (disposing) + { + Values?.Dispose(); + } + base.Dispose(disposing); + } + } +} diff --git a/csharp/src/Apache.Arrow/Arrays/LargeStringArray.cs b/csharp/src/Apache.Arrow/Arrays/LargeStringArray.cs new file mode 100644 index 0000000000000..2a65b828acfa1 --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/LargeStringArray.cs @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; +using Apache.Arrow.Types; + +namespace Apache.Arrow; + +public class LargeStringArray: LargeBinaryArray, IReadOnlyList, ICollection +{ + public static readonly Encoding DefaultEncoding = StringArray.DefaultEncoding; + + public LargeStringArray(ArrayData data) + : base(ArrowTypeId.LargeString, data) { } + + public LargeStringArray(int length, + ArrowBuffer valueOffsetsBuffer, + ArrowBuffer dataBuffer, + ArrowBuffer nullBitmapBuffer, + int nullCount = 0, int offset = 0) + : this(new ArrayData(LargeStringType.Default, length, nullCount, offset, + new[] { nullBitmapBuffer, valueOffsetsBuffer, dataBuffer })) + { } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + /// + /// Get the string value at the given index + /// + /// Input index + /// Optional: the string encoding, default is UTF8 + /// The string object at the given index + public string GetString(int index, Encoding encoding = default) + { + encoding ??= DefaultEncoding; + + ReadOnlySpan bytes = GetBytes(index, out bool isNull); + + if (isNull) + { + return null; + } + + if (bytes.Length == 0) + { + return string.Empty; + } + + unsafe + { + fixed (byte* data = &MemoryMarshal.GetReference(bytes)) + { + return encoding.GetString(data, bytes.Length); + } + } + } + + + int IReadOnlyCollection.Count => Length; + + string IReadOnlyList.this[int index] => GetString(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetString(index); + }; + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(string item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(string item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(string item) + { + for (int index = 0; index < Length; index++) + { + if (GetString(index) == item) + return true; + } + + return false; + } + + void ICollection.CopyTo(string[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetString(srcIndex); + } + } +} diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index abe02dcbb591f..68b67f3d7c620 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -162,6 +162,10 @@ private ArrayData GetAsArrayData(CArrowArray* cArray, IArrowType type) case ArrowTypeId.BinaryView: buffers = ImportByteArrayViewBuffers(cArray); break; + case ArrowTypeId.LargeString: + case ArrowTypeId.LargeBinary: + buffers = ImportLargeByteArrayBuffers(cArray); + break; case ArrowTypeId.List: children = ProcessListChildren(cArray, ((ListType)type).ValueDataType); buffers = ImportListBuffers(cArray); @@ -170,6 +174,10 @@ private ArrayData GetAsArrayData(CArrowArray* cArray, IArrowType type) children = ProcessListChildren(cArray, ((ListViewType)type).ValueDataType); buffers = ImportListViewBuffers(cArray); break; + case ArrowTypeId.LargeList: + children = ProcessListChildren(cArray, ((LargeListType)type).ValueDataType); + buffers = ImportLargeListBuffers(cArray); + break; case ArrowTypeId.FixedSizeList: children = ProcessListChildren(cArray, ((FixedSizeListType)type).ValueDataType); buffers = ImportFixedSizeListBuffers(cArray); @@ -313,6 +321,42 @@ private ArrowBuffer[] ImportByteArrayViewBuffers(CArrowArray* cArray) return buffers; } + private ArrowBuffer[] ImportLargeByteArrayBuffers(CArrowArray* cArray) + { + if (cArray->n_buffers != 3) + { + throw new InvalidOperationException("Large byte arrays are expected to have exactly three buffers"); + } + + const int maxLength = int.MaxValue / 8 - 1; + if (cArray->length > maxLength) + { + throw new OverflowException( + $"Cannot import large byte array. Array length {cArray->length} " + + $"is greater than the maximum supported large byte array length ({maxLength})"); + } + + int length = (int)cArray->length; + int offsetsLength = (length + 1) * 8; + long* offsets = (long*)cArray->buffers[1]; + Debug.Assert(offsets != null); + long valuesLength = offsets[length]; + + if (valuesLength > int.MaxValue) + { + throw new OverflowException( + $"Cannot import large byte array. Data length {valuesLength} " + + $"is greater than the maximum supported large byte array data length ({int.MaxValue})"); + } + + ArrowBuffer[] buffers = new ArrowBuffer[3]; + buffers[0] = ImportValidityBuffer(cArray); + buffers[1] = ImportCArrayBuffer(cArray, 1, offsetsLength); + buffers[2] = ImportCArrayBuffer(cArray, 2, (int)valuesLength); + + return buffers; + } + private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray) { if (cArray->n_buffers != 2) @@ -348,6 +392,31 @@ private ArrowBuffer[] ImportListViewBuffers(CArrowArray* cArray) return buffers; } + private ArrowBuffer[] ImportLargeListBuffers(CArrowArray* cArray) + { + if (cArray->n_buffers != 2) + { + throw new InvalidOperationException("Large list arrays are expected to have exactly two buffers"); + } + + const int maxLength = int.MaxValue / 8 - 1; + if (cArray->length > maxLength) + { + throw new OverflowException( + $"Cannot import large list array. Array length {cArray->length} " + + $"is greater than the maximum supported large list array length ({maxLength})"); + } + + int length = (int)cArray->length; + int offsetsLength = (length + 1) * 8; + + ArrowBuffer[] buffers = new ArrowBuffer[2]; + buffers[0] = ImportValidityBuffer(cArray); + buffers[1] = ImportCArrayBuffer(cArray, 1, offsetsLength); + + return buffers; + } + private ArrowBuffer[] ImportFixedSizeListBuffers(CArrowArray* cArray) { if (cArray->n_buffers != 1) diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs index 3bb7134af3ba9..92d48a2d70880 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs @@ -168,8 +168,10 @@ private static string GetFormat(IArrowType datatype) // Binary case BinaryType _: return "z"; case BinaryViewType _: return "vz"; + case LargeBinaryType _: return "Z"; case StringType _: return "u"; case StringViewType _: return "vu"; + case LargeStringType _: return "U"; case FixedSizeBinaryType binaryType: return $"w:{binaryType.ByteWidth}"; // Date @@ -199,6 +201,7 @@ private static string GetFormat(IArrowType datatype) // Nested case ListType _: return "+l"; case ListViewType _: return "+vl"; + case LargeListType _: return "+L"; case FixedSizeListType fixedListType: return $"+w:{fixedListType.ListSize}"; case StructType _: return "+s"; @@ -208,7 +211,7 @@ private static string GetFormat(IArrowType datatype) case DictionaryType dictionaryType: return GetFormat(dictionaryType.IndexType); default: throw new NotImplementedException($"Exporting {datatype.Name} not implemented"); - }; + } } private static long GetFlags(IArrowType datatype, bool nullable = true) diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs index f1acc007bcef7..94177184dea00 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs @@ -165,7 +165,7 @@ public ArrowType GetAsType() } // Special handling for nested types - if (format == "+l" || format == "+vl") + if (format == "+l" || format == "+vl" || format == "+L") { if (_cSchema->n_children != 1) { @@ -180,7 +180,13 @@ public ArrowType GetAsType() Field childField = childSchema.GetAsField(); - return format[1] == 'v' ? new ListViewType(childField) : new ListType(childField); + return format[1] switch + { + 'l' => new ListType(childField), + 'v' => new ListViewType(childField), + 'L' => new LargeListType(childField), + _ => throw new InvalidDataException($"Invalid format for list: '{format}'"), + }; } else if (format == "+s") { @@ -304,10 +310,10 @@ public ArrowType GetAsType() // Binary data "z" => BinaryType.Default, "vz" => BinaryViewType.Default, - //"Z" => new LargeBinaryType() // Not yet implemented + "Z" => LargeBinaryType.Default, "u" => StringType.Default, "vu" => StringViewType.Default, - //"U" => new LargeStringType(), // Not yet implemented + "U" => LargeStringType.Default, // Date and time "tdD" => Date32Type.Default, "tdm" => Date64Type.Default, diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs index a37c501072f4b..7e766677f8b28 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs @@ -291,6 +291,8 @@ private ArrayData LoadField( break; case ArrowTypeId.String: case ArrowTypeId.Binary: + case ArrowTypeId.LargeString: + case ArrowTypeId.LargeBinary: case ArrowTypeId.ListView: buffers = 3; break; diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs index 5583a58487bf5..12a2a17cf04e2 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs @@ -132,7 +132,13 @@ protected ReadResult ReadMessage() Flatbuf.Message message = Flatbuf.Message.GetRootAsMessage(CreateByteBuffer(messageBuff)); - int bodyLength = checked((int)message.BodyLength); + if (message.BodyLength > int.MaxValue) + { + throw new OverflowException( + $"Arrow IPC message body length ({message.BodyLength}) is larger than " + + $"the maximum supported message size ({int.MaxValue})"); + } + int bodyLength = (int)message.BodyLength; IMemoryOwner bodyBuffOwner = _allocator.Allocate(bodyLength); Memory bodyBuff = bodyBuffOwner.Memory.Slice(0, bodyLength); diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index c66569afeba85..eaa8471fa7bd3 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -57,11 +57,14 @@ private class ArrowRecordBatchFlatBufferBuilder : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -199,6 +202,28 @@ public void Visit(ListViewArray array) VisitArray(values); } + public void Visit(LargeListArray array) + { + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); + _buffers.Add(CreateBuffer(GetZeroBasedLongValueOffsets(array.ValueOffsetsBuffer, array.Offset, array.Length))); + + int valuesOffset = 0; + int valuesLength = 0; + if (array.Length > 0) + { + valuesOffset = checked((int)array.ValueOffsets[0]); + valuesLength = checked((int)array.ValueOffsets[array.Length] - valuesOffset); + } + + var values = array.Values; + if (valuesOffset > 0 || valuesLength < values.Length) + { + values = ArrowArrayFactory.Slice(values, valuesOffset, valuesLength); + } + + VisitArray(values); + } + public void Visit(FixedSizeListArray array) { _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); @@ -214,6 +239,8 @@ public void Visit(FixedSizeListArray array) public void Visit(StringViewArray array) => Visit(array as BinaryViewArray); + public void Visit(LargeStringArray array) => Visit(array as LargeBinaryArray); + public void Visit(BinaryArray array) { _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); @@ -242,6 +269,22 @@ public void Visit(BinaryViewArray array) VariadicCounts.Add(array.DataBufferCount); } + public void Visit(LargeBinaryArray array) + { + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); + _buffers.Add(CreateBuffer(GetZeroBasedLongValueOffsets(array.ValueOffsetsBuffer, array.Offset, array.Length))); + + int valuesOffset = 0; + int valuesLength = 0; + if (array.Length > 0) + { + valuesOffset = checked((int)array.ValueOffsets[0]); + valuesLength = checked((int)array.ValueOffsets[array.Length]) - valuesOffset; + } + + _buffers.Add(CreateSlicedBuffer(array.ValueBuffer, valuesOffset, valuesLength)); + } + public void Visit(FixedSizeBinaryArray array) { var itemSize = ((FixedSizeBinaryType)array.Data.DataType).ByteWidth; @@ -327,6 +370,39 @@ private ArrowBuffer GetZeroBasedValueOffsets(ArrowBuffer valueOffsetsBuffer, int } } + private ArrowBuffer GetZeroBasedLongValueOffsets(ArrowBuffer valueOffsetsBuffer, int arrayOffset, int arrayLength) + { + var requiredBytes = CalculatePaddedBufferLength(checked(sizeof(long) * (arrayLength + 1))); + + if (arrayOffset != 0) + { + // Array has been sliced, so we need to shift and adjust the offsets + var originalOffsets = valueOffsetsBuffer.Span.CastTo().Slice(arrayOffset, arrayLength + 1); + var firstOffset = arrayLength > 0 ? originalOffsets[0] : 0L; + + var newValueOffsetsBuffer = _allocator.Allocate(requiredBytes); + var newValueOffsets = newValueOffsetsBuffer.Memory.Span.CastTo(); + + for (int i = 0; i < arrayLength + 1; ++i) + { + newValueOffsets[i] = originalOffsets[i] - firstOffset; + } + + return new ArrowBuffer(newValueOffsetsBuffer); + } + else if (valueOffsetsBuffer.Length > requiredBytes) + { + // Array may have been sliced but the offset is zero, + // so we can truncate the existing offsets + return new ArrowBuffer(valueOffsetsBuffer.Memory.Slice(0, requiredBytes)); + } + else + { + // Use the full buffer + return valueOffsetsBuffer; + } + } + private (ArrowBuffer Buffer, int minOffset, int maxEnd) GetZeroBasedListViewOffsets(ListViewArray array) { if (array.Length == 0) diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs index 473e18968f8cb..adc229a051227 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs @@ -57,6 +57,7 @@ class TypeVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -65,9 +66,11 @@ class TypeVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -120,6 +123,14 @@ public void Visit(BinaryViewType type) Flatbuf.Type.BinaryView, offset); } + public void Visit(LargeBinaryType type) + { + Flatbuf.LargeBinary.StartLargeBinary(Builder); + Offset offset = Flatbuf.LargeBinary.EndLargeBinary(Builder); + Result = FieldType.Build( + Flatbuf.Type.LargeBinary, offset); + } + public void Visit(ListType type) { Flatbuf.List.StartList(Builder); @@ -136,6 +147,14 @@ public void Visit(ListViewType type) Flatbuf.ListView.EndListView(Builder)); } + public void Visit(LargeListType type) + { + Flatbuf.LargeList.StartLargeList(Builder); + Result = FieldType.Build( + Flatbuf.Type.LargeList, + Flatbuf.LargeList.EndLargeList(Builder)); + } + public void Visit(FixedSizeListType type) { Result = FieldType.Build( @@ -166,6 +185,14 @@ public void Visit(StringViewType type) Flatbuf.Type.Utf8View, offset); } + public void Visit(LargeStringType type) + { + Flatbuf.LargeUtf8.StartLargeUtf8(Builder); + Offset offset = Flatbuf.LargeUtf8.EndLargeUtf8(Builder); + Result = FieldType.Build( + Flatbuf.Type.LargeUtf8, offset); + } + public void Visit(TimestampType type) { StringOffset timezoneStringOffset = default; @@ -363,7 +390,7 @@ private static Flatbuf.IntervalUnit ToFlatBuffer(Types.IntervalUnit unit) Types.IntervalUnit.DayTime => Flatbuf.IntervalUnit.DAY_TIME, Types.IntervalUnit.MonthDayNanosecond => Flatbuf.IntervalUnit.MONTH_DAY_NANO, _ => throw new ArgumentException($"unsupported interval unit <{unit}>", nameof(unit)) - }; ; + }; } } } diff --git a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs index 0e6f330aef091..8e15632c517e1 100644 --- a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs +++ b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs @@ -186,6 +186,8 @@ private static Types.IArrowType GetFieldArrowType(Flatbuf.Field field, Field[] c return Types.StringType.Default; case Flatbuf.Type.Utf8View: return Types.StringViewType.Default; + case Flatbuf.Type.LargeUtf8: + return Types.LargeStringType.Default; case Flatbuf.Type.FixedSizeBinary: Flatbuf.FixedSizeBinary fixedSizeBinaryMetadata = field.Type().Value; return new Types.FixedSizeBinaryType(fixedSizeBinaryMetadata.ByteWidth); @@ -193,6 +195,8 @@ private static Types.IArrowType GetFieldArrowType(Flatbuf.Field field, Field[] c return Types.BinaryType.Default; case Flatbuf.Type.BinaryView: return Types.BinaryViewType.Default; + case Flatbuf.Type.LargeBinary: + return Types.LargeBinaryType.Default; case Flatbuf.Type.List: if (childFields == null || childFields.Length != 1) { @@ -205,6 +209,12 @@ private static Types.IArrowType GetFieldArrowType(Flatbuf.Field field, Field[] c throw new InvalidDataException($"List view type must have exactly one child."); } return new Types.ListViewType(childFields[0]); + case Flatbuf.Type.LargeList: + if (childFields == null || childFields.Length != 1) + { + throw new InvalidDataException($"Large list type must have exactly one child."); + } + return new Types.LargeListType(childFields[0]); case Flatbuf.Type.FixedSizeList: if (childFields == null || childFields.Length != 1) { diff --git a/csharp/src/Apache.Arrow/Types/IArrowType.cs b/csharp/src/Apache.Arrow/Types/IArrowType.cs index cf520391fe1e6..7a3159a1bbccd 100644 --- a/csharp/src/Apache.Arrow/Types/IArrowType.cs +++ b/csharp/src/Apache.Arrow/Types/IArrowType.cs @@ -53,6 +53,9 @@ public enum ArrowTypeId BinaryView, StringView, ListView, + LargeList, + LargeBinary, + LargeString, } public interface IArrowType diff --git a/csharp/src/Apache.Arrow/Types/LargeBinaryType.cs b/csharp/src/Apache.Arrow/Types/LargeBinaryType.cs new file mode 100644 index 0000000000000..e22c333824480 --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/LargeBinaryType.cs @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace Apache.Arrow.Types; + +public class LargeBinaryType: ArrowType +{ + public static readonly LargeBinaryType Default = new LargeBinaryType(); + + public override ArrowTypeId TypeId => ArrowTypeId.LargeBinary; + + public override string Name => "large_binary"; + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); +} diff --git a/csharp/src/Apache.Arrow/Types/LargeListType.cs b/csharp/src/Apache.Arrow/Types/LargeListType.cs new file mode 100644 index 0000000000000..2fe8166972931 --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/LargeListType.cs @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace Apache.Arrow.Types +{ + public sealed class LargeListType : NestedType + { + public override ArrowTypeId TypeId => ArrowTypeId.LargeList; + + public override string Name => "large_list"; + + public Field ValueField => Fields[0]; + + public IArrowType ValueDataType => Fields[0].DataType; + + public LargeListType(Field valueField) + : base(valueField) { } + + public LargeListType(IArrowType valueDataType) + : this(new Field("item", valueDataType, true)) { } + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); + } +} diff --git a/csharp/src/Apache.Arrow/Types/LargeStringType.cs b/csharp/src/Apache.Arrow/Types/LargeStringType.cs new file mode 100644 index 0000000000000..8698ca4747a0e --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/LargeStringType.cs @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace Apache.Arrow.Types; + +public sealed class LargeStringType : ArrowType +{ + public static readonly LargeStringType Default = new LargeStringType(); + + public override ArrowTypeId TypeId => ArrowTypeId.LargeString; + + public override string Name => "large_utf8"; + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); +} diff --git a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs index 7232f74b8bec6..c9e44b8d2f491 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs +++ b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs @@ -177,8 +177,10 @@ private static IArrowType ToArrowType(JsonArrowType type, Field[] children) "decimal" => ToDecimalArrowType(type), "binary" => BinaryType.Default, "binaryview" => BinaryViewType.Default, + "largebinary" => LargeBinaryType.Default, "utf8" => StringType.Default, "utf8view" => StringViewType.Default, + "largeutf8" => LargeStringType.Default, "fixedsizebinary" => new FixedSizeBinaryType(type.ByteWidth), "date" => ToDateArrowType(type), "time" => ToTimeArrowType(type), @@ -188,6 +190,7 @@ private static IArrowType ToArrowType(JsonArrowType type, Field[] children) "timestamp" => ToTimestampArrowType(type), "list" => ToListArrowType(type, children), "listview" => ToListViewArrowType(type, children), + "largelist" => ToLargeListArrowType(type, children), "fixedsizelist" => ToFixedSizeListArrowType(type, children), "struct" => ToStructArrowType(type, children), "union" => ToUnionArrowType(type, children), @@ -303,6 +306,11 @@ private static IArrowType ToListViewArrowType(JsonArrowType type, Field[] childr return new ListViewType(children[0]); } + private static IArrowType ToLargeListArrowType(JsonArrowType type, Field[] children) + { + return new LargeListType(children[0]); + } + private static IArrowType ToFixedSizeListArrowType(JsonArrowType type, Field[] children) { return new FixedSizeListType(children[0], type.ListSize); @@ -461,11 +469,14 @@ private class ArrayCreator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -696,6 +707,24 @@ public void Visit(StringViewType type) Array = new StringViewArray(arrayData); } + public void Visit(LargeStringType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + ArrowBuffer offsetBuffer = GetLargeOffsetBuffer(); + + var json = JsonFieldData.Data.GetRawText(); + string[] values = JsonSerializer.Deserialize(json, s_options); + + ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); + foreach (string value in values) + { + valueBuilder.Append(Encoding.UTF8.GetBytes(value)); + } + ArrowBuffer valueBuffer = valueBuilder.Build(default); + + Array = new LargeStringArray(JsonFieldData.Count, offsetBuffer, valueBuffer, validityBuffer, nullCount); + } + public void Visit(BinaryType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -747,6 +776,25 @@ public void Visit(BinaryViewType type) Array = new BinaryViewArray(arrayData); } + public void Visit(LargeBinaryType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + ArrowBuffer offsetBuffer = GetLargeOffsetBuffer(); + + var json = JsonFieldData.Data.GetRawText(); + string[] values = JsonSerializer.Deserialize(json, s_options); + + ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); + foreach (string value in values) + { + valueBuilder.Append(ConvertHexStringToByteArray(value)); + } + ArrowBuffer valueBuffer = valueBuilder.Build(default); + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, new[] { validityBuffer, offsetBuffer, valueBuffer }); + Array = new LargeBinaryArray(arrayData); + } + public void Visit(FixedSizeBinaryType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -796,6 +844,21 @@ public void Visit(ListViewType type) Array = new ListViewArray(arrayData); } + public void Visit(LargeListType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + ArrowBuffer offsetBuffer = GetLargeOffsetBuffer(); + + var data = JsonFieldData; + JsonFieldData = data.Children[0]; + type.ValueDataType.Accept(this); + JsonFieldData = data; + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, + new[] { validityBuffer, offsetBuffer }, new[] { Array.Data }); + Array = new LargeListArray(arrayData); + } + public void Visit(FixedSizeListType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -975,6 +1038,13 @@ private ArrowBuffer GetOffsetBuffer() return valueOffsets.Build(default); } + private ArrowBuffer GetLargeOffsetBuffer() + { + ArrowBuffer.Builder valueOffsets = new ArrowBuffer.Builder(JsonFieldData.Offset.Count); + valueOffsets.AppendRange(JsonFieldData.LongOffset); + return valueOffsets.Build(default); + } + private ArrowBuffer GetSizeBuffer() { ArrowBuffer.Builder valueSizes = new ArrowBuffer.Builder(JsonFieldData.Size.Count); @@ -1039,6 +1109,12 @@ public IEnumerable IntOffset get { return Offset.Select(GetInt); } } + [JsonIgnore] + public IEnumerable LongOffset + { + get { return Offset.Select(GetLong); } + } + [JsonIgnore] public IEnumerable IntSize { @@ -1056,6 +1132,18 @@ static int GetInt(JsonNode node) return int.Parse(node.GetValue()); } } + + static long GetLong(JsonNode node) + { + try + { + return node.GetValue(); + } + catch + { + return long.Parse(node.GetValue()); + } + } } public class JsonView diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index 5c33d1fd43986..85f7b75f931ef 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -95,12 +95,15 @@ private class ArrayComparer : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -144,14 +147,17 @@ public ArrayComparer(IArrowArray expectedArray, bool strictCompare) public void Visit(MonthDayNanosecondIntervalArray array) => CompareArrays(array); public void Visit(ListArray array) => CompareArrays(array); public void Visit(ListViewArray array) => CompareArrays(array); + public void Visit(LargeListArray array) => CompareArrays(array); public void Visit(FixedSizeListArray array) => CompareArrays(array); public void Visit(FixedSizeBinaryArray array) => CompareArrays(array); public void Visit(Decimal128Array array) => CompareArrays(array); public void Visit(Decimal256Array array) => CompareArrays(array); public void Visit(StringArray array) => CompareBinaryArrays(array); public void Visit(StringViewArray array) => CompareVariadicArrays(array); + public void Visit(LargeStringArray array) => CompareLargeBinaryArrays(array); public void Visit(BinaryArray array) => CompareBinaryArrays(array); public void Visit(BinaryViewArray array) => CompareVariadicArrays(array); + public void Visit(LargeBinaryArray array) => CompareLargeBinaryArrays(array); public void Visit(StructArray array) { @@ -276,6 +282,40 @@ private void CompareBinaryArrays(BinaryArray actualArray) } } + private void CompareLargeBinaryArrays(LargeBinaryArray actualArray) + where T : IArrowArray + { + Assert.IsAssignableFrom(_expectedArray); + Assert.IsAssignableFrom(actualArray); + + var expectedArray = (LargeBinaryArray)_expectedArray; + + actualArray.Data.DataType.Accept(_arrayTypeComparer); + + Assert.Equal(expectedArray.Length, actualArray.Length); + Assert.Equal(expectedArray.NullCount, actualArray.NullCount); + + CompareValidityBuffer( + expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, + expectedArray.Offset, actualArray.NullBitmapBuffer, actualArray.Offset); + + if (_strictCompare) + { + Assert.Equal(expectedArray.Offset, actualArray.Offset); + Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); + Assert.True(expectedArray.ValueBuffer.Span.Slice(0, expectedArray.Length).SequenceEqual(actualArray.ValueBuffer.Span.Slice(0, actualArray.Length))); + } + else + { + for (int i = 0; i < expectedArray.Length; i++) + { + Assert.True( + expectedArray.GetBytes(i).SequenceEqual(actualArray.GetBytes(i)), + $"LargeBinaryArray values do not match at index {i}."); + } + } + } + private void CompareVariadicArrays(BinaryViewArray actualArray) where T : IArrowArray { @@ -469,6 +509,44 @@ private void CompareArrays(ListViewArray actualArray) } } + private void CompareArrays(LargeListArray actualArray) + { + Assert.IsAssignableFrom(_expectedArray); + LargeListArray expectedArray = (LargeListArray)_expectedArray; + + actualArray.Data.DataType.Accept(_arrayTypeComparer); + + Assert.Equal(expectedArray.Length, actualArray.Length); + Assert.Equal(expectedArray.NullCount, actualArray.NullCount); + + CompareValidityBuffer( + expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, + expectedArray.Offset, actualArray.NullBitmapBuffer, actualArray.Offset); + + if (_strictCompare) + { + Assert.Equal(expectedArray.Offset, actualArray.Offset); + Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); + actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); + } + else + { + for (int i = 0; i < actualArray.Length; ++i) + { + if (expectedArray.IsNull(i)) + { + Assert.True(actualArray.IsNull(i)); + } + else + { + var expectedList = expectedArray.GetSlicedValues(i); + var actualList = actualArray.GetSlicedValues(i); + actualList.Accept(new ArrayComparer(expectedList, _strictCompare)); + } + } + } + } + private void CompareArrays(FixedSizeListArray actualArray) { Assert.IsAssignableFrom(_expectedArray); diff --git a/csharp/test/Apache.Arrow.Tests/LargeBinaryArrayTests.cs b/csharp/test/Apache.Arrow.Tests/LargeBinaryArrayTests.cs new file mode 100644 index 0000000000000..4ee1f1d0e0ffa --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/LargeBinaryArrayTests.cs @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow.Types; +using Xunit; + +namespace Apache.Arrow.Tests; + +public class LargeBinaryArrayTests +{ + [Fact] + public void GetBytesReturnsCorrectValue() + { + var byteArrays = new byte[][] + { + new byte[] {0, 1, 2, 255}, + new byte[] {3, 4, 5}, + new byte[] {}, + null, + new byte[] {254, 253, 252}, + }; + var array = BuildArray(byteArrays); + + Assert.Equal(array.Length, byteArrays.Length); + for (var i = 0; i < byteArrays.Length; ++i) + { + var byteSpan = array.GetBytes(i, out var isNull); + var byteArray = isNull ? null : byteSpan.ToArray(); + Assert.Equal(byteArrays[i], byteArray); + } + } + + [Fact] + public void GetBytesChecksForOffsetOverflow() + { + var valueBuffer = new ArrowBuffer.Builder(); + var offsetBuffer = new ArrowBuffer.Builder(); + var validityBuffer = new ArrowBuffer.BitmapBuilder(); + + offsetBuffer.Append(0); + offsetBuffer.Append((long)int.MaxValue + 1); + validityBuffer.Append(true); + + var array = new LargeBinaryArray( + LargeBinaryType.Default, length: 1, + offsetBuffer.Build(), valueBuffer.Build(), validityBuffer.Build(), + validityBuffer.UnsetBitCount); + + Assert.Throws(() => array.GetBytes(0)); + } + + private static LargeBinaryArray BuildArray(IReadOnlyCollection byteArrays) + { + var valueBuffer = new ArrowBuffer.Builder(); + var offsetBuffer = new ArrowBuffer.Builder(); + var validityBuffer = new ArrowBuffer.BitmapBuilder(); + + long offset = 0; + offsetBuffer.Append(offset); + foreach (var bytes in byteArrays) + { + if (bytes == null) + { + validityBuffer.Append(false); + offsetBuffer.Append(offset); + } + else + { + valueBuffer.Append(bytes); + offset += bytes.Length; + offsetBuffer.Append(offset); + validityBuffer.Append(true); + } + } + + return new LargeBinaryArray( + LargeBinaryType.Default, byteArrays.Count, + offsetBuffer.Build(), valueBuffer.Build(), validityBuffer.Build(), + validityBuffer.UnsetBitCount); + } +} diff --git a/csharp/test/Apache.Arrow.Tests/LargeListArrayTests.cs b/csharp/test/Apache.Arrow.Tests/LargeListArrayTests.cs new file mode 100644 index 0000000000000..1d35a8ffd62c5 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/LargeListArrayTests.cs @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Linq; +using Apache.Arrow.Types; +using Xunit; + +namespace Apache.Arrow.Tests; + +public class LargeListArrayTests +{ + [Fact] + public void GetSlicedValuesReturnsCorrectValues() + { + var values = new int?[][] + { + new int?[] {0, 1, 2}, + System.Array.Empty(), + null, + new int?[] {3, 4, null, 6}, + }; + + var array = BuildArray(values); + + Assert.Equal(values.Length, array.Length); + for (int i = 0; i < values.Length; ++i) + { + Assert.Equal(values[i] == null, array.IsNull(i)); + var arrayItem = (Int32Array) array.GetSlicedValues(i); + if (values[i] == null) + { + Assert.Null(arrayItem); + } + else + { + Assert.Equal(values[i], arrayItem.ToArray()); + } + } + } + + [Fact] + public void GetSlicedValuesChecksForOffsetOverflow() + { + var valuesArray = new Int32Array.Builder().Build(); + var offsetBuffer = new ArrowBuffer.Builder(); + var validityBuffer = new ArrowBuffer.BitmapBuilder(); + + offsetBuffer.Append(0); + offsetBuffer.Append((long)int.MaxValue + 1); + validityBuffer.Append(true); + + var array = new LargeListArray( + new LargeListType(new Int32Type()), length: 1, + offsetBuffer.Build(), valuesArray, validityBuffer.Build(), + validityBuffer.UnsetBitCount); + + Assert.Throws(() => array.GetSlicedValues(0)); + } + + private static LargeListArray BuildArray(int?[][] values) + { + var valuesBuilder = new Int32Array.Builder(); + var offsetBuffer = new ArrowBuffer.Builder(); + var validityBuffer = new ArrowBuffer.BitmapBuilder(); + + long offset = 0; + offsetBuffer.Append(offset); + foreach (var listValue in values) + { + if (listValue == null) + { + validityBuffer.Append(false); + offsetBuffer.Append(offset); + } + else + { + foreach (var value in listValue) + { + valuesBuilder.Append(value); + } + offset += listValue.Length; + offsetBuffer.Append(offset); + validityBuffer.Append(true); + } + } + + return new LargeListArray( + new LargeListType(new Int32Type()), values.Length, + offsetBuffer.Build(), valuesBuilder.Build(), validityBuffer.Build(), + validityBuffer.UnsetBitCount); + } +} diff --git a/csharp/test/Apache.Arrow.Tests/LargeStringArrayTests.cs b/csharp/test/Apache.Arrow.Tests/LargeStringArrayTests.cs new file mode 100644 index 0000000000000..aba97ba338c75 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/LargeStringArrayTests.cs @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Xunit; + +namespace Apache.Arrow.Tests; + +public class LargeStringArrayTests +{ + [Fact] + public void GetStringReturnsCorrectValue() + { + var strings = new string[] + { + "abc", + "defg", + "", + null, + "123", + }; + var array = BuildArray(strings); + + Assert.Equal(array.Length, strings.Length); + for (var i = 0; i < strings.Length; ++i) + { + Assert.Equal(strings[i], array.GetString(i)); + } + } + + [Fact] + public void GetStringChecksForOffsetOverflow() + { + var valueBuffer = new ArrowBuffer.Builder(); + var offsetBuffer = new ArrowBuffer.Builder(); + var validityBuffer = new ArrowBuffer.BitmapBuilder(); + + offsetBuffer.Append(0); + offsetBuffer.Append((long)int.MaxValue + 1); + validityBuffer.Append(true); + + var array = new LargeStringArray( + length: 1, offsetBuffer.Build(), valueBuffer.Build(), validityBuffer.Build(), + validityBuffer.UnsetBitCount); + + Assert.Throws(() => array.GetString(0)); + } + + private static LargeStringArray BuildArray(IReadOnlyCollection strings) + { + var valueBuffer = new ArrowBuffer.Builder(); + var offsetBuffer = new ArrowBuffer.Builder(); + var validityBuffer = new ArrowBuffer.BitmapBuilder(); + + long offset = 0; + offsetBuffer.Append(offset); + foreach (var value in strings) + { + if (value == null) + { + validityBuffer.Append(false); + offsetBuffer.Append(offset); + } + else + { + var bytes = LargeStringArray.DefaultEncoding.GetBytes(value); + valueBuffer.Append(bytes); + offset += value.Length; + offsetBuffer.Append(offset); + validityBuffer.Append(true); + } + } + + return new LargeStringArray( + strings.Count, offsetBuffer.Build(), valueBuffer.Build(), validityBuffer.Build(), + validityBuffer.UnsetBitCount); + } +} diff --git a/csharp/test/Apache.Arrow.Tests/TableTests.cs b/csharp/test/Apache.Arrow.Tests/TableTests.cs index 83c88265d172b..35fbe7cba68f1 100644 --- a/csharp/test/Apache.Arrow.Tests/TableTests.cs +++ b/csharp/test/Apache.Arrow.Tests/TableTests.cs @@ -63,9 +63,9 @@ public void TestTableFromRecordBatches() Table table1 = Table.TableFromRecordBatches(recordBatch1.Schema, recordBatches); Assert.Equal(20, table1.RowCount); #if NET5_0_OR_GREATER - Assert.Equal(35, table1.ColumnCount); + Assert.Equal(38, table1.ColumnCount); #else - Assert.Equal(34, table1.ColumnCount); + Assert.Equal(37, table1.ColumnCount); #endif Assert.Equal("ChunkedArray: Length=20, DataType=list", table1.Column(0).Data.ToString()); diff --git a/csharp/test/Apache.Arrow.Tests/TestData.cs b/csharp/test/Apache.Arrow.Tests/TestData.cs index 3ea42ee0fbcb7..36969766aeae0 100644 --- a/csharp/test/Apache.Arrow.Tests/TestData.cs +++ b/csharp/test/Apache.Arrow.Tests/TestData.cs @@ -49,6 +49,7 @@ void AddField(Field field) { AddField(CreateField(new ListType(Int64Type.Default), i)); AddField(CreateField(new ListViewType(Int64Type.Default), i)); + AddField(CreateField(new LargeListType(Int64Type.Default), i)); AddField(CreateField(BooleanType.Default, i)); AddField(CreateField(UInt8Type.Default, i)); AddField(CreateField(Int8Type.Default, i)); @@ -84,6 +85,8 @@ void AddField(Field field) AddField(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Sparse), i)); AddField(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Dense), -i)); AddField(CreateField(new DictionaryType(Int32Type.Default, StringType.Default, false), i)); + AddField(CreateField(new LargeBinaryType(), i)); + AddField(CreateField(new LargeStringType(), i)); } Schema schema = builder.Build(); @@ -144,8 +147,10 @@ private class ArrayCreator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -154,6 +159,7 @@ private class ArrayCreator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -335,6 +341,45 @@ public void Visit(StringViewType type) Array = builder.Build(); } + public void Visit(LargeStringType type) + { + var str = "hello"; + var valueBuffer = new ArrowBuffer.Builder(); + var offsetBuffer = new ArrowBuffer.Builder(); + var validityBuffer = new ArrowBuffer.BitmapBuilder(); + + long offset = 0; + offsetBuffer.Append(offset); + + for (var i = 0; i < Length; i++) + { + switch (i % 3) + { + case 0: + offsetBuffer.Append(offset); + validityBuffer.Append(false); + break; + case 1: + valueBuffer.Append(LargeStringArray.DefaultEncoding.GetBytes(str)); + offset += str.Length; + offsetBuffer.Append(offset); + validityBuffer.Append(true); + break; + case 2: + valueBuffer.Append(LargeStringArray.DefaultEncoding.GetBytes(str + str)); + offset += str.Length * 2; + offsetBuffer.Append(offset); + validityBuffer.Append(true); + break; + } + } + + var validity = validityBuffer.UnsetBitCount > 0 ? validityBuffer.Build() : ArrowBuffer.Empty; + Array = new LargeStringArray( + Length, offsetBuffer.Build(), valueBuffer.Build(), validity, + validityBuffer.UnsetBitCount); + } + public void Visit(ListType type) { var builder = new ListArray.Builder(type.ValueField).Reserve(Length); @@ -379,6 +424,37 @@ public void Visit(ListViewType type) Array = builder.Build(); } + public void Visit(LargeListType type) + { + var valueBuilder = new Int64Array.Builder().Reserve(Length * 3 / 2); + var offsetBuffer = new ArrowBuffer.Builder(); + var validityBuffer = new ArrowBuffer.BitmapBuilder(); + + offsetBuffer.Append(0); + + for (var i = 0; i < Length; i++) + { + if (i % 10 == 2) + { + offsetBuffer.Append(valueBuilder.Length); + validityBuffer.Append(false); + } + else + { + var listLength = i % 4; + valueBuilder.AppendRange(Enumerable.Range(i, listLength).Select(x => (long)x)); + offsetBuffer.Append(valueBuilder.Length); + validityBuffer.Append(true); + } + } + + var validity = validityBuffer.UnsetBitCount > 0 ? validityBuffer.Build() : ArrowBuffer.Empty; + Array = new LargeListArray( + new LargeListType(new Int64Type()), Length, + offsetBuffer.Build(), valueBuilder.Build(), validity, + validityBuffer.UnsetBitCount); + } + public void Visit(FixedSizeListType type) { var builder = new FixedSizeListArray.Builder(type.ValueField, type.ListSize).Reserve(Length); @@ -554,6 +630,48 @@ public void Visit(BinaryViewType type) Array = builder.Build(); } + public void Visit(LargeBinaryType type) + { + ReadOnlySpan shortData = new[] { (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9 }; + ReadOnlySpan longData = new[] + { + (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9, + (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)16, (byte)17, (byte)18, (byte)19 + }; + var valueBuffer = new ArrowBuffer.Builder(); + var offsetBuffer = new ArrowBuffer.Builder(); + var validityBuffer = new ArrowBuffer.BitmapBuilder(); + + offsetBuffer.Append(0L); + + for (var i = 0; i < Length; i++) + { + switch (i % 3) + { + case 0: + offsetBuffer.Append(valueBuffer.Length); + validityBuffer.Append(false); + break; + case 1: + valueBuffer.Append(shortData); + offsetBuffer.Append(valueBuffer.Length); + validityBuffer.Append(true); + break; + case 2: + valueBuffer.Append(longData); + offsetBuffer.Append(valueBuffer.Length); + validityBuffer.Append(true); + break; + } + } + + var validity = validityBuffer.UnsetBitCount > 0 ? validityBuffer.Build() : ArrowBuffer.Empty; + Array = new LargeBinaryArray( + LargeBinaryType.Default, Length, + offsetBuffer.Build(), valueBuffer.Build(), validity, + validityBuffer.UnsetBitCount); + } + public void Visit(FixedSizeBinaryType type) { ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index b51f3d876f820..47310c905a9ff 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1872,8 +1872,7 @@ def _temp_path(): generate_primitive_case([17, 20], name='primitive'), generate_primitive_case([0, 0, 0], name='primitive_zerolength'), - generate_primitive_large_offsets_case([17, 20]) - .skip_tester('C#'), + generate_primitive_large_offsets_case([17, 20]), generate_null_case([10, 0]), @@ -1906,7 +1905,6 @@ def _temp_path(): generate_recursive_nested_case(), generate_nested_large_offsets_case() - .skip_tester('C#') .skip_tester('JS'), generate_unions_case(), diff --git a/docs/source/status.rst b/docs/source/status.rst index 266381175608a..c232aa280befb 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -62,11 +62,11 @@ Data Types +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | Binary | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ -| Large Binary | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | ✓ | +| Large Binary | ✓ | ✓ | ✓ | ✓ | \(4) | ✓ | ✓ | | ✓ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | Utf8 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ -| Large Utf8 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | ✓ | +| Large Utf8 | ✓ | ✓ | ✓ | ✓ | \(4) | ✓ | ✓ | | ✓ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | Binary View | ✓ | | ✓ | | ✓ | | | | | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ @@ -85,7 +85,7 @@ Data Types +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | List | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | ✓ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ -| Large List | ✓ | ✓ | ✓ | | | ✓ | ✓ | | ✓ | +| Large List | ✓ | ✓ | ✓ | | \(4) | ✓ | ✓ | | ✓ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | List View | ✓ | | ✓ | | ✓ | | | | | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ @@ -125,6 +125,8 @@ Notes: * \(1) Casting to/from Float16 in Java is not supported. * \(2) Float16 support in C# is only available when targeting .NET 6+. * \(3) Nested dictionaries not supported +* \(4) C# large array types are provided to help with interoperability with other libraries, + but these do not support buffers larger than 2 GiB and an exception will be raised if trying to import an array that is too large. .. seealso:: The :ref:`format_columnar` and the