diff --git a/csharp/src/Apache.Arrow/Arrays/StringArray.cs b/csharp/src/Apache.Arrow/Arrays/StringArray.cs index af77fe1b1a83d..a3ec596adc7ba 100644 --- a/csharp/src/Apache.Arrow/Arrays/StringArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/StringArray.cs @@ -13,12 +13,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -using Apache.Arrow.Types; using System; using System.Collections; using System.Collections.Generic; using System.Runtime.InteropServices; using System.Text; +using Apache.Arrow.Types; namespace Apache.Arrow { @@ -26,6 +26,8 @@ public class StringArray: BinaryArray, IReadOnlyList { public static readonly Encoding DefaultEncoding = Encoding.UTF8; + private Dictionary materializedStringStore; + public new class Builder : BuilderBase { public Builder() : base(StringType.Default) { } @@ -71,16 +73,28 @@ public StringArray(int length, public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + /// + /// Get the string value at the given index + /// + /// Input index + /// Optional: the string encoding, default is UTF8 + /// The string object at the given index public string GetString(int index, Encoding encoding = default) { encoding ??= DefaultEncoding; + if (materializedStringStore != null && materializedStringStore.TryGetValue(encoding, out string[] materializedStrings)) + { + return materializedStrings[index]; + } + ReadOnlySpan bytes = GetBytes(index, out bool isNull); if (isNull) { return null; } + if (bytes.Length == 0) { return string.Empty; @@ -93,6 +107,50 @@ public string GetString(int index, Encoding encoding = default) } } + /// + /// Materialize the array for the given encoding to accelerate the string access + /// + /// Optional: the string encoding, default is UTF8 + /// This method is not thread safe when it is called in parallel with or . + public void Materialize(Encoding encoding = default) + { + encoding ??= DefaultEncoding; + + if (IsMaterialized(encoding)) + { + return; + } + + if (materializedStringStore == null) + { + materializedStringStore = new Dictionary(); + } + + var stringStore = new string[Length]; + for (int i = 0; i < Length; i++) + { + stringStore[i] = GetString(i, encoding); + } + + materializedStringStore[encoding] = stringStore; + } + + /// + /// Check if the array has been materialized for the given encoding + /// + /// Optional: the string encoding, default is UTF8 + /// True of false whether the array has been materialized + public bool IsMaterialized(Encoding encoding = default) + { + if (materializedStringStore == null) + { + return false; + } + + encoding ??= DefaultEncoding; + return materializedStringStore.ContainsKey(encoding); + } + int IReadOnlyCollection.Count => Length; string IReadOnlyList.this[int index] => GetString(index); diff --git a/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs b/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs index 0fd3d3d105a70..b19731535a29d 100644 --- a/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs @@ -49,6 +49,37 @@ public void ReturnsAppendedValue(string firstValue, string secondValue) // Assert Assert.Equal(firstValue, retrievedValue); } + + [Theory] + [InlineData(null, null)] + [InlineData(null, "")] + [InlineData(null, "value")] + [InlineData("", null)] + [InlineData("", "")] + [InlineData("", "value")] + [InlineData("value", null)] + [InlineData("value", "")] + [InlineData("value", "value")] + public void ReturnsAppendedValueMaterialize(string firstValue, string secondValue) + { + // Arrange + // Create an array with two elements. The second element being null, + // empty, or non-empty may influence the underlying BinaryArray + // storage such that retrieving an empty first element could result + // in an empty span or a 0-length span backed by storage. + var array = new StringArray.Builder() + .Append(firstValue) + .Append(secondValue) + .Build(); + + // Act + array.Materialize(); + var retrievedValue = array.GetString(0); + + // Assert + Assert.True(array.IsMaterialized()); + Assert.Equal(firstValue, retrievedValue); + } } } }