Skip to content

Commit

Permalink
apacheGH-41047: [C#] Address performance issue of reading from String…
Browse files Browse the repository at this point in the history
…Array (apache#41048)

### Rationale for this change

The motivation here is to address apache#41047. There is severe performance drawback in reading a StringArray as value array of a DictionaryArray, because of repeated and unnecessary UTF 8 string decoding.

### What changes are included in this PR?

- Added a new function Materialize() to materialize the values to a list. When materialized, GetString() reads from the vector directly.
- Added test coverage.

### Are these changes tested?

Yes

### Are there any user-facing changes?

No. This change maintains backwards compatibility on the API surface. It is up to the client application to decide whether to materialize the array and gain performance. 

* GitHub Issue: apache#41047

Authored-by: Keshuang Shen <[email protected]>
Signed-off-by: Curt Hagenlocher <[email protected]>
  • Loading branch information
keshen-msft authored and tolleybot committed May 2, 2024
1 parent 7806e6c commit c7cf3c0
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 1 deletion.
60 changes: 59 additions & 1 deletion csharp/src/Apache.Arrow/Arrays/StringArray.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,21 @@
// See the License for the specific language governing permissions and
// limitations under the License.

using Apache.Arrow.Types;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text;
using Apache.Arrow.Types;

namespace Apache.Arrow
{
public class StringArray: BinaryArray, IReadOnlyList<string>
{
public static readonly Encoding DefaultEncoding = Encoding.UTF8;

private Dictionary<Encoding, string[]> materializedStringStore;

public new class Builder : BuilderBase<StringArray, Builder>
{
public Builder() : base(StringType.Default) { }
Expand Down Expand Up @@ -71,16 +73,28 @@ public StringArray(int length,

public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor);

/// <summary>
/// Get the string value at the given index
/// </summary>
/// <param name="index">Input index</param>
/// <param name="encoding">Optional: the string encoding, default is UTF8</param>
/// <returns>The string object at the given index</returns>
public string GetString(int index, Encoding encoding = default)
{
encoding ??= DefaultEncoding;

if (materializedStringStore != null && materializedStringStore.TryGetValue(encoding, out string[] materializedStrings))
{
return materializedStrings[index];
}

ReadOnlySpan<byte> bytes = GetBytes(index, out bool isNull);

if (isNull)
{
return null;
}

if (bytes.Length == 0)
{
return string.Empty;
Expand All @@ -93,6 +107,50 @@ public string GetString(int index, Encoding encoding = default)
}
}

/// <summary>
/// Materialize the array for the given encoding to accelerate the string access
/// </summary>
/// <param name="encoding">Optional: the string encoding, default is UTF8</param>
/// <remarks>This method is not thread safe when it is called in parallel with <see cref="GetString(int, Encoding)"/> or <see cref="Materialize(Encoding)"/>.</remarks>
public void Materialize(Encoding encoding = default)
{
encoding ??= DefaultEncoding;

if (IsMaterialized(encoding))
{
return;
}

if (materializedStringStore == null)
{
materializedStringStore = new Dictionary<Encoding, string[]>();
}

var stringStore = new string[Length];
for (int i = 0; i < Length; i++)
{
stringStore[i] = GetString(i, encoding);
}

materializedStringStore[encoding] = stringStore;
}

/// <summary>
/// Check if the array has been materialized for the given encoding
/// </summary>
/// <param name="encoding">Optional: the string encoding, default is UTF8</param>
/// <returns>True of false whether the array has been materialized</returns>
public bool IsMaterialized(Encoding encoding = default)
{
if (materializedStringStore == null)
{
return false;
}

encoding ??= DefaultEncoding;
return materializedStringStore.ContainsKey(encoding);
}

int IReadOnlyCollection<string>.Count => Length;

string IReadOnlyList<string>.this[int index] => GetString(index);
Expand Down
31 changes: 31 additions & 0 deletions csharp/test/Apache.Arrow.Tests/StringArrayTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,37 @@ public void ReturnsAppendedValue(string firstValue, string secondValue)
// Assert
Assert.Equal(firstValue, retrievedValue);
}

[Theory]
[InlineData(null, null)]
[InlineData(null, "")]
[InlineData(null, "value")]
[InlineData("", null)]
[InlineData("", "")]
[InlineData("", "value")]
[InlineData("value", null)]
[InlineData("value", "")]
[InlineData("value", "value")]
public void ReturnsAppendedValueMaterialize(string firstValue, string secondValue)
{
// Arrange
// Create an array with two elements. The second element being null,
// empty, or non-empty may influence the underlying BinaryArray
// storage such that retrieving an empty first element could result
// in an empty span or a 0-length span backed by storage.
var array = new StringArray.Builder()
.Append(firstValue)
.Append(secondValue)
.Build();

// Act
array.Materialize();
var retrievedValue = array.GetString(0);

// Assert
Assert.True(array.IsMaterialized());
Assert.Equal(firstValue, retrievedValue);
}
}
}
}

0 comments on commit c7cf3c0

Please sign in to comment.