Skip to content

Commit

Permalink
Improve Ascii (and Utf8) encoding (#85266)
Browse files Browse the repository at this point in the history
* Improve writing of lower vector part in ascii convertion

* from 10 /17 to 1 instruction for 64/32 bit x86

* Add [MethodImpl(MethodImplOptions.AggressiveInlining)] to NarrowUtf16ToAscii_Intrinsified

* rewrite StoreLower without Sse2.StoreScalar

* move helper to Vector128 and call in case conversion

* remove unused helpers
  • Loading branch information
Daniel-Svensson authored May 12, 2023
1 parent 0c423af commit f1819bd
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2718,6 +2718,24 @@ public static unsafe void StoreAligned<T>(this Vector128<T> source, T* destinati
public static unsafe void StoreAlignedNonTemporal<T>(this Vector128<T> source, T* destination)
where T : unmanaged => source.StoreAligned(destination);

/// <summary>
/// Stores to lower 64 bits of <paramref name="source"/> to memory destination of <paramref name="destination"/>[<paramref name="elementOffset"/>]
/// </summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="source">The vector that will be stored.</param>
/// <param name="destination">The destination to which <paramref name="elementOffset" /> will be added before the vector will be stored.</param>
/// <param name="elementOffset">The element offset from <paramref name="destination" /> from which the vector will be stored.</param>
/// <remarks>
/// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack)
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void StoreLowerUnsafe<T>(this Vector128<T> source, ref T destination, nuint elementOffset = 0)
where T : struct
{
ref byte address = ref Unsafe.As<T, byte>(ref Unsafe.Add(ref destination, elementOffset));
Unsafe.WriteUnaligned<double>(ref address, source.AsDouble().ToScalar());
}

/// <summary>Stores a vector at the given destination.</summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="source">The vector that will be stored.</param>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -463,41 +463,6 @@ private static unsafe nuint ChangeCase<TFrom, TTo, TCasing>(TFrom* pSrc, TTo* pD
return i;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void Widen8To16AndAndWriteTo(Vector128<byte> narrowVector, char* pDest, nuint destOffset)
{
if (Vector256.IsHardwareAccelerated)
{
Vector256<ushort> wide = Vector256.WidenLower(narrowVector.ToVector256Unsafe());
wide.StoreUnsafe(ref *(ushort*)pDest, destOffset);
}
else
{
Vector128.WidenLower(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset);
Vector128.WidenUpper(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset + 8);
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void Narrow16To8AndAndWriteTo(Vector128<ushort> wideVector, byte* pDest, nuint destOffset)
{
Vector128<byte> narrow = Vector128.Narrow(wideVector, wideVector);

if (Sse2.IsSupported)
{
// MOVQ is supported even on x86, unaligned accesses allowed
Sse2.StoreScalar((ulong*)(pDest + destOffset), narrow.AsUInt64());
}
else if (Vector64.IsHardwareAccelerated)
{
narrow.GetLower().StoreUnsafe(ref *pDest, destOffset);
}
else
{
Unsafe.WriteUnaligned<ulong>(pDest + destOffset, narrow.AsUInt64().ToScalar());
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void ChangeWidthAndWriteTo<TFrom, TTo>(Vector128<TFrom> vector, TTo* pDest, nuint elementOffset)
where TFrom : unmanaged
Expand All @@ -524,12 +489,9 @@ private static unsafe void ChangeWidthAndWriteTo<TFrom, TTo>(Vector128<TFrom> ve
}
else if (sizeof(TFrom) == 2 && sizeof(TTo) == 1)
{
// narrowing operation required
// since we know data is all-ASCII, special-case SSE2 to avoid unneeded PAND in Narrow call
Vector128<byte> narrow = (Sse2.IsSupported)
? Sse2.PackUnsignedSaturate(vector.AsInt16(), vector.AsInt16())
: Vector128.Narrow(vector.AsUInt16(), vector.AsUInt16());
narrow.GetLower().StoreUnsafe(ref *(byte*)pDest, elementOffset);
// narrowing operation required, we know data is all-ASCII so use extract helper
Vector128<byte> narrow = ExtractAsciiVector(vector.AsUInt16(), vector.AsUInt16());
narrow.StoreLowerUnsafe(ref *(byte*)pDest, elementOffset);
}
else
{
Expand All @@ -556,25 +518,6 @@ private static unsafe Vector128<T> SignedLessThan<T>(Vector128<T> left, Vector12
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe Vector128<TTo> NarrowOrWidenLowerVectorUnsigned<TFrom, TTo>(Vector128<TFrom> vector)
where TFrom : unmanaged
where TTo : unmanaged
{
if (sizeof(TFrom) == 1 && sizeof(TTo) == 2)
{
return Vector128.WidenLower(vector.AsByte()).As<ushort, TTo>();
}
else if (sizeof(TFrom) == 2 && sizeof(TTo) == 1)
{
return Vector128.Narrow(vector.AsUInt16(), vector.AsUInt16()).As<byte, TTo>();
}
else
{
throw new NotSupportedException();
}
}

private struct ToUpperConversion { }
private struct ToLowerConversion { }
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1518,6 +1518,7 @@ private static Vector128<byte> ExtractAsciiVector(Vector128<ushort> vectorFirst,
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
{
// This method contains logic optimized using vector instructions for both x64 and Arm64.
Expand Down Expand Up @@ -1550,7 +1551,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer,

ref byte asciiBuffer = ref *pAsciiBuffer;
Vector128<byte> asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
asciiVector.GetLower().StoreUnsafe(ref asciiBuffer);
asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0);
nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far

// We're going to get the best performance when we have aligned writes, so we'll take the
Expand All @@ -1577,7 +1578,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer,

// Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
}

// Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
Expand Down Expand Up @@ -1630,7 +1631,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer,

Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
currentOffsetInElements += SizeOfVector128 / 2;

goto Finish;
Expand Down

0 comments on commit f1819bd

Please sign in to comment.