Skip to content

Commit

Permalink
System.Text.Encodings.Web refactoring and code modernization (#49373)
Browse files Browse the repository at this point in the history
- Unify workhorse implementations across all inbox encoders
- Refactor most unsafe code from TextEncoder workhorse routines into standalone helpers
- Fix bounds check logic in workhorse routines
- SSSE3-optimize central workhorse routine
- Remove vestigial code from the library and unit test project
- Add significant unit test coverage for the workhorse routines and unsafe helpers
- Ref: CVE-2021-26701 (MSRC 62749)
  • Loading branch information
GrabYourPitchforks committed Mar 19, 2021
1 parent 9f93bcb commit c569bc1
Show file tree
Hide file tree
Showing 62 changed files with 5,267 additions and 5,728 deletions.
60 changes: 38 additions & 22 deletions src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs
Original file line number Diff line number Diff line change
Expand Up @@ -994,19 +994,27 @@ public static bool TryCreate(uint value, out Rune result)
/// The <see cref="Utf16SequenceLength"/> property can be queried ahead of time to determine
/// the required size of the <paramref name="destination"/> buffer.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryEncodeToUtf16(Span<char> destination, out int charsWritten)
{
if (destination.Length >= 1)
// The Rune type fits cleanly into a register, so pass byval rather than byref
// to avoid stack-spilling the 'this' parameter.
return TryEncodeToUtf16(this, destination, out charsWritten);
}

private static bool TryEncodeToUtf16(Rune value, Span<char> destination, out int charsWritten)
{
if (!destination.IsEmpty)
{
if (IsBmp)
if (value.IsBmp)
{
destination[0] = (char)_value;
destination[0] = (char)value._value;
charsWritten = 1;
return true;
}
else if (destination.Length >= 2)
else if (1 < (uint)destination.Length)
{
UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]);
UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar((uint)value._value, out destination[0], out destination[1]);
charsWritten = 2;
return true;
}
Expand All @@ -1030,49 +1038,57 @@ public bool TryEncodeToUtf16(Span<char> destination, out int charsWritten)
/// The <see cref="Utf8SequenceLength"/> property can be queried ahead of time to determine
/// the required size of the <paramref name="destination"/> buffer.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryEncodeToUtf8(Span<byte> destination, out int bytesWritten)
{
// The Rune type fits cleanly into a register, so pass byval rather than byref
// to avoid stack-spilling the 'this' parameter.
return TryEncodeToUtf8(this, destination, out bytesWritten);
}

private static bool TryEncodeToUtf8(Rune value, Span<byte> destination, out int bytesWritten)
{
// The bit patterns below come from the Unicode Standard, Table 3-6.

if (destination.Length >= 1)
if (!destination.IsEmpty)
{
if (IsAscii)
if (value.IsAscii)
{
destination[0] = (byte)_value;
destination[0] = (byte)value._value;
bytesWritten = 1;
return true;
}

if (destination.Length >= 2)
if (1 < (uint)destination.Length)
{
if (_value <= 0x7FFu)
if (value.Value <= 0x7FFu)
{
// Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ]
destination[0] = (byte)((_value + (0b110u << 11)) >> 6);
destination[1] = (byte)((_value & 0x3Fu) + 0x80u);
destination[0] = (byte)((value._value + (0b110u << 11)) >> 6);
destination[1] = (byte)((value._value & 0x3Fu) + 0x80u);
bytesWritten = 2;
return true;
}

if (destination.Length >= 3)
if (2 < (uint)destination.Length)
{
if (_value <= 0xFFFFu)
if (value.Value <= 0xFFFFu)
{
// Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ]
destination[0] = (byte)((_value + (0b1110 << 16)) >> 12);
destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
destination[2] = (byte)((_value & 0x3Fu) + 0x80u);
destination[0] = (byte)((value._value + (0b1110 << 16)) >> 12);
destination[1] = (byte)(((value._value & (0x3Fu << 6)) >> 6) + 0x80u);
destination[2] = (byte)((value._value & 0x3Fu) + 0x80u);
bytesWritten = 3;
return true;
}

if (destination.Length >= 4)
if (3 < (uint)destination.Length)
{
// Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
destination[0] = (byte)((_value + (0b11110 << 21)) >> 18);
destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u);
destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
destination[3] = (byte)((_value & 0x3Fu) + 0x80u);
destination[0] = (byte)((value._value + (0b11110 << 21)) >> 18);
destination[1] = (byte)(((value._value & (0x3Fu << 12)) >> 12) + 0x80u);
destination[2] = (byte)(((value._value & (0x3Fu << 6)) >> 6) + 0x80u);
destination[3] = (byte)((value._value & 0x3Fu) + 0x80u);
bytesWritten = 4;
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ namespace System.Text
{
internal static class UnicodeDebug
{
[Conditional("DEBUG")]
internal static void AssertIsBmpCodePoint(uint codePoint)
{
if (!UnicodeUtility.IsBmpCodePoint(codePoint))
{
Debug.Fail($"The value {ToHexString(codePoint)} is not a valid BMP code point.");
}
}

[Conditional("DEBUG")]
internal static void AssertIsHighSurrogateCodePoint(uint codePoint)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>$(NetCoreAppCurrent);netstandard2.0;net461</TargetFrameworks>
<TargetFrameworks>$(NetCoreAppCurrent);netcoreapp3.1;netstandard2.0;net461</TargetFrameworks>
<ExcludeCurrentNetCoreAppFromPackage>true</ExcludeCurrentNetCoreAppFromPackage>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<!-- Only CLS-compliant members can be abstract -->
Expand All @@ -11,12 +11,13 @@
<Compile Include="System.Text.Encodings.Web.cs" />
</ItemGroup>
<ItemGroup Condition="'$(TargetFramework)' == '$(NetCoreAppCurrent)'">
<ProjectReference Include="$(LibrariesProjectRoot)System.Memory\ref\System.Memory.csproj" />
<ProjectReference Include="$(LibrariesProjectRoot)System.Runtime\ref\System.Runtime.csproj" />
<ProjectReference Include="$(LibrariesProjectRoot)System.Runtime.Extensions\ref\System.Runtime.Extensions.csproj" />
</ItemGroup>
<ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0' or
$(TargetFramework.StartsWith('net4'))">
<ItemGroup Condition="'$(TargetFramework)' == 'netcoreapp3.1'">
<Reference Include="System.Runtime" />
<Reference Include="System.Runtime.Extensions" />
</ItemGroup>
<ItemGroup Condition="'$(TargetFramework)' != '$(NetCoreAppCurrent)' and '$(TargetFramework)' != 'netcoreapp3.1'">
<PackageReference Include="System.Memory" Version="$(SystemMemoryVersion)" />
</ItemGroup>
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

// Contains a polyfill implementation of System.Numerics.BitOperations that works on netstandard2.0.
// Implementation copied from:
// https://github.com/dotnet/runtime/blob/6072e4d3a7a2a1493f514cdf4be75a3d56580e84/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs
//
// Some routines inspired by the Stanford Bit Twiddling Hacks by Sean Eron Anderson:
// http://graphics.stanford.edu/~seander/bithacks.html

namespace System.Numerics
{
internal static class BitOperations
{
private static ReadOnlySpan<byte> Log2DeBruijn => new byte[32]
{
00, 09, 01, 10, 13, 21, 02, 29,
11, 14, 16, 18, 22, 25, 03, 30,
08, 12, 20, 28, 15, 17, 24, 07,
19, 27, 23, 06, 26, 05, 04, 31
};

/// <summary>
/// Returns the integer (floor) log of the specified value, base 2.
/// Note that by convention, input value 0 returns 0 since log(0) is undefined.
/// </summary>
/// <param name="value">The value.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int Log2(uint value)
{
// Fallback contract is 0->0
return Log2SoftwareFallback(value | 1);
}

/// <summary>
/// Returns the integer (floor) log of the specified value, base 2.
/// Note that by convention, input value 0 returns 0 since Log(0) is undefined.
/// Does not directly use any hardware intrinsics, nor does it incur branching.
/// </summary>
/// <param name="value">The value.</param>
private static int Log2SoftwareFallback(uint value)
{
// No AggressiveInlining due to large method size
// Has conventional contract 0->0 (Log(0) is undefined)

// Fill trailing zeros with ones, eg 00010010 becomes 00011111
value |= value >> 01;
value |= value >> 02;
value |= value >> 04;
value |= value >> 08;
value |= value >> 16;

// uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check
return Unsafe.AddByteOffset(
// Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_1100_0100_1010_1100_1101_1101u
ref MemoryMarshal.GetReference(Log2DeBruijn),
// uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here
(nint)((value * 0x07C4ACDDu) >> 27));
}
}
}
Loading

0 comments on commit c569bc1

Please sign in to comment.