Skip to content

Commit

Permalink
Implement vpblendvb optimization in the JIT (#69509)
Browse files Browse the repository at this point in the history
* Move conditonal select handling to lowering, reproduce current behavior

* WIP: adding optimization

* Add BlendVariable optimization to lowering

* Add new test that is optimized by this change

* Ran jit-format

* Fix test failures due to incorrect ordering of operands

* Address PR feedback

* Ran jit-format
  • Loading branch information
dakersnar authored May 25, 2022
1 parent f2364ad commit 10d8a36
Show file tree
Hide file tree
Showing 7 changed files with 259 additions and 78 deletions.
22 changes: 2 additions & 20 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20383,27 +20383,9 @@ GenTree* Compiler::gtNewSimdCndSelNode(var_types type,
NamedIntrinsic intrinsic = NI_Illegal;

#if defined(TARGET_XARCH)
// TODO-XARCH-CQ: It's likely beneficial to have a dedicated CndSel node so we
// can special case when the condition is the result of various compare operations.
//
// When it is, the condition is AllBitsSet or Zero on a per-element basis and we
// could change this to be a Blend operation in lowering as an optimization.

assert((simdSize != 32) || compIsaSupportedDebugOnly(InstructionSet_AVX));
CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, simdBaseJitType);

GenTree* op1Dup;
op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL,
nullptr DEBUGARG("Clone op1 for vector conditional select"));

// op2 = op2 & op1
op2 = gtNewSimdBinOpNode(GT_AND, type, op2, op1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);

// op3 = op3 & ~op1Dup
op3 = gtNewSimdBinOpNode(GT_AND_NOT, type, op3, op1Dup, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);

// result = op2 | op3
return gtNewSimdBinOpNode(GT_OR, type, op2, op3, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
intrinsic = (simdSize == 32) ? NI_Vector256_ConditionalSelect : NI_Vector128_ConditionalSelect;
return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
#elif defined(TARGET_ARM64)
return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, NI_AdvSimd_BitwiseSelect, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
Expand Down
19 changes: 18 additions & 1 deletion src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,12 @@ enum HWIntrinsicFlag : unsigned int
// NoContainment
// the intrinsic cannot be handled by containment,
// all the intrinsic that have explicit memory load/store semantics should have this flag
HW_Flag_NoContainment = 0x10000
HW_Flag_NoContainment = 0x10000,

// Returns Per-Element Mask
// the intrinsic returns a vector containing elements that are either "all bits set" or "all bits clear"
// this output can be used as a per-element mask
HW_Flag_ReturnsPerElementMask = 0x20000

#elif defined(TARGET_ARM64)
// The intrinsic has an immediate operand
Expand Down Expand Up @@ -641,6 +646,18 @@ struct HWIntrinsicInfo
#endif
}

static bool ReturnsPerElementMask(NamedIntrinsic id)
{
HWIntrinsicFlag flags = lookupFlags(id);
#if defined(TARGET_XARCH)
return (flags & HW_Flag_ReturnsPerElementMask) != 0;
#elif defined(TARGET_ARM64)
unreached();
#else
#error Unsupported platform
#endif
}

static bool BaseTypeFromFirstArg(NamedIntrinsic id)
{
HWIntrinsicFlag flags = lookupFlags(id);
Expand Down
114 changes: 57 additions & 57 deletions src/coreclr/jit/hwintrinsiclistxarch.h

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ class Lowering final : public Phase
#ifdef FEATURE_HW_INTRINSICS
void LowerHWIntrinsic(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition);
void LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp);
void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicDot(GenTreeHWIntrinsic* node);
Expand Down
146 changes: 146 additions & 0 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,13 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)

switch (intrinsicId)
{
case NI_Vector128_ConditionalSelect:
case NI_Vector256_ConditionalSelect:
{
LowerHWIntrinsicCndSel(node);
break;
}

case NI_Vector128_Create:
case NI_Vector256_Create:
{
Expand Down Expand Up @@ -1453,6 +1460,145 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
LowerNode(node);
}

//----------------------------------------------------------------------------------------------
// Lowering::LowerHWIntrinsicCndSel: Lowers a Vector128 or Vector256 Conditional Select call
//
// Arguments:
// node - The hardware intrinsic node.
//
void Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node)
{
var_types simdType = node->gtType;
CorInfoType simdBaseJitType = node->GetSimdBaseJitType();
var_types simdBaseType = node->GetSimdBaseType();
unsigned simdSize = node->GetSimdSize();
bool isSimdAsHWIntrinsic = node->IsSimdAsHWIntrinsic();

assert(varTypeIsSIMD(simdType));
assert(varTypeIsArithmetic(simdBaseType));
assert(simdSize != 0);

// Get the three arguments to ConditionalSelect we stored in node
// op1: the condition vector
// op2: the left vector
// op3: the right vector
GenTree* op1 = node->Op(1);
GenTree* op2 = node->Op(2);
GenTree* op3 = node->Op(3);

// If the condition vector comes from a hardware intrinsic that
// returns a per-element mask (marked with HW_Flag_ReturnsPerElementMask),
// we can optimize the entire conditional select to
// a single BlendVariable instruction (if supported by the architecture)

// First, determine if the condition is a per-element mask
if (op1->OperIsHWIntrinsic() && HWIntrinsicInfo::ReturnsPerElementMask(op1->AsHWIntrinsic()->GetHWIntrinsicId()))
{
// Next, determine if the target architecture supports BlendVariable
NamedIntrinsic blendVariableId = NI_Illegal;

// For Vector256 (simdSize == 32), BlendVariable for floats/doubles is available on AVX, whereas other types
// require AVX2
if (simdSize == 32)
{
if (varTypeIsFloating(simdBaseType))
{
// This should have already been confirmed
assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));
blendVariableId = NI_AVX_BlendVariable;
}
else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
blendVariableId = NI_AVX2_BlendVariable;
}
}
// For Vector128, BlendVariable is available on SSE41
else if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
blendVariableId = NI_SSE41_BlendVariable;
}

// If blendVariableId has been set, the architecture supports BlendVariable, so we can optimize
if (blendVariableId != NI_Illegal)
{
// result = BlendVariable op3 (right) op2 (left) op1 (mask)
node->ResetHWIntrinsicId(blendVariableId, comp, op3, op2, op1);
return;
}
}

// We cannot optimize, so produce unoptimized instructions

// We will be constructing the following parts:
// /--* op1 simd16
// * STORE_LCL_VAR simd16
// op1 = LCL_VAR simd16
// tmp1 = LCL_VAR simd16
// ...

GenTree* tmp1;
GenTree* tmp2;
GenTree* tmp3;

LIR::Use op1Use(BlockRange(), &node->Op(1), node);
ReplaceWithLclVar(op1Use);
op1 = node->Op(1);

tmp1 = comp->gtClone(op1);
BlockRange().InsertAfter(op1, tmp1);

// ...
// tmp2 = op1 & op2
// ...
tmp2 = comp->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
BlockRange().InsertAfter(op2, tmp2);
LowerNode(tmp2);

// ...
// tmp3 = op3 & ~tmp1
// ...
tmp3 = comp->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, tmp1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
BlockRange().InsertAfter(op3, tmp3);
LowerNode(tmp3);

// determine which Or intrinsic to use, depending on target architecture
NamedIntrinsic orIntrinsic = NI_Illegal;

if (simdSize == 32)
{
assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));

if (varTypeIsFloating(simdBaseType))
{
orIntrinsic = NI_AVX_Or;
}
else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
orIntrinsic = NI_AVX2_Or;
}
else
{
// Since this is a bitwise operation, we can still support it by lying
// about the type and doing the operation using a supported instruction
orIntrinsic = NI_AVX_Or;
simdBaseJitType = CORINFO_TYPE_FLOAT;
}
}
else if (simdBaseType == TYP_FLOAT)
{
orIntrinsic = NI_SSE_Or;
}
else
{
orIntrinsic = NI_SSE2_Or;
}

// ...
// result = tmp2 | tmp3
node->ResetHWIntrinsicId(orIntrinsic, tmp2, tmp3);
node->SetSimdBaseJitType(simdBaseJitType);
}

//----------------------------------------------------------------------------------------------
// Lowering::LowerHWIntrinsicCreate: Lowers a Vector128 or Vector256 Create call
//
Expand Down
26 changes: 26 additions & 0 deletions src/tests/JIT/Regression/JitBlue/Runtime_67039/Runtime_67039.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;

public class Runtime_67039
{
public static int Main()
{
Vector128<float> left = Vector128.Create(1.0f, 2, 3, 4);
Vector128<float> right = Vector128.Create(4.0f, 3, 2, 1);

var result = Vector128.ConditionalSelect(Vector128.GreaterThan(left, right), left, right);

Vector128<float> expectedResult = Vector128.Create(4.0f, 3, 3, 4);
if (result == expectedResult)
{
return 100;
}
else
{
return 0;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<Optimize>True</Optimize>
</PropertyGroup>
<ItemGroup>
<Compile Include="$(MSBuildProjectName).cs" />
</ItemGroup>
</Project>

0 comments on commit 10d8a36

Please sign in to comment.