Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT ARM64-SVE: Allow LCL_VARs to store as mask #99608

Merged
merged 28 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
6628904
JIT ARM64-SVE: Allow LCL_VARs to store as mask
a74nh Mar 8, 2024
ed574f9
Remove FEATURE_MASKED_SIMD
a74nh Mar 13, 2024
02fa227
More generic ifdefs
a74nh Mar 13, 2024
2e2e174
Add varTypeIsSIMDOrMask
a74nh Mar 13, 2024
fcdb18a
Add extra type checks
a74nh Mar 13, 2024
687af37
Merge main
a74nh Mar 13, 2024
1fc8d5b
Fix use of isValidSimm9, and add extra uses
a74nh Mar 13, 2024
9dbfe63
Rename mask conversion functions to gtNewSimdConvert*
a74nh Mar 13, 2024
85f09bf
Add OperIs functions
a74nh Mar 13, 2024
7945d51
Mark untested uses of mov
a74nh Mar 14, 2024
bd5d951
Add INS_SCALABLE_OPTS_PREDICATE_DEST
a74nh Mar 14, 2024
ce61a40
Valuenum fixes for tier 1
a74nh Mar 14, 2024
b5502a6
Remove importer changes
a74nh Mar 14, 2024
39c02d0
XARCH versions of OperIsConvertMaskToVector
a74nh Mar 14, 2024
d8dea0e
Revert "Remove importer changes"
a74nh Mar 14, 2024
8ec8e38
Add tests fopr emitIns_S_R and emitIns_R_S
a74nh Mar 14, 2024
3ec441c
Fix formatting
a74nh Mar 15, 2024
f569512
Reapply "Remove importer changes"
a74nh Mar 15, 2024
0110170
Use dummy mask ldr and str
a74nh Mar 18, 2024
ec05e34
Refactor emitIns_S_R and emitIns_R_S
a74nh Mar 19, 2024
71bcb48
Move str_mask/ldr_mask
a74nh Mar 19, 2024
24cd68b
Fix formatting
a74nh Mar 19, 2024
5b995ae
Set imm
a74nh Mar 19, 2024
3a82d5d
fix assert
a74nh Mar 19, 2024
8baee38
Fix assert (2)
a74nh Mar 20, 2024
b22755a
Fix assert (3)
a74nh Mar 20, 2024
bd8db6e
nop
a74nh Mar 20, 2024
e359c93
Merge branch 'main' into lcl_var_mask_github
a74nh Mar 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/coreclr/jit/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,14 @@ function(create_standalone_jit)
if ((TARGETDETAILS_ARCH STREQUAL "x64") OR (TARGETDETAILS_ARCH STREQUAL "arm64") OR ((TARGETDETAILS_ARCH STREQUAL "x86") AND NOT (TARGETDETAILS_OS STREQUAL "unix")))
target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE FEATURE_SIMD)
target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE FEATURE_HW_INTRINSICS)
target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE FEATURE_MASKED_HW_INTRINSICS)
endif ()
endfunction()

if (CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64 OR (CLR_CMAKE_TARGET_ARCH_I386 AND NOT CLR_CMAKE_HOST_UNIX))
add_compile_definitions($<$<NOT:$<BOOL:$<TARGET_PROPERTY:IGNORE_DEFAULT_TARGET_ARCH>>>:FEATURE_SIMD>)
add_compile_definitions($<$<NOT:$<BOOL:$<TARGET_PROPERTY:IGNORE_DEFAULT_TARGET_ARCH>>>:FEATURE_HW_INTRINSICS>)
add_compile_definitions($<$<NOT:$<BOOL:$<TARGET_PROPERTY:IGNORE_DEFAULT_TARGET_ARCH>>>:FEATURE_MASKED_HW_INTRINSICS>)
endif ()

# JIT_BUILD disables certain PAL_TRY debugging features
Expand Down
29 changes: 22 additions & 7 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2767,11 +2767,19 @@ void CodeGen::genCodeForLclVar(GenTreeLclVar* tree)
// targetType must be a normal scalar type and not a TYP_STRUCT
assert(targetType != TYP_STRUCT);

instruction ins = ins_Load(targetType);
emitAttr attr = emitActualTypeSize(targetType);
instruction ins = ins_Load(targetType);
emitAttr attr = emitActualTypeSize(targetType);
insScalableOpts sopt = INS_SCALABLE_OPTS_NONE;
emitter* emit = GetEmitter();

// TODO-SVE: Removable once REG_V0 and REG_P0 are distinct
if (varTypeUsesMaskReg(targetType))
{
sopt = INS_SCALABLE_OPTS_PREDICATE_DEST;
}

emit->emitIns_R_S(ins, attr, tree->GetRegNum(), varNum, 0, sopt);

emitter* emit = GetEmitter();
emit->emitIns_R_S(ins, attr, tree->GetRegNum(), varNum, 0);
genProduceReg(tree);
}
}
Expand Down Expand Up @@ -2953,10 +2961,17 @@ void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* lclNode)
{
inst_set_SV_var(lclNode);

instruction ins = ins_StoreFromSrc(dataReg, targetType);
emitAttr attr = emitActualTypeSize(targetType);
instruction ins = ins_StoreFromSrc(dataReg, targetType);
emitAttr attr = emitActualTypeSize(targetType);
insScalableOpts sopt = INS_SCALABLE_OPTS_NONE;

// TODO-SVE: Removable once REG_V0 and REG_P0 are distinct
if (varTypeUsesMaskReg(targetType))
{
sopt = INS_SCALABLE_OPTS_PREDICATE_DEST;
}

emit->emitIns_S_R(ins, attr, dataReg, varNum, /* offset */ 0);
emit->emitIns_S_R(ins, attr, dataReg, varNum, /* offset */ 0, sopt);
}
else // store into register (i.e move into register)
{
Expand Down
10 changes: 5 additions & 5 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -3446,6 +3446,11 @@ class Compiler

GenTreeIndir* gtNewMethodTableLookup(GenTree* obj);

#if defined(TARGET_ARM64)
GenTree* gtNewSimdConvertVectorToMaskNode(var_types type, GenTree* node, CorInfoType simdBaseJitType, unsigned simdSize);
GenTree* gtNewSimdConvertMaskToVectorNode(GenTreeHWIntrinsic* node, var_types type);
#endif

//------------------------------------------------------------------------
// Other GenTree functions

Expand Down Expand Up @@ -4556,11 +4561,6 @@ class Compiler
NamedIntrinsic intrinsic, GenTree* immOp, bool mustExpand, int immLowerBound, int immUpperBound);
GenTree* addRangeCheckForHWIntrinsic(GenTree* immOp, int immLowerBound, int immUpperBound);

#if defined(TARGET_ARM64)
GenTree* convertHWIntrinsicToMask(var_types type, GenTree* node, CorInfoType simdBaseJitType, unsigned simdSize);
GenTree* convertHWIntrinsicFromMask(GenTreeHWIntrinsic* node, var_types type);
#endif

#endif // FEATURE_HW_INTRINSICS
GenTree* impArrayAccessIntrinsic(CORINFO_CLASS_HANDLE clsHnd,
CORINFO_SIG_INFO* sig,
Expand Down
137 changes: 108 additions & 29 deletions src/coreclr/jit/emitarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5932,7 +5932,7 @@ emitter::code_t emitter::emitInsCodeSve(instruction ins, insFormat fmt)
if (imm == 0)
return true; // Encodable using IF_LS_2A

if ((imm >= -256) && (imm <= 255))
if (isValidSimm<9>(imm))
return true; // Encodable using IF_LS_2C (or possibly IF_LS_2B)

if (imm < 0)
Expand Down Expand Up @@ -10727,7 +10727,7 @@ void emitter::emitIns_R_R_I(instruction ins,
}
else if (insOptsIndexed(opt) || unscaledOp || (imm < 0) || ((imm & mask) != 0))
{
if ((imm >= -256) && (imm <= 255))
if (isValidSimm<9>(imm))
{
fmt = IF_LS_2C;
}
Expand Down Expand Up @@ -17311,13 +17311,19 @@ void emitter::emitIns_S(instruction ins, emitAttr attr, int varx, int offs)
*
* Add an instruction referencing a register and a stack-based local variable.
*/
void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs)
void emitter::emitIns_R_S(instruction ins,
emitAttr attr,
regNumber reg1,
int varx,
int offs,
insScalableOpts sopt /* = INS_SCALABLE_OPTS_NONE */)
{
emitAttr size = EA_SIZE(attr);
insFormat fmt = IF_NONE;
int disp = 0;
unsigned scale = 0;
bool isLdrStr = false;
emitAttr size = EA_SIZE(attr);
insFormat fmt = IF_NONE;
int disp = 0;
unsigned scale = 0;
bool isLdrStr = false;
bool isScalable = false;

assert(offs >= 0);

Expand Down Expand Up @@ -17353,16 +17359,43 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
scale = 0;
break;

case INS_sve_ldr:
assert(isVectorRegister(reg1) || isPredicateRegister(reg1));
isScalable = true;

// TODO-SVE: This should probably be set earlier in the caller
size = EA_SCALABLE;
attr = size;

// TODO-SVE: Use register number instead of enum
// TODO-SVE: Don't assume 128bit vectors
if (sopt == INS_SCALABLE_OPTS_PREDICATE_DEST)
{
assert(isPredicateRegister(reg1));
fmt = IF_SVE_ID_2A;
// Predicate size is vector length / 8
scale = NaturalScale_helper(EA_2BYTE);
}
else
{
assert(insScalableOptsNone(sopt));
assert(isVectorRegister(reg1));
fmt = IF_SVE_IE_2A;
scale = NaturalScale_helper(EA_16BYTE);
}
break;

default:
NYI("emitIns_R_S"); // FP locals?
return;

} // end switch (ins)

/* Figure out the variable's frame position */
ssize_t imm;
int base;
bool FPbased;
ssize_t imm;
int base;
bool FPbased;
insFormat scalarfmt = fmt;

base = emitComp->lvaFrameAddress(varx, &FPbased);
disp = base + offs;
Expand All @@ -17387,13 +17420,13 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va

if (imm <= 0x0fff)
{
fmt = IF_DI_2A; // add reg1,reg2,#disp
scalarfmt = IF_DI_2A; // add reg1,reg2,#disp
}
else
{
regNumber rsvdReg = codeGen->rsGetRsvdReg();
codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm);
fmt = IF_DR_3A; // add reg1,reg2,rsvdReg
scalarfmt = IF_DR_3A; // add reg1,reg2,rsvdReg
}
}
else
Expand All @@ -17402,13 +17435,13 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
imm = disp;
if (imm == 0)
{
fmt = IF_LS_2A;
scalarfmt = IF_LS_2A;
}
else if ((imm < 0) || ((imm & mask) != 0))
{
if ((imm >= -256) && (imm <= 255))
if (isValidSimm<9>(imm))
{
fmt = IF_LS_2C;
scalarfmt = IF_LS_2C;
}
else
{
Expand All @@ -17417,11 +17450,13 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
}
else if (imm > 0)
{
// TODO: We should be able to scale values <0 for all variants.

if (((imm & mask) == 0) && ((imm >> scale) < 0x1000))
{
imm >>= scale; // The immediate is scaled by the size of the ld/st

fmt = IF_LS_2B;
scalarfmt = IF_LS_2B;
}
else
{
Expand All @@ -17433,10 +17468,15 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
{
regNumber rsvdReg = codeGen->rsGetRsvdReg();
codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm);
fmt = IF_LS_3A;
scalarfmt = IF_LS_3A;
}
}

// Set the format based on the immediate encoding
if (!isScalable)
{
fmt = scalarfmt;
}
assert(fmt != IF_NONE);

// Try to optimize a load/store with an alternative instruction.
Expand Down Expand Up @@ -17564,7 +17604,12 @@ void emitter::emitIns_R_R_S_S(
*
* Add an instruction referencing a stack-based local variable and a register
*/
void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs)
void emitter::emitIns_S_R(instruction ins,
emitAttr attr,
regNumber reg1,
int varx,
int offs,
insScalableOpts sopt /* = INS_SCALABLE_OPTS_NONE */)
{
assert(offs >= 0);
emitAttr size = EA_SIZE(attr);
Expand All @@ -17573,6 +17618,7 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
unsigned scale = 0;
bool isVectorStore = false;
bool isStr = false;
bool isScalable = false;

// TODO-ARM64-CQ: use unscaled loads?
/* Figure out the encoding format of the instruction */
Expand Down Expand Up @@ -17604,6 +17650,32 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
isStr = true;
break;

case INS_sve_str:
assert(isVectorRegister(reg1) || isPredicateRegister(reg1));
isScalable = true;

// TODO-SVE: This should probably be set in the caller
size = EA_SCALABLE;
attr = size;

// TODO-SVE: Use register number instead of enum
// TODO-SVE: Don't assume 128bit vectors
if (sopt == INS_SCALABLE_OPTS_PREDICATE_DEST)
{
assert(isPredicateRegister(reg1));
fmt = IF_SVE_JG_2A;
// Predicate size is vector length / 8
scale = NaturalScale_helper(EA_2BYTE);
}
else
{
assert(insScalableOptsNone(sopt));
assert(isVectorRegister(reg1));
fmt = IF_SVE_JH_2A;
scale = NaturalScale_helper(EA_16BYTE);
}
break;

default:
NYI("emitIns_S_R"); // FP locals?
return;
Expand All @@ -17617,7 +17689,7 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
base = emitComp->lvaFrameAddress(varx, &FPbased);
disp = base + offs;
assert(scale >= 0);
if (isVectorStore)
if (isVectorStore || isScalable)
{
assert(scale <= 4);
}
Expand All @@ -17630,18 +17702,19 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
regNumber reg2 = FPbased ? REG_FPBASE : REG_SPBASE;
reg2 = encodingSPtoZR(reg2);

bool useRegForImm = false;
ssize_t imm = disp;
ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate
bool useRegForImm = false;
ssize_t imm = disp;
ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate
insFormat scalarfmt = fmt;
if (imm == 0)
{
fmt = IF_LS_2A;
scalarfmt = IF_LS_2A;
}
else if ((imm < 0) || ((imm & mask) != 0))
{
if ((imm >= -256) && (imm <= 255))
if (isValidSimm<9>(imm))
{
fmt = IF_LS_2C;
scalarfmt = IF_LS_2C;
}
else
{
Expand All @@ -17650,11 +17723,12 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
}
else if (imm > 0)
{
// TODO: We should be able to scale values <0 for all variants.

if (((imm & mask) == 0) && ((imm >> scale) < 0x1000))
{
imm >>= scale; // The immediate is scaled by the size of the ld/st

fmt = IF_LS_2B;
scalarfmt = IF_LS_2B;
}
else
{
Expand All @@ -17668,9 +17742,14 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
// It is instead implicit when idSetIsLclVar() is set, with this encoding format.
regNumber rsvdReg = codeGen->rsGetRsvdReg();
codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm);
fmt = IF_LS_3A;
scalarfmt = IF_LS_3A;
}

// Set the format based on the immediate encoding
if (!isScalable)
{
fmt = scalarfmt;
}
assert(fmt != IF_NONE);

// Try to optimize a store with an alternative instruction.
Expand Down
6 changes: 4 additions & 2 deletions src/coreclr/jit/emitarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -1604,7 +1604,8 @@ void emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fdlHnd, int

void emitIns_S(instruction ins, emitAttr attr, int varx, int offs);

void emitIns_S_R(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs);
void emitIns_S_R(
instruction ins, emitAttr attr, regNumber ireg, int varx, int offs, insScalableOpts sopt = INS_SCALABLE_OPTS_NONE);

void emitIns_S_S_R_R(
instruction ins, emitAttr attr, emitAttr attr2, regNumber ireg, regNumber ireg2, int varx, int offs);
Expand All @@ -1622,7 +1623,8 @@ void emitIns_R_R_R_I_LdStPair(instruction ins,
int offs2 = -1 DEBUG_ARG(unsigned var1RefsOffs = BAD_IL_OFFSET)
DEBUG_ARG(unsigned var2RefsOffs = BAD_IL_OFFSET));

void emitIns_R_S(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs);
void emitIns_R_S(
instruction ins, emitAttr attr, regNumber ireg, int varx, int offs, insScalableOpts sopt = INS_SCALABLE_OPTS_NONE);

void emitIns_R_R_S_S(
instruction ins, emitAttr attr, emitAttr attr2, regNumber ireg, regNumber ireg2, int varx, int offs);
Expand Down
Loading
Loading