Skip to content

Commit

Permalink
[Arm64] Keep unrolling InitBlock and CopyBlock up to 128 bytes (#63422)
Browse files Browse the repository at this point in the history
* Add INITBLK_LCL_UNROLL_LIMIT and CPBLK_LCL_UNROLL_LIMIT of 128 bytes in src/coreclr/jit/targetarm64.h

* Keep unrolling InitBlock up to INITBLK_LCL_UNROLL_LIMIT bytes when dstAddr points to the stack in src/coreclr/jit/lowerarmarch.cpp

* Keep unrolling CopyBlock up to CPBLK_LCL_UNROLL_LIMIT bytes when both srcAddr and dstAddr point to the stack in src/coreclr/jit/lowerarmarch.cpp
  • Loading branch information
echesakov authored Jan 20, 2022
1 parent a5158df commit f1c8b10
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 10 deletions.
49 changes: 41 additions & 8 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
GenTree* src = blkNode->Data();
unsigned size = blkNode->Size();

const bool isDstAddrLocal = dstAddr->OperIsLocalAddr();

if (blkNode->OperIsInitBlkOp())
{
if (src->OperIs(GT_INIT_VAL))
Expand All @@ -306,7 +308,18 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->SetOper(GT_STORE_BLK);
}

if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT) && src->OperIs(GT_CNS_INT))
unsigned initBlockUnrollLimit = INITBLK_UNROLL_LIMIT;

#ifdef TARGET_ARM64
if (isDstAddrLocal)
{
// Since dstAddr points to the stack CodeGen can use more optimal
// quad-word store SIMD instructions for InitBlock.
initBlockUnrollLimit = INITBLK_LCL_UNROLL_LIMIT;
}
#endif

if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= initBlockUnrollLimit) && src->OperIs(GT_CNS_INT))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down Expand Up @@ -353,27 +366,47 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
assert(src->OperIs(GT_IND, GT_LCL_VAR, GT_LCL_FLD));
src->SetContained();

bool isSrcAddrLocal = false;

if (src->OperIs(GT_IND))
{
GenTree* srcAddr = src->AsIndir()->Addr();
// TODO-Cleanup: Make sure that GT_IND lowering didn't mark the source address as contained.
// Sometimes the GT_IND type is a non-struct type and then GT_IND lowering may contain the
// address, not knowing that GT_IND is part of a block op that has containment restrictions.
src->AsIndir()->Addr()->ClearContained();
srcAddr->ClearContained();
isSrcAddrLocal = srcAddr->OperIsLocalAddr();
}
else
{
isSrcAddrLocal = true;

if (src->OperIs(GT_LCL_VAR))
{
// TODO-1stClassStructs: for now we can't work with STORE_BLOCK source in register.
const unsigned srcLclNum = src->AsLclVar()->GetLclNum();
comp->lvaSetVarDoNotEnregister(srcLclNum DEBUGARG(DoNotEnregisterReason::BlockOp));
}
}
else if (src->OperIs(GT_LCL_VAR))

unsigned copyBlockUnrollLimit = CPBLK_UNROLL_LIMIT;

#ifdef TARGET_ARM64
if (isSrcAddrLocal && isDstAddrLocal)
{
// TODO-1stClassStructs: for now we can't work with STORE_BLOCK source in register.
const unsigned srcLclNum = src->AsLclVar()->GetLclNum();
comp->lvaSetVarDoNotEnregister(srcLclNum DEBUGARG(DoNotEnregisterReason::BlockOp));
// Since both srcAddr and dstAddr point to the stack CodeGen can use more optimal
// quad-word load and store SIMD instructions for CopyBlock.
copyBlockUnrollLimit = CPBLK_LCL_UNROLL_LIMIT;
}
#endif

if (blkNode->OperIs(GT_STORE_OBJ))
{
if (!blkNode->AsObj()->GetLayout()->HasGCPtr())
{
blkNode->SetOper(GT_STORE_BLK);
}
else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT))
else if (isDstAddrLocal && (size <= copyBlockUnrollLimit))
{
// If the size is small enough to unroll then we need to mark the block as non-interruptible
// to actually allow unrolling. The generated code does not report GC references loaded in the
Expand All @@ -389,7 +422,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)

blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
}
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT))
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= copyBlockUnrollLimit))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down
6 changes: 4 additions & 2 deletions src/coreclr/jit/targetarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0

#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk.
#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk
#define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack)
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk
#define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack)

#ifdef FEATURE_SIMD
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
Expand Down

0 comments on commit f1c8b10

Please sign in to comment.