From 2e26efaf9bb9d780c10cc20531b5d3a37cda954f Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 9 Nov 2021 17:48:11 -0800 Subject: [PATCH 01/16] Use SIMD operations in CodeGen::genCodeForInitBlkUnroll() --- src/coreclr/jit/codegenarmarch.cpp | 433 ++++++++++++++++++++++++++++- src/coreclr/jit/lsraarmarch.cpp | 6 + 2 files changed, 428 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index d53014650dc6b..096a41e89b08e 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -1879,6 +1879,368 @@ void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode) } } +#ifdef TARGET_ARM64 + +class CountingStream +{ +public: + CountingStream() + { + instrCount = 0; + } + + void LoadPairRegs(int offset, int regSizeBytes) + { + instrCount++; + } + + void StorePairRegs(int offset, int regSizeBytes) + { + instrCount++; + } + + void LoadReg(int offset, int regSizeBytes) + { + instrCount++; + } + + void StoreReg(int offset, int regSizeBytes) + { + instrCount++; + } + + int InstructionCount() const + { + return instrCount; + } + +private: + int instrCount; +}; + +class StoreBlockUnrollHelper +{ +public: + static int AlignUp(int offset, int alignment) + { + assert(((alignment - 1) & alignment) == 0); + return (offset + alignment - 1) & (-alignment); + } + + static int GetRegSizeAtLeastBytes(int byteCount) + { + assert(byteCount > 0); + assert(byteCount < 16); + + int regSizeBytes = byteCount; + + if (byteCount > 8) + { + regSizeBytes = 16; + } + else if (byteCount > 4) + { + regSizeBytes = 8; + } + else if (byteCount > 2) + { + regSizeBytes = 4; + } + + return regSizeBytes; + } +}; + +class VerifyingStream +{ +public: + VerifyingStream() + { + canEncodeAllLoads = true; + canEncodeAllStores = true; + } + + void LoadPairRegs(int offset, int regSizeBytes) + { + canEncodeAllLoads = canEncodeAllLoads && CanEncodeLoadOrStorePairRegs(offset, regSizeBytes); + } + + void StorePairRegs(int offset, int regSizeBytes) + { + canEncodeAllStores = canEncodeAllStores && CanEncodeLoadOrStorePairRegs(offset, regSizeBytes); + } + + void LoadReg(int offset, int regSizeBytes) + { + canEncodeAllLoads = canEncodeAllLoads && CanEncodeLoadOrStoreReg(offset, regSizeBytes); + } + + void StoreReg(int offset, int regSizeBytes) + { + canEncodeAllStores = canEncodeAllStores && CanEncodeLoadOrStoreReg(offset, regSizeBytes); + } + + bool CanEncodeAllLoads() const + { + return canEncodeAllLoads; + } + + bool CanEncodeAllStores() const + { + return canEncodeAllStores; + } + +private: + static bool CanEncodeLoadOrStorePairRegs(int offset, int regSizeBytes) + { + const bool canEncodeWithSignedOffset = + (offset % regSizeBytes == 0) && (offset >= -64 * regSizeBytes) && (offset < 64 * regSizeBytes); + return canEncodeWithSignedOffset; + } + + static bool CanEncodeLoadOrStoreReg(int offset, int regSizeBytes) + { + const bool canEncodeWithUnsignedOffset = + (offset % regSizeBytes == 0) && (offset >= 0) && (offset < regSizeBytes * 4096); + const bool canEncodeWithUnscaledOffset = (offset >= -256) && (offset <= 255); + + return canEncodeWithUnsignedOffset || canEncodeWithUnscaledOffset; + } + + bool canEncodeAllLoads; + bool canEncodeAllStores; +}; + +class ProducingStream +{ +public: + ProducingStream(regNumber intReg1, + regNumber intReg2, + regNumber simdReg1, + regNumber simdReg2, + regNumber addrReg, + emitter* emitter) + : intReg1(intReg1), intReg2(intReg2), simdReg1(simdReg1), simdReg2(simdReg2), addrReg(addrReg), emitter(emitter) + { + } + + void LoadPairRegs(int offset, int regSizeBytes) + { + assert((regSizeBytes == 8) || (regSizeBytes == 16)); + + regNumber tempReg1; + regNumber tempReg2; + + if ((regSizeBytes == 16) || (intReg2 == REG_NA)) + { + tempReg1 = simdReg1; + tempReg2 = simdReg2; + } + else + { + tempReg1 = intReg1; + tempReg2 = intReg2; + } + + emitter->emitIns_R_R_R_I(INS_ldp, EA_SIZE(regSizeBytes), tempReg1, tempReg2, addrReg, offset); + } + + void StorePairRegs(int offset, int regSizeBytes) + { + assert((regSizeBytes == 8) || (regSizeBytes == 16)); + + regNumber tempReg1; + regNumber tempReg2; + + if ((regSizeBytes == 16) || (intReg2 == REG_NA)) + { + tempReg1 = simdReg1; + tempReg2 = simdReg2; + } + else + { + tempReg1 = intReg1; + tempReg2 = intReg2; + } + + emitter->emitIns_R_R_R_I(INS_stp, EA_SIZE(regSizeBytes), tempReg1, tempReg2, addrReg, offset); + } + + void LoadReg(int offset, int regSizeBytes) + { + instruction ins = INS_ldr; + regNumber tempReg = intReg1; + + if ((intReg1 == REG_NA) || (regSizeBytes == 16)) + { + tempReg = simdReg1; + } + else if (regSizeBytes == 1) + { + ins = INS_ldrb; + } + else if (regSizeBytes == 2) + { + ins = INS_ldrh; + } + + emitter->emitIns_R_R_I(ins, EA_SIZE(regSizeBytes), tempReg, addrReg, offset); + } + + void StoreReg(int offset, int regSizeBytes) + { + instruction ins = INS_str; + regNumber tempReg = intReg1; + + if ((intReg1 == REG_NA) || (regSizeBytes == 16)) + { + tempReg = simdReg1; + } + else if (regSizeBytes == 1) + { + ins = INS_strb; + } + else if (regSizeBytes == 2) + { + ins = INS_strh; + } + + emitter->emitIns_R_R_I(ins, EA_SIZE(regSizeBytes), tempReg, addrReg, offset); + } + +private: + const regNumber intReg1; + const regNumber intReg2; + const regNumber simdReg1; + const regNumber simdReg2; + const regNumber addrReg; + emitter* const emitter; +}; + +class InitBlockUnrollHelper +{ +public: + InitBlockUnrollHelper(int dstOffset, int byteCount) : dstStartOffset(dstOffset), byteCount(byteCount) + { + } + + int GetDstOffset() const + { + return dstStartOffset; + } + + void SetDstOffset(int dstOffset) + { + dstStartOffset = dstOffset; + } + + bool CanEncodeAllOffsets(int regSizeBytes) const + { + VerifyingStream instrStream; + UnrollInitBlock(instrStream, regSizeBytes); + + return instrStream.CanEncodeAllStores(); + } + + int InstructionCount(int regSizeBytes) const + { + CountingStream instrStream; + UnrollInitBlock(instrStream, regSizeBytes); + + return instrStream.InstructionCount(); + } + + void Unroll(int regSizeBytes, regNumber intReg, regNumber simdReg, regNumber addrReg, emitter* emitter) const + { + ProducingStream instrStream(intReg, intReg, simdReg, simdReg, addrReg, emitter); + UnrollInitBlock(instrStream, regSizeBytes); + } + +private: + template + void UnrollInitBlock(InstructionStream& instrStream, int regSizeBytes) const + { + assert((regSizeBytes == 8) || (regSizeBytes == 16)); + + int offset = dstStartOffset; + const int endOffset = offset + byteCount; + + const int storePairRegsAlignment = regSizeBytes; + const int storePairRegsWritesBytes = 2 * regSizeBytes; + + const int offsetAligned = StoreBlockUnrollHelper::AlignUp(offset, storePairRegsAlignment); + const int storePairRegsInstrCount = (endOffset - offsetAligned) / storePairRegsWritesBytes; + + if (storePairRegsInstrCount > 0) + { + if (offset != offsetAligned) + { + const int firstRegSizeBytes = StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(offsetAligned - offset); + instrStream.StoreReg(offset, firstRegSizeBytes); + offset = offsetAligned; + } + + while (endOffset - offset >= storePairRegsWritesBytes) + { + instrStream.StorePairRegs(offset, regSizeBytes); + offset += storePairRegsWritesBytes; + } + + if (endOffset - offset >= regSizeBytes) + { + instrStream.StoreReg(offset, regSizeBytes); + offset += regSizeBytes; + } + + if (offset != endOffset) + { + const int lastRegSizeBytes = StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(endOffset - offset); + instrStream.StoreReg(endOffset - lastRegSizeBytes, lastRegSizeBytes); + } + } + else + { + bool isSafeToWriteBehind = false; + + while (endOffset - offset >= regSizeBytes) + { + instrStream.StoreReg(offset, regSizeBytes); + offset += regSizeBytes; + isSafeToWriteBehind = true; + } + + assert(endOffset - offset < regSizeBytes); + + while (offset != endOffset) + { + if (isSafeToWriteBehind) + { + assert(endOffset - offset < regSizeBytes); + const int lastRegSizeBytes = StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(endOffset - offset); + instrStream.StoreReg(endOffset - lastRegSizeBytes, lastRegSizeBytes); + break; + } + + if (offset + regSizeBytes > endOffset) + { + regSizeBytes = regSizeBytes / 2; + } + else + { + instrStream.StoreReg(offset, regSizeBytes); + offset += regSizeBytes; + isSafeToWriteBehind = true; + } + } + } + } + + int dstStartOffset; + const int byteCount; +}; + +#endif // TARGET_ARM64 + //---------------------------------------------------------------------------------- // genCodeForInitBlkUnroll: Generate unrolled block initialization code. // @@ -1947,19 +2309,70 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) assert(dstOffset < INT32_MAX - static_cast(size)); #ifdef TARGET_ARM64 - for (unsigned regSize = 2 * REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize) + InitBlockUnrollHelper helper(dstOffset, size); + + regNumber dstReg = dstAddrBaseReg; + int dstRegAddrAlignment = -1; + + if (dstLclNum != BAD_VAR_NUM) { - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_S_R_R(INS_stp, EA_8BYTE, EA_8BYTE, srcReg, srcReg, dstLclNum, dstOffset); - } - else + bool FPbased; + const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &FPbased); + + dstReg = FPbased ? REG_FPBASE : REG_SPBASE; + dstRegAddrAlignment = FPbased ? (genSPtoFPdelta() % 16) : 0; + + helper.SetDstOffset(baseAddr + dstOffset); + } + + if (!helper.CanEncodeAllOffsets(REGSIZE_BYTES)) + { + const regNumber tempReg = rsGetRsvdReg(); + + int dstOffsetAdj = helper.GetDstOffset(); + + if (dstRegAddrAlignment != -1) { - emit->emitIns_R_R_R_I(INS_stp, EA_8BYTE, srcReg, srcReg, dstAddrBaseReg, dstOffset); + dstOffsetAdj = helper.GetDstOffset() - dstRegAddrAlignment; + dstRegAddrAlignment = 0; } + + genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg, dstReg, dstOffsetAdj, tempReg); + dstReg = tempReg; + + helper.SetDstOffset(helper.GetDstOffset() - dstOffsetAdj); } -#endif + const bool hasAvailableSimdReg = (node->AvailableTempRegCount(RBM_ALLFLOAT) != 0); + const bool canUse16ByteWideInstrs = hasAvailableSimdReg && helper.CanEncodeAllOffsets(FP_REGSIZE_BYTES); + + bool shouldUse16ByteWideInstrs = false; + + // Store operations that cross a 16-byte boundary reduce bandwidth or incur additional latency. + if (canUse16ByteWideInstrs && (dstRegAddrAlignment != -1) && + (((dstRegAddrAlignment + helper.GetDstOffset()) % 16) == 0)) + { + shouldUse16ByteWideInstrs = + ((helper.InstructionCount(FP_REGSIZE_BYTES) + 1) < helper.InstructionCount(REGSIZE_BYTES)); + } + + const regNumber intReg = srcReg; + regNumber simdReg = REG_NA; + int regSizeBytes = REGSIZE_BYTES; + + if (shouldUse16ByteWideInstrs) + { + simdReg = node->GetSingleTempReg(RBM_ALLFLOAT); + regSizeBytes = FP_REGSIZE_BYTES; + + const int initValue = (src->AsIntCon()->IconValue() & 0xFF); + emit->emitIns_R_I(INS_movi, EA_16BYTE, simdReg, initValue, INS_OPTS_16B); + } + + helper.Unroll(regSizeBytes, intReg, simdReg, dstReg, GetEmitter()); +#endif // TARGET_ARM64 + +#ifdef TARGET_ARM for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize) { while (regSize > size) @@ -1981,9 +2394,6 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) attr = EA_4BYTE; break; case 4: -#ifdef TARGET_ARM64 - case 8: -#endif storeIns = INS_str; attr = EA_ATTR(regSize); break; @@ -2000,6 +2410,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) emit->emitIns_R_R_I(storeIns, attr, srcReg, dstAddrBaseReg, dstOffset); } } +#endif // TARGET_ARM } //---------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index c37e4a328d731..b6cbb1a86cb65 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -619,6 +619,12 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) switch (blkNode->gtBlkOpKind) { case GenTreeBlk::BlkOpKindUnroll: +#ifdef TARGET_ARM64 + if (size > 2 * FP_REGSIZE_BYTES) + { + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + } +#endif // TARGET_ARM64 break; case GenTreeBlk::BlkOpKindHelper: From d6520141fc135d0f7bc28c24b1853b053d265f27 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 9 Nov 2021 17:48:41 -0800 Subject: [PATCH 02/16] Use SIMD operations in CodeGen::genCodeForCpBlkUnroll() --- src/coreclr/jit/codegenarmarch.cpp | 396 ++++++++++++++++++++++++++--- src/coreclr/jit/lsraarmarch.cpp | 3 + 2 files changed, 369 insertions(+), 30 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 096a41e89b08e..017f6297f3d65 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -2239,6 +2239,185 @@ class InitBlockUnrollHelper const int byteCount; }; +class CopyBlockUnrollHelper +{ +public: + CopyBlockUnrollHelper(int srcOffset, int dstOffset, int byteCount) + : srcStartOffset(srcOffset), dstStartOffset(dstOffset), byteCount(byteCount) + { + } + + int GetSrcOffset() const + { + return srcStartOffset; + } + + int GetDstOffset() const + { + return dstStartOffset; + } + + void SetSrcOffset(int srcOffset) + { + srcStartOffset = srcOffset; + } + + void SetDstOffset(int dstOffset) + { + dstStartOffset = dstOffset; + } + + int InstructionCount(int regSizeBytes) const + { + CountingStream instrStream; + UnrollCopyBlock(instrStream, instrStream, regSizeBytes); + + return instrStream.InstructionCount(); + } + + bool CanEncodeAllOffsets(int regSizeBytes) const + { + bool canEncodeAllLoads = true; + bool canEncodeAllStores = true; + + TryEncodeAllOffsets(regSizeBytes, &canEncodeAllLoads, &canEncodeAllStores); + return canEncodeAllLoads && canEncodeAllStores; + } + + void TryEncodeAllOffsets(int regSizeBytes, bool* pCanEncodeAllLoads, bool* pCanEncodeAllStores) const + { + assert(pCanEncodeAllLoads != nullptr); + assert(pCanEncodeAllStores != nullptr); + + VerifyingStream instrStream; + UnrollCopyBlock(instrStream, instrStream, regSizeBytes); + + *pCanEncodeAllLoads = instrStream.CanEncodeAllLoads(); + *pCanEncodeAllStores = instrStream.CanEncodeAllStores(); + } + + void Unroll(int regSizeBytes, + regNumber intReg1, + regNumber intReg2, + regNumber simdReg1, + regNumber simdReg2, + regNumber srcAddrReg, + regNumber dstAddrReg, + emitter* emitter) const + { + ProducingStream loadStream(intReg1, intReg2, simdReg1, simdReg2, srcAddrReg, emitter); + ProducingStream storeStream(intReg1, intReg2, simdReg1, simdReg2, dstAddrReg, emitter); + UnrollCopyBlock(loadStream, storeStream, regSizeBytes); + } + +private: + template + void UnrollCopyBlock(InstructionStream& loadStream, InstructionStream& storeStream, int regSizeBytes) const + { + assert((regSizeBytes == 8) || (regSizeBytes == 16)); + + int srcOffset = srcStartOffset; + int dstOffset = dstStartOffset; + + const int endSrcOffset = srcOffset + byteCount; + const int endDstOffset = dstOffset + byteCount; + + const int storePairRegsAlignment = regSizeBytes; + const int storePairRegsWritesBytes = 2 * regSizeBytes; + + const int dstOffsetAligned = StoreBlockUnrollHelper::AlignUp(dstOffset, storePairRegsAlignment); + + if (endDstOffset - dstOffsetAligned >= storePairRegsWritesBytes) + { + const int dstBytesToAlign = dstOffsetAligned - dstOffset; + + if (dstBytesToAlign != 0) + { + const int firstRegSizeBytes = StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(dstBytesToAlign); + + loadStream.LoadReg(srcOffset, firstRegSizeBytes); + storeStream.StoreReg(dstOffset, firstRegSizeBytes); + + srcOffset = srcOffset + dstBytesToAlign; + dstOffset = dstOffsetAligned; + } + + while (endDstOffset - dstOffset >= storePairRegsWritesBytes) + { + loadStream.LoadPairRegs(srcOffset, regSizeBytes); + storeStream.StorePairRegs(dstOffset, regSizeBytes); + + srcOffset += storePairRegsWritesBytes; + dstOffset += storePairRegsWritesBytes; + } + + if (endDstOffset - dstOffset >= regSizeBytes) + { + loadStream.LoadReg(srcOffset, regSizeBytes); + storeStream.StoreReg(dstOffset, regSizeBytes); + + srcOffset += regSizeBytes; + dstOffset += regSizeBytes; + } + + if (dstOffset != endDstOffset) + { + const int lastRegSizeBytes = StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(endDstOffset - dstOffset); + + loadStream.LoadReg(endSrcOffset - lastRegSizeBytes, lastRegSizeBytes); + storeStream.StoreReg(endDstOffset - lastRegSizeBytes, lastRegSizeBytes); + } + } + else + { + bool isSafeToWriteBehind = false; + + while (endDstOffset - dstOffset >= regSizeBytes) + { + loadStream.LoadReg(srcOffset, regSizeBytes); + storeStream.StoreReg(dstOffset, regSizeBytes); + + srcOffset += regSizeBytes; + dstOffset += regSizeBytes; + isSafeToWriteBehind = true; + } + + assert(endSrcOffset - srcOffset < regSizeBytes); + + while (dstOffset != endDstOffset) + { + if (isSafeToWriteBehind) + { + const int lastRegSizeBytes = + StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(endDstOffset - dstOffset); + + loadStream.LoadReg(endSrcOffset - lastRegSizeBytes, lastRegSizeBytes); + storeStream.StoreReg(endDstOffset - lastRegSizeBytes, lastRegSizeBytes); + break; + } + + if (dstOffset + regSizeBytes > endDstOffset) + { + regSizeBytes = regSizeBytes / 2; + } + else + { + loadStream.LoadReg(srcOffset, regSizeBytes); + storeStream.StoreReg(dstOffset, regSizeBytes); + + srcOffset += regSizeBytes; + dstOffset += regSizeBytes; + isSafeToWriteBehind = true; + } + } + } + } + + int srcStartOffset; + int dstStartOffset; + const int byteCount; +}; + #endif // TARGET_ARM64 //---------------------------------------------------------------------------------- @@ -2441,15 +2620,6 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) } else { - // TODO-ARM-CQ: If the local frame offset is too large to be encoded, the emitter automatically - // loads the offset into a reserved register (see CodeGen::rsGetRsvdReg()). If we generate - // multiple store instructions we'll also generate multiple offset loading instructions. - // We could try to detect such cases, compute the base destination address in this reserved - // and use it in all store instructions we generate. This would effectively undo the effect - // of local address containment done by lowering. - // - // The same issue also occurs in source address case below and in genCodeForInitBlkUnroll. - assert(dstAddr->OperIsLocalAddr()); dstLclNum = dstAddr->AsLclVarCommon()->GetLclNum(); dstOffset = dstAddr->AsLclVarCommon()->GetLclOffs(); @@ -2505,34 +2675,202 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) regNumber tempReg = node->ExtractTempReg(RBM_ALLINT); #ifdef TARGET_ARM64 - if (size >= 2 * REGSIZE_BYTES) + CopyBlockUnrollHelper helper(srcOffset, dstOffset, size); + + regNumber srcReg = srcAddrBaseReg; + int srcRegAddrAlignment = -1; + + if (srcLclNum != BAD_VAR_NUM) + { + bool FPbased; + const int baseAddr = compiler->lvaFrameAddress(srcLclNum, &FPbased); + + srcReg = FPbased ? REG_FPBASE : REG_SPBASE; + srcRegAddrAlignment = FPbased ? (genSPtoFPdelta() % 16) : 0; + + helper.SetSrcOffset(baseAddr + srcOffset); + } + + regNumber dstReg = dstAddrBaseReg; + int dstRegAddrAlignment = -1; + + if (dstLclNum != BAD_VAR_NUM) { - regNumber tempReg2 = node->ExtractTempReg(RBM_ALLINT); + bool FPbased; + const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &FPbased); + + dstReg = FPbased ? REG_FPBASE : REG_SPBASE; + dstRegAddrAlignment = FPbased ? (genSPtoFPdelta() % 16) : 0; + + helper.SetDstOffset(baseAddr + dstOffset); + } + + bool canEncodeAllLoads = true; + bool canEncodeAllStores = true; + helper.TryEncodeAllOffsets(REGSIZE_BYTES, &canEncodeAllLoads, &canEncodeAllStores); + + srcOffset = helper.GetSrcOffset(); + dstOffset = helper.GetDstOffset(); - for (unsigned regSize = 2 * REGSIZE_BYTES; size >= regSize; - size -= regSize, srcOffset += regSize, dstOffset += regSize) + int srcOffsetAdj = 0; + int dstOffsetAdj = 0; + + if (!canEncodeAllLoads && !canEncodeAllStores) + { + srcOffsetAdj = srcOffset; + dstOffsetAdj = dstOffset; + } + else if (!canEncodeAllLoads) + { + srcOffsetAdj = srcOffset - dstOffset; + } + else if (!canEncodeAllStores) + { + dstOffsetAdj = dstOffset - srcOffset; + } + + helper.SetSrcOffset(srcOffset - srcOffsetAdj); + helper.SetDstOffset(dstOffset - dstOffsetAdj); + + bool shouldUse16ByteWideInstrs = false; + + // Quad-word load operations that are not 16-byte aligned, and store operations that cross a 16-byte boundary + // can reduce bandwidth or incur additional latency. + // Therefore, the JIT would attempt to use 16-byte variants of such instructions when both conditions are met: + // 1) the base address stored in dstReg has known alignment (modulo 16 bytes) and + // 2) the base address stored in srcReg has the same alignment as the address in dstReg. + // + // When both addresses are 16-byte aligned the CopyBlock instruction sequence looks like + // + // ldp Q_simdReg1, Q_simdReg2, [srcReg, #srcOffset] + // stp Q_simdReg1, Q_simdReg2, [dstReg, #dstOffset] + // ldp Q_simdReg1, Q_simdReg2, [srcReg, #dstOffset+32] + // stp Q_simdReg1, Q_simdReg2, [dstReg, #dstOffset+32] + // ... + // + // When both addresses are not 16-byte aligned the CopyBlock instruction sequence starts with padding + // str instruction. For example, when both addresses are 8-byte aligned the instruction sequence looks like + // + // ldr D_simdReg1, [srcReg, #srcOffset] + // str D_simdReg1, [dstReg, #dstOffset] + // ldp Q_simdReg1, Q_simdReg2, [srcReg, #srcOffset+8] + // stp Q_simdReg1, Q_simdReg2, [dstReg, #dstOffset+8] + // ldp Q_simdReg1, Q_simdReg2, [srcReg, #srcOffset+40] + // stp Q_simdReg1, Q_simdReg2, [dstReg, #dstOffset+40] + // ... + + if ((dstRegAddrAlignment != -1) && (srcRegAddrAlignment == dstRegAddrAlignment)) + { + bool canEncodeAll16ByteWideLoads = false; + bool canEncodeAll16ByteWideStores = false; + helper.TryEncodeAllOffsets(FP_REGSIZE_BYTES, &canEncodeAll16ByteWideLoads, &canEncodeAll16ByteWideStores); + + if (canEncodeAll16ByteWideLoads && canEncodeAll16ByteWideStores) { - if (srcLclNum != BAD_VAR_NUM) - { - emit->emitIns_R_R_S_S(INS_ldp, EA_8BYTE, EA_8BYTE, tempReg, tempReg2, srcLclNum, srcOffset); - } - else - { - emit->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tempReg, tempReg2, srcAddrBaseReg, srcOffset); - } + // No further adjustments for srcOffset and dstOffset are needed. + // The JIT should use 16-byte loads and stores when the resulting sequence has fewer number of instructions. - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_S_R_R(INS_stp, EA_8BYTE, EA_8BYTE, tempReg, tempReg2, dstLclNum, dstOffset); - } - else + shouldUse16ByteWideInstrs = + (helper.InstructionCount(FP_REGSIZE_BYTES) < helper.InstructionCount(REGSIZE_BYTES)); + } + else if (canEncodeAllLoads && canEncodeAllStores && + (canEncodeAll16ByteWideLoads || canEncodeAll16ByteWideStores)) + { + // In order to use 16-byte instructions the JIT needs to adjust either srcOffset or dstOffset. + // The JIT should use 16-byte loads and stores when the resulting sequence (incl. an additional add + // instruction) + // has fewer number of instructions. + + if (helper.InstructionCount(FP_REGSIZE_BYTES) + 1 < helper.InstructionCount(REGSIZE_BYTES)) { - emit->emitIns_R_R_R_I(INS_stp, EA_8BYTE, tempReg, tempReg2, dstAddrBaseReg, dstOffset); + shouldUse16ByteWideInstrs = true; + + if (!canEncodeAll16ByteWideLoads) + { + srcOffsetAdj = srcOffset - dstOffset; + } + else + { + dstOffsetAdj = dstOffset - srcOffset; + } + + helper.SetSrcOffset(srcOffset - srcOffsetAdj); + helper.SetDstOffset(dstOffset - dstOffsetAdj); } } } + +#ifdef DEBUG + if (shouldUse16ByteWideInstrs) + { + assert(helper.CanEncodeAllOffsets(FP_REGSIZE_BYTES)); + } + else + { + assert(helper.CanEncodeAllOffsets(REGSIZE_BYTES)); + } #endif + regNumber intReg1 = REG_NA; + regNumber intReg2 = REG_NA; + + // On Arm64 an internal integer register is always allocated for a CopyBlock node + // in addition to a reserved register. + // However, if it is determined that both srcOffset and dstOffset need adjustments + // than both registers will be used for storing the adjusted addresses. + // In that case, the JIT uses SIMD register(s) and instructions for copying. + + const regNumber tempReg1 = rsGetRsvdReg(); + const regNumber tempReg2 = tempReg; + + if ((srcOffsetAdj != 0) && (dstOffsetAdj != 0)) + { + genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg1, srcReg, srcOffsetAdj, tempReg1); + srcReg = tempReg1; + + genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg2, dstReg, dstOffsetAdj, tempReg2); + dstReg = tempReg2; + + if (size >= 2 * REGSIZE_BYTES) + { + intReg1 = node->ExtractTempReg(RBM_ALLINT); + } + } + else + { + if (srcOffsetAdj != 0) + { + genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg1, srcReg, srcOffsetAdj, tempReg1); + srcReg = tempReg1; + } + else if (dstOffsetAdj != 0) + { + genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg1, dstReg, dstOffsetAdj, tempReg1); + dstReg = tempReg1; + } + + intReg1 = tempReg2; + + if (size >= 2 * REGSIZE_BYTES) + { + intReg2 = node->ExtractTempReg(RBM_ALLINT); + } + } + + const regNumber simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT); + regNumber simdReg2 = REG_NA; + + if (size >= FP_REGSIZE_BYTES) + { + simdReg2 = node->ExtractTempReg(RBM_ALLFLOAT); + } + + const int regSizeBytes = shouldUse16ByteWideInstrs ? FP_REGSIZE_BYTES : REGSIZE_BYTES; + + helper.Unroll(regSizeBytes, intReg1, intReg2, simdReg1, simdReg2, srcReg, dstReg, GetEmitter()); +#endif // TARGET_ARM64 + +#ifdef TARGET_ARM for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize) { while (regSize > size) @@ -2557,9 +2895,6 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) attr = EA_4BYTE; break; case 4: -#ifdef TARGET_ARM64 - case 8: -#endif loadIns = INS_ldr; storeIns = INS_str; attr = EA_ATTR(regSize); @@ -2586,6 +2921,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) emit->emitIns_R_R_I(storeIns, attr, tempReg, dstAddrBaseReg, dstOffset); } } +#endif // TARGET_ARM if (node->IsVolatile()) { diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index b6cbb1a86cb65..acdd2d7a39288 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -681,11 +681,14 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) case GenTreeBlk::BlkOpKindUnroll: buildInternalIntRegisterDefForNode(blkNode); #ifdef TARGET_ARM64 + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + if (size >= 2 * REGSIZE_BYTES) { // We will use ldp/stp to reduce code size and improve performance // so we need to reserve an extra internal register buildInternalIntRegisterDefForNode(blkNode); + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); } #endif break; From c3566b1823a4092831de05daf3e55a89b36c4889 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Wed, 1 Dec 2021 17:19:19 -0800 Subject: [PATCH 03/16] Remove restrictions on offset in Lowering::ContainBlockStoreAddress() --- src/coreclr/jit/lowerarmarch.cpp | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 345edc402cc3f..c1fd3ddf1984b 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -442,24 +442,14 @@ void Lowering::ContainBlockStoreAddress(GenTreeBlk* blkNode, unsigned size, GenT GenTreeIntCon* offsetNode = addr->AsOp()->gtGetOp2()->AsIntCon(); ssize_t offset = offsetNode->IconValue(); - // All integer load/store instructions on both ARM32 and ARM64 support - // offsets in range -255..255. Of course, this is a rather conservative - // check. For example, if the offset and size are a multiple of 8 we - // could allow a combined offset of up to 32760 on ARM64. +#ifdef TARGET_ARM + // All integer load/store instructions on Arm support offsets in range -255..255. + // Of course, this is a rather conservative check. if ((offset < -255) || (offset > 255) || (offset + static_cast(size) > 256)) { return; } - -#ifdef TARGET_ARM64 - // If we're going to use LDP/STP we need to ensure that the offset is - // a multiple of 8 since these instructions do not have an unscaled - // offset variant. - if ((size >= 2 * REGSIZE_BYTES) && (offset % REGSIZE_BYTES != 0)) - { - return; - } -#endif +#endif // TARGET_ARM if (!IsSafeToContainMem(blkNode, addr)) { From c7d9d085103272bcbbb3fb9c6156841447227c69 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 20 Dec 2021 16:01:52 -0800 Subject: [PATCH 04/16] Remove unused macros in src/coreclr/jit/instr.h --- src/coreclr/jit/instr.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index 1e9302cf503e8..89bb5da348949 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -321,7 +321,6 @@ enum emitAttr : unsigned #define EA_ATTR(x) ((emitAttr)(x)) #define EA_SIZE(x) ((emitAttr)(((unsigned)(x)) & EA_SIZE_MASK)) #define EA_SIZE_IN_BYTES(x) ((UNATIVE_OFFSET)(EA_SIZE(x))) -#define EA_SET_SIZE(x, sz) ((emitAttr)((((unsigned)(x)) & ~EA_SIZE_MASK) | (sz))) #define EA_SET_FLG(x, flg) ((emitAttr)(((unsigned)(x)) | (flg))) #define EA_REMOVE_FLG(x, flg) ((emitAttr)(((unsigned)(x)) & ~(flg))) #define EA_4BYTE_DSP_RELOC (EA_SET_FLG(EA_4BYTE, EA_DSP_RELOC_FLG)) @@ -336,8 +335,6 @@ enum emitAttr : unsigned #define EA_IS_RELOC(x) (EA_IS_DSP_RELOC(x) || EA_IS_CNS_RELOC(x)) #define EA_TYPE(x) ((emitAttr)(((unsigned)(x)) & ~(EA_OFFSET_FLG | EA_DSP_RELOC_FLG | EA_CNS_RELOC_FLG))) -#define EmitSize(x) (EA_ATTR(genTypeSize(TypeGet(x)))) - // clang-format on /*****************************************************************************/ From 52e22d7875aa74ba8a3d89ab1ec6b8eed6ad8754 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Thu, 16 Dec 2021 10:52:25 -0800 Subject: [PATCH 05/16] Rename FPbased -> fpBased in src/coreclr/jit/codegenarmarch.cpp --- src/coreclr/jit/codegenarmarch.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 017f6297f3d65..1ffbbfed63d74 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -2495,11 +2495,11 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) if (dstLclNum != BAD_VAR_NUM) { - bool FPbased; - const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &FPbased); + bool fpBased; + const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &fpBased); - dstReg = FPbased ? REG_FPBASE : REG_SPBASE; - dstRegAddrAlignment = FPbased ? (genSPtoFPdelta() % 16) : 0; + dstReg = fpBased ? REG_FPBASE : REG_SPBASE; + dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0; helper.SetDstOffset(baseAddr + dstOffset); } @@ -2682,11 +2682,11 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) if (srcLclNum != BAD_VAR_NUM) { - bool FPbased; - const int baseAddr = compiler->lvaFrameAddress(srcLclNum, &FPbased); + bool fpBased; + const int baseAddr = compiler->lvaFrameAddress(srcLclNum, &fpBased); - srcReg = FPbased ? REG_FPBASE : REG_SPBASE; - srcRegAddrAlignment = FPbased ? (genSPtoFPdelta() % 16) : 0; + srcReg = fpBased ? REG_FPBASE : REG_SPBASE; + srcRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0; helper.SetSrcOffset(baseAddr + srcOffset); } @@ -2696,11 +2696,11 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) if (dstLclNum != BAD_VAR_NUM) { - bool FPbased; - const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &FPbased); + bool fpBased; + const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &fpBased); - dstReg = FPbased ? REG_FPBASE : REG_SPBASE; - dstRegAddrAlignment = FPbased ? (genSPtoFPdelta() % 16) : 0; + dstReg = fpBased ? REG_FPBASE : REG_SPBASE; + dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0; helper.SetDstOffset(baseAddr + dstOffset); } From c9be2de644a8c7319640b57fc8429e170b34b433 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Thu, 16 Dec 2021 11:03:26 -0800 Subject: [PATCH 06/16] Split Arm32 and Arm64 logic around srcReg in CodeGen::genCodeForInitBlkUnroll() in src/coreclr/jit/codegenarmarch.cpp --- src/coreclr/jit/codegenarmarch.cpp | 31 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 1ffbbfed63d74..21e2f13c5e2a6 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -2453,8 +2453,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) dstOffset = dstAddr->AsLclVarCommon()->GetLclOffs(); } - regNumber srcReg; - GenTree* src = node->Data(); + GenTree* src = node->Data(); if (src->OperIs(GT_INIT_VAL)) { @@ -2462,20 +2461,6 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) src = src->gtGetOp1(); } - if (!src->isContained()) - { - srcReg = genConsumeReg(src); - } - else - { -#ifdef TARGET_ARM64 - assert(src->IsIntegralConst(0)); - srcReg = REG_ZR; -#else - unreached(); -#endif - } - if (node->IsVolatile()) { instGen_MemoryBarrier(); @@ -2490,6 +2475,18 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) #ifdef TARGET_ARM64 InitBlockUnrollHelper helper(dstOffset, size); + regNumber srcReg; + + if (!src->isContained()) + { + srcReg = genConsumeReg(src); + } + else + { + assert(src->IsIntegralConst(0)); + srcReg = REG_ZR; + } + regNumber dstReg = dstAddrBaseReg; int dstRegAddrAlignment = -1; @@ -2552,6 +2549,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) #endif // TARGET_ARM64 #ifdef TARGET_ARM + const regNumber srcReg = genConsumeReg(src); + for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize) { while (regSize > size) From e7cff76674e99c6950d0c59648fc023c55153e28 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Thu, 16 Dec 2021 12:33:55 -0800 Subject: [PATCH 07/16] Change int -> unsigned for InstrCount in CountingStream in src/coreclr/jit/codegenarmarch.cpp --- src/coreclr/jit/codegenarmarch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 21e2f13c5e2a6..745370d75fb48 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -1909,13 +1909,13 @@ class CountingStream instrCount++; } - int InstructionCount() const + unsigned InstructionCount() const { return instrCount; } private: - int instrCount; + unsigned instrCount; }; class StoreBlockUnrollHelper From fb143b0bd5321c3f6cbd324c089a6c8fdf9c623c Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Thu, 16 Dec 2021 12:51:02 -0800 Subject: [PATCH 08/16] Change int -> unsigned for InstrCount in InitBlockUnrollHelper and CopyBlockUnrollHelper in src/coreclr/jit/codegenarmarch.cpp --- src/coreclr/jit/codegenarmarch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 745370d75fb48..b9e0bd1147645 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -2142,7 +2142,7 @@ class InitBlockUnrollHelper return instrStream.CanEncodeAllStores(); } - int InstructionCount(int regSizeBytes) const + unsigned InstructionCount(int regSizeBytes) const { CountingStream instrStream; UnrollInitBlock(instrStream, regSizeBytes); @@ -2267,7 +2267,7 @@ class CopyBlockUnrollHelper dstStartOffset = dstOffset; } - int InstructionCount(int regSizeBytes) const + unsigned InstructionCount(int regSizeBytes) const { CountingStream instrStream; UnrollCopyBlock(instrStream, instrStream, regSizeBytes); From 9eb1f85274b1ff99933af02195e973bb1a78546c Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Thu, 16 Dec 2021 16:19:35 -0800 Subject: [PATCH 09/16] Allocate an integer register in LSRA instead of relying on rsGetRsvdReg for InitBlock in src/coreclr/jit/lsraarmarch.cpp --- src/coreclr/jit/lsraarmarch.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index acdd2d7a39288..285c1bbcd3a6e 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -620,12 +620,24 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) { case GenTreeBlk::BlkOpKindUnroll: #ifdef TARGET_ARM64 - if (size > 2 * FP_REGSIZE_BYTES) + { + if (dstAddr->isContained()) + { + // Since the dstAddr is contained the address will be computed in CodeGen. + // This might require an integer register to store the value. + buildInternalIntRegisterDefForNode(blkNode); + } + + const bool isDstRegAddrAlignmentKnown = dstAddr->OperIsLocalAddr(); + + if (isDstRegAddrAlignmentKnown && (size > FP_REGSIZE_BYTES)) { + // For larger block sizes CodeGen can choose to use 16-byte SIMD instructions. buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); } + } #endif // TARGET_ARM64 - break; + break; case GenTreeBlk::BlkOpKindHelper: assert(!src->isContained()); From ad771d3a5c2b0baf5f90c726cee5eb8225aa9b1f Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Fri, 17 Dec 2021 12:11:40 -0800 Subject: [PATCH 10/16] Allocate an integer register in LSRA instead of relying on rsGetRsvdReg for CopyBlock in src/coreclr/jit/lsraarmarch.cpp --- src/coreclr/jit/lsraarmarch.cpp | 51 +++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index 285c1bbcd3a6e..aa01f8d0e6a8c 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -691,19 +691,60 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) switch (blkNode->gtBlkOpKind) { case GenTreeBlk::BlkOpKindUnroll: + { buildInternalIntRegisterDefForNode(blkNode); #ifdef TARGET_ARM64 - buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + const bool isSrcRegAddrAlignmentKnown = + src->OperIs(GT_LCL_VAR, GT_LCL_FLD) || + ((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr()); + const bool isDstRegAddrAlignmentKnown = dstAddr->OperIsLocalAddr(); + + bool willUseSimdInstrs = false; + + if (isSrcRegAddrAlignmentKnown && isDstRegAddrAlignmentKnown) + { + // The following allocates an additional integer register in a case + // when neither loads nor stores can be encoded using unsigned instruction offsets. + buildInternalIntRegisterDefForNode(blkNode); - if (size >= 2 * REGSIZE_BYTES) + if (size >= FP_REGSIZE_BYTES) + { + willUseSimdInstrs = true; + } + } + else if (dstAddr->isContained() && (src->OperIs(GT_LCL_VAR, GT_LCL_FLD) || + (srcAddrOrFill != nullptr) && srcAddrOrFill->isContained())) { - // We will use ldp/stp to reduce code size and improve performance - // so we need to reserve an extra internal register + // Both srcAddr and dstAddr are contained - the addresses will be computed in CodeGen. + // This might take up to two integer registers to store both values. + // The following allocates an additional integer register to store an address. buildInternalIntRegisterDefForNode(blkNode); + } + + // The following allocates temporary register(s) for copying from source to destination. + + if (willUseSimdInstrs) + { + // Note that the following allocates a pair of SIMD registers. + // CodeGen might end up needing both when two integer registers (allocated above) + // are occupied with source and destination addresses. + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); } + else + { + buildInternalIntRegisterDefForNode(blkNode); + + // CodeGen will use load-pair/store-pair instructions to reduce code size + // and improve performance. LSRA needs to reserve an extra internal register for these. + if (size >= 2 * REGSIZE_BYTES) + { + buildInternalIntRegisterDefForNode(blkNode); + } + } #endif - break; + } + break; case GenTreeBlk::BlkOpKindHelper: dstAddrRegMask = RBM_ARG_0; From fbee8d60c18a78e9a637c22abd5f1d3c38d130e8 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 20 Dec 2021 13:47:11 -0800 Subject: [PATCH 11/16] Change int -> unsigned for regSizeBytes in src/coreclr/jit/codegenarmarch.cpp --- src/coreclr/jit/codegenarmarch.cpp | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index b9e0bd1147645..3904e3602e0ac 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -1889,22 +1889,22 @@ class CountingStream instrCount = 0; } - void LoadPairRegs(int offset, int regSizeBytes) + void LoadPairRegs(int offset, unsigned regSizeBytes) { instrCount++; } - void StorePairRegs(int offset, int regSizeBytes) + void StorePairRegs(int offset, unsigned regSizeBytes) { instrCount++; } - void LoadReg(int offset, int regSizeBytes) + void LoadReg(int offset, unsigned regSizeBytes) { instrCount++; } - void StoreReg(int offset, int regSizeBytes) + void StoreReg(int offset, unsigned regSizeBytes) { instrCount++; } @@ -1960,22 +1960,22 @@ class VerifyingStream canEncodeAllStores = true; } - void LoadPairRegs(int offset, int regSizeBytes) + void LoadPairRegs(int offset, unsigned regSizeBytes) { canEncodeAllLoads = canEncodeAllLoads && CanEncodeLoadOrStorePairRegs(offset, regSizeBytes); } - void StorePairRegs(int offset, int regSizeBytes) + void StorePairRegs(int offset, unsigned regSizeBytes) { canEncodeAllStores = canEncodeAllStores && CanEncodeLoadOrStorePairRegs(offset, regSizeBytes); } - void LoadReg(int offset, int regSizeBytes) + void LoadReg(int offset, unsigned regSizeBytes) { canEncodeAllLoads = canEncodeAllLoads && CanEncodeLoadOrStoreReg(offset, regSizeBytes); } - void StoreReg(int offset, int regSizeBytes) + void StoreReg(int offset, unsigned regSizeBytes) { canEncodeAllStores = canEncodeAllStores && CanEncodeLoadOrStoreReg(offset, regSizeBytes); } @@ -1991,17 +1991,17 @@ class VerifyingStream } private: - static bool CanEncodeLoadOrStorePairRegs(int offset, int regSizeBytes) + static bool CanEncodeLoadOrStorePairRegs(int offset, unsigned regSizeBytes) { const bool canEncodeWithSignedOffset = - (offset % regSizeBytes == 0) && (offset >= -64 * regSizeBytes) && (offset < 64 * regSizeBytes); + (offset % regSizeBytes == 0) && (offset >= -64 * (int)regSizeBytes) && (offset < 64 * (int)regSizeBytes); return canEncodeWithSignedOffset; } - static bool CanEncodeLoadOrStoreReg(int offset, int regSizeBytes) + static bool CanEncodeLoadOrStoreReg(int offset, unsigned regSizeBytes) { const bool canEncodeWithUnsignedOffset = - (offset % regSizeBytes == 0) && (offset >= 0) && (offset < regSizeBytes * 4096); + (offset % regSizeBytes == 0) && (offset >= 0) && (offset < (int)regSizeBytes * 4096); const bool canEncodeWithUnscaledOffset = (offset >= -256) && (offset <= 255); return canEncodeWithUnsignedOffset || canEncodeWithUnscaledOffset; @@ -2024,7 +2024,7 @@ class ProducingStream { } - void LoadPairRegs(int offset, int regSizeBytes) + void LoadPairRegs(int offset, unsigned regSizeBytes) { assert((regSizeBytes == 8) || (regSizeBytes == 16)); @@ -2045,7 +2045,7 @@ class ProducingStream emitter->emitIns_R_R_R_I(INS_ldp, EA_SIZE(regSizeBytes), tempReg1, tempReg2, addrReg, offset); } - void StorePairRegs(int offset, int regSizeBytes) + void StorePairRegs(int offset, unsigned regSizeBytes) { assert((regSizeBytes == 8) || (regSizeBytes == 16)); @@ -2066,7 +2066,7 @@ class ProducingStream emitter->emitIns_R_R_R_I(INS_stp, EA_SIZE(regSizeBytes), tempReg1, tempReg2, addrReg, offset); } - void LoadReg(int offset, int regSizeBytes) + void LoadReg(int offset, unsigned regSizeBytes) { instruction ins = INS_ldr; regNumber tempReg = intReg1; @@ -2087,7 +2087,7 @@ class ProducingStream emitter->emitIns_R_R_I(ins, EA_SIZE(regSizeBytes), tempReg, addrReg, offset); } - void StoreReg(int offset, int regSizeBytes) + void StoreReg(int offset, unsigned regSizeBytes) { instruction ins = INS_str; regNumber tempReg = intReg1; From dbc75b52efe7a6334b74f07ede022379ff13ca53 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 20 Dec 2021 16:03:27 -0800 Subject: [PATCH 12/16] Move encoding-related logic to the emitter - add emitter::canEncodeLoadOrStorePairOffset() helper in src/coreclr/jit/emitarm64.h src/coreclr/jit/emitarm64.cpp --- src/coreclr/jit/emitarm64.cpp | 8 ++++++++ src/coreclr/jit/emitarm64.h | 3 +++ 2 files changed, 11 insertions(+) diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 004871baf9fe2..eb18e345d528e 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -2359,6 +2359,14 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) return false; // not encodable } +// true if 'imm' can be encoded as an offset in a ldp/stp instruction +/*static*/ bool emitter::canEncodeLoadOrStorePairOffset(INT64 imm, emitAttr attr) +{ + assert((attr == EA_4BYTE) || (attr == EA_8BYTE) || (attr == EA_16BYTE)); + const int size = EA_SIZE_IN_BYTES(attr); + return (imm % size == 0) && (imm >= -64 * size) && (imm < 64 * size); +} + /************************************************************************ * * A helper method to return the natural scale for an EA 'size' diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h index 05909f6d2e175..692f4b9a18aa5 100644 --- a/src/coreclr/jit/emitarm64.h +++ b/src/coreclr/jit/emitarm64.h @@ -485,6 +485,9 @@ static bool emitIns_valid_imm_for_alu(INT64 imm, emitAttr size); // true if this 'imm' can be encoded as the offset in a ldr/str instruction static bool emitIns_valid_imm_for_ldst_offset(INT64 imm, emitAttr size); +// true if 'imm' can be encoded as an offset in a ldp/stp instruction +static bool canEncodeLoadOrStorePairOffset(INT64 imm, emitAttr size); + // true if 'imm' can use the left shifted by 12 bits encoding static bool canEncodeWithShiftImmBy12(INT64 imm); From dcf079965dc77218d98e54299d6a8dd2b85ee6ce Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 20 Dec 2021 18:02:57 -0800 Subject: [PATCH 13/16] Re-structure Streams and Helpers and describe the design in src/coreclr/jit/codegenarmarch.cpp --- src/coreclr/jit/codegenarmarch.cpp | 418 +++++++++++++++++------------ 1 file changed, 251 insertions(+), 167 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 3904e3602e0ac..aa4491299b855 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -1881,6 +1881,45 @@ void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode) #ifdef TARGET_ARM64 +// The following classes +// - InitBlockUnrollHelper +// - CopyBlockUnrollHelper +// encapsulate algorithms that produce instruction sequences for inlined equivalents of memset() and memcpy() functions. +// +// Each class has a private template function that accepts an "InstructionStream" as a template class argument: +// - InitBlockUnrollHelper::UnrollInitBlock(startDstOffset, byteCount, initValue) +// - CopyBlockUnrollHelper::UnrollCopyBlock(startSrcOffset, startDstOffset, byteCount) +// +// The design goal is to separate optimization approaches implemented by the algorithms +// from the target platform specific details. +// +// InstructionStream is a "stream" of load/store instructions (i.e. ldr/ldp/str/stp) that represents an instruction +// sequence that will initialize a memory region with some value or copy values from one memory region to another. +// +// As far as UnrollInitBlock and UnrollCopyBlock concerned, InstructionStream implements the following class member +// functions: +// - LoadPairRegs(offset, regSizeBytes) +// - StorePairRegs(offset, regSizeBytes) +// - LoadReg(offset, regSizeBytes) +// - StoreReg(offset, regSizeBytes) +// +// There are three implementations of InstructionStream: +// - CountingStream that counts how many instructions were pushed out of the stream +// - VerifyingStream that validates that all the instructions in the stream are encodable on Arm64 +// - ProducingStream that maps the function to corresponding emitter functions +// +// The idea behind the design is that decision regarding what instruction sequence to emit +// (scalar instructions vs. SIMD instructions) is made by execution an algorithm producing an instruction sequence +// while counting the number of produced instructions and verifying that all the instructions are encodable. +// +// For example, using SIMD instructions might produce a shorter sequence but require "spilling" a value of a starting +// address +// to an integer register (due to stricter offset alignment rules for 16-byte wide SIMD instructions). +// This the CodeGen can take this fact into account before emitting an instruction sequence. +// +// Alternative design might have had VerifyingStream and ProducingStream fused into one class +// that would allow to undo an instruction if the sequence is not fully encodable. + class CountingStream { public: @@ -1918,39 +1957,6 @@ class CountingStream unsigned instrCount; }; -class StoreBlockUnrollHelper -{ -public: - static int AlignUp(int offset, int alignment) - { - assert(((alignment - 1) & alignment) == 0); - return (offset + alignment - 1) & (-alignment); - } - - static int GetRegSizeAtLeastBytes(int byteCount) - { - assert(byteCount > 0); - assert(byteCount < 16); - - int regSizeBytes = byteCount; - - if (byteCount > 8) - { - regSizeBytes = 16; - } - else if (byteCount > 4) - { - regSizeBytes = 8; - } - else if (byteCount > 2) - { - regSizeBytes = 4; - } - - return regSizeBytes; - } -}; - class VerifyingStream { public: @@ -1962,22 +1968,25 @@ class VerifyingStream void LoadPairRegs(int offset, unsigned regSizeBytes) { - canEncodeAllLoads = canEncodeAllLoads && CanEncodeLoadOrStorePairRegs(offset, regSizeBytes); + canEncodeAllLoads = canEncodeAllLoads && emitter::canEncodeLoadOrStorePairOffset(offset, EA_SIZE(regSizeBytes)); } void StorePairRegs(int offset, unsigned regSizeBytes) { - canEncodeAllStores = canEncodeAllStores && CanEncodeLoadOrStorePairRegs(offset, regSizeBytes); + canEncodeAllStores = + canEncodeAllStores && emitter::canEncodeLoadOrStorePairOffset(offset, EA_SIZE(regSizeBytes)); } void LoadReg(int offset, unsigned regSizeBytes) { - canEncodeAllLoads = canEncodeAllLoads && CanEncodeLoadOrStoreReg(offset, regSizeBytes); + canEncodeAllLoads = + canEncodeAllLoads && emitter::emitIns_valid_imm_for_ldst_offset(offset, EA_SIZE(regSizeBytes)); } void StoreReg(int offset, unsigned regSizeBytes) { - canEncodeAllStores = canEncodeAllStores && CanEncodeLoadOrStoreReg(offset, regSizeBytes); + canEncodeAllStores = + canEncodeAllStores && emitter::emitIns_valid_imm_for_ldst_offset(offset, EA_SIZE(regSizeBytes)); } bool CanEncodeAllLoads() const @@ -1991,97 +2000,118 @@ class VerifyingStream } private: - static bool CanEncodeLoadOrStorePairRegs(int offset, unsigned regSizeBytes) - { - const bool canEncodeWithSignedOffset = - (offset % regSizeBytes == 0) && (offset >= -64 * (int)regSizeBytes) && (offset < 64 * (int)regSizeBytes); - return canEncodeWithSignedOffset; - } - - static bool CanEncodeLoadOrStoreReg(int offset, unsigned regSizeBytes) - { - const bool canEncodeWithUnsignedOffset = - (offset % regSizeBytes == 0) && (offset >= 0) && (offset < (int)regSizeBytes * 4096); - const bool canEncodeWithUnscaledOffset = (offset >= -256) && (offset <= 255); - - return canEncodeWithUnsignedOffset || canEncodeWithUnscaledOffset; - } - bool canEncodeAllLoads; bool canEncodeAllStores; }; -class ProducingStream +class ProducingStreamBaseInstrs { public: - ProducingStream(regNumber intReg1, - regNumber intReg2, - regNumber simdReg1, - regNumber simdReg2, - regNumber addrReg, - emitter* emitter) - : intReg1(intReg1), intReg2(intReg2), simdReg1(simdReg1), simdReg2(simdReg2), addrReg(addrReg), emitter(emitter) + ProducingStreamBaseInstrs(regNumber intReg1, regNumber intReg2, regNumber addrReg, emitter* emitter) + : intReg1(intReg1), intReg2(intReg2), addrReg(addrReg), emitter(emitter) { } void LoadPairRegs(int offset, unsigned regSizeBytes) { - assert((regSizeBytes == 8) || (regSizeBytes == 16)); + assert(regSizeBytes == 8); + + emitter->emitIns_R_R_R_I(INS_ldp, EA_SIZE(regSizeBytes), intReg1, intReg2, addrReg, offset); + } - regNumber tempReg1; - regNumber tempReg2; + void StorePairRegs(int offset, unsigned regSizeBytes) + { + assert(regSizeBytes == 8); - if ((regSizeBytes == 16) || (intReg2 == REG_NA)) + emitter->emitIns_R_R_R_I(INS_stp, EA_SIZE(regSizeBytes), intReg1, intReg2, addrReg, offset); + } + + void LoadReg(int offset, unsigned regSizeBytes) + { + instruction ins = INS_ldr; + + if (regSizeBytes == 1) { - tempReg1 = simdReg1; - tempReg2 = simdReg2; + ins = INS_ldrb; } - else + else if (regSizeBytes == 2) { - tempReg1 = intReg1; - tempReg2 = intReg2; + ins = INS_ldrh; } - emitter->emitIns_R_R_R_I(INS_ldp, EA_SIZE(regSizeBytes), tempReg1, tempReg2, addrReg, offset); + emitter->emitIns_R_R_I(ins, EA_SIZE(regSizeBytes), intReg1, addrReg, offset); } - void StorePairRegs(int offset, unsigned regSizeBytes) + void StoreReg(int offset, unsigned regSizeBytes) { - assert((regSizeBytes == 8) || (regSizeBytes == 16)); + instruction ins = INS_str; - regNumber tempReg1; - regNumber tempReg2; - - if ((regSizeBytes == 16) || (intReg2 == REG_NA)) + if (regSizeBytes == 1) { - tempReg1 = simdReg1; - tempReg2 = simdReg2; + ins = INS_strb; } - else + else if (regSizeBytes == 2) { - tempReg1 = intReg1; - tempReg2 = intReg2; + ins = INS_strh; } - emitter->emitIns_R_R_R_I(INS_stp, EA_SIZE(regSizeBytes), tempReg1, tempReg2, addrReg, offset); + emitter->emitIns_R_R_I(ins, EA_SIZE(regSizeBytes), intReg1, addrReg, offset); + } + +private: + const regNumber intReg1; + const regNumber intReg2; + const regNumber addrReg; + emitter* const emitter; +}; + +class ProducingStream +{ +public: + ProducingStream(regNumber intReg1, regNumber simdReg1, regNumber simdReg2, regNumber addrReg, emitter* emitter) + : intReg1(intReg1), simdReg1(simdReg1), simdReg2(simdReg2), addrReg(addrReg), emitter(emitter) + { + } + + void LoadPairRegs(int offset, unsigned regSizeBytes) + { + assert((regSizeBytes == 8) || (regSizeBytes == 16)); + + emitter->emitIns_R_R_R_I(INS_ldp, EA_SIZE(regSizeBytes), simdReg1, simdReg2, addrReg, offset); + } + + void StorePairRegs(int offset, unsigned regSizeBytes) + { + assert((regSizeBytes == 8) || (regSizeBytes == 16)); + + emitter->emitIns_R_R_R_I(INS_stp, EA_SIZE(regSizeBytes), simdReg1, simdReg2, addrReg, offset); } void LoadReg(int offset, unsigned regSizeBytes) { - instruction ins = INS_ldr; - regNumber tempReg = intReg1; + instruction ins = INS_ldr; - if ((intReg1 == REG_NA) || (regSizeBytes == 16)) + // Note that 'intReg1' can be unavailable. + // If that is the case, then use SIMD instruction ldr and + // 'simdReg1' as a temporary register. + regNumber tempReg; + + if ((regSizeBytes == 16) || (intReg1 == REG_NA)) { tempReg = simdReg1; } - else if (regSizeBytes == 1) - { - ins = INS_ldrb; - } - else if (regSizeBytes == 2) + else { - ins = INS_ldrh; + tempReg = intReg1; + + if (regSizeBytes == 1) + { + ins = INS_ldrb; + } + else if (regSizeBytes == 2) + { + ins = INS_ldrh; + } } emitter->emitIns_R_R_I(ins, EA_SIZE(regSizeBytes), tempReg, addrReg, offset); @@ -2089,20 +2119,29 @@ class ProducingStream void StoreReg(int offset, unsigned regSizeBytes) { - instruction ins = INS_str; - regNumber tempReg = intReg1; + instruction ins = INS_str; + + // Note that 'intReg1' can be unavailable. + // If that is the case, then use SIMD instruction ldr and + // 'simdReg1' as a temporary register. + regNumber tempReg; - if ((intReg1 == REG_NA) || (regSizeBytes == 16)) + if ((regSizeBytes == 16) || (intReg1 == REG_NA)) { tempReg = simdReg1; } - else if (regSizeBytes == 1) - { - ins = INS_strb; - } - else if (regSizeBytes == 2) + else { - ins = INS_strh; + tempReg = intReg1; + + if (regSizeBytes == 1) + { + ins = INS_strb; + } + else if (regSizeBytes == 2) + { + ins = INS_strh; + } } emitter->emitIns_R_R_I(ins, EA_SIZE(regSizeBytes), tempReg, addrReg, offset); @@ -2110,17 +2149,50 @@ class ProducingStream private: const regNumber intReg1; - const regNumber intReg2; const regNumber simdReg1; const regNumber simdReg2; const regNumber addrReg; emitter* const emitter; }; +class BlockUnrollHelper +{ +public: + // The following function returns a 'size' bytes that + // 1) is greater or equal to 'byteCount' and + // 2) can be read or written by a single instruction on Arm64. + // For example, Arm64 ISA has ldrb/strb and ldrh/strh that + // load/store 1 or 2 bytes, correspondingly. + // However, there are no instructions that can load/store 3 bytes and + // the next "smallest" instruction is ldr/str that operates on 4 byte granularity. + static unsigned GetRegSizeAtLeastBytes(unsigned byteCount) + { + assert(byteCount != 0); + assert(byteCount < 16); + + unsigned regSizeBytes = byteCount; + + if (byteCount > 8) + { + regSizeBytes = 16; + } + else if (byteCount > 4) + { + regSizeBytes = 8; + } + else if (byteCount > 2) + { + regSizeBytes = 4; + } + + return regSizeBytes; + } +}; + class InitBlockUnrollHelper { public: - InitBlockUnrollHelper(int dstOffset, int byteCount) : dstStartOffset(dstOffset), byteCount(byteCount) + InitBlockUnrollHelper(int dstOffset, unsigned byteCount) : dstStartOffset(dstOffset), byteCount(byteCount) { } @@ -2150,51 +2222,57 @@ class InitBlockUnrollHelper return instrStream.InstructionCount(); } - void Unroll(int regSizeBytes, regNumber intReg, regNumber simdReg, regNumber addrReg, emitter* emitter) const + void Unroll(regNumber intReg, regNumber simdReg, regNumber addrReg, emitter* emitter) const { - ProducingStream instrStream(intReg, intReg, simdReg, simdReg, addrReg, emitter); - UnrollInitBlock(instrStream, regSizeBytes); + ProducingStream instrStream(intReg, simdReg, simdReg, addrReg, emitter); + UnrollInitBlock(instrStream, FP_REGSIZE_BYTES); + } + + void UnrollBaseInstrs(regNumber intReg, regNumber addrReg, emitter* emitter) const + { + ProducingStreamBaseInstrs instrStream(intReg, intReg, addrReg, emitter); + UnrollInitBlock(instrStream, REGSIZE_BYTES); } private: template - void UnrollInitBlock(InstructionStream& instrStream, int regSizeBytes) const + void UnrollInitBlock(InstructionStream& instrStream, int initialRegSizeBytes) const { - assert((regSizeBytes == 8) || (regSizeBytes == 16)); + assert((initialRegSizeBytes == 8) || (initialRegSizeBytes == 16)); int offset = dstStartOffset; const int endOffset = offset + byteCount; - const int storePairRegsAlignment = regSizeBytes; - const int storePairRegsWritesBytes = 2 * regSizeBytes; + const int storePairRegsAlignment = initialRegSizeBytes; + const int storePairRegsWritesBytes = 2 * initialRegSizeBytes; - const int offsetAligned = StoreBlockUnrollHelper::AlignUp(offset, storePairRegsAlignment); + const int offsetAligned = AlignUp((UINT)offset, storePairRegsAlignment); const int storePairRegsInstrCount = (endOffset - offsetAligned) / storePairRegsWritesBytes; if (storePairRegsInstrCount > 0) { if (offset != offsetAligned) { - const int firstRegSizeBytes = StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(offsetAligned - offset); + const int firstRegSizeBytes = BlockUnrollHelper::GetRegSizeAtLeastBytes(offsetAligned - offset); instrStream.StoreReg(offset, firstRegSizeBytes); offset = offsetAligned; } while (endOffset - offset >= storePairRegsWritesBytes) { - instrStream.StorePairRegs(offset, regSizeBytes); + instrStream.StorePairRegs(offset, initialRegSizeBytes); offset += storePairRegsWritesBytes; } - if (endOffset - offset >= regSizeBytes) + if (endOffset - offset >= initialRegSizeBytes) { - instrStream.StoreReg(offset, regSizeBytes); - offset += regSizeBytes; + instrStream.StoreReg(offset, initialRegSizeBytes); + offset += initialRegSizeBytes; } if (offset != endOffset) { - const int lastRegSizeBytes = StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(endOffset - offset); + const int lastRegSizeBytes = BlockUnrollHelper::GetRegSizeAtLeastBytes(endOffset - offset); instrStream.StoreReg(endOffset - lastRegSizeBytes, lastRegSizeBytes); } } @@ -2202,47 +2280,47 @@ class InitBlockUnrollHelper { bool isSafeToWriteBehind = false; - while (endOffset - offset >= regSizeBytes) + while (endOffset - offset >= initialRegSizeBytes) { - instrStream.StoreReg(offset, regSizeBytes); - offset += regSizeBytes; + instrStream.StoreReg(offset, initialRegSizeBytes); + offset += initialRegSizeBytes; isSafeToWriteBehind = true; } - assert(endOffset - offset < regSizeBytes); + assert(endOffset - offset < initialRegSizeBytes); while (offset != endOffset) { if (isSafeToWriteBehind) { - assert(endOffset - offset < regSizeBytes); - const int lastRegSizeBytes = StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(endOffset - offset); + assert(endOffset - offset < initialRegSizeBytes); + const int lastRegSizeBytes = BlockUnrollHelper::GetRegSizeAtLeastBytes(endOffset - offset); instrStream.StoreReg(endOffset - lastRegSizeBytes, lastRegSizeBytes); break; } - if (offset + regSizeBytes > endOffset) + if (offset + initialRegSizeBytes > endOffset) { - regSizeBytes = regSizeBytes / 2; + initialRegSizeBytes = initialRegSizeBytes / 2; } else { - instrStream.StoreReg(offset, regSizeBytes); - offset += regSizeBytes; + instrStream.StoreReg(offset, initialRegSizeBytes); + offset += initialRegSizeBytes; isSafeToWriteBehind = true; } } } } - int dstStartOffset; - const int byteCount; + int dstStartOffset; + const unsigned byteCount; }; class CopyBlockUnrollHelper { public: - CopyBlockUnrollHelper(int srcOffset, int dstOffset, int byteCount) + CopyBlockUnrollHelper(int srcOffset, int dstOffset, unsigned byteCount) : srcStartOffset(srcOffset), dstStartOffset(dstOffset), byteCount(byteCount) { } @@ -2296,25 +2374,32 @@ class CopyBlockUnrollHelper *pCanEncodeAllStores = instrStream.CanEncodeAllStores(); } - void Unroll(int regSizeBytes, - regNumber intReg1, - regNumber intReg2, + void Unroll(unsigned initialRegSizeBytes, + regNumber intReg, regNumber simdReg1, regNumber simdReg2, regNumber srcAddrReg, regNumber dstAddrReg, emitter* emitter) const { - ProducingStream loadStream(intReg1, intReg2, simdReg1, simdReg2, srcAddrReg, emitter); - ProducingStream storeStream(intReg1, intReg2, simdReg1, simdReg2, dstAddrReg, emitter); - UnrollCopyBlock(loadStream, storeStream, regSizeBytes); + ProducingStream loadStream(intReg, simdReg1, simdReg2, srcAddrReg, emitter); + ProducingStream storeStream(intReg, simdReg1, simdReg2, dstAddrReg, emitter); + UnrollCopyBlock(loadStream, storeStream, initialRegSizeBytes); + } + + void UnrollBaseInstrs( + regNumber intReg1, regNumber intReg2, regNumber srcAddrReg, regNumber dstAddrReg, emitter* emitter) const + { + ProducingStreamBaseInstrs loadStream(intReg1, intReg2, srcAddrReg, emitter); + ProducingStreamBaseInstrs storeStream(intReg1, intReg2, dstAddrReg, emitter); + UnrollCopyBlock(loadStream, storeStream, REGSIZE_BYTES); } private: template - void UnrollCopyBlock(InstructionStream& loadStream, InstructionStream& storeStream, int regSizeBytes) const + void UnrollCopyBlock(InstructionStream& loadStream, InstructionStream& storeStream, int initialRegSizeBytes) const { - assert((regSizeBytes == 8) || (regSizeBytes == 16)); + assert((initialRegSizeBytes == 8) || (initialRegSizeBytes == 16)); int srcOffset = srcStartOffset; int dstOffset = dstStartOffset; @@ -2322,10 +2407,10 @@ class CopyBlockUnrollHelper const int endSrcOffset = srcOffset + byteCount; const int endDstOffset = dstOffset + byteCount; - const int storePairRegsAlignment = regSizeBytes; - const int storePairRegsWritesBytes = 2 * regSizeBytes; + const int storePairRegsAlignment = initialRegSizeBytes; + const int storePairRegsWritesBytes = 2 * initialRegSizeBytes; - const int dstOffsetAligned = StoreBlockUnrollHelper::AlignUp(dstOffset, storePairRegsAlignment); + const int dstOffsetAligned = AlignUp((UINT)dstOffset, storePairRegsAlignment); if (endDstOffset - dstOffsetAligned >= storePairRegsWritesBytes) { @@ -2333,7 +2418,7 @@ class CopyBlockUnrollHelper if (dstBytesToAlign != 0) { - const int firstRegSizeBytes = StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(dstBytesToAlign); + const int firstRegSizeBytes = BlockUnrollHelper::GetRegSizeAtLeastBytes(dstBytesToAlign); loadStream.LoadReg(srcOffset, firstRegSizeBytes); storeStream.StoreReg(dstOffset, firstRegSizeBytes); @@ -2344,25 +2429,25 @@ class CopyBlockUnrollHelper while (endDstOffset - dstOffset >= storePairRegsWritesBytes) { - loadStream.LoadPairRegs(srcOffset, regSizeBytes); - storeStream.StorePairRegs(dstOffset, regSizeBytes); + loadStream.LoadPairRegs(srcOffset, initialRegSizeBytes); + storeStream.StorePairRegs(dstOffset, initialRegSizeBytes); srcOffset += storePairRegsWritesBytes; dstOffset += storePairRegsWritesBytes; } - if (endDstOffset - dstOffset >= regSizeBytes) + if (endDstOffset - dstOffset >= initialRegSizeBytes) { - loadStream.LoadReg(srcOffset, regSizeBytes); - storeStream.StoreReg(dstOffset, regSizeBytes); + loadStream.LoadReg(srcOffset, initialRegSizeBytes); + storeStream.StoreReg(dstOffset, initialRegSizeBytes); - srcOffset += regSizeBytes; - dstOffset += regSizeBytes; + srcOffset += initialRegSizeBytes; + dstOffset += initialRegSizeBytes; } if (dstOffset != endDstOffset) { - const int lastRegSizeBytes = StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(endDstOffset - dstOffset); + const int lastRegSizeBytes = BlockUnrollHelper::GetRegSizeAtLeastBytes(endDstOffset - dstOffset); loadStream.LoadReg(endSrcOffset - lastRegSizeBytes, lastRegSizeBytes); storeStream.StoreReg(endDstOffset - lastRegSizeBytes, lastRegSizeBytes); @@ -2372,50 +2457,49 @@ class CopyBlockUnrollHelper { bool isSafeToWriteBehind = false; - while (endDstOffset - dstOffset >= regSizeBytes) + while (endDstOffset - dstOffset >= initialRegSizeBytes) { - loadStream.LoadReg(srcOffset, regSizeBytes); - storeStream.StoreReg(dstOffset, regSizeBytes); + loadStream.LoadReg(srcOffset, initialRegSizeBytes); + storeStream.StoreReg(dstOffset, initialRegSizeBytes); - srcOffset += regSizeBytes; - dstOffset += regSizeBytes; + srcOffset += initialRegSizeBytes; + dstOffset += initialRegSizeBytes; isSafeToWriteBehind = true; } - assert(endSrcOffset - srcOffset < regSizeBytes); + assert(endSrcOffset - srcOffset < initialRegSizeBytes); while (dstOffset != endDstOffset) { if (isSafeToWriteBehind) { - const int lastRegSizeBytes = - StoreBlockUnrollHelper::GetRegSizeAtLeastBytes(endDstOffset - dstOffset); + const int lastRegSizeBytes = BlockUnrollHelper::GetRegSizeAtLeastBytes(endDstOffset - dstOffset); loadStream.LoadReg(endSrcOffset - lastRegSizeBytes, lastRegSizeBytes); storeStream.StoreReg(endDstOffset - lastRegSizeBytes, lastRegSizeBytes); break; } - if (dstOffset + regSizeBytes > endDstOffset) + if (dstOffset + initialRegSizeBytes > endDstOffset) { - regSizeBytes = regSizeBytes / 2; + initialRegSizeBytes = initialRegSizeBytes / 2; } else { - loadStream.LoadReg(srcOffset, regSizeBytes); - storeStream.StoreReg(dstOffset, regSizeBytes); + loadStream.LoadReg(srcOffset, initialRegSizeBytes); + storeStream.StoreReg(dstOffset, initialRegSizeBytes); - srcOffset += regSizeBytes; - dstOffset += regSizeBytes; + srcOffset += initialRegSizeBytes; + dstOffset += initialRegSizeBytes; isSafeToWriteBehind = true; } } } } - int srcStartOffset; - int dstStartOffset; - const int byteCount; + int srcStartOffset; + int dstStartOffset; + const unsigned byteCount; }; #endif // TARGET_ARM64 From 8eabe6b8b527b01ffcb4f7c82e5060d541a0e75b Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Thu, 16 Dec 2021 12:39:50 -0800 Subject: [PATCH 14/16] Re-structure CodeGen::genCodeForInitBlkUnroll() in src/coreclr/jit/codegenarmarch.cpp --- src/coreclr/jit/codegenarmarch.cpp | 62 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index aa4491299b855..3dd92e7a3d1d5 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -2571,65 +2571,67 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) srcReg = REG_ZR; } - regNumber dstReg = dstAddrBaseReg; - int dstRegAddrAlignment = -1; + regNumber dstReg = dstAddrBaseReg; + int dstRegAddrAlignment = 0; + bool isDstRegAddrAlignmentKnown = false; if (dstLclNum != BAD_VAR_NUM) { bool fpBased; const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &fpBased); - dstReg = fpBased ? REG_FPBASE : REG_SPBASE; - dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0; + dstReg = fpBased ? REG_FPBASE : REG_SPBASE; + dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0; + isDstRegAddrAlignmentKnown = true; helper.SetDstOffset(baseAddr + dstOffset); } if (!helper.CanEncodeAllOffsets(REGSIZE_BYTES)) { - const regNumber tempReg = rsGetRsvdReg(); - - int dstOffsetAdj = helper.GetDstOffset(); - - if (dstRegAddrAlignment != -1) - { - dstOffsetAdj = helper.GetDstOffset() - dstRegAddrAlignment; - dstRegAddrAlignment = 0; - } + // If dstRegAddrAlignment is known and non-zero the following ensures that the adjusted value of dstReg is at + // 16-byte aligned boundary. + // This is done to potentially allow more cases where the JIT can use 16-byte stores. + const int dstOffsetAdjustment = helper.GetDstOffset() - dstRegAddrAlignment; + dstRegAddrAlignment = 0; - genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg, dstReg, dstOffsetAdj, tempReg); + const regNumber tempReg = node->ExtractTempReg(RBM_ALLINT); + genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg, dstReg, dstOffsetAdjustment, tempReg); dstReg = tempReg; - helper.SetDstOffset(helper.GetDstOffset() - dstOffsetAdj); + helper.SetDstOffset(helper.GetDstOffset() - dstOffsetAdjustment); } - const bool hasAvailableSimdReg = (node->AvailableTempRegCount(RBM_ALLFLOAT) != 0); - const bool canUse16ByteWideInstrs = hasAvailableSimdReg && helper.CanEncodeAllOffsets(FP_REGSIZE_BYTES); - bool shouldUse16ByteWideInstrs = false; // Store operations that cross a 16-byte boundary reduce bandwidth or incur additional latency. - if (canUse16ByteWideInstrs && (dstRegAddrAlignment != -1) && - (((dstRegAddrAlignment + helper.GetDstOffset()) % 16) == 0)) + // The following condition prevents using 16-byte stores when dstRegAddrAlignment is: + // 1) unknown (i.e. dstReg is neither FP nor SP) or + // 2) non-zero (i.e. dstRegAddr is not 16-byte aligned). + const bool hasAvailableSimdReg = isDstRegAddrAlignmentKnown && (size > FP_REGSIZE_BYTES); + const bool canUse16ByteWideInstrs = + hasAvailableSimdReg && (dstRegAddrAlignment == 0) && helper.CanEncodeAllOffsets(FP_REGSIZE_BYTES); + + if (canUse16ByteWideInstrs) { - shouldUse16ByteWideInstrs = - ((helper.InstructionCount(FP_REGSIZE_BYTES) + 1) < helper.InstructionCount(REGSIZE_BYTES)); + // The JIT would need to initialize a SIMD register with "movi simdReg.16B, #initValue". + const unsigned instrCount16ByteWide = helper.InstructionCount(FP_REGSIZE_BYTES) + 1; + shouldUse16ByteWideInstrs = instrCount16ByteWide < helper.InstructionCount(REGSIZE_BYTES); } - const regNumber intReg = srcReg; - regNumber simdReg = REG_NA; - int regSizeBytes = REGSIZE_BYTES; - if (shouldUse16ByteWideInstrs) { - simdReg = node->GetSingleTempReg(RBM_ALLFLOAT); - regSizeBytes = FP_REGSIZE_BYTES; + const regNumber simdReg = node->GetSingleTempReg(RBM_ALLFLOAT); const int initValue = (src->AsIntCon()->IconValue() & 0xFF); emit->emitIns_R_I(INS_movi, EA_16BYTE, simdReg, initValue, INS_OPTS_16B); - } - helper.Unroll(regSizeBytes, intReg, simdReg, dstReg, GetEmitter()); + helper.Unroll(srcReg, simdReg, dstReg, GetEmitter()); + } + else + { + helper.UnrollBaseInstrs(srcReg, dstReg, GetEmitter()); + } #endif // TARGET_ARM64 #ifdef TARGET_ARM From 96ee5bf886f9880b143ef802dd9471acb0120013 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 20 Dec 2021 18:10:41 -0800 Subject: [PATCH 15/16] Re-structure CodeGen::genCodeForCpBlkUnroll() in src/coreclr/jit/codegenarmarch.cpp --- src/coreclr/jit/codegenarmarch.cpp | 149 +++++++++++++++++------------ 1 file changed, 86 insertions(+), 63 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 3dd92e7a3d1d5..f70a8af1693eb 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -2757,35 +2757,37 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) assert(srcOffset < INT32_MAX - static_cast(size)); assert(dstOffset < INT32_MAX - static_cast(size)); - regNumber tempReg = node->ExtractTempReg(RBM_ALLINT); - #ifdef TARGET_ARM64 CopyBlockUnrollHelper helper(srcOffset, dstOffset, size); - regNumber srcReg = srcAddrBaseReg; - int srcRegAddrAlignment = -1; + regNumber srcReg = srcAddrBaseReg; + int srcRegAddrAlignment = 0; + bool isSrcRegAddrAlignmentKnown = false; if (srcLclNum != BAD_VAR_NUM) { bool fpBased; const int baseAddr = compiler->lvaFrameAddress(srcLclNum, &fpBased); - srcReg = fpBased ? REG_FPBASE : REG_SPBASE; - srcRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0; + srcReg = fpBased ? REG_FPBASE : REG_SPBASE; + srcRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0; + isSrcRegAddrAlignmentKnown = true; helper.SetSrcOffset(baseAddr + srcOffset); } - regNumber dstReg = dstAddrBaseReg; - int dstRegAddrAlignment = -1; + regNumber dstReg = dstAddrBaseReg; + int dstRegAddrAlignment = 0; + bool isDstRegAddrAlignmentKnown = false; if (dstLclNum != BAD_VAR_NUM) { bool fpBased; const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &fpBased); - dstReg = fpBased ? REG_FPBASE : REG_SPBASE; - dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0; + dstReg = fpBased ? REG_FPBASE : REG_SPBASE; + dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0; + isDstRegAddrAlignmentKnown = true; helper.SetDstOffset(baseAddr + dstOffset); } @@ -2797,27 +2799,25 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) srcOffset = helper.GetSrcOffset(); dstOffset = helper.GetDstOffset(); - int srcOffsetAdj = 0; - int dstOffsetAdj = 0; + int srcOffsetAdjustment = 0; + int dstOffsetAdjustment = 0; if (!canEncodeAllLoads && !canEncodeAllStores) { - srcOffsetAdj = srcOffset; - dstOffsetAdj = dstOffset; + srcOffsetAdjustment = srcOffset; + dstOffsetAdjustment = dstOffset; } else if (!canEncodeAllLoads) { - srcOffsetAdj = srcOffset - dstOffset; + srcOffsetAdjustment = srcOffset - dstOffset; } else if (!canEncodeAllStores) { - dstOffsetAdj = dstOffset - srcOffset; + dstOffsetAdjustment = dstOffset - srcOffset; } - helper.SetSrcOffset(srcOffset - srcOffsetAdj); - helper.SetDstOffset(dstOffset - dstOffsetAdj); - - bool shouldUse16ByteWideInstrs = false; + helper.SetSrcOffset(srcOffset - srcOffsetAdjustment); + helper.SetDstOffset(dstOffset - dstOffsetAdjustment); // Quad-word load operations that are not 16-byte aligned, and store operations that cross a 16-byte boundary // can reduce bandwidth or incur additional latency. @@ -2844,7 +2844,17 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) // stp Q_simdReg1, Q_simdReg2, [dstReg, #dstOffset+40] // ... - if ((dstRegAddrAlignment != -1) && (srcRegAddrAlignment == dstRegAddrAlignment)) + // LSRA allocates a pair of SIMD registers when alignments of both source and destination base addresses are + // known and the block size is larger than a single SIMD register size (i.e. when using SIMD instructions can + // be profitable). + const bool hasAvailableSimdRegs = + isSrcRegAddrAlignmentKnown && isDstRegAddrAlignmentKnown && (size >= FP_REGSIZE_BYTES); + + const bool canUse16ByteWideInstrs = hasAvailableSimdRegs && (srcRegAddrAlignment == dstRegAddrAlignment); + + bool shouldUse16ByteWideInstrs = false; + + if (canUse16ByteWideInstrs) { bool canEncodeAll16ByteWideLoads = false; bool canEncodeAll16ByteWideStores = false; @@ -2872,15 +2882,15 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) if (!canEncodeAll16ByteWideLoads) { - srcOffsetAdj = srcOffset - dstOffset; + srcOffsetAdjustment = srcOffset - dstOffset; } else { - dstOffsetAdj = dstOffset - srcOffset; + dstOffsetAdjustment = dstOffset - srcOffset; } - helper.SetSrcOffset(srcOffset - srcOffsetAdj); - helper.SetDstOffset(dstOffset - dstOffsetAdj); + helper.SetSrcOffset(srcOffset - srcOffsetAdjustment); + helper.SetDstOffset(dstOffset - dstOffsetAdjustment); } } } @@ -2896,66 +2906,79 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) } #endif - regNumber intReg1 = REG_NA; - regNumber intReg2 = REG_NA; - - // On Arm64 an internal integer register is always allocated for a CopyBlock node - // in addition to a reserved register. - // However, if it is determined that both srcOffset and dstOffset need adjustments - // than both registers will be used for storing the adjusted addresses. - // In that case, the JIT uses SIMD register(s) and instructions for copying. - - const regNumber tempReg1 = rsGetRsvdReg(); - const regNumber tempReg2 = tempReg; - - if ((srcOffsetAdj != 0) && (dstOffsetAdj != 0)) + if ((srcOffsetAdjustment != 0) && (dstOffsetAdjustment != 0)) { - genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg1, srcReg, srcOffsetAdj, tempReg1); + const regNumber tempReg1 = node->ExtractTempReg(RBM_ALLINT); + genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg1, srcReg, srcOffsetAdjustment, tempReg1); srcReg = tempReg1; - genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg2, dstReg, dstOffsetAdj, tempReg2); + const regNumber tempReg2 = node->ExtractTempReg(RBM_ALLINT); + genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg2, dstReg, dstOffsetAdjustment, tempReg2); dstReg = tempReg2; + } + else if (srcOffsetAdjustment != 0) + { + const regNumber tempReg = node->ExtractTempReg(RBM_ALLINT); + genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg, srcReg, srcOffsetAdjustment, tempReg); + srcReg = tempReg; + } + else if (dstOffsetAdjustment != 0) + { + const regNumber tempReg = node->ExtractTempReg(RBM_ALLINT); + genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg, dstReg, dstOffsetAdjustment, tempReg); + dstReg = tempReg; + } - if (size >= 2 * REGSIZE_BYTES) + regNumber simdReg1 = REG_NA; + regNumber simdReg2 = REG_NA; + + regNumber intReg1 = REG_NA; + regNumber intReg2 = REG_NA; + + if (hasAvailableSimdRegs) + { + simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT); + simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT); + + if ((srcOffsetAdjustment == 0) || (dstOffsetAdjustment == 0)) { intReg1 = node->ExtractTempReg(RBM_ALLINT); + + if ((srcOffsetAdjustment == 0) && (dstOffsetAdjustment == 0)) + { + intReg2 = node->GetSingleTempReg(RBM_ALLINT); + } } - } - else - { - if (srcOffsetAdj != 0) + + if (shouldUse16ByteWideInstrs || (intReg2 == REG_NA)) { - genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg1, srcReg, srcOffsetAdj, tempReg1); - srcReg = tempReg1; + const unsigned initialRegSizeBytes = shouldUse16ByteWideInstrs ? FP_REGSIZE_BYTES : REGSIZE_BYTES; + + helper.Unroll(initialRegSizeBytes, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter()); } - else if (dstOffsetAdj != 0) + else { - genInstrWithConstant(INS_add, EA_PTRSIZE, tempReg1, dstReg, dstOffsetAdj, tempReg1); - dstReg = tempReg1; - } + assert(intReg1 != REG_NA); - intReg1 = tempReg2; + helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter()); + } + } + else + { + intReg1 = node->ExtractTempReg(RBM_ALLINT); if (size >= 2 * REGSIZE_BYTES) { intReg2 = node->ExtractTempReg(RBM_ALLINT); } - } - - const regNumber simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT); - regNumber simdReg2 = REG_NA; - if (size >= FP_REGSIZE_BYTES) - { - simdReg2 = node->ExtractTempReg(RBM_ALLFLOAT); + helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter()); } - - const int regSizeBytes = shouldUse16ByteWideInstrs ? FP_REGSIZE_BYTES : REGSIZE_BYTES; - - helper.Unroll(regSizeBytes, intReg1, intReg2, simdReg1, simdReg2, srcReg, dstReg, GetEmitter()); #endif // TARGET_ARM64 #ifdef TARGET_ARM + const regNumber tempReg = node->ExtractTempReg(RBM_ALLINT); + for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize) { while (regSize > size) From e457e9247e1726a27947ff6932fe24ee68e24614 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Wed, 22 Dec 2021 17:56:48 -0800 Subject: [PATCH 16/16] Change how LSRA allocates int/SIMD registers when both srcAddr/dstAddr are contained and can potentially occupy up to 2 int registers --- src/coreclr/jit/codegenarmarch.cpp | 71 ++++++++++++++++-------------- src/coreclr/jit/lsraarmarch.cpp | 55 +++++++++-------------- 2 files changed, 58 insertions(+), 68 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index f70a8af1693eb..26e69c5c3f9fe 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -2847,10 +2847,9 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) // LSRA allocates a pair of SIMD registers when alignments of both source and destination base addresses are // known and the block size is larger than a single SIMD register size (i.e. when using SIMD instructions can // be profitable). - const bool hasAvailableSimdRegs = - isSrcRegAddrAlignmentKnown && isDstRegAddrAlignmentKnown && (size >= FP_REGSIZE_BYTES); - const bool canUse16ByteWideInstrs = hasAvailableSimdRegs && (srcRegAddrAlignment == dstRegAddrAlignment); + const bool canUse16ByteWideInstrs = isSrcRegAddrAlignmentKnown && isDstRegAddrAlignmentKnown && + (size >= FP_REGSIZE_BYTES) && (srcRegAddrAlignment == dstRegAddrAlignment); bool shouldUse16ByteWideInstrs = false; @@ -2929,50 +2928,56 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) dstReg = tempReg; } - regNumber simdReg1 = REG_NA; - regNumber simdReg2 = REG_NA; - regNumber intReg1 = REG_NA; regNumber intReg2 = REG_NA; - if (hasAvailableSimdRegs) - { - simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT); - simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT); + const unsigned intRegCount = node->AvailableTempRegCount(RBM_ALLINT); - if ((srcOffsetAdjustment == 0) || (dstOffsetAdjustment == 0)) - { + switch (intRegCount) + { + case 1: + intReg1 = node->GetSingleTempReg(RBM_ALLINT); + break; + case 2: intReg1 = node->ExtractTempReg(RBM_ALLINT); + intReg2 = node->GetSingleTempReg(RBM_ALLINT); + break; + default: + break; + } - if ((srcOffsetAdjustment == 0) && (dstOffsetAdjustment == 0)) - { - intReg2 = node->GetSingleTempReg(RBM_ALLINT); - } - } + regNumber simdReg1 = REG_NA; + regNumber simdReg2 = REG_NA; - if (shouldUse16ByteWideInstrs || (intReg2 == REG_NA)) - { - const unsigned initialRegSizeBytes = shouldUse16ByteWideInstrs ? FP_REGSIZE_BYTES : REGSIZE_BYTES; + const unsigned simdRegCount = node->AvailableTempRegCount(RBM_ALLFLOAT); - helper.Unroll(initialRegSizeBytes, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter()); - } - else - { - assert(intReg1 != REG_NA); + switch (simdRegCount) + { + case 1: + simdReg1 = node->GetSingleTempReg(RBM_ALLFLOAT); + break; + case 2: + simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT); + simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT); + break; + default: + break; + } - helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter()); - } + if (shouldUse16ByteWideInstrs) + { + helper.Unroll(FP_REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter()); } else { - intReg1 = node->ExtractTempReg(RBM_ALLINT); - - if (size >= 2 * REGSIZE_BYTES) + if (intReg2 == REG_NA) { - intReg2 = node->ExtractTempReg(RBM_ALLINT); + helper.Unroll(REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter()); + } + else + { + helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter()); } - - helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter()); } #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index aa01f8d0e6a8c..ef7a2d0440aa4 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -694,54 +694,39 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) { buildInternalIntRegisterDefForNode(blkNode); #ifdef TARGET_ARM64 - const bool isSrcRegAddrAlignmentKnown = - src->OperIs(GT_LCL_VAR, GT_LCL_FLD) || - ((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr()); - const bool isDstRegAddrAlignmentKnown = dstAddr->OperIsLocalAddr(); + const bool dstAddrMayNeedReg = dstAddr->isContained(); + const bool srcAddrMayNeedReg = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) || + ((srcAddrOrFill != nullptr) && srcAddrOrFill->isContained()); - bool willUseSimdInstrs = false; - - if (isSrcRegAddrAlignmentKnown && isDstRegAddrAlignmentKnown) + if (srcAddrMayNeedReg && dstAddrMayNeedReg) { // The following allocates an additional integer register in a case - // when neither loads nor stores can be encoded using unsigned instruction offsets. + // when a load instruction and a store instruction cannot be encoded. buildInternalIntRegisterDefForNode(blkNode); - - if (size >= FP_REGSIZE_BYTES) + // In this case, CodeGen will use a SIMD register for copying. + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + // And in case of a larger block size, two SIMD registers. + if (size >= 2 * REGSIZE_BYTES) { - willUseSimdInstrs = true; + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); } } - else if (dstAddr->isContained() && (src->OperIs(GT_LCL_VAR, GT_LCL_FLD) || - (srcAddrOrFill != nullptr) && srcAddrOrFill->isContained())) - { - // Both srcAddr and dstAddr are contained - the addresses will be computed in CodeGen. - // This might take up to two integer registers to store both values. - // The following allocates an additional integer register to store an address. - buildInternalIntRegisterDefForNode(blkNode); - } - - // The following allocates temporary register(s) for copying from source to destination. - - if (willUseSimdInstrs) - { - // Note that the following allocates a pair of SIMD registers. - // CodeGen might end up needing both when two integer registers (allocated above) - // are occupied with source and destination addresses. - buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); - buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); - } - else + else if (srcAddrMayNeedReg || dstAddrMayNeedReg) { - buildInternalIntRegisterDefForNode(blkNode); - - // CodeGen will use load-pair/store-pair instructions to reduce code size - // and improve performance. LSRA needs to reserve an extra internal register for these. if (size >= 2 * REGSIZE_BYTES) + { + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + } + else { buildInternalIntRegisterDefForNode(blkNode); } } + else if (size >= 2 * REGSIZE_BYTES) + { + buildInternalIntRegisterDefForNode(blkNode); + } #endif } break;