From 9b1d149075406b5400bb39d119ed9fd3bf2fc590 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 29 Aug 2020 11:36:22 +0300 Subject: [PATCH] Optimize "X / C" via cmovns --- src/coreclr/src/jit/codegenxarch.cpp | 52 +++++++++++++++++++++++++++- src/coreclr/src/jit/emit.cpp | 13 ++++--- src/coreclr/src/jit/lower.cpp | 8 +++++ 3 files changed, 67 insertions(+), 6 deletions(-) diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp index 88c021ac73d01..a12946fbc13ec 100644 --- a/src/coreclr/src/jit/codegenxarch.cpp +++ b/src/coreclr/src/jit/codegenxarch.cpp @@ -773,6 +773,7 @@ void CodeGen::genCodeForDivMod(GenTreeOp* treeNode) GenTree* divisor = treeNode->gtOp2; genTreeOps oper = treeNode->OperGet(); emitAttr size = emitTypeSize(treeNode); + regNumber operandReg = dividend->GetRegNum(); regNumber targetReg = treeNode->GetRegNum(); var_types targetType = treeNode->TypeGet(); emitter* emit = GetEmitter(); @@ -783,7 +784,56 @@ void CodeGen::genCodeForDivMod(GenTreeOp* treeNode) // dividend is in a register. assert(dividend->isUsedFromReg()); - genConsumeOperands(treeNode->AsOp()); + genConsumeReg(dividend); + + if (treeNode->OperIs(GT_DIV) && treeNode->TypeIs(TYP_INT, TYP_LONG) && + divisor->IsIntegralConst()) + { + if (operandReg == targetReg) + { + // the optimization won't work if target reg == dividend reg + // so we need to move the dividend to a temp reg + inst_RV_RV(INS_mov, REG_RDX, operandReg, targetType); + operandReg = REG_RDX; + } + + const ssize_t cnsDivisor = divisor->AsIntConCommon()->IconValue(); + const size_t absCnsDivisor = abs(cnsDivisor); + if (absCnsDivisor >= 4 && isPow2(absCnsDivisor)) + { + if (absCnsDivisor <= (1UL << 30)) + { + // lea rax, [rdx + (cnsDivisor-1)] + emit->emitIns_R_AR(INS_lea, size, targetReg, operandReg, static_cast(absCnsDivisor - 1)); + } + else + { + // mov + add + assert(false); // TODO + } + + // test rdx, rdx + emit->emitIns_R_R(INS_test, size, operandReg, operandReg); + + // cmovns rax, rdx + emit->emitIns_R_R(INS_cmovns, size, targetReg, operandReg); + + // sar rax, ctz(cnsDivisor) + emit->emitIns_R_I(INS_sar_N, size, targetReg, genLog2(static_cast(absCnsDivisor))); + + if (cnsDivisor < 0) + { + // neg rax + emit->emitIns_R(INS_neg, size, targetReg); + } + + genProduceReg(treeNode); + return; + } + } + + genConsumeRegs(divisor); + // dividend must be in RAX genCopyRegIfNeeded(dividend, REG_RAX); diff --git a/src/coreclr/src/jit/emit.cpp b/src/coreclr/src/jit/emit.cpp index 7a957b17f91ed..39d1fe02a5062 100644 --- a/src/coreclr/src/jit/emit.cpp +++ b/src/coreclr/src/jit/emit.cpp @@ -1138,11 +1138,14 @@ float emitter::insEvaluateExecutionCost(instrDesc* id) // void emitter::perfScoreUnhandledInstruction(instrDesc* id, insExecutionCharacteristics* pResult) { -#ifdef DEBUG - printf("PerfScore: unhandled instruction: %s, format %s", codeGen->genInsName(id->idIns()), - emitIfName(id->idInsFmt())); - assert(!"PerfScore: unhandled instruction"); -#endif +//#ifdef DEBUG +// printf("PerfScore: unhandled instruction: %s, format %s", codeGen->genInsName(id->idIns()), +// emitIfName(id->idInsFmt())); +// assert(!"PerfScore: unhandled instruction"); +//#endif +// +// TODO: update perfscore for CMOV* instructions +// pResult->insThroughput = PERFSCORE_THROUGHPUT_1C; pResult->insLatency = PERFSCORE_LATENCY_1C; } diff --git a/src/coreclr/src/jit/lower.cpp b/src/coreclr/src/jit/lower.cpp index 2867a7bacbb46..207fab15d5814 100644 --- a/src/coreclr/src/jit/lower.cpp +++ b/src/coreclr/src/jit/lower.cpp @@ -5404,6 +5404,14 @@ GenTree* Lowering::LowerConstIntDivOrMod(GenTree* node) #endif } + if (isDiv && absDivisorValue >= 4 && isPow2(absDivisorValue) && comp->opts.compUseCMOV) + { + divisor->SetContained(); + // don't expand "X s/ C" to RSH+AND+ADD if C is a power of two (>= 4) + // and CMOV instruction is available + return nullptr; + } + // We're committed to the conversion now. Go find the use if any. LIR::Use use; if (!BlockRange().TryGetUse(node, &use))