Skip to content

Commit

Permalink
Merge pull request #19815 from a7ehuo/system-arraycopy-perf-16-reference
Browse files Browse the repository at this point in the history
x86: Inline reference system array copy for smaller size
  • Loading branch information
0xdaryl authored Jul 13, 2024
2 parents d2b02cf + e721562 commit 011aecc
Showing 1 changed file with 74 additions and 1 deletion.
75 changes: 74 additions & 1 deletion runtime/compiler/x/codegen/J9TreeEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1887,11 +1887,42 @@ TR::Register *J9::X86::TreeEvaluator::arraycopyEvaluator(TR::Node *node, TR::Cod
auto RDI = cg->allocateRegister();
auto RCX = cg->allocateRegister();

TR::Register* tmpReg1 = NULL;
TR::Register* tmpReg2 = NULL;
TR::Register* tmpXmmYmmReg1 = NULL;
TR::Register* tmpXmmYmmReg2 = NULL;

static bool disableReferenceArrayCopyInlineSmallSizeWithoutREPMOVS = feGetEnv("TR_DisableReferenceArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL;

bool enableInlineForSmallSize = !disableReferenceArrayCopyInlineSmallSizeWithoutREPMOVS &&
!comp->getOption(TR_DisableReferenceArrayCopyInlineSmallSizeWithoutREPMOVS) &&
comp->target().cpu.supportsAVX() &&
comp->target().is64Bit();

int32_t repMovsThresholdBytes = 32;
int32_t newThreshold = comp->getOptions()->getArraycopyRepMovsReferenceArrayThreshold();

if ((repMovsThresholdBytes < newThreshold) && ((newThreshold == 64) || (newThreshold == 128)))
{
// If the CPU doesn't support AVX512, reduce the threshold to 64 bytes
repMovsThresholdBytes = ((newThreshold == 128) && !comp->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX512F)) ? 64 : newThreshold;
}

if (enableInlineForSmallSize)
{
tmpReg1 = cg->allocateRegister(TR_GPR);
tmpReg2 = cg->allocateRegister(TR_GPR);
tmpXmmYmmReg1 = cg->allocateRegister(TR_VRF);
tmpXmmYmmReg2 = cg->allocateRegister(TR_VRF);
}

generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, RSI, srcReg, cg);
generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, RDI, dstReg, cg);
generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, RCX, sizeReg, cg);

auto deps = generateRegisterDependencyConditions((uint8_t)5, 5, cg);
int8_t numDeps = enableInlineForSmallSize ? 9 : 5;
TR::RegisterDependencyConditions* deps = generateRegisterDependencyConditions(numDeps, numDeps, cg);

deps->addPreCondition(RSI, TR::RealRegister::esi, cg);
deps->addPreCondition(RDI, TR::RealRegister::edi, cg);
deps->addPreCondition(RCX, TR::RealRegister::ecx, cg);
Expand All @@ -1903,6 +1934,19 @@ TR::Register *J9::X86::TreeEvaluator::arraycopyEvaluator(TR::Node *node, TR::Cod
deps->addPostCondition(srcObjReg, TR::RealRegister::NoReg, cg);
deps->addPostCondition(dstObjReg, TR::RealRegister::NoReg, cg);

if (enableInlineForSmallSize)
{
deps->addPreCondition(tmpReg1, TR::RealRegister::NoReg, cg);
deps->addPreCondition(tmpReg2, TR::RealRegister::NoReg, cg);
deps->addPreCondition(tmpXmmYmmReg1, TR::RealRegister::NoReg, cg);
deps->addPreCondition(tmpXmmYmmReg2, TR::RealRegister::NoReg, cg);

deps->addPostCondition(tmpReg1, TR::RealRegister::NoReg, cg);
deps->addPostCondition(tmpReg2, TR::RealRegister::NoReg, cg);
deps->addPostCondition(tmpXmmYmmReg1, TR::RealRegister::NoReg, cg);
deps->addPostCondition(tmpXmmYmmReg2, TR::RealRegister::NoReg, cg);
}

auto begLabel = generateLabelSymbol(cg);
auto endLabel = generateLabelSymbol(cg);
begLabel->setStartInternalControlFlow();
Expand All @@ -1926,6 +1970,25 @@ TR::Register *J9::X86::TreeEvaluator::arraycopyEvaluator(TR::Node *node, TR::Cod
generateLabelInstruction(TR::InstOpCode::JMP4, node, endLabel, cg);
og.endOutlinedInstructionSequence();
}

if (enableInlineForSmallSize)
{
TR::LabelSymbol* repMovsLabel = generateLabelSymbol(cg);

if (use64BitClasses)
{
OMR::TreeEvaluatorConnector::arrayCopy64BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(node, RDI /* dstReg */, RSI /* srcReg */, RCX /* sizeReg */, tmpReg1, tmpReg2,
tmpXmmYmmReg1, tmpXmmYmmReg2, cg, repMovsThresholdBytes, repMovsLabel, endLabel);
}
else
{
OMR::TreeEvaluatorConnector::arrayCopy32BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(node, RDI /* dstReg */, RSI /* srcReg */, RCX /* sizeReg */, tmpReg1, tmpReg2,
tmpXmmYmmReg1, tmpXmmYmmReg2, cg, repMovsThresholdBytes, repMovsLabel, endLabel);
}

generateLabelInstruction(TR::InstOpCode::label, node, repMovsLabel, cg);
}

if (!node->isForwardArrayCopy())
{
TR::LabelSymbol* backwardLabel = generateLabelSymbol(cg);
Expand All @@ -1945,14 +2008,24 @@ TR::Register *J9::X86::TreeEvaluator::arraycopyEvaluator(TR::Node *node, TR::Cod
generateLabelInstruction(TR::InstOpCode::JMP4, node, endLabel, cg);
og.endOutlinedInstructionSequence();
}

generateRegImmInstruction(TR::InstOpCode::SHRRegImm1(), node, RCX, use64BitClasses ? 3 : 2, cg);
generateInstruction(use64BitClasses ? TR::InstOpCode::REPMOVSQ : TR::InstOpCode::REPMOVSD, node, cg);

generateLabelInstruction(TR::InstOpCode::label, node, endLabel, deps, cg);

cg->stopUsingRegister(RSI);
cg->stopUsingRegister(RDI);
cg->stopUsingRegister(RCX);

if (enableInlineForSmallSize)
{
cg->stopUsingRegister(tmpReg1);
cg->stopUsingRegister(tmpReg2);
cg->stopUsingRegister(tmpXmmYmmReg1);
cg->stopUsingRegister(tmpXmmYmmReg2);
}

TR::TreeEvaluator::VMwrtbarWithoutStoreEvaluator(node, node->getChild(1), NULL, NULL, cg->generateScratchRegisterManager(), cg);
}

Expand Down

0 comments on commit 011aecc

Please sign in to comment.