Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Optimize redundant memory barriers on arm/arm64 #60219

Merged
merged 6 commits into from
Oct 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions src/coreclr/jit/codegenarm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1883,4 +1883,36 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni
#endif // USING_SCOPE_INFO
}

//-----------------------------------------------------------------------------------
// instGen_MemoryBarrier: Emit a MemoryBarrier instruction
//
// Arguments:
// barrierKind - kind of barrier to emit (ignored on arm32)
//
// Notes:
// All MemoryBarriers instructions can be removed by DOTNET_JitNoMemoryBarriers=1
// barrierKind argument is ignored on arm32 and a full memory barrier is emitted
//
void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
{
#ifdef DEBUG
if (JitConfig.JitNoMemoryBarriers() == 1)
{
return;
}
#endif // DEBUG

// Avoid emitting redundant memory barriers on arm32 if they belong to the same IG
// and there were no memory accesses in-between them
if ((GetEmitter()->emitLastMemBarrier != nullptr) && compiler->opts.OptimizationEnabled())
{
assert(GetEmitter()->emitLastMemBarrier->idSmallCns() == INS_BARRIER_SY);
}
else
{
// ARM has only full barriers, so all barriers need to be emitted as full.
GetEmitter()->emitIns_I(INS_dmb, EA_4BYTE, INS_BARRIER_SY);
}
}

#endif // TARGET_ARM
50 changes: 50 additions & 0 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9510,4 +9510,54 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni
}
}

//-----------------------------------------------------------------------------------
// instGen_MemoryBarrier: Emit a MemoryBarrier instruction
//
// Arguments:
// barrierKind - kind of barrier to emit (Full or Load-Only).
//
// Notes:
// All MemoryBarriers instructions can be removed by DOTNET_JitNoMemoryBarriers=1
//
void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
{
#ifdef DEBUG
if (JitConfig.JitNoMemoryBarriers() == 1)
{
return;
}
#endif // DEBUG

// Avoid emitting redundant memory barriers on arm64 if they belong to the same IG
// and there were no memory accesses in-between them
emitter::instrDesc* lastMemBarrier = GetEmitter()->emitLastMemBarrier;
if ((lastMemBarrier != nullptr) && compiler->opts.OptimizationEnabled())
{
BarrierKind prevBarrierKind = BARRIER_FULL;
if (lastMemBarrier->idSmallCns() == INS_BARRIER_ISHLD)
{
prevBarrierKind = BARRIER_LOAD_ONLY;
}
else
{
// Currently we only emit two kinds of barriers on arm64:
// ISH - Full (inner shareable domain)
// ISHLD - LoadOnly (inner shareable domain)
assert(lastMemBarrier->idSmallCns() == INS_BARRIER_ISH);
}

if ((prevBarrierKind == BARRIER_LOAD_ONLY) && (barrierKind == BARRIER_FULL))
{
// Previous memory barrier: load-only, current: full
// Upgrade the previous one to full
assert((prevBarrierKind == BARRIER_LOAD_ONLY) && (barrierKind == BARRIER_FULL));
lastMemBarrier->idSmallCns(INS_BARRIER_ISH);
}
}
else
{
GetEmitter()->emitIns_BARR(INS_dmb, barrierKind == BARRIER_LOAD_ONLY ? INS_BARRIER_ISHLD : INS_BARRIER_ISH);
}
}

#endif // TARGET_ARM64
26 changes: 26 additions & 0 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9135,4 +9135,30 @@ void CodeGen::genPushCalleeSavedRegisters()
}
}

//-----------------------------------------------------------------------------------
// instGen_MemoryBarrier: Emit a MemoryBarrier instruction
//
// Arguments:
// barrierKind - kind of barrier to emit (Load-only is no-op on xarch)
//
// Notes:
// All MemoryBarriers instructions can be removed by DOTNET_JitNoMemoryBarriers=1
//
void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
{
#ifdef DEBUG
if (JitConfig.JitNoMemoryBarriers() == 1)
{
return;
}
#endif // DEBUG

// only full barrier needs to be emitted on Xarch
if (barrierKind == BARRIER_FULL)
{
instGen(INS_lock);
GetEmitter()->emitIns_I_AR(INS_or, EA_4BYTE, 0, REG_SPBASE, 0);
}
}

#endif // TARGET_XARCH
20 changes: 20 additions & 0 deletions src/coreclr/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,11 @@ insGroup* emitter::emitSavIG(bool emitAdd)
ig = emitCurIG;
assert(ig);

#ifdef TARGET_ARMARCH
// Reset emitLastMemBarrier for new IG
emitLastMemBarrier = nullptr;
#endif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should really be:

if (!emitAdd) {
  emitLastMemBarrier = nullptr;
}

because if we're saving this IG for the purpose of creating a new, "overflow" IG, we're not creating a label, so it should be treated as contiguous.

It's unfortunate that this "peephole optimization" is spread around the emitter instead of being handled entirely within codegen. E.g., instead of emitLastMemBarrier, couldn't there be a codegen variable that keeps track of the last memory barrier, and clear that at BasicBlock boundaries (not IG boundaries)? You're leveraging a convenient chokepoint in the emitter, appendToCurIG, to clear this if a load/store is found, but maybe there is something not much more difficult in codegen only (and not touching the emitter).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to do that in codegen first but it didn't look safe to me - I wasn't sure if I remove "GTF_VOLATILE" flag from a load it won't be then optimized further in emitter or in other place of codegen. I chose emitter because on arm the only instructions we need memorybarrier for is ldr and str (and their variations) unlike x86 where it's way more complicated. And it's where other last-chance peepholes live.

because if we're saving this IG for the purpose of creating a new, "overflow" IG, we're not creating a label, so it should be treated as contiguous.

Yeah, I kept that in mind, but it didn't produce any diffs so I decided to leave it as is. I was mostly interested in optimizing things like volatileFiled++ or volatileFiled*=10 etc that we have in some places in Task. Fortunately, we don't emit many memory barriers on arm - for performance-critical code we mostly use Volatile.Read/Write that doesn't produce them.


// Compute how much code we've generated

sz = emitCurIGfreeNext - emitCurIGfreeBase;
Expand Down Expand Up @@ -1140,6 +1145,10 @@ void emitter::emitBegFN(bool hasFramePtr

emitLastIns = nullptr;

#ifdef TARGET_ARMARCH
emitLastMemBarrier = nullptr;
#endif

ig->igNext = nullptr;

#ifdef DEBUG
Expand Down Expand Up @@ -1303,6 +1312,17 @@ void emitter::dispIns(instrDesc* id)

void emitter::appendToCurIG(instrDesc* id)
{
#ifdef TARGET_ARMARCH
if (id->idIns() == INS_dmb)
{
emitLastMemBarrier = id;
}
else if (emitInsIsLoadOrStore(id->idIns()))
{
// A memory access - reset saved memory barrier
emitLastMemBarrier = nullptr;
}
#endif
emitCurIGsize += id->idCodeSize();
}

Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -1884,6 +1884,10 @@ class emitter

instrDesc* emitLastIns;

#ifdef TARGET_ARMARCH
instrDesc* emitLastMemBarrier;
#endif

#ifdef DEBUG
void emitCheckIGoffsets();
#endif
Expand Down
35 changes: 0 additions & 35 deletions src/coreclr/jit/instr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2386,41 +2386,6 @@ void CodeGen::instGen_Return(unsigned stkArgSize)
#endif
}

/*****************************************************************************
*
* Emit a MemoryBarrier instruction
*
* Note: all MemoryBarriers instructions can be removed by
* SET COMPlus_JitNoMemoryBarriers=1
*/
void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
{
#ifdef DEBUG
if (JitConfig.JitNoMemoryBarriers() == 1)
{
return;
}
#endif // DEBUG

#if defined(TARGET_XARCH)
// only full barrier needs to be emitted on Xarch
if (barrierKind != BARRIER_FULL)
{
return;
}

instGen(INS_lock);
GetEmitter()->emitIns_I_AR(INS_or, EA_4BYTE, 0, REG_SPBASE, 0);
#elif defined(TARGET_ARM)
// ARM has only full barriers, so all barriers need to be emitted as full.
GetEmitter()->emitIns_I(INS_dmb, EA_4BYTE, 0xf);
#elif defined(TARGET_ARM64)
GetEmitter()->emitIns_BARR(INS_dmb, barrierKind == BARRIER_LOAD_ONLY ? INS_BARRIER_ISHLD : INS_BARRIER_ISH);
#else
#error "Unknown TARGET"
#endif
}

/*****************************************************************************
*
* Machine independent way to move a Zero value into a register
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/instr.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,10 @@ enum insOpts: unsigned
INS_OPTS_ASR,
INS_OPTS_ROR
};
enum insBarrier : unsigned
{
INS_BARRIER_SY = 15
};
#elif defined(TARGET_ARM64)
enum insOpts : unsigned
{
Expand Down