Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization to remove redundant zero initializations. #36918

Merged
merged 2 commits into from
May 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 9 additions & 36 deletions src/coreclr/src/jit/codegencommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4596,8 +4596,9 @@ void CodeGen::genCheckUseBlockInit()
// double-counting the initialization impact of any locals.
bool counted = false;

if (varDsc->lvIsParam)
if (!varDsc->lvIsInReg() && !varDsc->lvOnFrame)
{
noway_assert(varDsc->lvRefCnt() == 0);
continue;
}

Expand All @@ -4608,44 +4609,10 @@ void CodeGen::genCheckUseBlockInit()
continue;
}

// Likewise, initialization of the GS cookie is handled specially for OSR.
// Could do this for non-OSR too.. (likewise for the dummy)
if (compiler->opts.IsOSR() && varNum == compiler->lvaGSSecurityCookie)
{
continue;
}

if (!varDsc->lvIsInReg() && !varDsc->lvOnFrame)
{
noway_assert(varDsc->lvRefCnt() == 0);
continue;
}

if (varNum == compiler->lvaInlinedPInvokeFrameVar || varNum == compiler->lvaStubArgumentVar ||
varNum == compiler->lvaRetAddrVar)
{
continue;
}

#if FEATURE_FIXED_OUT_ARGS
if (varNum == compiler->lvaPInvokeFrameRegSaveVar)
{
continue;
}
if (varNum == compiler->lvaOutgoingArgSpaceVar)
{
continue;
}
#endif

#if defined(FEATURE_EH_FUNCLETS)
// There's no need to force 0-initialization of the PSPSym, it will be
// initialized with a real value in the prolog
if (varNum == compiler->lvaPSPSym)
if (compiler->fgVarIsNeverZeroInitializedInProlog(varNum))
{
continue;
}
#endif

if (compiler->lvaIsFieldOfDependentlyPromotedStruct(varDsc))
{
Expand All @@ -4655,6 +4622,12 @@ void CodeGen::genCheckUseBlockInit()
continue;
}

if (varDsc->lvHasExplicitInit)
{
varDsc->lvMustInit = 0;
continue;
}

if (compiler->info.compInitMem || varDsc->HasGCPtr() || varDsc->lvMustInit)
{
if (varDsc->lvTracked)
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/src/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4690,6 +4690,8 @@ void Compiler::compCompile(void** methodCodePtr, ULONG* methodCodeSize, JitFlags
DoPhase(this, PHASE_BUILD_SSA, &Compiler::fgSsaBuild);
}

DoPhase(this, PHASE_ZERO_INITS, &Compiler::optRemoveRedundantZeroInits);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like it would be less confusing to run this before we build SSA, since it doesn't really leverage or involve SSA (other than skipping phi defs).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This optimization needs to run after liveness so that it can use lvTracked and lvLiveInOutOfHndlr. We run liveness when we build SSA.


if (doEarlyProp)
{
// Propagate array length and rewrite getType() method call
Expand Down
11 changes: 10 additions & 1 deletion src/coreclr/src/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,10 @@ class LclVarDsc

unsigned char lvSuppressedZeroInit : 1; // local needs zero init if we transform tail call to loop

unsigned char lvHasExplicitInit : 1; // The local is explicitly initialized and doesn't need zero initialization in
// the prolog. If the local has gc pointers, there are no gc-safe points
// between the prolog and the explicit initialization.

union {
unsigned lvFieldLclStart; // The index of the local var representing the first field in the promoted struct
// local. For implicit byref parameters, this gets hijacked between
Expand Down Expand Up @@ -4653,8 +4657,11 @@ class Compiler

unsigned fgSsaPassesCompleted; // Number of times fgSsaBuild has been run.

// Returns "true" if this is a special variable that is never zero initialized in the prolog.
inline bool fgVarIsNeverZeroInitializedInProlog(unsigned varNum);

// Returns "true" if the variable needs explicit zero initialization.
inline bool fgVarNeedsExplicitZeroInit(LclVarDsc* varDsc, bool bbInALoop, bool bbIsReturn);
inline bool fgVarNeedsExplicitZeroInit(unsigned varNum, bool bbInALoop, bool bbIsReturn);

// The value numbers for this compilation.
ValueNumStore* vnStore;
Expand Down Expand Up @@ -5906,6 +5913,8 @@ class Compiler

void optUnrollLoops(); // Unrolls loops (needs to have cost info)

void optRemoveRedundantZeroInits();

protected:
// This enumeration describes what is killed by a call.

Expand Down
39 changes: 37 additions & 2 deletions src/coreclr/src/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4145,11 +4145,39 @@ inline void Compiler::CLR_API_Leave(API_ICorJitInfo_Names ename)

#endif // MEASURE_CLRAPI_CALLS

//------------------------------------------------------------------------------
// fgVarIsNeverZeroInitializedInProlog : Check whether the variable is never zero initialized in the prolog.
//
// Arguments:
// varNum - local variable number
//
// Returns:
// true if this is a special variable that is never zero initialized in the prolog;
// false otherwise
//

bool Compiler::fgVarIsNeverZeroInitializedInProlog(unsigned varNum)
{
LclVarDsc* varDsc = lvaGetDesc(varNum);
bool result = varDsc->lvIsParam || lvaIsOSRLocal(varNum) || (opts.IsOSR() && (varNum == lvaGSSecurityCookie)) ||
(varNum == lvaInlinedPInvokeFrameVar) || (varNum == lvaStubArgumentVar) || (varNum == lvaRetAddrVar);

#if FEATURE_FIXED_OUT_ARGS
result = result || (varNum == lvaPInvokeFrameRegSaveVar) || (varNum == lvaOutgoingArgSpaceVar);
#endif

#if defined(FEATURE_EH_FUNCLETS)
result = result || (varNum == lvaPSPSym);
#endif

return result;
}

//------------------------------------------------------------------------------
// fgVarNeedsExplicitZeroInit : Check whether the variable needs an explicit zero initialization.
//
// Arguments:
// varDsc - local var description
// varNum - local var number
// bbInALoop - true if the basic block may be in a loop
// bbIsReturn - true if the basic block always returns
//
Expand All @@ -4165,13 +4193,20 @@ inline void Compiler::CLR_API_Leave(API_ICorJitInfo_Names ename)
// - compInitMem is set and the variable has a long lifetime or has gc fields.
// In these cases we will insert zero-initialization in the prolog if necessary.

bool Compiler::fgVarNeedsExplicitZeroInit(LclVarDsc* varDsc, bool bbInALoop, bool bbIsReturn)
bool Compiler::fgVarNeedsExplicitZeroInit(unsigned varNum, bool bbInALoop, bool bbIsReturn)
{
LclVarDsc* varDsc = lvaGetDesc(varNum);

if (bbInALoop && !bbIsReturn)
{
return true;
}

if (fgVarIsNeverZeroInitializedInProlog(varNum))
{
return true;
}

if (varTypeIsGC(varDsc->lvType))
{
return false;
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/src/jit/compmemkind.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ CompMemKindMacro(VariableLiveRanges)
CompMemKindMacro(ClassLayout)
CompMemKindMacro(TailMergeThrows)
CompMemKindMacro(EarlyProp)
CompMemKindMacro(ZeroInit)
//clang-format on

#undef CompMemKindMacro
1 change: 1 addition & 0 deletions src/coreclr/src/jit/compphases.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ CompPhaseNameMacro(PHASE_CREATE_FUNCLETS, "Create EH funclets",
CompPhaseNameMacro(PHASE_MERGE_THROWS, "Merge throw blocks", "MRGTHROW", false, -1, false)
CompPhaseNameMacro(PHASE_OPTIMIZE_LAYOUT, "Optimize layout", "LAYOUT", false, -1, false)
CompPhaseNameMacro(PHASE_COMPUTE_REACHABILITY, "Compute blocks reachability", "BL_REACH", false, -1, false)
CompPhaseNameMacro(PHASE_ZERO_INITS, "Redundant zero Inits", "ZERO-INIT", false, -1, false)
CompPhaseNameMacro(PHASE_OPTIMIZE_LOOPS, "Optimize loops", "LOOP-OPT", false, -1, false)
CompPhaseNameMacro(PHASE_CLONE_LOOPS, "Clone loops", "LP-CLONE", false, -1, false)
CompPhaseNameMacro(PHASE_UNROLL_LOOPS, "Unroll loops", "UNROLL", false, -1, false)
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/src/jit/flowgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23787,7 +23787,7 @@ Statement* Compiler::fgInlinePrependStatements(InlineInfo* inlineInfo)
if (tmpNum != BAD_VAR_NUM)
{
LclVarDsc* const tmpDsc = lvaGetDesc(tmpNum);
if (!fgVarNeedsExplicitZeroInit(tmpDsc, bbInALoop, bbIsReturn))
if (!fgVarNeedsExplicitZeroInit(tmpNum, bbInALoop, bbIsReturn))
{
JITDUMP("\nSuppressing zero-init for V%02u -- expect to zero in prolog\n", tmpNum);
tmpDsc->lvSuppressedZeroInit = 1;
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/src/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13883,7 +13883,7 @@ void Compiler::impImportBlockCode(BasicBlock* block)
bool bbIsReturn = (block->bbJumpKind == BBJ_RETURN) &&
(!compIsForInlining() || (impInlineInfo->iciBlock->bbJumpKind == BBJ_RETURN));
LclVarDsc* const lclDsc = lvaGetDesc(lclNum);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated to this PR, but this code inexplicably usees both lclDsc and lvaTable[lclNum]

if (fgVarNeedsExplicitZeroInit(lclDsc, bbInALoop, bbIsReturn))
if (fgVarNeedsExplicitZeroInit(lclNum, bbInALoop, bbIsReturn))
{
// Append a tree to zero-out the temp
newObjThisPtr = gtNewLclvNode(lclNum, lvaTable[lclNum].TypeGet());
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/src/jit/objectalloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ unsigned int ObjectAllocator::MorphAllocObjNodeIntoStackAlloc(GenTreeAllocObj* a
bool bbInALoop = (block->bbFlags & BBF_BACKWARD_JUMP) != 0;
bool bbIsReturn = block->bbJumpKind == BBJ_RETURN;
LclVarDsc* const lclDsc = comp->lvaGetDesc(lclNum);
if (comp->fgVarNeedsExplicitZeroInit(lclDsc, bbInALoop, bbIsReturn))
if (comp->fgVarNeedsExplicitZeroInit(lclNum, bbInALoop, bbIsReturn))
{
//------------------------------------------------------------------------
// STMTx (IL 0x... ???)
Expand Down
144 changes: 144 additions & 0 deletions src/coreclr/src/jit/optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9189,3 +9189,147 @@ void Compiler::optOptimizeBools()
fgDebugCheckBBlist();
#endif
}

typedef JitHashTable<unsigned, JitSmallPrimitiveKeyFuncs<unsigned>, unsigned> LclVarRefCounts;

//------------------------------------------------------------------------------------------
// optRemoveRedundantZeroInits: Remove redundant zero intializations.
//
// Notes:
// This phase iterates over basic blocks starting with the first basic block until there is no unique
// basic block successor or until it detects a loop. It keeps track of local nodes it encounters.
// When it gets to an assignment to a local variable or a local field, it checks whether the assignment
// is the first reference to the local (or to the parent of the local field), and, if so,
// it may do one of two optimizations:
// 1. If the following conditions are true:
// the local is untracked,
// the rhs of the assignment is 0,
// the local is guaranteed to be fully initialized in the prolog,
// then the explicit zero initialization is removed.
// 2. If the following conditions are true:
// the assignment is to a local (and not a field),
// the local is not lvLiveInOutOfHndlr or no exceptions can be thrown between the prolog and the assignment,
// either the local has no gc pointers or there are no gc-safe points between the prolog and the assignment,
// then the local with lvHasExplicitInit which tells the codegen not to insert zero initialization for this
// local in the prolog.

void Compiler::optRemoveRedundantZeroInits()
{
#ifdef DEBUG
if (verbose)
{
printf("*************** In optRemoveRedundantZeroInits()\n");
}
#endif // DEBUG

CompAllocator allocator(getAllocator(CMK_ZeroInit));
LclVarRefCounts refCounts(allocator);
bool hasGCSafePoint = false;
bool canThrow = false;

assert(fgStmtListThreaded);

for (BasicBlock* block = fgFirstBB; (block != nullptr) && ((block->bbFlags & BBF_MARKED) == 0);
block = block->GetUniqueSucc())
{
block->bbFlags |= BBF_MARKED;
for (Statement* stmt = block->FirstNonPhiDef(); stmt != nullptr;)
{
Statement* next = stmt->GetNextStmt();
for (GenTree* tree = stmt->GetTreeList(); tree != nullptr; tree = tree->gtNext)
{
if (((tree->gtFlags & GTF_CALL) != 0) && (!tree->IsCall() || !tree->AsCall()->IsSuppressGCTransition()))
{
hasGCSafePoint = true;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we have some calls (GTF_CALL_M_SUPPRESS_GC_TRANSITION) that are not safe points? I suppose it is conservatively correct to assume all calls are GC safepoints though.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I added a check for calls with GTF_CALL_M_SUPPRESS_GC_TRANSITION. No new diffs in framework and benchmarks.

}

if ((tree->gtFlags & GTF_EXCEPT) != 0)
{
canThrow = true;
}

switch (tree->gtOper)
{
case GT_LCL_VAR:
case GT_LCL_FLD:
case GT_LCL_VAR_ADDR:
case GT_LCL_FLD_ADDR:
{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For non-address exposed locals we only care about seeing defs, not uses?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We care about seeing defs when removing explicit zero initializations and we care about seeing uses when marking locals with lvHasExplicitInit. I tried a change where I tracked defs and uses separately and saw no new diffs in framework or benchmarks so will keep it simple.

unsigned lclNum = tree->AsLclVarCommon()->GetLclNum();
unsigned* pRefCount = refCounts.LookupPointer(lclNum);
if (pRefCount != nullptr)
{
*pRefCount = (*pRefCount) + 1;
}
else
{
refCounts.Set(lclNum, 1);
}

break;
}
case GT_ASG:
{
GenTreeOp* treeOp = tree->AsOp();
if (treeOp->gtOp1->OperIs(GT_LCL_VAR, GT_LCL_FLD))
{
unsigned lclNum = treeOp->gtOp1->AsLclVarCommon()->GetLclNum();
LclVarDsc* const lclDsc = lvaGetDesc(lclNum);
unsigned* pRefCount = refCounts.LookupPointer(lclNum);
assert(pRefCount != nullptr);
if (*pRefCount == 1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor suggestion - it might be worth a comment (or perhaps add to the comment below) that *pRefCount can never be null, because the local node on the rhs of the assignment must have already been seend.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add a comment with my next PR.

{
// The local hasn't been referenced before this assignment.
bool removedExplicitZeroInit = false;
if (!lclDsc->lvTracked && treeOp->gtOp2->IsIntegralConst(0))
{
bool bbInALoop = (block->bbFlags & BBF_BACKWARD_JUMP) != 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BBF_BACKWARD_JUMP may be too conservative? Seems like we should have enough graph analysis state lying around to detect loops more accurately. Might not be worth the trouble.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked around and found that we call optMarkLoopBlocks but that just sets the weights for the blocks and it doesn't feel right to use that here. I'll leave it for later to try to add a basic block flag and use it in this optimization. I doubt it will make much of a difference.

bool bbIsReturn = block->bbJumpKind == BBJ_RETURN;

if (!fgVarNeedsExplicitZeroInit(lclNum, bbInALoop, bbIsReturn))
{
// We are guaranteed to have a zero initialization in the prolog and
// the local hasn't been redefined between the prolog and this explicit
// zero initialization so the assignment can be safely removed.
if (tree == stmt->GetRootNode())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For non-root assignments can't we just bash the asg tree to a nop? Or if bashing during walking is too painful, keep track of these trees and bash them later?

Or are these rare enough that it's not worth trying to handle them?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried that and saw no new diffs in framework and benchmarks.

{
fgRemoveStmt(block, stmt);
removedExplicitZeroInit = true;
*pRefCount = 0;
lclDsc->lvSuppressedZeroInit = 1;
}
}
}

if (!removedExplicitZeroInit && treeOp->gtOp1->OperIs(GT_LCL_VAR) &&
(!canThrow || !lclDsc->lvLiveInOutOfHndlr))
{
// If compMethodRequiresPInvokeFrame() returns true, lower may later
// insert a call to CORINFO_HELP_INIT_PINVOKE_FRAME which is a gc-safe point.
if (!lclDsc->HasGCPtr() ||
(!GetInterruptible() && !hasGCSafePoint && !compMethodRequiresPInvokeFrame()))
{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add to your comment above or below and explain why we're checking compMethodRequiresPInvokeFrame here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

// The local hasn't been used and won't be reported to the gc between
// the prolog and this explicit intialization. Therefore, it doesn't
// require zero initialization in the prolog.
lclDsc->lvHasExplicitInit = 1;
}
}
}
}
break;
}
default:
break;
}
}
stmt = next;
}
}

for (BasicBlock* block = fgFirstBB; (block != nullptr) && ((block->bbFlags & BBF_MARKED) != 0);
block = block->GetUniqueSucc())
{
block->bbFlags &= ~BBF_MARKED;
}
}