Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chained comparison of two integers against a constant is not coalesced #102103

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
280 changes: 219 additions & 61 deletions src/coreclr/jit/optimizebools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class OptBoolsDsc
bool optOptimizeCompareChainCondBlock();
bool optOptimizeRangeTests();
bool optOptimizeBoolsReturnBlock(BasicBlock* b3);
bool optOptimizeAndConditionWithEqualityOperator(BasicBlock* b3);
#ifdef DEBUG
void optOptimizeBoolsGcStress();
#endif
Expand All @@ -106,6 +107,7 @@ class OptBoolsDsc
bool optOptimizeBoolsChkTypeCostCond();
void optOptimizeBoolsUpdateTrees();
bool FindCompareChain(GenTree* condition, bool* isTestCondition);
void optOptimizeBoolsKeepFirstBlockAndFreeTheRest(bool optReturnBlock);
};

//-----------------------------------------------------------------------------
Expand Down Expand Up @@ -1235,68 +1237,14 @@ bool OptBoolsDsc::optOptimizeBoolsChkTypeCostCond()
}

//-----------------------------------------------------------------------------
// optOptimizeBoolsUpdateTrees: Fold the trees based on fold type and comparison type,
// update the edges, and unlink removed blocks
// optOptimizeBoolsKeepFirstBlockAndFreeTheRest: update the edges, and unlink removed blocks
//
void OptBoolsDsc::optOptimizeBoolsUpdateTrees()
// Arguments:
// cmpOp - comparison operation
// optReturnBlock - defines whether to make first block a return or condition block
//
void OptBoolsDsc::optOptimizeBoolsKeepFirstBlockAndFreeTheRest(bool optReturnBlock)
{
assert(m_b1 != nullptr && m_b2 != nullptr);

bool optReturnBlock = false;
if (m_b3 != nullptr)
{
optReturnBlock = true;
}

assert(m_cmpOp != GT_NONE && m_c1 != nullptr && m_c2 != nullptr);

GenTree* cmpOp1 = m_foldOp == GT_NONE ? m_c1 : m_comp->gtNewOperNode(m_foldOp, m_foldType, m_c1, m_c2);
if (m_testInfo1.isBool && m_testInfo2.isBool)
{
// When we 'OR'/'AND' two booleans, the result is boolean as well
cmpOp1->gtFlags |= GTF_BOOLEAN;
}

GenTree* t1Comp = m_testInfo1.compTree;
t1Comp->SetOper(m_cmpOp);
t1Comp->AsOp()->gtOp1 = cmpOp1;
t1Comp->AsOp()->gtOp2->gtType = m_foldType; // Could have been varTypeIsGC()
if (optReturnBlock)
{
// Update tree when m_b1 is BBJ_COND and m_b2 and m_b3 are GT_RETURN/GT_SWIFT_ERROR_RET (BBJ_RETURN)
t1Comp->AsOp()->gtOp2->AsIntCon()->gtIconVal = 0;
m_testInfo1.testTree->gtOper = m_testInfo2.testTree->OperGet();
m_testInfo1.testTree->gtType = m_testInfo2.testTree->TypeGet();

// Update the return count of flow graph
assert(m_comp->fgReturnCount >= 2);
--m_comp->fgReturnCount;
}

#if FEATURE_SET_FLAGS
// For comparisons against zero we will have the GTF_SET_FLAGS set
// and this can cause an assert to fire in fgMoveOpsLeft(GenTree* tree)
// during the CSE phase.
//
// So make sure to clear any GTF_SET_FLAGS bit on these operations
// as they are no longer feeding directly into a comparisons against zero

// Make sure that the GTF_SET_FLAGS bit is cleared.
// Fix 388436 ARM JitStress WP7
m_c1->gtFlags &= ~GTF_SET_FLAGS;
m_c2->gtFlags &= ~GTF_SET_FLAGS;

// The new top level node that we just created does feed directly into
// a comparison against zero, so set the GTF_SET_FLAGS bit so that
// we generate an instruction that sets the flags, which allows us
// to omit the cmp with zero instruction.

// Request that the codegen for cmpOp1 sets the condition flags
// when it generates the code for cmpOp1.
//
cmpOp1->gtRequestSetFlags();
#endif

// Recost/rethread the tree if necessary
//
if (m_comp->fgNodeThreading != NodeThreading::None)
Expand Down Expand Up @@ -1390,6 +1338,73 @@ void OptBoolsDsc::optOptimizeBoolsUpdateTrees()
m_b1->bbCodeOffsEnd = optReturnBlock ? m_b3->bbCodeOffsEnd : m_b2->bbCodeOffsEnd;
}

//-----------------------------------------------------------------------------
// optOptimizeBoolsUpdateTrees: Fold the trees based on fold type and comparison type,
// update the edges, and unlink removed blocks
//
void OptBoolsDsc::optOptimizeBoolsUpdateTrees()
{
assert(m_b1 != nullptr && m_b2 != nullptr);

bool optReturnBlock = false;
if (m_b3 != nullptr)
{
optReturnBlock = true;
}

assert(m_cmpOp != GT_NONE && m_c1 != nullptr && m_c2 != nullptr);

GenTree* cmpOp1 = m_foldOp == GT_NONE ? m_c1 : m_comp->gtNewOperNode(m_foldOp, m_foldType, m_c1, m_c2);
if (m_testInfo1.isBool && m_testInfo2.isBool)
{
// When we 'OR'/'AND' two booleans, the result is boolean as well
cmpOp1->gtFlags |= GTF_BOOLEAN;
}

GenTree* t1Comp = m_testInfo1.compTree;
t1Comp->SetOper(m_cmpOp);
t1Comp->AsOp()->gtOp1 = cmpOp1;
t1Comp->AsOp()->gtOp2->gtType = m_foldType; // Could have been varTypeIsGC()
if (optReturnBlock)
{
// Update tree when m_b1 is BBJ_COND and m_b2 and m_b3 are GT_RETURN/GT_SWIFT_ERROR_RET (BBJ_RETURN)
t1Comp->AsOp()->gtOp2->AsIntCon()->gtIconVal = 0;
m_testInfo1.testTree->gtOper = m_testInfo2.testTree->OperGet();
m_testInfo1.testTree->gtType = m_testInfo2.testTree->TypeGet();

// Update the return count of flow graph
assert(m_comp->fgReturnCount >= 2);
--m_comp->fgReturnCount;
}

#if FEATURE_SET_FLAGS
// For comparisons against zero we will have the GTF_SET_FLAGS set
// and this can cause an assert to fire in fgMoveOpsLeft(GenTree* tree)
// during the CSE phase.
//
// So make sure to clear any GTF_SET_FLAGS bit on these operations
// as they are no longer feeding directly into a comparisons against zero

// Make sure that the GTF_SET_FLAGS bit is cleared.
// Fix 388436 ARM JitStress WP7

m_c1->gtFlags &= ~GTF_SET_FLAGS;
m_c2->gtFlags &= ~GTF_SET_FLAGS;

// The new top level node that we just created does feed directly into
// a comparison against zero, so set the GTF_SET_FLAGS bit so that
// we generate an instruction that sets the flags, which allows us
// to omit the cmp with zero instruction.

// Request that the codegen for cmpOp sets the condition flags
// when it generates the code for cmpOp.
//
cmpOp1->gtRequestSetFlags();
#endif

optOptimizeBoolsKeepFirstBlockAndFreeTheRest(optReturnBlock);
}

//-----------------------------------------------------------------------------
// optOptimizeBoolsReturnBlock: Optimize boolean when m_b1 is BBJ_COND and m_b2 and m_b3 are BBJ_RETURN
//
Expand Down Expand Up @@ -1771,6 +1786,100 @@ GenTree* OptBoolsDsc::optIsBoolComp(OptTestInfo* pOptTest)
return opr1;
}

//-----------------------------------------------------------------------------
// optOptimizeAndConditionWithEqualityOperator: Optimize boolean when m_b1 is BBJ_COND with EQ on integers, m_b2 is
// BBJ_RETURN with EQ on integers and m_b3 is BBJ_RETURN of False
//
// Arguments:
// b3: Pointer to basic block b3
//
// Returns:
// true if boolean optimization is done and m_b1, m_b2 and m_b3 are folded into m_b1, else false.
//
// Notes:
// m_b1, m_b2 and m_b3 of OptBoolsDsc are set on entry.
//
// if B1->TargetIs(b3), it transforms
// B1 : brtrue(t1, B3)
// B2 : ret(t2)
// B3 : ret(0)
// to
// B1 : ret((!t1) && t2)
//
bool OptBoolsDsc::optOptimizeAndConditionWithEqualityOperator(BasicBlock* b3)
{
#if defined(TARGET_ARM) || defined(TARGET_ARM64)
return false;
#endif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this check? There should be a comment why the transformation isn't profitable on these platforms.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pedrobsaila as well..

The JIT already supports creating conditional chains. It was implemented by @a74nh in #79283. It is only enabled on arm64 because only arm64 has conditional compare.

Conditional compare is going to be better than the xor pattern from the example, that's probably why this is a regression on arm64. Also, x64/x86 is getting conditional compare as part of Intel APX, so it is expected that we will enable the same logic for x86/x64 in the future. Given this and the small diffs of this PR I'm not sure the complexity is warranted.

If we want to have this transformation then it should be done by enabling optOptimizeCompareChainCondBlock for x86/x64. Then the backend should be taught how to translate this pattern to something more profitable on x86/x64. Currently it only knows how to do that for arm64. The transformation is done by TryLowerAndOrToCCMP. x86/x64 should have a similar version that is translated by using xor instead. Most likely optOptimizeCompareChainCondBlock will have to be restricted on the patterns it allows to combine on x86/x64 since there is no true conditional compare yet.

Copy link
Contributor

@neon-sunset neon-sunset Jul 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I removed my comment because I misread this feedback as asking the author rather than a nudge to provide context on disabling on ARM64, though I was hoping to see much larger amount of diffs - this pattern is pretty common after all)

Indeed ccmp is emitted by both Clang and GCC: https://godbolt.org/z/srhqPrK4v

Copy link
Contributor Author

@pedrobsaila pedrobsaila Jul 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this check?

yes the ARM support for branchless conditional select is what motivated the if, the xor pattern ends up being more expensive in this case.

If we want to have this transformation then it should be done by enabling optOptimizeCompareChainCondBlock for x86/x64

If I do understand the code well, this would optimize comparison checks for conditional blocks but not for return ones. Do I need to delete the optimization for the latest ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's ok to add some handling for return blocks if the existing handling doesn't cover it. But I think you should transform it into bitwise ops (i.e. return (x == 0x80) && (y == 0x80) => return (x == 0x80) & (y == 0x80) so that the remaining handling falls out naturally from TryLowerAndOrToCCMP (and the x64 specific variant being introduced).


Statement* const s1 = optOptimizeBoolsChkBlkCond();
if (s1 == nullptr)
{
return false;
}

GenTree* cond1 = m_testInfo1.GetTestOp();
GenTree* cond2 = m_testInfo2.GetTestOp();

ssize_t block3RetVal = m_b3->firstStmt()->GetRootNode()->AsOp()->GetReturnValue()->AsIntCon()->IconValue();

if (!cond1->OperIs(GT_NE) || !cond2->OperIs(GT_EQ, GT_NE) || (block3RetVal != 0 && cond2->OperIs(GT_EQ)) ||
(block3RetVal != 1 && cond2->OperIs(GT_NE)))
{
return false;
};

GenTree* op11 = cond1->AsOp()->gtOp1;
GenTree* op12 = cond1->AsOp()->gtOp2;
GenTree* op21 = cond2->AsOp()->gtOp1;
GenTree* op22 = cond2->AsOp()->gtOp2;

if (!op11->OperIs(GT_LCL_VAR, GT_CNS_INT) || !op12->OperIs(GT_LCL_VAR, GT_CNS_INT) ||
!op21->OperIs(GT_LCL_VAR, GT_CNS_INT) || !op22->OperIs(GT_LCL_VAR, GT_CNS_INT) || !op11->TypeIs(TYP_INT) ||
!op12->TypeIs(TYP_INT) || !op21->TypeIs(TYP_INT) || !op22->TypeIs(TYP_INT))
{
return false;
}

GenTreeOp* xorOp1 = m_comp->gtNewOperNode(GT_XOR, TYP_INT, op11, op12);
xorOp1->gtFlags |= GTF_BOOLEAN;
xorOp1->gtPrev = op12;
op12->gtNext = xorOp1;
op12->gtPrev = op11;
op11->gtNext = op12;
op11->gtPrev = nullptr;
GenTreeOp* xorOp2 = m_comp->gtNewOperNode(GT_XOR, TYP_INT, op21, op22);
xorOp2->gtFlags |= GTF_BOOLEAN;
xorOp2->gtPrev = op22;
op22->gtNext = xorOp2;
op22->gtPrev = op21;
op21->gtNext = op22;

GenTreeOp* branchlessOrEqOp = m_comp->gtNewOperNode(GT_OR, TYP_INT, xorOp1, xorOp2);
branchlessOrEqOp->gtFlags |= GTF_BOOLEAN;
branchlessOrEqOp->gtPrev = xorOp2;
xorOp2->gtNext = branchlessOrEqOp;
op21->gtPrev = xorOp1;
xorOp1->gtNext = op21;

GenTreeIntCon* trueOp = m_comp->gtNewFalse();
m_testInfo1.compTree->SetOper(cond2->OperIs(GT_EQ) ? GT_EQ : GT_NE);
m_testInfo1.compTree->gtFlags |= GTF_BOOLEAN;
m_testInfo1.compTree->gtType = TYP_INT;
m_testInfo1.compTree->AsOp()->gtOp1 = branchlessOrEqOp;
m_testInfo1.compTree->AsOp()->gtOp2 = trueOp;
m_testInfo1.compTree->gtPrev = trueOp;
trueOp->gtNext = m_testInfo1.compTree;
trueOp->gtPrev = branchlessOrEqOp;
branchlessOrEqOp->gtNext = trueOp;

m_testInfo1.testTree->gtOper = m_testInfo2.testTree->OperGet();
m_testInfo1.testTree->gtType = m_testInfo2.testTree->TypeGet();

optOptimizeBoolsKeepFirstBlockAndFreeTheRest(true);
return true;
}

//-----------------------------------------------------------------------------
// optOptimizeBools: Folds boolean conditionals for GT_JTRUE/GT_RETURN/GT_SWIFT_ERROR_RET nodes
//
Expand Down Expand Up @@ -1894,6 +2003,54 @@ GenTree* OptBoolsDsc::optIsBoolComp(OptTestInfo* pOptTest)
// +--* LCL_VAR int V00 arg0
// \--* CNS_INT int 0
//
// Case 16: (x = 128 && y = 89) => (x^128) || (y^89)
// * RETURN int $VN.Void
// \--* EQ int
// +--* OR int
// | +--* XOR int
// | | +--* LCL_VAR int V00 arg0
// | | \--* CNS_INT int 128 $43
// | \--* XOR int
// | +--* LCL_VAR int V01 arg1
// | \--* CNS_INT int 89 $44
// \--* CNS_INT int 0
//
// Case 17: (x = y && v = z) => (x^y) || (v^z)
// * RETURN int $VN.Void
// \--* EQ int
// +--* OR int
// | +--* XOR int
// | | +--* LCL_VAR int V00 arg0
// | | \--* LCL_VAR int V00 arg1
// | \--* XOR int
// | +--* LCL_VAR int V01 arg2
// | \--* LCL_VAR int V00 arg3
// \--* CNS_INT int 0
//
// Case 18: (x != 128 || y != 89) => !((x^128) || (y^89))
// * RETURN int $VN.Void
// \--* NE int
// +--* OR int
// | +--* XOR int
// | | +--* LCL_VAR int V00 arg0
// | | \--* CNS_INT int 128 $43
// | \--* XOR int
// | +--* LCL_VAR int V01 arg1
// | \--* CNS_INT int 89 $44
// \--* CNS_INT int 0
//
// Case 19: (x != y && v != z) => !((x^y) || (v^z))
// * RETURN int $VN.Void
// \--* NE int
// +--* OR int
// | +--* XOR int
// | | +--* LCL_VAR int V00 arg0
// | | \--* LCL_VAR int V00 arg1
// | \--* XOR int
// | +--* LCL_VAR int V01 arg2
// | \--* LCL_VAR int V00 arg3
// \--* CNS_INT int 0
//
// Patterns that are not optimized include (x == 1 && y == 1), (x == 1 || y == 1),
// (x == 0 || y == 0) because currently their comptree is not marked as boolean expression.
// When m_foldOp == GT_AND or m_cmpOp == GT_NE, both compTrees must be boolean expression
Expand Down Expand Up @@ -2001,7 +2158,8 @@ PhaseStatus Compiler::optOptimizeBools()
continue;
}

if (optBoolsDsc.optOptimizeBoolsReturnBlock(b3))
if (optBoolsDsc.optOptimizeBoolsReturnBlock(b3) ||
optBoolsDsc.optOptimizeAndConditionWithEqualityOperator(b3))
{
change = true;
numReturn++;
Expand Down
Loading