From f3d84a2ab79937df6a68d580226ef0c204aa0f56 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Wed, 12 May 2021 19:53:06 -0700 Subject: [PATCH 1/4] JIT: peel off dominant switch case under PGO If we have PGO data and the dominant non-default switch case has more than 30% of the profile, add an explicit test for that case upstream of the switch. We don't see switches all that often anymore as CSC is quite aggressive about turning them into if-then-else trees, but they still show up in the async methods. --- src/coreclr/jit/block.h | 80 +++++++++--------- src/coreclr/jit/fgopt.cpp | 143 +++++++++++++++++++++++++------ src/coreclr/jit/fgprofile.cpp | 154 ++++++++++++++++++++++++++++++++++ 3 files changed, 312 insertions(+), 65 deletions(-) diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index dfc3d53fc25c1..352a2dea3291d 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -85,44 +85,7 @@ class typeInfo; struct BasicBlockList; struct flowList; struct EHblkDsc; - -/***************************************************************************** - * - * The following describes a switch block. - * - * Things to know: - * 1. If bbsHasDefault is true, the default case is the last one in the array of basic block addresses - * namely bbsDstTab[bbsCount - 1]. - * 2. bbsCount must be at least 1, for the default case. bbsCount cannot be zero. It appears that the ECMA spec - * allows for a degenerate switch with zero cases. Normally, the optimizer will optimize degenerate - * switches with just a default case to a BBJ_ALWAYS branch, and a switch with just two cases to a BBJ_COND. - * However, in debuggable code, we might not do that, so bbsCount might be 1. - */ -struct BBswtDesc -{ - BasicBlock** bbsDstTab; // case label table address - unsigned bbsCount; // count of cases (includes 'default' if bbsHasDefault) - bool bbsHasDefault; - - BBswtDesc() : bbsHasDefault(true) - { - } - - void removeDefault() - { - assert(bbsHasDefault); - assert(bbsCount > 0); - bbsHasDefault = false; - bbsCount--; - } - - BasicBlock* getDefault() - { - assert(bbsHasDefault); - assert(bbsCount > 0); - return bbsDstTab[bbsCount - 1]; - } -}; +struct BBswtDesc; struct StackEntry { @@ -1291,6 +1254,47 @@ struct BasicBlockList } }; +// BBswtDesc -- descriptor for a switch block +// +// Things to know: +// 1. If bbsHasDefault is true, the default case is the last one in the array of basic block addresses +// namely bbsDstTab[bbsCount - 1]. +// 2. bbsCount must be at least 1, for the default case. bbsCount cannot be zero. It appears that the ECMA spec +// allows for a degenerate switch with zero cases. Normally, the optimizer will optimize degenerate +// switches with just a default case to a BBJ_ALWAYS branch, and a switch with just two cases to a BBJ_COND. +// However, in debuggable code, we might not do that, so bbsCount might be 1. +// +struct BBswtDesc +{ + BasicBlock** bbsDstTab; // case label table address + unsigned bbsCount; // count of cases (includes 'default' if bbsHasDefault) + unsigned bbsDominantCase; + BasicBlock::weight_t bbsDominantFraction; + bool bbsHasDefault; + bool bbsHasDominantCase; + + BBswtDesc() : bbsHasDefault(true), bbsHasDominantCase(false) + { + } + + void removeDefault() + { + assert(bbsHasDefault); + assert(bbsCount > 0); + bbsHasDefault = false; + bbsCount--; + } + + BasicBlock* getDefault() + { + assert(bbsHasDefault); + assert(bbsCount > 0); + return bbsDstTab[bbsCount - 1]; + } +}; + +// flowList -- control flow edge +// struct flowList { public: diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index dc4774286456d..651d1bc87954e 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -3726,47 +3726,136 @@ bool Compiler::fgOptimizeBranch(BasicBlock* bJump) return true; } -/***************************************************************************** - * - * Function called to optimize switch statements - */ - +//----------------------------------------------------------------------------- +// fgOptimizeSwitchJump: see if a switch has a dominant case, and modify to +// check for that case up front (aka switch peeling). +// +// Returns: +// True if the switch now has an upstream check for the dominant case. +// bool Compiler::fgOptimizeSwitchJumps() { - bool result = false; // Our return value - -#if 0 - // TODO-CQ: Add switch jump optimizations? if (!fgHasSwitch) + { return false; + } - if (!fgHaveValidEdgeWeights) - return false; + bool modified = false; - for (BasicBlock* bSrc = fgFirstBB; bSrc != NULL; bSrc = bSrc->bbNext) + for (BasicBlock* block = fgFirstBB; block != NULL; block = block->bbNext) { - if (bSrc->bbJumpKind == BBJ_SWITCH) + if (block->IsLIR()) { - unsigned jumpCnt; jumpCnt = bSrc->bbJumpSwt->bbsCount; - BasicBlock** jumpTab; jumpTab = bSrc->bbJumpSwt->bbsDstTab; + break; + } - do - { - BasicBlock* bDst = *jumpTab; - flowList* edgeToDst = fgGetPredForBlock(bDst, bSrc); - double outRatio = (double) edgeToDst->edgeWeightMin() / (double) bSrc->bbWeight; + if (block->isRunRarely()) + { + continue; + } - if (outRatio >= 0.60) - { - // straighten switch here... - } + if (block->bbJumpKind != BBJ_SWITCH) + { + continue; + } + + if (!block->bbJumpSwt->bbsHasDominantCase) + { + continue; + } + + // We currently will only see dominant cases with PGO. + // + assert(block->hasProfileWeight()); + + const unsigned dominantCase = block->bbJumpSwt->bbsDominantCase; + + JITDUMP(FMT_BB " has switch with dominant case %u, considering peeling\n", block->bbNum, dominantCase); + + // The dominant case should not be the default case, as we already peel that one. + // + assert(dominantCase < (block->bbJumpSwt->bbsCount - 1)); + BasicBlock* const dominantTarget = block->bbJumpSwt->bbsDstTab[dominantCase]; + Statement* const switchStmt = block->lastStmt(); + GenTree* const switchTree = switchStmt->GetRootNode(); + assert(switchTree->OperIs(GT_SWITCH)); + GenTree* const switchValue = switchTree->AsOp()->gtGetOp1(); + + // Split the switch block just before at the switch. + // + // After this, newBlock is the switch block, and + // block is the upstream block. + // + BasicBlock* newBlock = nullptr; + + if (block->firstStmt() == switchStmt) + { + newBlock = fgSplitBlockAtBeginning(block); + } + else + { + newBlock = fgSplitBlockAfterStatement(block, switchStmt->GetPrevStmt()); + } + + // Set up a compare in the upstream block, "stealing" the switch value tree. + // + GenTree* const dominantCaseCompare = gtNewOperNode(GT_EQ, TYP_INT, switchValue, gtNewIconNode(dominantCase)); + GenTree* const jmpTree = gtNewOperNode(GT_JTRUE, TYP_VOID, dominantCaseCompare); + Statement* const jmpStmt = fgNewStmtFromTree(jmpTree, switchStmt->GetILOffsetX()); + fgInsertStmtAtEnd(block, jmpStmt); + dominantCaseCompare->gtFlags |= GTF_RELOP_JMP_USED; + + // Reattach switch value to the switch. This may introduce a comma + // in the upstream compare tree, if the switch value expression is complex. + // + switchTree->AsOp()->gtOp1 = fgMakeMultiUse(&dominantCaseCompare->AsOp()->gtOp1); + + // Update flags + // + dominantCaseCompare->gtFlags |= dominantCaseCompare->AsOp()->gtOp1->gtFlags; + jmpTree->gtFlags |= dominantCaseCompare->gtFlags; + dominantCaseCompare->gtFlags |= GTF_RELOP_JMP_USED; + + // Wire up the new control flow. + // + block->bbJumpKind = BBJ_COND; + block->bbJumpDest = dominantTarget; + flowList* const blockToTargetEdge = fgAddRefPred(dominantTarget, block); + flowList* const blockToNewBlockEdge = newBlock->bbPreds; + assert(blockToNewBlockEdge->getBlock() == block); + assert(blockToTargetEdge->getBlock() == block); + + // Update profile data + // + const BasicBlock::weight_t fraction = newBlock->bbJumpSwt->bbsDominantFraction; + const BasicBlock::weight_t blockToTargetWeight = block->bbWeight * fraction; + const BasicBlock::weight_t blockToNewBlockWeight = block->bbWeight - blockToTargetWeight; + + newBlock->setBBProfileWeight(blockToNewBlockWeight); + + blockToTargetEdge->setEdgeWeights(blockToTargetWeight, blockToTargetWeight, dominantTarget); + blockToNewBlockEdge->setEdgeWeights(blockToNewBlockWeight, blockToNewBlockWeight, block); + + for (flowList* pred = dominantTarget->bbPreds; pred != nullptr; pred = pred->flNext) + { + if (pred->getBlock() == newBlock) + { + assert(pred->flDupCount == 1); + pred->setEdgeWeights(BB_ZERO_WEIGHT, BB_ZERO_WEIGHT, block); } - while (++jumpTab, --jumpCnt); } + + // For now we leave the switch as is, since there's no way + // to indicate that one of the cases is now unreachable. + // + // But it no longer has a dominant case. + // + newBlock->bbJumpSwt->bbsHasDominantCase = false; + + modified = true; } -#endif - return result; + return modified; } //----------------------------------------------------------------------------- diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index 3fb8254baf572..5841ecf98c9f3 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -2031,6 +2031,9 @@ class EfficientEdgeCountReconstructor : public SpanningTreeVisitor block->bbSparseCountInfo = info; } + void MarkInterestingBlocks(BasicBlock* block, BlockInfo* info); + void MarkInterestingSwitches(BasicBlock* block, BlockInfo* info); + // Flags for noting and handling various error cases. // bool m_badcode; @@ -2531,7 +2534,158 @@ void EfficientEdgeCountReconstructor::Propagate() assert(info->m_weightKnown); m_comp->fgSetProfileWeight(block, info->m_weight); + + // Mark blocks that might be worth optimizing further, given + // what we know about the PGO data. + // + MarkInterestingBlocks(block, info); + } +} + +//------------------------------------------------------------------------ +// EfficientEdgeCountReconstructor::MarkInterestingBlocks: look for blocks +// that are worth specially optimizing, given the block and edge profile data +// +// Arguments: +// block - block of interest +// info - associated block info +// +// Notes: +// We do this during reconstructdion because we have a clean look at the edge +// weights. If we defer until we recompute edge weights later we may fail to solve +// for them. +// +// Someday we'll keep the edge profile info viable all throughout compilation and +// we can defer this screening until later. Doing so will catch more cases as +// optimizations can sharpen the profile data. +// +void EfficientEdgeCountReconstructor::MarkInterestingBlocks(BasicBlock* block, BlockInfo* info) +{ + switch (block->bbJumpKind) + { + case BBJ_SWITCH: + MarkInterestingSwitches(block, info); + break; + + default: + break; + } +} + +//------------------------------------------------------------------------ +// EfficientEdgeCountReconstructor::MarkInterestingSwitches: look for switch blocks +// that are worth specially optimizing, given the block and edge profile data +// +// Arguments: +// block - block of interest +// info - associated block info +// +// Notes: +// See if one of the non-default switch cases dominates and should be peeled +// from the switch during flow opts. +// +// If so, information is added to the bbJmpSwt for the block for use later. +// +void EfficientEdgeCountReconstructor::MarkInterestingSwitches(BasicBlock* block, BlockInfo* info) +{ + assert(block->bbJumpKind == BBJ_SWITCH); + + const BasicBlock::weight_t sufficientSamples = 100.0f; + const BasicBlock::weight_t sufficientFraction = 0.3f; + + if (info->m_weight < sufficientSamples) + { + return; + } + + JITDUMP("Switch in " FMT_BB " was hit " FMT_WT " >= " FMT_WT " times, checking for dominant edge\n", block->bbNum, + info->m_weight, sufficientSamples); + Edge* dominantEdge = info->m_outgoingEdges; + + for (Edge* edge = dominantEdge->m_nextOutgoingEdge; edge != nullptr; edge = edge->m_nextOutgoingEdge) + { + if (edge->m_weightKnown) + { + if (!dominantEdge->m_weightKnown || (edge->m_weight > dominantEdge->m_weight)) + { + dominantEdge = edge; + } + } + } + + if (!dominantEdge->m_weightKnown) + { + JITDUMP("No edges with known counts, sorry\n"); + return; + } + + BasicBlock::weight_t fraction = dominantEdge->m_weight / info->m_weight; + + // Because of count inconsistency we can see nonsensical ratios. Cap these. + // + if (fraction > 1.0) + { + fraction = 1.0; + } + + if (fraction < sufficientFraction) + { + JITDUMP("Maximum edge likelihood is " FMT_WT " < " FMT_WT "; not sufficient to trigger peeling)\n", fraction, + sufficientFraction); + return; } + + // Despite doing "edge" instrumentation, we only use a single edge probe for a given successor block. + // Multiple switch cases may lead to this block. So we also need to show that there's just one switch + // case that can lead to the dominant edge's target block. + // + // If it turns out often we fail at this stage, we might consider building a histogram of switch case + // values at runtime, similar to what we do for classes at virtual call sites. + // + const unsigned caseCount = block->bbJumpSwt->bbsCount; + BasicBlock** const jumpTab = block->bbJumpSwt->bbsDstTab; + unsigned dominantCase = caseCount; + + for (unsigned i = 0; i < caseCount; i++) + { + if (jumpTab[i] == dominantEdge->m_targetBlock) + { + if (dominantCase != caseCount) + { + JITDUMP("Both case %u and %u lead to " FMT_BB "-- can't optimize\n", i, dominantCase, + jumpTab[i]->bbNum); + dominantCase = caseCount; + break; + } + + dominantCase = i; + } + } + + if (dominantCase == caseCount) + { + // Multiple (or no) cases lead to the dominant case target. + // + return; + } + + if (block->bbJumpSwt->bbsHasDefault && (dominantCase == caseCount - 1)) + { + // Dominant case is the default case. + // This effectively gets peeled already, so defer. + // + JITDUMP("Default case %u uniquely leads to target " FMT_BB " of dominant edge, so will be peeled already\n", + dominantCase, dominantEdge->m_targetBlock->bbNum); + return; + } + + JITDUMP("Non-default case %u uniquely leads to target " FMT_BB " of dominant edge with likelihood " FMT_WT + "; marking for peeling\n", + dominantCase, dominantEdge->m_targetBlock->bbNum, fraction); + + block->bbJumpSwt->bbsHasDominantCase = true; + block->bbJumpSwt->bbsDominantCase = dominantCase; + block->bbJumpSwt->bbsDominantFraction = fraction; } //------------------------------------------------------------------------ From d85b84f19e1d744b6ed81bfcca5f64804299eb81 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 17 May 2021 12:44:06 -0700 Subject: [PATCH 2/4] fix edge weight update; review feedback --- src/coreclr/jit/fgopt.cpp | 31 +++++++++++++++++++++++++++++-- src/coreclr/jit/fgprofile.cpp | 4 ++-- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 651d1bc87954e..b31d6744bfc65 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -3836,12 +3836,39 @@ bool Compiler::fgOptimizeSwitchJumps() blockToTargetEdge->setEdgeWeights(blockToTargetWeight, blockToTargetWeight, dominantTarget); blockToNewBlockEdge->setEdgeWeights(blockToNewBlockWeight, blockToNewBlockWeight, block); + // There may be other switch cases that lead to this same block, but there's just + // one edge in the flowgraph. So we need to subtract off the profile data that now flows + // along the peeled edge. + // for (flowList* pred = dominantTarget->bbPreds; pred != nullptr; pred = pred->flNext) { if (pred->getBlock() == newBlock) { - assert(pred->flDupCount == 1); - pred->setEdgeWeights(BB_ZERO_WEIGHT, BB_ZERO_WEIGHT, block); + if (pred->flDupCount == 1) + { + // The only switch case leading to the dominant target was the one we peeled. + // So the edge from the switch now has zero weight. + // + pred->setEdgeWeights(BB_ZERO_WEIGHT, BB_ZERO_WEIGHT, dominantTarget); + } + else + { + // Other switch cases also lead to the dominant target. + // Subtract off the weight we transferred to the peel. + // + BasicBlock::weight_t newMinWeight = pred->edgeWeightMin() - blockToTargetWeight; + BasicBlock::weight_t newMaxWeight = pred->edgeWeightMax() - blockToTargetWeight; + + if (newMinWeight < BB_ZERO_WEIGHT) + { + newMinWeight = BB_ZERO_WEIGHT; + } + if (newMaxWeight < BB_ZERO_WEIGHT) + { + newMaxWeight = BB_ZERO_WEIGHT; + } + pred->setEdgeWeights(newMinWeight, newMaxWeight, dominantTarget); + } } } diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index 5841ecf98c9f3..ab14388965a02 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -2551,7 +2551,7 @@ void EfficientEdgeCountReconstructor::Propagate() // info - associated block info // // Notes: -// We do this during reconstructdion because we have a clean look at the edge +// We do this during reconstruction because we have a clean look at the edge // weights. If we defer until we recompute edge weights later we may fail to solve // for them. // @@ -2590,7 +2590,7 @@ void EfficientEdgeCountReconstructor::MarkInterestingSwitches(BasicBlock* block, { assert(block->bbJumpKind == BBJ_SWITCH); - const BasicBlock::weight_t sufficientSamples = 100.0f; + const BasicBlock::weight_t sufficientSamples = 30.0f; const BasicBlock::weight_t sufficientFraction = 0.3f; if (info->m_weight < sufficientSamples) From 2e11c198e2301f59d7b2dbca2c5d4b168169e57a Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 17 May 2021 14:55:03 -0700 Subject: [PATCH 3/4] fix profile update when coalescing blocks --- src/coreclr/jit/fgopt.cpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index b31d6744bfc65..0f93857136637 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -1803,25 +1803,26 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext) } } - // Note we could update the local variable weights here by - // calling lvaMarkLocalVars, with the block and weight adjustment. - // If either block or bNext has a profile weight // or if both block and bNext have non-zero weights - // then we select the highest weight block. + // then we wil use the max weight for the block. + // + const bool hasProfileWeight = block->hasProfileWeight() || bNext->hasProfileWeight(); + const bool hasNonZeroWeight = (block->bbWeight > BB_ZERO_WEIGHT) || (bNext->bbWeight > BB_ZERO_WEIGHT); - if (block->hasProfileWeight() || bNext->hasProfileWeight() || (block->bbWeight && bNext->bbWeight)) + if (hasProfileWeight || hasNonZeroWeight) { - // We are keeping block so update its fields - // when bNext has a greater weight + BasicBlock::weight_t const newWeight = max(block->bbWeight, bNext->bbWeight); - if (block->bbWeight < bNext->bbWeight) + if (hasProfileWeight) { - block->bbWeight = bNext->bbWeight; - - block->bbFlags |= (bNext->bbFlags & BBF_PROF_WEIGHT); // Set the profile weight flag (if necessary) - assert(block->bbWeight != BB_ZERO_WEIGHT); - block->bbFlags &= ~BBF_RUN_RARELY; // Clear any RarelyRun flag + block->setBBProfileWeight(newWeight); + } + else + { + assert(newWeight != BB_ZERO_WEIGHT); + block->bbWeight = newWeight; + block->bbFlags &= ~BBF_RUN_RARELY; } } // otherwise if either block has a zero weight we select the zero weight From 8f3e5fc23ea0ee357f789dc12569982217d23d94 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Tue, 18 May 2021 12:18:54 -0700 Subject: [PATCH 4/4] Review feeback. Also, up the minimal fraction to 0.55 based on some simple local benchmarking. --- src/coreclr/jit/block.cpp | 53 +++++++++++++++++--- src/coreclr/jit/block.h | 84 +++++++++++++++++--------------- src/coreclr/jit/fgdiagnostic.cpp | 37 +++++++++----- src/coreclr/jit/fgopt.cpp | 14 +++--- src/coreclr/jit/fgprofile.cpp | 39 ++++++++++----- src/coreclr/jit/optimizer.cpp | 9 +--- 6 files changed, 150 insertions(+), 86 deletions(-) diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp index 5266bed750735..8778245a932ea 100644 --- a/src/coreclr/jit/block.cpp +++ b/src/coreclr/jit/block.cpp @@ -646,19 +646,32 @@ void BasicBlock::dspJumpKind() break; case BBJ_SWITCH: + { printf(" ->"); - unsigned jumpCnt; - jumpCnt = bbJumpSwt->bbsCount; - BasicBlock** jumpTab; - jumpTab = bbJumpSwt->bbsDstTab; - do + const unsigned jumpCnt = bbJumpSwt->bbsCount; + BasicBlock** const jumpTab = bbJumpSwt->bbsDstTab; + + for (unsigned i = 0; i < jumpCnt; i++) { - printf("%c" FMT_BB, (jumpTab == bbJumpSwt->bbsDstTab) ? ' ' : ',', (*jumpTab)->bbNum); - } while (++jumpTab, --jumpCnt); + printf("%c" FMT_BB, (i == 0) ? ' ' : ',', jumpTab[i]->bbNum); + + const bool isDefault = bbJumpSwt->bbsHasDefault && (i == jumpCnt - 1); + if (isDefault) + { + printf("[def]"); + } + + const bool isDominant = bbJumpSwt->bbsHasDominantCase && (i == bbJumpSwt->bbsDominantCase); + if (isDominant) + { + printf("[dom(" FMT_WT ")]", bbJumpSwt->bbsDominantFraction); + } + } printf(" (switch)"); - break; + } + break; default: unreached(); @@ -1630,3 +1643,27 @@ bool BasicBlock::hasEHBoundaryOut() return returnVal; } + +//------------------------------------------------------------------------ +// BBswtDesc copy ctor: copy a switch descriptor +// +// Arguments: +// comp - compiler instance +// other - existing switch descriptor to copy +// +BBswtDesc::BBswtDesc(Compiler* comp, const BBswtDesc* other) + : bbsDstTab(nullptr) + , bbsCount(other->bbsCount) + , bbsDominantCase(other->bbsDominantCase) + , bbsDominantFraction(other->bbsDominantFraction) + , bbsHasDefault(other->bbsHasDefault) + , bbsHasDominantCase(other->bbsHasDominantCase) +{ + // Allocate and fill in a new dst tab + // + bbsDstTab = new (comp, CMK_BasicBlock) BasicBlock*[bbsCount]; + for (unsigned i = 0; i < bbsCount; i++) + { + bbsDstTab[i] = other->bbsDstTab[i]; + } +} diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index 352a2dea3291d..ab05f3de3a7bb 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -1198,6 +1198,51 @@ typedef JitHashTable, BlkVector> BlkToBl // Map from Block to Block. Used for a variety of purposes. typedef JitHashTable, BasicBlock*> BlockToBlockMap; +// BBswtDesc -- descriptor for a switch block +// +// Things to know: +// 1. If bbsHasDefault is true, the default case is the last one in the array of basic block addresses +// namely bbsDstTab[bbsCount - 1]. +// 2. bbsCount must be at least 1, for the default case. bbsCount cannot be zero. It appears that the ECMA spec +// allows for a degenerate switch with zero cases. Normally, the optimizer will optimize degenerate +// switches with just a default case to a BBJ_ALWAYS branch, and a switch with just two cases to a BBJ_COND. +// However, in debuggable code, we might not do that, so bbsCount might be 1. +// +struct BBswtDesc +{ + BasicBlock** bbsDstTab; // case label table address + unsigned bbsCount; // count of cases (includes 'default' if bbsHasDefault) + + // Case number and likelihood of most likely case + // (only known with PGO, only valid if bbsHasDominantCase is true) + unsigned bbsDominantCase; + BasicBlock::weight_t bbsDominantFraction; + + bool bbsHasDefault; // true if last switch case is a default case + bool bbsHasDominantCase; // true if switch has a dominant case + + BBswtDesc() : bbsHasDefault(true), bbsHasDominantCase(false) + { + } + + BBswtDesc(Compiler* comp, const BBswtDesc* other); + + void removeDefault() + { + assert(bbsHasDefault); + assert(bbsCount > 0); + bbsHasDefault = false; + bbsCount--; + } + + BasicBlock* getDefault() + { + assert(bbsHasDefault); + assert(bbsCount > 0); + return bbsDstTab[bbsCount - 1]; + } +}; + // In compiler terminology the control flow between two BasicBlocks // is typically referred to as an "edge". Most well known are the // backward branches for loops, which are often called "back-edges". @@ -1254,45 +1299,6 @@ struct BasicBlockList } }; -// BBswtDesc -- descriptor for a switch block -// -// Things to know: -// 1. If bbsHasDefault is true, the default case is the last one in the array of basic block addresses -// namely bbsDstTab[bbsCount - 1]. -// 2. bbsCount must be at least 1, for the default case. bbsCount cannot be zero. It appears that the ECMA spec -// allows for a degenerate switch with zero cases. Normally, the optimizer will optimize degenerate -// switches with just a default case to a BBJ_ALWAYS branch, and a switch with just two cases to a BBJ_COND. -// However, in debuggable code, we might not do that, so bbsCount might be 1. -// -struct BBswtDesc -{ - BasicBlock** bbsDstTab; // case label table address - unsigned bbsCount; // count of cases (includes 'default' if bbsHasDefault) - unsigned bbsDominantCase; - BasicBlock::weight_t bbsDominantFraction; - bool bbsHasDefault; - bool bbsHasDominantCase; - - BBswtDesc() : bbsHasDefault(true), bbsHasDominantCase(false) - { - } - - void removeDefault() - { - assert(bbsHasDefault); - assert(bbsCount > 0); - bbsHasDefault = false; - bbsCount--; - } - - BasicBlock* getDefault() - { - assert(bbsHasDefault); - assert(bbsCount > 0); - return bbsDstTab[bbsCount - 1]; - } -}; - // flowList -- control flow edge // struct flowList diff --git a/src/coreclr/jit/fgdiagnostic.cpp b/src/coreclr/jit/fgdiagnostic.cpp index 507f92536c287..f9f9d3c86b6d0 100644 --- a/src/coreclr/jit/fgdiagnostic.cpp +++ b/src/coreclr/jit/fgdiagnostic.cpp @@ -1933,19 +1933,33 @@ void Compiler::fgTableDispBasicBlock(BasicBlock* block, int ibcColWidth /* = 0 * break; case BBJ_SWITCH: + { printf("->"); - unsigned jumpCnt; - jumpCnt = block->bbJumpSwt->bbsCount; - BasicBlock** jumpTab; - jumpTab = block->bbJumpSwt->bbsDstTab; - int switchWidth; - switchWidth = 0; - do + const BBswtDesc* const bbJumpSwt = block->bbJumpSwt; + const unsigned jumpCnt = bbJumpSwt->bbsCount; + BasicBlock** const jumpTab = bbJumpSwt->bbsDstTab; + int switchWidth = 0; + + for (unsigned i = 0; i < jumpCnt; i++) { - printf("%c" FMT_BB, (jumpTab == block->bbJumpSwt->bbsDstTab) ? ' ' : ',', (*jumpTab)->bbNum); - switchWidth += 1 /* space/comma */ + 2 /* BB */ + max(CountDigits((*jumpTab)->bbNum), 2); - } while (++jumpTab, --jumpCnt); + printf("%c" FMT_BB, (i == 0) ? ' ' : ',', jumpTab[i]->bbNum); + switchWidth += 1 /* space/comma */ + 2 /* BB */ + max(CountDigits(jumpTab[i]->bbNum), 2); + + const bool isDefault = bbJumpSwt->bbsHasDefault && (i == jumpCnt - 1); + if (isDefault) + { + printf("[def]"); + switchWidth += 5; + } + + const bool isDominant = bbJumpSwt->bbsHasDominantCase && (i == bbJumpSwt->bbsDominantCase); + if (isDominant) + { + printf("[dom(" FMT_WT ")]", bbJumpSwt->bbsDominantFraction); + switchWidth += 10; + } + } if (switchWidth < 7) { @@ -1953,7 +1967,8 @@ void Compiler::fgTableDispBasicBlock(BasicBlock* block, int ibcColWidth /* = 0 * } printf(" (switch)"); - break; + } + break; } } diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 0f93857136637..12220bb2ef898 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -1805,7 +1805,7 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext) // If either block or bNext has a profile weight // or if both block and bNext have non-zero weights - // then we wil use the max weight for the block. + // then we will use the max weight for the block. // const bool hasProfileWeight = block->hasProfileWeight() || bNext->hasProfileWeight(); const bool hasNonZeroWeight = (block->bbWeight > BB_ZERO_WEIGHT) || (bNext->bbWeight > BB_ZERO_WEIGHT); @@ -3745,17 +3745,17 @@ bool Compiler::fgOptimizeSwitchJumps() for (BasicBlock* block = fgFirstBB; block != NULL; block = block->bbNext) { - if (block->IsLIR()) - { - break; - } + // Lowering expands switches, so calling this method on lowered IR + // does not make sense. + // + assert(!block->IsLIR()); - if (block->isRunRarely()) + if (block->bbJumpKind != BBJ_SWITCH) { continue; } - if (block->bbJumpKind != BBJ_SWITCH) + if (block->isRunRarely()) { continue; } diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index ab14388965a02..e5bbe639c120c 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -2590,35 +2590,48 @@ void EfficientEdgeCountReconstructor::MarkInterestingSwitches(BasicBlock* block, { assert(block->bbJumpKind == BBJ_SWITCH); + // Thresholds for detecting a dominant switch case. + // + // We need to see enough hits on the switch to have a plausible sense of the distribution of cases. + // We also want to enable peeling for switches that are executed at least once per call. + // By default, we're guaranteed to see at least 30 calls to instrumented method, for dynamic PGO. + // Hence we require at least 30 observed switch executions. + // + // The profitabilty of peeling is related to the dominant fraction. The cost has a constant portion + // (at a minimum the cost of a not-taken branch) and a variable portion, plus increased code size. + // So we don't want to peel in cases where the dominant fraction is too small. + // const BasicBlock::weight_t sufficientSamples = 30.0f; - const BasicBlock::weight_t sufficientFraction = 0.3f; + const BasicBlock::weight_t sufficientFraction = 0.55f; if (info->m_weight < sufficientSamples) { + JITDUMP("Switch in " FMT_BB " was hit " FMT_WT " < " FMT_WT " times, NOT checking for dominant edge\n", + block->bbNum, info->m_weight, sufficientSamples); return; } JITDUMP("Switch in " FMT_BB " was hit " FMT_WT " >= " FMT_WT " times, checking for dominant edge\n", block->bbNum, info->m_weight, sufficientSamples); - Edge* dominantEdge = info->m_outgoingEdges; + Edge* dominantEdge = nullptr; - for (Edge* edge = dominantEdge->m_nextOutgoingEdge; edge != nullptr; edge = edge->m_nextOutgoingEdge) + // We don't expect to see any unknown edge weights; if we do, just bail out. + // + for (Edge* edge = info->m_outgoingEdges; edge != nullptr; edge = edge->m_nextOutgoingEdge) { - if (edge->m_weightKnown) + if (!edge->m_weightKnown) { - if (!dominantEdge->m_weightKnown || (edge->m_weight > dominantEdge->m_weight)) - { - dominantEdge = edge; - } + JITDUMP("Found edge with unknown weight.\n"); + return; } - } - if (!dominantEdge->m_weightKnown) - { - JITDUMP("No edges with known counts, sorry\n"); - return; + if ((dominantEdge == nullptr) || (edge->m_weight > dominantEdge->m_weight)) + { + dominantEdge = edge; + } } + assert(dominantEdge != nullptr); BasicBlock::weight_t fraction = dominantEdge->m_weight / info->m_weight; // Because of count inconsistency we can see nonsensical ratios. Cap these. diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index c0f3d7b48fcec..0dae3cabed81a 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2679,14 +2679,7 @@ void Compiler::optCopyBlkDest(BasicBlock* from, BasicBlock* to) case BBJ_SWITCH: { - to->bbJumpSwt = new (this, CMK_BasicBlock) BBswtDesc(); - to->bbJumpSwt->bbsCount = from->bbJumpSwt->bbsCount; - to->bbJumpSwt->bbsDstTab = new (this, CMK_BasicBlock) BasicBlock*[from->bbJumpSwt->bbsCount]; - - for (unsigned i = 0; i < from->bbJumpSwt->bbsCount; i++) - { - to->bbJumpSwt->bbsDstTab[i] = from->bbJumpSwt->bbsDstTab[i]; - } + to->bbJumpSwt = new (this, CMK_BasicBlock) BBswtDesc(this, from->bbJumpSwt); } break;