diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp index 5266bed750735..8778245a932ea 100644 --- a/src/coreclr/jit/block.cpp +++ b/src/coreclr/jit/block.cpp @@ -646,19 +646,32 @@ void BasicBlock::dspJumpKind() break; case BBJ_SWITCH: + { printf(" ->"); - unsigned jumpCnt; - jumpCnt = bbJumpSwt->bbsCount; - BasicBlock** jumpTab; - jumpTab = bbJumpSwt->bbsDstTab; - do + const unsigned jumpCnt = bbJumpSwt->bbsCount; + BasicBlock** const jumpTab = bbJumpSwt->bbsDstTab; + + for (unsigned i = 0; i < jumpCnt; i++) { - printf("%c" FMT_BB, (jumpTab == bbJumpSwt->bbsDstTab) ? ' ' : ',', (*jumpTab)->bbNum); - } while (++jumpTab, --jumpCnt); + printf("%c" FMT_BB, (i == 0) ? ' ' : ',', jumpTab[i]->bbNum); + + const bool isDefault = bbJumpSwt->bbsHasDefault && (i == jumpCnt - 1); + if (isDefault) + { + printf("[def]"); + } + + const bool isDominant = bbJumpSwt->bbsHasDominantCase && (i == bbJumpSwt->bbsDominantCase); + if (isDominant) + { + printf("[dom(" FMT_WT ")]", bbJumpSwt->bbsDominantFraction); + } + } printf(" (switch)"); - break; + } + break; default: unreached(); @@ -1630,3 +1643,27 @@ bool BasicBlock::hasEHBoundaryOut() return returnVal; } + +//------------------------------------------------------------------------ +// BBswtDesc copy ctor: copy a switch descriptor +// +// Arguments: +// comp - compiler instance +// other - existing switch descriptor to copy +// +BBswtDesc::BBswtDesc(Compiler* comp, const BBswtDesc* other) + : bbsDstTab(nullptr) + , bbsCount(other->bbsCount) + , bbsDominantCase(other->bbsDominantCase) + , bbsDominantFraction(other->bbsDominantFraction) + , bbsHasDefault(other->bbsHasDefault) + , bbsHasDominantCase(other->bbsHasDominantCase) +{ + // Allocate and fill in a new dst tab + // + bbsDstTab = new (comp, CMK_BasicBlock) BasicBlock*[bbsCount]; + for (unsigned i = 0; i < bbsCount; i++) + { + bbsDstTab[i] = other->bbsDstTab[i]; + } +} diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index dfc3d53fc25c1..ab05f3de3a7bb 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -85,44 +85,7 @@ class typeInfo; struct BasicBlockList; struct flowList; struct EHblkDsc; - -/***************************************************************************** - * - * The following describes a switch block. - * - * Things to know: - * 1. If bbsHasDefault is true, the default case is the last one in the array of basic block addresses - * namely bbsDstTab[bbsCount - 1]. - * 2. bbsCount must be at least 1, for the default case. bbsCount cannot be zero. It appears that the ECMA spec - * allows for a degenerate switch with zero cases. Normally, the optimizer will optimize degenerate - * switches with just a default case to a BBJ_ALWAYS branch, and a switch with just two cases to a BBJ_COND. - * However, in debuggable code, we might not do that, so bbsCount might be 1. - */ -struct BBswtDesc -{ - BasicBlock** bbsDstTab; // case label table address - unsigned bbsCount; // count of cases (includes 'default' if bbsHasDefault) - bool bbsHasDefault; - - BBswtDesc() : bbsHasDefault(true) - { - } - - void removeDefault() - { - assert(bbsHasDefault); - assert(bbsCount > 0); - bbsHasDefault = false; - bbsCount--; - } - - BasicBlock* getDefault() - { - assert(bbsHasDefault); - assert(bbsCount > 0); - return bbsDstTab[bbsCount - 1]; - } -}; +struct BBswtDesc; struct StackEntry { @@ -1235,6 +1198,51 @@ typedef JitHashTable, BlkVector> BlkToBl // Map from Block to Block. Used for a variety of purposes. typedef JitHashTable, BasicBlock*> BlockToBlockMap; +// BBswtDesc -- descriptor for a switch block +// +// Things to know: +// 1. If bbsHasDefault is true, the default case is the last one in the array of basic block addresses +// namely bbsDstTab[bbsCount - 1]. +// 2. bbsCount must be at least 1, for the default case. bbsCount cannot be zero. It appears that the ECMA spec +// allows for a degenerate switch with zero cases. Normally, the optimizer will optimize degenerate +// switches with just a default case to a BBJ_ALWAYS branch, and a switch with just two cases to a BBJ_COND. +// However, in debuggable code, we might not do that, so bbsCount might be 1. +// +struct BBswtDesc +{ + BasicBlock** bbsDstTab; // case label table address + unsigned bbsCount; // count of cases (includes 'default' if bbsHasDefault) + + // Case number and likelihood of most likely case + // (only known with PGO, only valid if bbsHasDominantCase is true) + unsigned bbsDominantCase; + BasicBlock::weight_t bbsDominantFraction; + + bool bbsHasDefault; // true if last switch case is a default case + bool bbsHasDominantCase; // true if switch has a dominant case + + BBswtDesc() : bbsHasDefault(true), bbsHasDominantCase(false) + { + } + + BBswtDesc(Compiler* comp, const BBswtDesc* other); + + void removeDefault() + { + assert(bbsHasDefault); + assert(bbsCount > 0); + bbsHasDefault = false; + bbsCount--; + } + + BasicBlock* getDefault() + { + assert(bbsHasDefault); + assert(bbsCount > 0); + return bbsDstTab[bbsCount - 1]; + } +}; + // In compiler terminology the control flow between two BasicBlocks // is typically referred to as an "edge". Most well known are the // backward branches for loops, which are often called "back-edges". @@ -1291,6 +1299,8 @@ struct BasicBlockList } }; +// flowList -- control flow edge +// struct flowList { public: diff --git a/src/coreclr/jit/fgdiagnostic.cpp b/src/coreclr/jit/fgdiagnostic.cpp index 507f92536c287..f9f9d3c86b6d0 100644 --- a/src/coreclr/jit/fgdiagnostic.cpp +++ b/src/coreclr/jit/fgdiagnostic.cpp @@ -1933,19 +1933,33 @@ void Compiler::fgTableDispBasicBlock(BasicBlock* block, int ibcColWidth /* = 0 * break; case BBJ_SWITCH: + { printf("->"); - unsigned jumpCnt; - jumpCnt = block->bbJumpSwt->bbsCount; - BasicBlock** jumpTab; - jumpTab = block->bbJumpSwt->bbsDstTab; - int switchWidth; - switchWidth = 0; - do + const BBswtDesc* const bbJumpSwt = block->bbJumpSwt; + const unsigned jumpCnt = bbJumpSwt->bbsCount; + BasicBlock** const jumpTab = bbJumpSwt->bbsDstTab; + int switchWidth = 0; + + for (unsigned i = 0; i < jumpCnt; i++) { - printf("%c" FMT_BB, (jumpTab == block->bbJumpSwt->bbsDstTab) ? ' ' : ',', (*jumpTab)->bbNum); - switchWidth += 1 /* space/comma */ + 2 /* BB */ + max(CountDigits((*jumpTab)->bbNum), 2); - } while (++jumpTab, --jumpCnt); + printf("%c" FMT_BB, (i == 0) ? ' ' : ',', jumpTab[i]->bbNum); + switchWidth += 1 /* space/comma */ + 2 /* BB */ + max(CountDigits(jumpTab[i]->bbNum), 2); + + const bool isDefault = bbJumpSwt->bbsHasDefault && (i == jumpCnt - 1); + if (isDefault) + { + printf("[def]"); + switchWidth += 5; + } + + const bool isDominant = bbJumpSwt->bbsHasDominantCase && (i == bbJumpSwt->bbsDominantCase); + if (isDominant) + { + printf("[dom(" FMT_WT ")]", bbJumpSwt->bbsDominantFraction); + switchWidth += 10; + } + } if (switchWidth < 7) { @@ -1953,7 +1967,8 @@ void Compiler::fgTableDispBasicBlock(BasicBlock* block, int ibcColWidth /* = 0 * } printf(" (switch)"); - break; + } + break; } } diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index dc4774286456d..12220bb2ef898 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -1803,25 +1803,26 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext) } } - // Note we could update the local variable weights here by - // calling lvaMarkLocalVars, with the block and weight adjustment. - // If either block or bNext has a profile weight // or if both block and bNext have non-zero weights - // then we select the highest weight block. + // then we will use the max weight for the block. + // + const bool hasProfileWeight = block->hasProfileWeight() || bNext->hasProfileWeight(); + const bool hasNonZeroWeight = (block->bbWeight > BB_ZERO_WEIGHT) || (bNext->bbWeight > BB_ZERO_WEIGHT); - if (block->hasProfileWeight() || bNext->hasProfileWeight() || (block->bbWeight && bNext->bbWeight)) + if (hasProfileWeight || hasNonZeroWeight) { - // We are keeping block so update its fields - // when bNext has a greater weight + BasicBlock::weight_t const newWeight = max(block->bbWeight, bNext->bbWeight); - if (block->bbWeight < bNext->bbWeight) + if (hasProfileWeight) { - block->bbWeight = bNext->bbWeight; - - block->bbFlags |= (bNext->bbFlags & BBF_PROF_WEIGHT); // Set the profile weight flag (if necessary) - assert(block->bbWeight != BB_ZERO_WEIGHT); - block->bbFlags &= ~BBF_RUN_RARELY; // Clear any RarelyRun flag + block->setBBProfileWeight(newWeight); + } + else + { + assert(newWeight != BB_ZERO_WEIGHT); + block->bbWeight = newWeight; + block->bbFlags &= ~BBF_RUN_RARELY; } } // otherwise if either block has a zero weight we select the zero weight @@ -3726,47 +3727,163 @@ bool Compiler::fgOptimizeBranch(BasicBlock* bJump) return true; } -/***************************************************************************** - * - * Function called to optimize switch statements - */ - +//----------------------------------------------------------------------------- +// fgOptimizeSwitchJump: see if a switch has a dominant case, and modify to +// check for that case up front (aka switch peeling). +// +// Returns: +// True if the switch now has an upstream check for the dominant case. +// bool Compiler::fgOptimizeSwitchJumps() { - bool result = false; // Our return value - -#if 0 - // TODO-CQ: Add switch jump optimizations? if (!fgHasSwitch) + { return false; + } - if (!fgHaveValidEdgeWeights) - return false; + bool modified = false; - for (BasicBlock* bSrc = fgFirstBB; bSrc != NULL; bSrc = bSrc->bbNext) + for (BasicBlock* block = fgFirstBB; block != NULL; block = block->bbNext) { - if (bSrc->bbJumpKind == BBJ_SWITCH) + // Lowering expands switches, so calling this method on lowered IR + // does not make sense. + // + assert(!block->IsLIR()); + + if (block->bbJumpKind != BBJ_SWITCH) { - unsigned jumpCnt; jumpCnt = bSrc->bbJumpSwt->bbsCount; - BasicBlock** jumpTab; jumpTab = bSrc->bbJumpSwt->bbsDstTab; + continue; + } - do - { - BasicBlock* bDst = *jumpTab; - flowList* edgeToDst = fgGetPredForBlock(bDst, bSrc); - double outRatio = (double) edgeToDst->edgeWeightMin() / (double) bSrc->bbWeight; + if (block->isRunRarely()) + { + continue; + } + + if (!block->bbJumpSwt->bbsHasDominantCase) + { + continue; + } + + // We currently will only see dominant cases with PGO. + // + assert(block->hasProfileWeight()); + + const unsigned dominantCase = block->bbJumpSwt->bbsDominantCase; + + JITDUMP(FMT_BB " has switch with dominant case %u, considering peeling\n", block->bbNum, dominantCase); + + // The dominant case should not be the default case, as we already peel that one. + // + assert(dominantCase < (block->bbJumpSwt->bbsCount - 1)); + BasicBlock* const dominantTarget = block->bbJumpSwt->bbsDstTab[dominantCase]; + Statement* const switchStmt = block->lastStmt(); + GenTree* const switchTree = switchStmt->GetRootNode(); + assert(switchTree->OperIs(GT_SWITCH)); + GenTree* const switchValue = switchTree->AsOp()->gtGetOp1(); + + // Split the switch block just before at the switch. + // + // After this, newBlock is the switch block, and + // block is the upstream block. + // + BasicBlock* newBlock = nullptr; - if (outRatio >= 0.60) + if (block->firstStmt() == switchStmt) + { + newBlock = fgSplitBlockAtBeginning(block); + } + else + { + newBlock = fgSplitBlockAfterStatement(block, switchStmt->GetPrevStmt()); + } + + // Set up a compare in the upstream block, "stealing" the switch value tree. + // + GenTree* const dominantCaseCompare = gtNewOperNode(GT_EQ, TYP_INT, switchValue, gtNewIconNode(dominantCase)); + GenTree* const jmpTree = gtNewOperNode(GT_JTRUE, TYP_VOID, dominantCaseCompare); + Statement* const jmpStmt = fgNewStmtFromTree(jmpTree, switchStmt->GetILOffsetX()); + fgInsertStmtAtEnd(block, jmpStmt); + dominantCaseCompare->gtFlags |= GTF_RELOP_JMP_USED; + + // Reattach switch value to the switch. This may introduce a comma + // in the upstream compare tree, if the switch value expression is complex. + // + switchTree->AsOp()->gtOp1 = fgMakeMultiUse(&dominantCaseCompare->AsOp()->gtOp1); + + // Update flags + // + dominantCaseCompare->gtFlags |= dominantCaseCompare->AsOp()->gtOp1->gtFlags; + jmpTree->gtFlags |= dominantCaseCompare->gtFlags; + dominantCaseCompare->gtFlags |= GTF_RELOP_JMP_USED; + + // Wire up the new control flow. + // + block->bbJumpKind = BBJ_COND; + block->bbJumpDest = dominantTarget; + flowList* const blockToTargetEdge = fgAddRefPred(dominantTarget, block); + flowList* const blockToNewBlockEdge = newBlock->bbPreds; + assert(blockToNewBlockEdge->getBlock() == block); + assert(blockToTargetEdge->getBlock() == block); + + // Update profile data + // + const BasicBlock::weight_t fraction = newBlock->bbJumpSwt->bbsDominantFraction; + const BasicBlock::weight_t blockToTargetWeight = block->bbWeight * fraction; + const BasicBlock::weight_t blockToNewBlockWeight = block->bbWeight - blockToTargetWeight; + + newBlock->setBBProfileWeight(blockToNewBlockWeight); + + blockToTargetEdge->setEdgeWeights(blockToTargetWeight, blockToTargetWeight, dominantTarget); + blockToNewBlockEdge->setEdgeWeights(blockToNewBlockWeight, blockToNewBlockWeight, block); + + // There may be other switch cases that lead to this same block, but there's just + // one edge in the flowgraph. So we need to subtract off the profile data that now flows + // along the peeled edge. + // + for (flowList* pred = dominantTarget->bbPreds; pred != nullptr; pred = pred->flNext) + { + if (pred->getBlock() == newBlock) + { + if (pred->flDupCount == 1) + { + // The only switch case leading to the dominant target was the one we peeled. + // So the edge from the switch now has zero weight. + // + pred->setEdgeWeights(BB_ZERO_WEIGHT, BB_ZERO_WEIGHT, dominantTarget); + } + else { - // straighten switch here... + // Other switch cases also lead to the dominant target. + // Subtract off the weight we transferred to the peel. + // + BasicBlock::weight_t newMinWeight = pred->edgeWeightMin() - blockToTargetWeight; + BasicBlock::weight_t newMaxWeight = pred->edgeWeightMax() - blockToTargetWeight; + + if (newMinWeight < BB_ZERO_WEIGHT) + { + newMinWeight = BB_ZERO_WEIGHT; + } + if (newMaxWeight < BB_ZERO_WEIGHT) + { + newMaxWeight = BB_ZERO_WEIGHT; + } + pred->setEdgeWeights(newMinWeight, newMaxWeight, dominantTarget); } } - while (++jumpTab, --jumpCnt); } + + // For now we leave the switch as is, since there's no way + // to indicate that one of the cases is now unreachable. + // + // But it no longer has a dominant case. + // + newBlock->bbJumpSwt->bbsHasDominantCase = false; + + modified = true; } -#endif - return result; + return modified; } //----------------------------------------------------------------------------- diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index 142a2c7ac6656..ccd12f4fcc9da 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -2031,6 +2031,9 @@ class EfficientEdgeCountReconstructor : public SpanningTreeVisitor block->bbSparseCountInfo = info; } + void MarkInterestingBlocks(BasicBlock* block, BlockInfo* info); + void MarkInterestingSwitches(BasicBlock* block, BlockInfo* info); + // Flags for noting and handling various error cases. // bool m_badcode; @@ -2531,7 +2534,171 @@ void EfficientEdgeCountReconstructor::Propagate() assert(info->m_weightKnown); m_comp->fgSetProfileWeight(block, info->m_weight); + + // Mark blocks that might be worth optimizing further, given + // what we know about the PGO data. + // + MarkInterestingBlocks(block, info); + } +} + +//------------------------------------------------------------------------ +// EfficientEdgeCountReconstructor::MarkInterestingBlocks: look for blocks +// that are worth specially optimizing, given the block and edge profile data +// +// Arguments: +// block - block of interest +// info - associated block info +// +// Notes: +// We do this during reconstruction because we have a clean look at the edge +// weights. If we defer until we recompute edge weights later we may fail to solve +// for them. +// +// Someday we'll keep the edge profile info viable all throughout compilation and +// we can defer this screening until later. Doing so will catch more cases as +// optimizations can sharpen the profile data. +// +void EfficientEdgeCountReconstructor::MarkInterestingBlocks(BasicBlock* block, BlockInfo* info) +{ + switch (block->bbJumpKind) + { + case BBJ_SWITCH: + MarkInterestingSwitches(block, info); + break; + + default: + break; + } +} + +//------------------------------------------------------------------------ +// EfficientEdgeCountReconstructor::MarkInterestingSwitches: look for switch blocks +// that are worth specially optimizing, given the block and edge profile data +// +// Arguments: +// block - block of interest +// info - associated block info +// +// Notes: +// See if one of the non-default switch cases dominates and should be peeled +// from the switch during flow opts. +// +// If so, information is added to the bbJmpSwt for the block for use later. +// +void EfficientEdgeCountReconstructor::MarkInterestingSwitches(BasicBlock* block, BlockInfo* info) +{ + assert(block->bbJumpKind == BBJ_SWITCH); + + // Thresholds for detecting a dominant switch case. + // + // We need to see enough hits on the switch to have a plausible sense of the distribution of cases. + // We also want to enable peeling for switches that are executed at least once per call. + // By default, we're guaranteed to see at least 30 calls to instrumented method, for dynamic PGO. + // Hence we require at least 30 observed switch executions. + // + // The profitabilty of peeling is related to the dominant fraction. The cost has a constant portion + // (at a minimum the cost of a not-taken branch) and a variable portion, plus increased code size. + // So we don't want to peel in cases where the dominant fraction is too small. + // + const BasicBlock::weight_t sufficientSamples = 30.0f; + const BasicBlock::weight_t sufficientFraction = 0.55f; + + if (info->m_weight < sufficientSamples) + { + JITDUMP("Switch in " FMT_BB " was hit " FMT_WT " < " FMT_WT " times, NOT checking for dominant edge\n", + block->bbNum, info->m_weight, sufficientSamples); + return; + } + + JITDUMP("Switch in " FMT_BB " was hit " FMT_WT " >= " FMT_WT " times, checking for dominant edge\n", block->bbNum, + info->m_weight, sufficientSamples); + Edge* dominantEdge = nullptr; + + // We don't expect to see any unknown edge weights; if we do, just bail out. + // + for (Edge* edge = info->m_outgoingEdges; edge != nullptr; edge = edge->m_nextOutgoingEdge) + { + if (!edge->m_weightKnown) + { + JITDUMP("Found edge with unknown weight.\n"); + return; + } + + if ((dominantEdge == nullptr) || (edge->m_weight > dominantEdge->m_weight)) + { + dominantEdge = edge; + } + } + + assert(dominantEdge != nullptr); + BasicBlock::weight_t fraction = dominantEdge->m_weight / info->m_weight; + + // Because of count inconsistency we can see nonsensical ratios. Cap these. + // + if (fraction > 1.0) + { + fraction = 1.0; + } + + if (fraction < sufficientFraction) + { + JITDUMP("Maximum edge likelihood is " FMT_WT " < " FMT_WT "; not sufficient to trigger peeling)\n", fraction, + sufficientFraction); + return; + } + + // Despite doing "edge" instrumentation, we only use a single edge probe for a given successor block. + // Multiple switch cases may lead to this block. So we also need to show that there's just one switch + // case that can lead to the dominant edge's target block. + // + // If it turns out often we fail at this stage, we might consider building a histogram of switch case + // values at runtime, similar to what we do for classes at virtual call sites. + // + const unsigned caseCount = block->bbJumpSwt->bbsCount; + BasicBlock** const jumpTab = block->bbJumpSwt->bbsDstTab; + unsigned dominantCase = caseCount; + + for (unsigned i = 0; i < caseCount; i++) + { + if (jumpTab[i] == dominantEdge->m_targetBlock) + { + if (dominantCase != caseCount) + { + JITDUMP("Both case %u and %u lead to " FMT_BB "-- can't optimize\n", i, dominantCase, + jumpTab[i]->bbNum); + dominantCase = caseCount; + break; + } + + dominantCase = i; + } + } + + if (dominantCase == caseCount) + { + // Multiple (or no) cases lead to the dominant case target. + // + return; + } + + if (block->bbJumpSwt->bbsHasDefault && (dominantCase == caseCount - 1)) + { + // Dominant case is the default case. + // This effectively gets peeled already, so defer. + // + JITDUMP("Default case %u uniquely leads to target " FMT_BB " of dominant edge, so will be peeled already\n", + dominantCase, dominantEdge->m_targetBlock->bbNum); + return; } + + JITDUMP("Non-default case %u uniquely leads to target " FMT_BB " of dominant edge with likelihood " FMT_WT + "; marking for peeling\n", + dominantCase, dominantEdge->m_targetBlock->bbNum, fraction); + + block->bbJumpSwt->bbsHasDominantCase = true; + block->bbJumpSwt->bbsDominantCase = dominantCase; + block->bbJumpSwt->bbsDominantFraction = fraction; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index c0f3d7b48fcec..0dae3cabed81a 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2679,14 +2679,7 @@ void Compiler::optCopyBlkDest(BasicBlock* from, BasicBlock* to) case BBJ_SWITCH: { - to->bbJumpSwt = new (this, CMK_BasicBlock) BBswtDesc(); - to->bbJumpSwt->bbsCount = from->bbJumpSwt->bbsCount; - to->bbJumpSwt->bbsDstTab = new (this, CMK_BasicBlock) BasicBlock*[from->bbJumpSwt->bbsCount]; - - for (unsigned i = 0; i < from->bbJumpSwt->bbsCount; i++) - { - to->bbJumpSwt->bbsDstTab[i] = from->bbJumpSwt->bbsDstTab[i]; - } + to->bbJumpSwt = new (this, CMK_BasicBlock) BBswtDesc(this, from->bbJumpSwt); } break;