Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update how OSR and PGO interact #61453

Merged
merged 8 commits into from
Nov 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions eng/pipelines/common/templates/runtimes/run-test-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,10 @@ jobs:
scenarios:
- jitosr
- jitosr_stress
- jitosr_pgo
- jitpartialcompilation
- jitpartialcompilation_osr
- jitpartialcompilation_osr_pgo
- jitobjectstackallocation
${{ if in(parameters.testGroup, 'ilasm') }}:
scenarios:
Expand Down
29 changes: 29 additions & 0 deletions src/coreclr/inc/pgo_formatprocessing.h
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,35 @@ bool ReadInstrumentationSchemaWithLayout(const uint8_t *pByte, size_t cbDataMax,
});
}


// Return true if schemaTable entries are a subset of the schema described by pByte, with matching entries in the same order.
// Also updates offset of the matching entries in schemaTable to those of the pByte schema.
//
inline bool CheckIfPgoSchemaIsCompatibleAndSetOffsets(const uint8_t *pByte, size_t cbDataMax, ICorJitInfo::PgoInstrumentationSchema* schemaTable, size_t cSchemas)
{
size_t nMatched = 0;
size_t initialOffset = cbDataMax;

auto handler = [schemaTable, cSchemas, &nMatched](const ICorJitInfo::PgoInstrumentationSchema& schema)
{
if ((nMatched < cSchemas)
&& (schema.InstrumentationKind == schemaTable[nMatched].InstrumentationKind)
&& (schema.ILOffset == schemaTable[nMatched].ILOffset)
&& (schema.Count == schemaTable[nMatched].Count)
&& (schema.Other == schemaTable[nMatched].Other))
{
schemaTable[nMatched].Offset = schema.Offset;
nMatched++;
}

return true;
};

ReadInstrumentationSchemaWithLayout(pByte, cbDataMax, initialOffset, handler);

return (nMatched == cSchemas);
}

inline bool ReadInstrumentationSchemaWithLayoutIntoSArray(const uint8_t *pByte, size_t cbDataMax, size_t initialOffset, SArray<ICorJitInfo::PgoInstrumentationSchema>* pSchemas)
{
auto lambda = [pSchemas](const ICorJitInfo::PgoInstrumentationSchema &schema)
Expand Down
34 changes: 29 additions & 5 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2899,6 +2899,18 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
verboseDump = (JitConfig.JitDumpTier0() > 0);
}

// Optionally suppress dumping some OSR jit requests.
//
if (verboseDump && jitFlags->IsSet(JitFlags::JIT_FLAG_OSR))
{
const int desiredOffset = JitConfig.JitDumpAtOSROffset();

if (desiredOffset != -1)
{
verboseDump = (((IL_OFFSET)desiredOffset) == info.compILEntry);
}
}

if (verboseDump)
{
verbose = true;
Expand Down Expand Up @@ -4571,6 +4583,10 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
//
DoPhase(this, PHASE_IMPORTATION, &Compiler::fgImport);

// Expand any patchpoints
//
DoPhase(this, PHASE_PATCHPOINTS, &Compiler::fgTransformPatchpoints);

// If instrumenting, add block and class probes.
//
if (compileFlags->IsSet(JitFlags::JIT_FLAG_BBINSTR))
Expand All @@ -4582,10 +4598,6 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
//
DoPhase(this, PHASE_INDXCALL, &Compiler::fgTransformIndirectCalls);

// Expand any patchpoints
//
DoPhase(this, PHASE_PATCHPOINTS, &Compiler::fgTransformPatchpoints);

// PostImportPhase: cleanup inlinees
//
auto postImportPhase = [this]() {
Expand Down Expand Up @@ -6500,9 +6512,21 @@ int Compiler::compCompileHelper(CORINFO_MODULE_HANDLE classPtr,
#ifdef DEBUG
if ((JitConfig.DumpJittedMethods() == 1) && !compIsForInlining())
{
enum
{
BUFSIZE = 20
};
char osrBuffer[BUFSIZE] = {0};
if (opts.IsOSR())
{
// Tiering name already includes "OSR", we just want the IL offset
//
sprintf_s(osrBuffer, BUFSIZE, " @0x%x", info.compILEntry);
}

printf("Compiling %4d %s::%s, IL size = %u, hash=0x%08x %s%s%s\n", Compiler::jitTotalMethodCompiled,
info.compClassName, info.compMethodName, info.compILCodeSize, info.compMethodHash(),
compGetTieringName(), opts.IsOSR() ? " OSR" : "", compGetStressMessage());
compGetTieringName(), osrBuffer, compGetStressMessage());
}
if (compIsForInlining())
{
Expand Down
14 changes: 13 additions & 1 deletion src/coreclr/jit/fgopt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1553,7 +1553,18 @@ void Compiler::fgPostImportationCleanup()
//
auto addConditionalFlow = [this, entryStateVar, &entryJumpTarget](BasicBlock* fromBlock,
BasicBlock* toBlock) {
fgSplitBlockAtBeginning(fromBlock);

// We may have previously though this try entry was unreachable, but now we're going to
// step through it on the way to the OSR entry. So ensure it has plausible profile weight.
//
if (fgHaveProfileData() && !fromBlock->hasProfileWeight())
{
JITDUMP("Updating block weight for now-reachable try entry " FMT_BB " via " FMT_BB "\n",
fromBlock->bbNum, fgFirstBB->bbNum);
fromBlock->inheritWeight(fgFirstBB);
}

BasicBlock* const newBlock = fgSplitBlockAtBeginning(fromBlock);
fromBlock->bbFlags |= BBF_INTERNAL;

GenTree* const entryStateLcl = gtNewLclvNode(entryStateVar, TYP_INT);
Expand All @@ -1565,6 +1576,7 @@ void Compiler::fgPostImportationCleanup()
fromBlock->bbJumpKind = BBJ_COND;
fromBlock->bbJumpDest = toBlock;
fgAddRefPred(toBlock, fromBlock);
newBlock->inheritWeight(fromBlock);

entryJumpTarget = fromBlock;
};
Expand Down
113 changes: 83 additions & 30 deletions src/coreclr/jit/fgprofile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,10 +300,10 @@ class Instrumentor
virtual void BuildSchemaElements(BasicBlock* block, Schema& schema)
{
}
virtual void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
virtual void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
}
virtual void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory)
virtual void InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory)
{
}
virtual void SuppressProbes()
Expand Down Expand Up @@ -349,8 +349,8 @@ class BlockCountInstrumentor : public Instrumentor
}
void Prepare(bool isPreImport) override;
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory) override;
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;
void InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory) override;
};

//------------------------------------------------------------------------
Expand Down Expand Up @@ -428,7 +428,7 @@ void BlockCountInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& sche
// schema -- instrumentation schema
// profileMemory -- profile data slab
//
void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
const ICorJitInfo::PgoInstrumentationSchema& entry = schema[block->bbCountSchemaIndex];

Expand Down Expand Up @@ -464,7 +464,7 @@ void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE*
// Notes:
// When prejitting, add the method entry callback node
//
void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, BYTE* profileMemory)
void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory)
{
Compiler::Options& opts = m_comp->opts;
Compiler::Info& info = m_comp->info;
Expand Down Expand Up @@ -982,7 +982,7 @@ class EfficientEdgeCountInstrumentor : public Instrumentor, public SpanningTreeV
return ((block->bbFlags & BBF_IMPORTED) == BBF_IMPORTED);
}
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;

void Badcode() override
{
Expand Down Expand Up @@ -1132,7 +1132,7 @@ void EfficientEdgeCountInstrumentor::BuildSchemaElements(BasicBlock* block, Sche
// schema -- instrumentation schema
// profileMemory -- profile data slab
//
void EfficientEdgeCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
void EfficientEdgeCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
// Inlinee compilers build their blocks in the root compiler's
// graph. So for NumSucc, we use the root compiler instance.
Expand Down Expand Up @@ -1307,12 +1307,12 @@ class BuildClassProbeSchemaGen
class ClassProbeInserter
{
Schema& m_schema;
BYTE* m_profileMemory;
uint8_t* m_profileMemory;
int* m_currentSchemaIndex;
unsigned& m_instrCount;

public:
ClassProbeInserter(Schema& schema, BYTE* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount)
ClassProbeInserter(Schema& schema, uint8_t* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount)
: m_schema(schema)
, m_profileMemory(profileMemory)
, m_currentSchemaIndex(pCurrentSchemaIndex)
Expand Down Expand Up @@ -1349,7 +1349,7 @@ class ClassProbeInserter

// Figure out where the table is located.
//
BYTE* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory;
uint8_t* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory;
*m_currentSchemaIndex += 2; // There are 2 schema entries per class probe

// Grab a temp to hold the 'this' object as it will be used three times
Expand Down Expand Up @@ -1426,7 +1426,7 @@ class ClassProbeInstrumentor : public Instrumentor
}
void Prepare(bool isPreImport) override;
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;
void SuppressProbes() override;
};

Expand Down Expand Up @@ -1490,7 +1490,7 @@ void ClassProbeInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& sche
// schema -- instrumentation schema
// profileMemory -- profile data slab
//
void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
{
Expand Down Expand Up @@ -1563,21 +1563,43 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
// Choose instrumentation technology.
//
// We enable edge profiling by default, except when:
//
// * disabled by option
// * we are prejitting
// * we are jitting osr methods
// * we are jitting tier0 methods with patchpoints
// * we are jitting an OSR method
//
// Currently, OSR is incompatible with edge profiling. So if OSR is enabled,
// always do block profiling.
// OSR is incompatible with edge profiling. Only portions of the Tier0
// method will be executed, and the bail-outs at patchpoints won't be obvious
// exit points from the method. So for OSR we always do block profiling.
//
// Note this incompatibility only exists for methods that actually have
// patchpoints, but we won't know that until we import.
// patchpoints. Currently we will only place patchponts in methods with
// backwards jumps.
//
// And because we want the Tier1 method to see the full set of profile data,
// when OSR is enabled, both Tier0 and any OSR methods need to contribute to
// the same profile data set. Since Tier0 has laid down a dense block-based
// schema, the OSR methods must use this schema as well.
//
// Note that OSR methods may also inline. We currently won't instrument
// any inlinee contributions (which would also need to carefully "share"
// the profile data segment with any Tier0 version and/or any other equivalent
// inlnee), so we'll lose a bit of their profile data. We can support this
// eventually if it turns out to matter.
//
// Similar issues arise with partially jitted methods. Because we currently
// only defer jitting for throw blocks, we currently ignore the impact of partial
// jitting on PGO. If we ever implement a broader pattern of deferral -- say deferring
// based on static PGO -- we will need to reconsider.
//
CLANG_FORMAT_COMMENT_ANCHOR;

const bool prejit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT);
const bool osr = (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) && (JitConfig.TC_OnStackReplacement() > 0));
const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !osr;
const bool prejit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT);
const bool tier0WithPatchpoints = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) &&
(JitConfig.TC_OnStackReplacement() > 0) && compHasBackwardJump;
const bool osrMethod = opts.IsOSR();
const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !tier0WithPatchpoints && !osrMethod;

if (useEdgeProfiles)
{
Expand All @@ -1586,7 +1608,9 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
else
{
JITDUMP("Using block profiling, because %s\n",
(JitConfig.JitEdgeProfiling() > 0) ? "edge profiles disabled" : prejit ? "prejitting" : "OSR");
(JitConfig.JitEdgeProfiling() > 0)
? "edge profiles disabled"
: prejit ? "prejitting" : osrMethod ? "OSR" : "tier0 with patchpoints");

fgCountInstrumentor = new (this, CMK_Pgo) BlockCountInstrumentor(this);
}
Expand Down Expand Up @@ -1636,7 +1660,7 @@ PhaseStatus Compiler::fgInstrumentMethod()
{
noway_assert(!compIsForInlining());

// Make post-importpreparations.
// Make post-import preparations.
//
const bool isPreImport = false;
fgCountInstrumentor->Prepare(isPreImport);
Expand All @@ -1661,7 +1685,17 @@ PhaseStatus Compiler::fgInstrumentMethod()
// Verify we created schema for the calls needing class probes.
// (we counted those when importing)
//
assert(fgClassInstrumentor->SchemaCount() == info.compClassProbeCount);
// This is not true when we do partial compilation; it can/will erase class probes,
// and there's no easy way to figure out how many should be left.
//
if (doesMethodHavePartialCompilationPatchpoints())
{
assert(fgClassInstrumentor->SchemaCount() <= info.compClassProbeCount);
}
else
{
assert(fgClassInstrumentor->SchemaCount() == info.compClassProbeCount);
}

// Optionally, when jitting, if there were no class probes and only one count probe,
// suppress instrumentation.
Expand Down Expand Up @@ -1694,11 +1728,16 @@ PhaseStatus Compiler::fgInstrumentMethod()

assert(schema.size() > 0);

// Allocate the profile buffer
// Allocate/retrieve the profile buffer.
//
BYTE* profileMemory;

HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
// If this is an OSR method, we should use the same buffer that the Tier0 method used.
//
// This is supported by allocPgoInsrumentationDataBySchema, which will verify the schema
// we provide here matches the one from Tier0, and will fill in the data offsets in
// our schema properly.
//
uint8_t* profileMemory;
HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
(UINT32)schema.size(), &profileMemory);

// Deal with allocation failures.
Expand Down Expand Up @@ -1920,6 +1959,14 @@ void Compiler::fgIncorporateBlockCounts()
fgSetProfileWeight(block, profileWeight);
}
}

// For OSR, give the method entry (which will be a scratch BB)
// the same weight as the OSR Entry.
//
if (opts.IsOSR())
{
fgFirstBB->inheritWeight(fgOSREntryBB);
}
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -3285,11 +3332,17 @@ void Compiler::fgComputeCalledCount(weight_t returnWeight)

BasicBlock* firstILBlock = fgFirstBB; // The first block for IL code (i.e. for the IL code at offset 0)

// Skip past any/all BBF_INTERNAL blocks that may have been added before the first real IL block.
// OSR methods can have complex entry flow, and so
// for OSR we ensure fgFirstBB has plausible profile data.
//
while (firstILBlock->bbFlags & BBF_INTERNAL)
if (!opts.IsOSR())
{
firstILBlock = firstILBlock->bbNext;
// Skip past any/all BBF_INTERNAL blocks that may have been added before the first real IL block.
//
while (firstILBlock->bbFlags & BBF_INTERNAL)
{
firstILBlock = firstILBlock->bbNext;
}
}

// The 'firstILBlock' is now expected to have a profile-derived weight
Expand Down
Loading