From 2623a10c18646e054cdeb47b4a0b138654097f07 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sat, 19 Nov 2022 12:16:04 +0100 Subject: [PATCH] Improve performance of global code by emitting fewer atomic barriers. --- src/codegen.cpp | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index cdc5e00a26281..f000eb9d57516 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -7791,13 +7791,6 @@ static jl_llvm_functions_t ctx.builder.SetInsertPoint(tryblk); } else { - if (!jl_is_method(ctx.linfo->def.method) && !ctx.is_opaque_closure) { - // TODO: inference is invalid if this has any effect (which it often does) - LoadInst *world = ctx.builder.CreateAlignedLoad(getSizeTy(ctx.builder.getContext()), - prepare_global_in(jl_Module, jlgetworld_global), Align(sizeof(size_t))); - world->setOrdering(AtomicOrdering::Acquire); - ctx.builder.CreateAlignedStore(world, world_age_field, Align(sizeof(size_t))); - } emit_stmtpos(ctx, stmt, cursor); mallocVisitStmt(debuginfoloc, nullptr); } @@ -8023,12 +8016,12 @@ static jl_llvm_functions_t } // step 12. Perform any delayed instantiations - if (ctx.debug_enabled) { - bool in_prologue = true; - for (auto &BB : *ctx.f) { - for (auto &I : BB) { - CallBase *call = dyn_cast(&I); - if (call && !I.getDebugLoc()) { + bool in_prologue = true; + for (auto &BB : *ctx.f) { + for (auto &I : BB) { + CallBase *call = dyn_cast(&I); + if (call) { + if (ctx.debug_enabled && !I.getDebugLoc()) { // LLVM Verifier: inlinable function call in a function with debug info must have a !dbg location // make sure that anything we attempt to call has some inlining info, just in case optimization messed up // (except if we know that it is an intrinsic used in our prologue, which should never have its own debug subprogram) @@ -8037,12 +8030,24 @@ static jl_llvm_functions_t I.setDebugLoc(topdebugloc); } } - if (&I == &prologue_end) - in_prologue = false; + if (toplevel && !ctx.is_opaque_closure && !in_prologue) { + // we're at toplevel; insert an atomic barrier between every instruction + // TODO: inference is invalid if this has any effect (which it often does) + LoadInst *load_world = new LoadInst(getSizeTy(ctx.builder.getContext()), + prepare_global_in(jl_Module, jlgetworld_global), Twine(), + /*isVolatile*/false, Align(sizeof(size_t)), /*insertBefore*/&I); + load_world->setOrdering(AtomicOrdering::Monotonic); + StoreInst *store_world = new StoreInst(load_world, world_age_field, + /*isVolatile*/false, Align(sizeof(size_t)), /*insertBefore*/&I); + (void)store_world; + } } + if (&I == &prologue_end) + in_prologue = false; } - dbuilder.finalize(); } + if (ctx.debug_enabled) + dbuilder.finalize(); if (ctx.vaSlot > 0) { // remove VA allocation if we never referenced it