From ac41e0ef860af53ab2519bc4935c51ea09c98d9b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sat, 19 Nov 2022 12:16:04 +0100 Subject: [PATCH] Improve performance of global code by emitting fewer atomic barriers. --- src/codegen.cpp | 51 +++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index 71a704910b70b..ea4a0fb766635 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -5959,7 +5959,7 @@ static Function* gen_cfun_wrapper( Value *world_v = ctx.builder.CreateAlignedLoad(ctx.types().T_size, prepare_global_in(jl_Module, jlgetworld_global), ctx.types().alignof_ptr); - cast(world_v)->setOrdering(AtomicOrdering::Acquire); + cast(world_v)->setOrdering(AtomicOrdering::Monotonic); Value *age_ok = NULL; if (calltype) { @@ -7760,11 +7760,19 @@ static jl_llvm_functions_t Instruction &prologue_end = ctx.builder.GetInsertBlock()->back(); - // step 11a. Emit the entry safepoint + // step 11b. For top-level code, load the world age + if (toplevel && !ctx.is_opaque_closure) { + LoadInst *world = ctx.builder.CreateAlignedLoad(ctx.types().T_size, + prepare_global_in(jl_Module, jlgetworld_global), ctx.types().alignof_ptr); + world->setOrdering(AtomicOrdering::Monotonic); + ctx.builder.CreateAlignedStore(world, world_age_field, ctx.types().alignof_ptr); + } + + // step 11b. Emit the entry safepoint if (JL_FEAT_TEST(ctx, safepoint_on_entry)) emit_gc_safepoint(ctx.builder, ctx.types().T_size, get_current_ptls(ctx), ctx.tbaa().tbaa_const); - // step 11b. Do codegen in control flow order + // step 11c. Do codegen in control flow order std::vector workstack; std::map BB; std::map come_from_bb; @@ -8087,13 +8095,6 @@ static jl_llvm_functions_t ctx.builder.SetInsertPoint(tryblk); } else { - if (!jl_is_method(ctx.linfo->def.method) && !ctx.is_opaque_closure) { - // TODO: inference is invalid if this has any effect (which it often does) - LoadInst *world = ctx.builder.CreateAlignedLoad(ctx.types().T_size, - prepare_global_in(jl_Module, jlgetworld_global), ctx.types().alignof_ptr); - world->setOrdering(AtomicOrdering::Acquire); - ctx.builder.CreateAlignedStore(world, world_age_field, ctx.types().alignof_ptr); - } emit_stmtpos(ctx, stmt, cursor); mallocVisitStmt(debuginfoloc, nullptr); } @@ -8319,12 +8320,12 @@ static jl_llvm_functions_t } // step 12. Perform any delayed instantiations - if (ctx.debug_enabled) { - bool in_prologue = true; - for (auto &BB : *ctx.f) { - for (auto &I : BB) { - CallBase *call = dyn_cast(&I); - if (call && !I.getDebugLoc()) { + bool in_prologue = true; + for (auto &BB : *ctx.f) { + for (auto &I : BB) { + CallBase *call = dyn_cast(&I); + if (call) { + if (ctx.debug_enabled && !I.getDebugLoc()) { // LLVM Verifier: inlinable function call in a function with debug info must have a !dbg location // make sure that anything we attempt to call has some inlining info, just in case optimization messed up // (except if we know that it is an intrinsic used in our prologue, which should never have its own debug subprogram) @@ -8333,12 +8334,24 @@ static jl_llvm_functions_t I.setDebugLoc(topdebugloc); } } - if (&I == &prologue_end) - in_prologue = false; + if (toplevel && !ctx.is_opaque_closure && !in_prologue) { + // we're at toplevel; insert an atomic barrier between every instruction + // TODO: inference is invalid if this has any effect (which it often does) + LoadInst *world = new LoadInst(ctx.types().T_size, + prepare_global_in(jl_Module, jlgetworld_global), Twine(), + /*isVolatile*/false, ctx.types().alignof_ptr, /*insertBefore*/&I); + world->setOrdering(AtomicOrdering::Monotonic); + StoreInst *store_world = new StoreInst(world, world_age_field, + /*isVolatile*/false, ctx.types().alignof_ptr, /*insertBefore*/&I); + (void)store_world; + } } + if (&I == &prologue_end) + in_prologue = false; } - dbuilder.finalize(); } + if (ctx.debug_enabled) + dbuilder.finalize(); if (ctx.vaSlot > 0) { // remove VA allocation if we never referenced it