From 3dc95e3316f608792e4fef82ab23b567783765c1 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 28 Jun 2023 11:39:35 -0600 Subject: [PATCH 01/13] Allocation Profiler: Types for all allocations Before this PR, we were missing the types for allocations in two cases: 1. allocations from codegen 2. allocations in gc_managed_realloc_ The second one is easy: those are always used for `buffer`s, right? For the first one: we pass the type through to the allocation function, so that we can log the allocation with the correct type. --- src/gc.c | 24 +++++++++++++++++--- src/jl_exported_funcs.inc | 2 ++ src/llvm-final-gc-lowering.cpp | 9 ++++---- src/llvm-late-gc-lowering.cpp | 40 +++++++++++++++------------------- src/llvm-pass-helpers.cpp | 12 +++++----- stdlib/Profile/test/allocs.jl | 31 ++++++++++++++++++++++++++ 6 files changed, 84 insertions(+), 34 deletions(-) diff --git a/src/gc.c b/src/gc.c index d766f5c39fbf5..b573c08b3de35 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1040,13 +1040,21 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) return jl_valueof(&v->header); } -// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code. +// Deprecated version, supported for legacy code. JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz) { jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz); maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag); return val; } +// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code. +JL_DLLEXPORT jl_value_t *jl_gc_big_alloc_typed(jl_ptls_t ptls, size_t sz, jl_value_t *type) +{ + jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz); + jl_set_typeof(val, type); + maybe_record_alloc_to_profile(val, sz, (jl_datatype_t*)type); + return val; +} // This wrapper exists only to prevent `jl_gc_big_alloc_inner` from being inlined into // its callers. We provide an external-facing interface for callers, and inline `jl_gc_big_alloc_inner` @@ -1357,7 +1365,7 @@ STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset return jl_valueof(v); } -// Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code. +// Deprecated version, supported for legacy code. JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, int osize) { @@ -1365,6 +1373,15 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag); return val; } +// Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code. +JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc_typed(jl_ptls_t ptls, int pool_offset, + int osize, jl_value_t* type) +{ + jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize); + jl_set_typeof(val, type); + maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type); + return val; +} // This wrapper exists only to prevent `jl_gc_pool_alloc_inner` from being inlined into // its callers. We provide an external-facing interface for callers, and inline `jl_gc_pool_alloc_inner` @@ -3836,7 +3853,8 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds SetLastError(last_error); #endif errno = last_errno; - maybe_record_alloc_to_profile((jl_value_t*)b, sz, jl_gc_unknown_type_tag); + // gc_managed_realloc_ is currently used exclusively for resizing array buffers. + maybe_record_alloc_to_profile((jl_value_t*)b, sz, (jl_datatype_t*)jl_buff_tag); return b; } diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index a7ffedd5cba10..91ace3bb80ad9 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -158,6 +158,7 @@ XX(jl_gc_alloc_3w) \ XX(jl_gc_alloc_typed) \ XX(jl_gc_big_alloc) \ + XX(jl_gc_big_alloc_typed) \ XX(jl_gc_collect) \ XX(jl_gc_conservative_gc_support_enabled) \ XX(jl_gc_counted_calloc) \ @@ -185,6 +186,7 @@ XX(jl_gc_new_weakref_th) \ XX(jl_gc_num) \ XX(jl_gc_pool_alloc) \ + XX(jl_gc_pool_alloc_typed) \ XX(jl_gc_queue_multiroot) \ XX(jl_gc_queue_root) \ XX(jl_gc_safepoint) \ diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp index ca24cda542019..950329769bc4b 100644 --- a/src/llvm-final-gc-lowering.cpp +++ b/src/llvm-final-gc-lowering.cpp @@ -187,12 +187,13 @@ Value *FinalLowerGC::lowerSafepoint(CallInst *target, Function &F) Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) { ++GCAllocBytesCount; - assert(target->arg_size() == 2); + assert(target->arg_size() == 3); CallInst *newI; IRBuilder<> builder(target); builder.SetCurrentDebugLocation(target->getDebugLoc()); auto ptls = target->getArgOperand(0); + auto type = target->getArgOperand(2); Attribute derefAttr; if (auto CI = dyn_cast(target->getArgOperand(1))) { @@ -203,19 +204,19 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) if (offset < 0) { newI = builder.CreateCall( bigAllocFunc, - { ptls, ConstantInt::get(T_size, sz + sizeof(void*)) }); + { ptls, ConstantInt::get(T_size, sz + sizeof(void*)), type }); derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), sz + sizeof(void*)); } else { auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset); auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize); - newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize }); + newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize, type }); derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), osize); } } else { auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size); size = builder.CreateAdd(size, ConstantInt::get(T_size, sizeof(void*))); - newI = builder.CreateCall(allocTypedFunc, { ptls, size, ConstantPointerNull::get(Type::getInt8PtrTy(F.getContext())) }); + newI = builder.CreateCall(allocTypedFunc, { ptls, size, type }); derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), sizeof(void*)); } newI->setAttributes(newI->getCalledFunction()->getAttributes()); diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 409d3a09c0337..10aa15eab76e0 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2349,22 +2349,6 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { IRBuilder<> builder(CI); builder.SetCurrentDebugLocation(CI->getDebugLoc()); - // Create a call to the `julia.gc_alloc_bytes` intrinsic, which is like - // `julia.gc_alloc_obj` except it doesn't set the tag. - auto allocBytesIntrinsic = getOrDeclare(jl_intrinsics::GCAllocBytes); - auto ptlsLoad = get_current_ptls_from_task(builder, T_size, CI->getArgOperand(0), tbaa_gcframe); - auto ptls = builder.CreateBitCast(ptlsLoad, Type::getInt8PtrTy(builder.getContext())); - auto newI = builder.CreateCall( - allocBytesIntrinsic, - { - ptls, - builder.CreateIntCast( - CI->getArgOperand(1), - allocBytesIntrinsic->getFunctionType()->getParamType(1), - false) - }); - newI->takeName(CI); - // LLVM alignment/bit check is not happy about addrspacecast and refuse // to remove write barrier because of it. // We pretty much only load using `T_size` so try our best to strip @@ -2403,12 +2387,24 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { builder.CreateAlignmentAssumption(DL, tag, 16); } } - // Set the tag. - auto &M = *builder.GetInsertBlock()->getModule(); - StoreInst *store = builder.CreateAlignedStore( - tag, EmitTagPtr(builder, tag_type, T_size, newI), M.getDataLayout().getPointerABIAlignment(0)); - store->setOrdering(AtomicOrdering::Unordered); - store->setMetadata(LLVMContext::MD_tbaa, tbaa_tag); + + // Create a call to the `julia.gc_alloc_bytes` intrinsic, which is like + // `julia.gc_alloc_obj` except it specializes the call based on the constant + // size of the object to allocate, to save one indirection. + auto allocBytesIntrinsic = getOrDeclare(jl_intrinsics::GCAllocBytes); + auto ptlsLoad = get_current_ptls_from_task(builder, T_size, CI->getArgOperand(0), tbaa_gcframe); + auto ptls = builder.CreateBitCast(ptlsLoad, Type::getInt8PtrTy(builder.getContext())); + auto newI = builder.CreateCall( + allocBytesIntrinsic, + { + ptls, + builder.CreateIntCast( + CI->getArgOperand(1), + allocBytesIntrinsic->getFunctionType()->getParamType(1), + false), + builder.CreatePtrToInt(tag, T_size), + }); + newI->takeName(CI); // Replace uses of the call to `julia.gc_alloc_obj` with the call to // `julia.gc_alloc_bytes`. diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index b006f191937f5..2d96f3880ca34 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -151,7 +151,9 @@ namespace jl_intrinsics { auto intrinsic = Function::Create( FunctionType::get( T_prjlvalue, - { Type::getInt8PtrTy(ctx), T_size }, + { Type::getInt8PtrTy(ctx), + T_size, + T_size }, // type false), Function::ExternalLinkage, GC_ALLOC_BYTES_NAME); @@ -236,8 +238,8 @@ namespace jl_intrinsics { } namespace jl_well_known { - static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc); - static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc); + static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc_typed); + static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc_typed); static const char *GC_QUEUE_ROOT_NAME = XSTR(jl_gc_queue_root); static const char *GC_ALLOC_TYPED_NAME = XSTR(jl_gc_alloc_typed); @@ -251,7 +253,7 @@ namespace jl_well_known { auto bigAllocFunc = Function::Create( FunctionType::get( T_prjlvalue, - { Type::getInt8PtrTy(ctx), T_size }, + { Type::getInt8PtrTy(ctx), T_size , T_size}, false), Function::ExternalLinkage, GC_BIG_ALLOC_NAME); @@ -267,7 +269,7 @@ namespace jl_well_known { auto poolAllocFunc = Function::Create( FunctionType::get( T_prjlvalue, - { Type::getInt8PtrTy(ctx), Type::getInt32Ty(ctx), Type::getInt32Ty(ctx) }, + { Type::getInt8PtrTy(ctx), Type::getInt32Ty(ctx), Type::getInt32Ty(ctx), T_size }, false), Function::ExternalLinkage, GC_POOL_ALLOC_NAME); diff --git a/stdlib/Profile/test/allocs.jl b/stdlib/Profile/test/allocs.jl index c2ec7d2f6cb54..ae0cbab945f01 100644 --- a/stdlib/Profile/test/allocs.jl +++ b/stdlib/Profile/test/allocs.jl @@ -121,3 +121,34 @@ end @test length(prof.allocs) >= 1 @test length([a for a in prof.allocs if a.type == String]) >= 1 end + +@testset "alloc profiler catches allocs from codegen" begin + @eval begin + struct MyType x::Int; y::Int end + Base.:(+)(n::Number, x::MyType) = n + x.x + x.y + foo(a, x) = a[1] + x + wrapper(a) = foo(a, MyType(0,1)) + end + a = Any[1,2,3] + # warmup + wrapper(a) + + @eval Allocs.@profile sample_rate=1 wrapper($a) + + prof = Allocs.fetch() + Allocs.clear() + + @test length(prof.allocs) >= 1 + @test length([a for a in prof.allocs if a.type == MyType]) >= 1 +end + +@testset "alloc profiler catches allocs from buffer resize" begin + a = Int[] + Allocs.@profile sample_rate=1 for _ in 1:100; push!(a, 1); end + + prof = Allocs.fetch() + Allocs.clear() + + @test length(prof.allocs) >= 1 + @test length([a for a in prof.allocs if a.type == Profile.Allocs.BufferType]) >= 1 +end From 4c47804347903da3628a8ee0c2f2f43a44687813 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Fri, 30 Jun 2023 09:48:11 -0600 Subject: [PATCH 02/13] Switch back to setting the type tag in LLVM IR, rather than in C. The tradeoff here is: - the compiler gets to see the set instruction and it can participate in optimization - but the code size and compilation time of every allocation increases by 1 instruction. --- src/gc.c | 6 ++---- src/jl_exported_funcs.inc | 4 ++-- src/llvm-late-gc-lowering.cpp | 18 +++++++++++++++++- src/llvm-pass-helpers.cpp | 4 ++-- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/gc.c b/src/gc.c index b573c08b3de35..1978e215e04cb 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1048,10 +1048,9 @@ JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz) return val; } // Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code. -JL_DLLEXPORT jl_value_t *jl_gc_big_alloc_typed(jl_ptls_t ptls, size_t sz, jl_value_t *type) +JL_DLLEXPORT jl_value_t *jl_gc_big_alloc_instrumented(jl_ptls_t ptls, size_t sz, jl_value_t *type) { jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz); - jl_set_typeof(val, type); maybe_record_alloc_to_profile(val, sz, (jl_datatype_t*)type); return val; } @@ -1374,11 +1373,10 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, return val; } // Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code. -JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc_typed(jl_ptls_t ptls, int pool_offset, +JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc_instrumented(jl_ptls_t ptls, int pool_offset, int osize, jl_value_t* type) { jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize); - jl_set_typeof(val, type); maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type); return val; } diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index 91ace3bb80ad9..57a2e0cfc58ec 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -158,7 +158,7 @@ XX(jl_gc_alloc_3w) \ XX(jl_gc_alloc_typed) \ XX(jl_gc_big_alloc) \ - XX(jl_gc_big_alloc_typed) \ + XX(jl_gc_big_alloc_instrumented) \ XX(jl_gc_collect) \ XX(jl_gc_conservative_gc_support_enabled) \ XX(jl_gc_counted_calloc) \ @@ -186,7 +186,7 @@ XX(jl_gc_new_weakref_th) \ XX(jl_gc_num) \ XX(jl_gc_pool_alloc) \ - XX(jl_gc_pool_alloc_typed) \ + XX(jl_gc_pool_alloc_instrumented) \ XX(jl_gc_queue_multiroot) \ XX(jl_gc_queue_root) \ XX(jl_gc_safepoint) \ diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 10aa15eab76e0..9545912695b20 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2390,7 +2390,9 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { // Create a call to the `julia.gc_alloc_bytes` intrinsic, which is like // `julia.gc_alloc_obj` except it specializes the call based on the constant - // size of the object to allocate, to save one indirection. + // size of the object to allocate, to save one indirection, and doesn't set + // the type tag. (Note that if the size is not a constant, it will call + // gc_alloc_obj, and will redundantly set the tag.) auto allocBytesIntrinsic = getOrDeclare(jl_intrinsics::GCAllocBytes); auto ptlsLoad = get_current_ptls_from_task(builder, T_size, CI->getArgOperand(0), tbaa_gcframe); auto ptls = builder.CreateBitCast(ptlsLoad, Type::getInt8PtrTy(builder.getContext())); @@ -2406,6 +2408,20 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { }); newI->takeName(CI); + // Now, finally, set the tag. We do this in IR instead of in the C alloc + // function, to provide possible optimization opportunities. (I think? TBH + // the most recent editor of this code is not entirely clear on why we + // prefer to set the tag in the generated code. Providing optimziation + // opportunities is the most likely reason; the tradeoff is slightly + // larger code size and increased compilation time, compiling this + // instruction at every allocation site, rather than once in the C alloc + // function.) + auto &M = *builder.GetInsertBlock()->getModule(); + StoreInst *store = builder.CreateAlignedStore( + tag, EmitTagPtr(builder, tag_type, T_size, newI), M.getDataLayout().getPointerABIAlignment(0)); + store->setOrdering(AtomicOrdering::Unordered); + store->setMetadata(LLVMContext::MD_tbaa, tbaa_tag); + // Replace uses of the call to `julia.gc_alloc_obj` with the call to // `julia.gc_alloc_bytes`. CI->replaceAllUsesWith(newI); diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index 2d96f3880ca34..5ade0f9822fa8 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -238,8 +238,8 @@ namespace jl_intrinsics { } namespace jl_well_known { - static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc_typed); - static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc_typed); + static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc_instrumented); + static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc_instrumented); static const char *GC_QUEUE_ROOT_NAME = XSTR(jl_gc_queue_root); static const char *GC_ALLOC_TYPED_NAME = XSTR(jl_gc_alloc_typed); From 8cd9a06d0400b224448c0e48ebbc6a89c349f6eb Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Fri, 7 Jul 2023 11:33:21 -0600 Subject: [PATCH 03/13] Review suggestion: report (positive) delta in realloc --- src/gc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gc.c b/src/gc.c index 1978e215e04cb..5390b08cc9e8c 100644 --- a/src/gc.c +++ b/src/gc.c @@ -3852,7 +3852,9 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds #endif errno = last_errno; // gc_managed_realloc_ is currently used exclusively for resizing array buffers. - maybe_record_alloc_to_profile((jl_value_t*)b, sz, (jl_datatype_t*)jl_buff_tag); + if (allocsz > oldsz) { + maybe_record_alloc_to_profile((jl_value_t*)b, allocsz - oldsz, (jl_datatype_t*)jl_buff_tag); + } return b; } From 736da48a6e924ac4bc1422738e98f58264df0647 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Fri, 7 Jul 2023 11:55:33 -0600 Subject: [PATCH 04/13] Update documentation for the Allocation Profiler; now we have all types Add docs section on external profile visualizer tools --- doc/src/manual/profile.md | 76 +++++++++++++++++++++++++++++++++++---- src/gc-alloc-profiler.h | 1 + 2 files changed, 70 insertions(+), 7 deletions(-) diff --git a/doc/src/manual/profile.md b/doc/src/manual/profile.md index e5f1d6c417fa6..4d1147b155292 100644 --- a/doc/src/manual/profile.md +++ b/doc/src/manual/profile.md @@ -338,15 +338,77 @@ argument can be passed to speed it up by making it skip some allocations. Passing `sample_rate=1.0` will make it record everything (which is slow); `sample_rate=0.1` will record only 10% of the allocations (faster), etc. -!!! note +!!! compat "Julia 1.11" + + Older versions of Julia could not capture types in all cases. In older versions of + julia, if you see an allocation of type `Profile.Allocs.UnknownType`, it means that + the profiler doesn't know what type of object was allocated. This mainly happened when + the allocation was coming from generated code produced by the compiler. See + [issue #43688](https://github.com/JuliaLang/julia/issues/43688) for more info. + + In Julia 1.11+, all allocations should have a type reported. + +For more details on how to use this tool, please see the following talk from JuliaCon 2022: +https://www.youtube.com/watch?v=BFvpwC8hEWQ + +##### Allocation Profiler Example + +Here is an example of how to invoke the Allocation profiler. A good number of samples to aim +for is around 1 - 10 thousand. Too many, and the profile visualizer can get overwhelmed, and +profiling will be slow. Too few, and you don't have a representative sample. +```julia-repl +julia> import Profile + +julia> @time my_function() # Estimate allocations from a (second-run) of the function + 0.110018 seconds (1.50 M allocations: 58.725 MiB, 17.17% gc time) +500000 + +julia> Profile.Allocs.clear() + +julia> Profile.Allocs.@profile sample_rate=0.001 begin # 1.5 M * 0.001 = ~1.5K allocs. + my_function() + end +500000 + +julia> prof = Profile.Allocs.fetch(); + +julia> length(prof.allocs) # Confirm we have expected number of allocations. +1410 + +julia> using PProf # Now, visualize with an external tool, like PProf or ProfileCanvas. + +julia> PProf.Allocs.pprof(prof; from_c=false) +Analyzing 1410 allocation samples... 100%|████████████████████████████████| Time: 0:00:16 +"alloc-profile.pb.gz" + +Serving web UI on http://localhost:62261 +``` + +##### Allocation Profiling Tips + +As stated above, aim for around 1-10 thousand samples in your profile. + +Note that we are uniformly sampling in the space of _all allocations_, and are not weighting +our samples by the size of the allocation. So a given allocation profile may not give a +representative profile of where most bytes are allocated in your program, unless you had set +`sample_rate=1`. + +Allocations can come from users directly constructing objects, but can also come from inside +the runtime or be inserted into compiled code to handle type instability. Looking at the +"source code" view can be helpful to isolate them, and then other external tools such as +[`Cthulhu.jl`](https://github.com/JuliaDebug/Cthulhu.jl) can be useful for identifying the +cause of the allocation. - The current implementation of the Allocations Profiler _does not - capture types for all allocations._ Allocations for which the profiler - could not capture the type are represented as having type - `Profile.Allocs.UnknownType`. +##### Allocation Profile Visualization Tools - You can read more about the missing types and the plan to improve this, here: - [issue #43688](https://github.com/JuliaLang/julia/issues/43688). +There are several profiling visualization tools now that can all display Allocation +Profiles. Here is a small list of some of the main ones we know about: +- [PProf.jl](https://github.com/JuliaPerf/PProf.jl) +- [ProfileCanvas.jl](https://github.com/pfitzseb/ProfileCanvas.jl) +- VSCode's built-in profile visualizer (`@profview_allocs`) [docs needed] +- Viewing the results directly in the REPL + - You can inspect the results in the REPL via [`Profile.Allocs.fetch()`](@ref), to view + the stacktrace and type of each allocation. #### Line-by-Line Allocation Tracking diff --git a/src/gc-alloc-profiler.h b/src/gc-alloc-profiler.h index 3fd8bf4388a0a..fcd8e45caa2d8 100644 --- a/src/gc-alloc-profiler.h +++ b/src/gc-alloc-profiler.h @@ -35,6 +35,7 @@ void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t extern int g_alloc_profile_enabled; +// This should only be used from _deprecated_ code paths. We shouldn't see UNKNOWN anymore. #define jl_gc_unknown_type_tag ((jl_datatype_t*)0xdeadaa03) static inline void maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *typ) JL_NOTSAFEPOINT { From 0de76b90f96f44c239ef01b1208172ed64131d79 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Fri, 7 Jul 2023 12:22:50 -0600 Subject: [PATCH 05/13] First attempt at fixing llvm tests... I frankly don't know enough to do this yet, I think. --- test/llvmpasses/alloc-opt-gcframe.ll | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/test/llvmpasses/alloc-opt-gcframe.ll b/test/llvmpasses/alloc-opt-gcframe.ll index a04d6566cec0a..b68ed11e96387 100644 --- a/test/llvmpasses/alloc-opt-gcframe.ll +++ b/test/llvmpasses/alloc-opt-gcframe.ll @@ -14,17 +14,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; CHECK-NOT: @julia.gc_alloc_obj ; TYPED: %current_task = getelementptr inbounds {}*, {}** %gcstack, i64 -12 +; TYPED-NEXT: call void @llvm.assume(i1 true) [ "align"({} addrspace(10)* @tag, i64 16) ] ; TYPED-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 16 ; TYPED-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; TYPED-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; TYPED-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -; TYPED-NEXT: %v = call noalias nonnull dereferenceable({{[0-9]+}}) {} addrspace(10)* @ijl_gc_pool_alloc(i8* [[ptls_i8]], i32 [[SIZE_T:[0-9]+]], i32 16) +; TYPED-NEXT: %v = call noalias nonnull dereferenceable({{[0-9]+}}) {} addrspace(10)* @ijl_gc_pool_alloc_instrumented(i8* [[ptls_i8]], i32 [[SIZE_T:[0-9]+]], i32 16, i64 ptrtoint [[.*]]) ; TYPED: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* {{.*}} unordered, align 8, !tbaa !4 ; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %gcstack, i64 -12 ; OPAQUE-NEXT: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16 ; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0 -; OPAQUE-NEXT: %v = call noalias nonnull dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_pool_alloc(ptr [[ptls_load]], i32 [[SIZE_T:[0-9]+]], i32 16) +; OPAQUE-NEXT: %v = call noalias nonnull dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_pool_alloc_instrumented(ptr [[ptls_load]], i32 [[SIZE_T:[0-9]+]], i32 16, ptr) ; OPAQUE: store atomic ptr addrspace(10) @tag, ptr addrspace(10) {{.*}} unordered, align 8, !tbaa !4 define {} addrspace(10)* @return_obj() { @@ -39,7 +40,7 @@ define {} addrspace(10)* @return_obj() { ; CHECK-LABEL: @return_load ; CHECK: alloca i64 ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK-NOT: @jl_gc_pool_alloc +; CHECK-NOT: @ijl_gc_pool_alloc_instrumented ; TYPED: call void @llvm.lifetime.start{{.*}}(i64 8, i8* ; OPAQUE: call void @llvm.lifetime.start{{.*}}(i64 8, ptr ; CHECK-NOT: @tag @@ -62,7 +63,7 @@ define i64 @return_load(i64 %i) { ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK: @ijl_gc_pool_alloc +; CHECK: @ijl_gc_pool_alloc_instrumented ; TYPED: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* {{.*}} unordered, align 8, !tbaa !4 ; OPAQUE: store atomic ptr addrspace(10) @tag, ptr addrspace(10) {{.*}} unordered, align 8, !tbaa !4 define void @ccall_obj(i8* %fptr) { @@ -81,7 +82,7 @@ define void @ccall_obj(i8* %fptr) { ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK-NOT: @jl_gc_pool_alloc +; CHECK-NOT: @jl_gc_pool_alloc_instrumented ; TYPED: call void @llvm.lifetime.start{{.*}}(i64 8, i8* ; TYPED: %f = bitcast i8* %fptr to void (i8*)* @@ -109,7 +110,7 @@ define void @ccall_ptr(i8* %fptr) { ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK: @ijl_gc_pool_alloc +; CHECK: @ijl_gc_pool_alloc_instrumented ; TYPED: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* {{.*}} unordered, align 8, !tbaa !4 ; OPAQUE: store atomic ptr addrspace(10) @tag, ptr addrspace(10) {{.*}} unordered, align 8, !tbaa !4 define void @ccall_unknown_bundle(i8* %fptr) { @@ -179,7 +180,7 @@ L3: ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK-NOT: @jl_gc_pool_alloc +; CHECK-NOT: @jl_gc_pool_alloc_instrumented ; CHECK-NOT: store {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* {{.*}}, align 8, !tbaa !4 define void @object_field({} addrspace(10)* %field) { %pgcstack = call {}*** @julia.get_pgcstack() @@ -198,7 +199,7 @@ define void @object_field({} addrspace(10)* %field) { ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK-NOT: @jl_gc_pool_alloc +; CHECK-NOT: @jl_gc_pool_alloc_instrumented ; TYPED: call void @llvm.memcpy.p0i8.p0i8.i64 ; OPAQUE: call void @llvm.memcpy.p0.p0.i64 define void @memcpy_opt(i8* %v22) { @@ -218,7 +219,7 @@ top: ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK-NOT: @jl_gc_pool_alloc +; CHECK-NOT: @jl_gc_pool_alloc_instrumented ; CHECK-NOT: @llvm.lifetime.end ; CHECK: @external_function define void @preserve_opt(i8* %v22) { @@ -270,11 +271,11 @@ L3: } ; CHECK-LABEL: }{{$}} -; TYPED: declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, -; TYPED: declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc(i8*, +; TYPED: declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc_instrumented(i8*, +; TYPED: declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc_instrumented(i8*, -; OPAQUE: declare noalias nonnull ptr addrspace(10) @ijl_gc_pool_alloc(ptr, -; OPAQUE: declare noalias nonnull ptr addrspace(10) @ijl_gc_big_alloc(ptr, +; OPAQUE: declare noalias nonnull ptr addrspace(10) @ijl_gc_pool_alloc_instrumented(ptr, +; OPAQUE: declare noalias nonnull ptr addrspace(10) @ijl_gc_big_alloc_instrumented(ptr, declare void @external_function() declare {}*** @julia.get_pgcstack() declare noalias nonnull {} addrspace(10)* @julia.gc_alloc_obj({}**, i64, {} addrspace(10)*) From f7e39c4cd877f537f8a6632acc02b2576dd0c82c Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Fri, 7 Jul 2023 14:17:19 -0600 Subject: [PATCH 06/13] Docs suggestions Co-authored-by: Valentin Churavy --- doc/src/manual/profile.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/src/manual/profile.md b/doc/src/manual/profile.md index 4d1147b155292..a37719c9b782a 100644 --- a/doc/src/manual/profile.md +++ b/doc/src/manual/profile.md @@ -341,12 +341,12 @@ Passing `sample_rate=1.0` will make it record everything (which is slow); !!! compat "Julia 1.11" Older versions of Julia could not capture types in all cases. In older versions of - julia, if you see an allocation of type `Profile.Allocs.UnknownType`, it means that + Julia, if you see an allocation of type `Profile.Allocs.UnknownType`, it means that the profiler doesn't know what type of object was allocated. This mainly happened when the allocation was coming from generated code produced by the compiler. See [issue #43688](https://github.com/JuliaLang/julia/issues/43688) for more info. - In Julia 1.11+, all allocations should have a type reported. + Since Julia 1.11, all allocations should have a type reported. For more details on how to use this tool, please see the following talk from JuliaCon 2022: https://www.youtube.com/watch?v=BFvpwC8hEWQ @@ -356,6 +356,7 @@ https://www.youtube.com/watch?v=BFvpwC8hEWQ Here is an example of how to invoke the Allocation profiler. A good number of samples to aim for is around 1 - 10 thousand. Too many, and the profile visualizer can get overwhelmed, and profiling will be slow. Too few, and you don't have a representative sample. + ```julia-repl julia> import Profile From c28f0ef010d5cb9e8748c3f2d5fca413b23b8af5 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Sat, 29 Jul 2023 13:02:36 -0400 Subject: [PATCH 07/13] Now i know about the LLVM tests! :tada: --- test/llvmpasses/alloc-opt-gcframe.ll | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/test/llvmpasses/alloc-opt-gcframe.ll b/test/llvmpasses/alloc-opt-gcframe.ll index b68ed11e96387..f600399ac2a7a 100644 --- a/test/llvmpasses/alloc-opt-gcframe.ll +++ b/test/llvmpasses/alloc-opt-gcframe.ll @@ -14,18 +14,17 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; CHECK-NOT: @julia.gc_alloc_obj ; TYPED: %current_task = getelementptr inbounds {}*, {}** %gcstack, i64 -12 -; TYPED-NEXT: call void @llvm.assume(i1 true) [ "align"({} addrspace(10)* @tag, i64 16) ] -; TYPED-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 16 +; TYPED: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 16 ; TYPED-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; TYPED-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; TYPED-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -; TYPED-NEXT: %v = call noalias nonnull dereferenceable({{[0-9]+}}) {} addrspace(10)* @ijl_gc_pool_alloc_instrumented(i8* [[ptls_i8]], i32 [[SIZE_T:[0-9]+]], i32 16, i64 ptrtoint [[.*]]) +; TYPED-NEXT: %v = call noalias nonnull dereferenceable({{[0-9]+}}) {} addrspace(10)* @ijl_gc_pool_alloc_instrumented(i8* [[ptls_i8]], i32 [[SIZE_T:[0-9]+]], i32 16, i64 {{.*}} @tag {{.*}}) ; TYPED: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* {{.*}} unordered, align 8, !tbaa !4 ; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %gcstack, i64 -12 -; OPAQUE-NEXT: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16 +; OPAQUE: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16 ; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0 -; OPAQUE-NEXT: %v = call noalias nonnull dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_pool_alloc_instrumented(ptr [[ptls_load]], i32 [[SIZE_T:[0-9]+]], i32 16, ptr) +; OPAQUE-NEXT: %v = call noalias nonnull dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_pool_alloc_instrumented(ptr [[ptls_load]], i32 [[SIZE_T:[0-9]+]], i32 16, i64 {{.*}} @tag {{.*}}) ; OPAQUE: store atomic ptr addrspace(10) @tag, ptr addrspace(10) {{.*}} unordered, align 8, !tbaa !4 define {} addrspace(10)* @return_obj() { @@ -40,7 +39,7 @@ define {} addrspace(10)* @return_obj() { ; CHECK-LABEL: @return_load ; CHECK: alloca i64 ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK-NOT: @ijl_gc_pool_alloc_instrumented +; CHECK-NOT: @jl_gc_pool_alloc ; TYPED: call void @llvm.lifetime.start{{.*}}(i64 8, i8* ; OPAQUE: call void @llvm.lifetime.start{{.*}}(i64 8, ptr ; CHECK-NOT: @tag @@ -63,7 +62,7 @@ define i64 @return_load(i64 %i) { ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK: @ijl_gc_pool_alloc_instrumented +; CHECK: @ijl_gc_pool_alloc ; TYPED: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* {{.*}} unordered, align 8, !tbaa !4 ; OPAQUE: store atomic ptr addrspace(10) @tag, ptr addrspace(10) {{.*}} unordered, align 8, !tbaa !4 define void @ccall_obj(i8* %fptr) { @@ -82,7 +81,7 @@ define void @ccall_obj(i8* %fptr) { ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK-NOT: @jl_gc_pool_alloc_instrumented +; CHECK-NOT: @jl_gc_pool_alloc ; TYPED: call void @llvm.lifetime.start{{.*}}(i64 8, i8* ; TYPED: %f = bitcast i8* %fptr to void (i8*)* @@ -110,7 +109,7 @@ define void @ccall_ptr(i8* %fptr) { ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK: @ijl_gc_pool_alloc_instrumented +; CHECK: @ijl_gc_pool_alloc ; TYPED: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* {{.*}} unordered, align 8, !tbaa !4 ; OPAQUE: store atomic ptr addrspace(10) @tag, ptr addrspace(10) {{.*}} unordered, align 8, !tbaa !4 define void @ccall_unknown_bundle(i8* %fptr) { @@ -180,7 +179,7 @@ L3: ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK-NOT: @jl_gc_pool_alloc_instrumented +; CHECK-NOT: @jl_gc_pool_alloc ; CHECK-NOT: store {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* {{.*}}, align 8, !tbaa !4 define void @object_field({} addrspace(10)* %field) { %pgcstack = call {}*** @julia.get_pgcstack() @@ -199,7 +198,7 @@ define void @object_field({} addrspace(10)* %field) { ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK-NOT: @jl_gc_pool_alloc_instrumented +; CHECK-NOT: @jl_gc_pool_alloc ; TYPED: call void @llvm.memcpy.p0i8.p0i8.i64 ; OPAQUE: call void @llvm.memcpy.p0.p0.i64 define void @memcpy_opt(i8* %v22) { @@ -219,7 +218,7 @@ top: ; TYPED: call {}*** @julia.get_pgcstack() ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj -; CHECK-NOT: @jl_gc_pool_alloc_instrumented +; CHECK-NOT: @jl_gc_pool_alloc ; CHECK-NOT: @llvm.lifetime.end ; CHECK: @external_function define void @preserve_opt(i8* %v22) { From 4a135a0e63faf921484fd6d5499d97e3f6aeab5d Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Sat, 29 Jul 2023 14:26:44 -0400 Subject: [PATCH 08/13] Fix rest of the llvmpasses tests --- test/llvmpasses/final-lower-gc.ll | 4 ++-- test/llvmpasses/late-lower-gc-addrspaces.ll | 8 ++++---- test/llvmpasses/late-lower-gc.ll | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/llvmpasses/final-lower-gc.ll b/test/llvmpasses/final-lower-gc.ll index 5bbaa2f4d81ea..ccb2571f0080f 100644 --- a/test/llvmpasses/final-lower-gc.ll +++ b/test/llvmpasses/final-lower-gc.ll @@ -80,8 +80,8 @@ top: %pgcstack = call {}*** @julia.get_pgcstack() %ptls = call {}*** @julia.ptls_states() %ptls_i8 = bitcast {}*** %ptls to i8* -; TYPED: %v = call noalias nonnull dereferenceable({{[0-9]+}}) {} addrspace(10)* @ijl_gc_pool_alloc -; OPAQUE: %v = call noalias nonnull dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_pool_alloc +; TYPED: %v = call noalias nonnull dereferenceable({{[0-9]+}}) {} addrspace(10)* @ijl_gc_pool_alloc_instrumented +; OPAQUE: %v = call noalias nonnull dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_pool_alloc_instrumented %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 8) %0 = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* %1 = getelementptr {} addrspace(10)*, {} addrspace(10)* addrspace(10)* %0, i64 -1 diff --git a/test/llvmpasses/late-lower-gc-addrspaces.ll b/test/llvmpasses/late-lower-gc-addrspaces.ll index 9849f432fb9a7..8021aab6c99a3 100644 --- a/test/llvmpasses/late-lower-gc-addrspaces.ll +++ b/test/llvmpasses/late-lower-gc-addrspaces.ll @@ -69,7 +69,7 @@ top: ; TYPED-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; TYPED-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; TYPED-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -; TYPED-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8) +; TYPED-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; TYPED-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* ; TYPED-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1 ; TYPED-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4 @@ -77,7 +77,7 @@ top: ; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12 ; OPAQUE-NEXT: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16 ; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0 -; OPAQUE-NEXT: %v = call ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8) +; OPAQUE-NEXT: %v = call ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; OPAQUE-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1 ; OPAQUE-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4 %v = call noalias {} addrspace(10)* @julia.gc_alloc_obj({}** %current_task, i64 8, {} addrspace(10)* @tag) @@ -102,7 +102,7 @@ top: ; TYPED-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; TYPED-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; TYPED-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -; TYPED-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8) +; TYPED-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; TYPED-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* ; TYPED-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1 ; TYPED-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4 @@ -110,7 +110,7 @@ top: ; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12 ; OPAQUE-NEXT: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16 ; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0 -; OPAQUE-NEXT: %v = call ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8) +; OPAQUE-NEXT: %v = call ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; OPAQUE-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1 ; OPAQUE-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4 %v = call noalias {} addrspace(10)* @julia.gc_alloc_obj({}** %current_task, i64 8, {} addrspace(10)* @tag) diff --git a/test/llvmpasses/late-lower-gc.ll b/test/llvmpasses/late-lower-gc.ll index ee13c5e9207fb..fd881a0d89066 100644 --- a/test/llvmpasses/late-lower-gc.ll +++ b/test/llvmpasses/late-lower-gc.ll @@ -66,7 +66,7 @@ top: ; TYPED-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; TYPED-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; TYPED-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -; TYPED-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8) +; TYPED-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; TYPED-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* ; TYPED-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1 ; TYPED-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4 @@ -74,7 +74,7 @@ top: ; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12 ; OPAQUE-NEXT: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16 ; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0 -; OPAQUE-NEXT: %v = call ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8) +; OPAQUE-NEXT: %v = call ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; OPAQUE-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1 ; OPAQUE-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4 %v = call noalias {} addrspace(10)* @julia.gc_alloc_obj({}** %current_task, i64 8, {} addrspace(10)* @tag) @@ -99,7 +99,7 @@ top: ; TYPED-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; TYPED-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; TYPED-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -; TYPED-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8) +; TYPED-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}}}) ; TYPED-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* ; TYPED-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1 ; TYPED-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4 @@ -107,7 +107,7 @@ top: ; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12 ; OPAQUE-NEXT: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16 ; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0 -; OPAQUE-NEXT: %v = call ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8) +; OPAQUE-NEXT: %v = call ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; OPAQUE-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1 ; OPAQUE-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4 %v = call noalias {} addrspace(10)* @julia.gc_alloc_obj({}** %current_task, i64 8, {} addrspace(10)* @tag) From 0c069d6783a8644282bee49c588b4766a1aa0c58 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Sat, 29 Jul 2023 20:05:36 -0400 Subject: [PATCH 09/13] typo --- test/llvmpasses/late-lower-gc.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/llvmpasses/late-lower-gc.ll b/test/llvmpasses/late-lower-gc.ll index fd881a0d89066..054747486cab0 100644 --- a/test/llvmpasses/late-lower-gc.ll +++ b/test/llvmpasses/late-lower-gc.ll @@ -99,7 +99,7 @@ top: ; TYPED-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; TYPED-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; TYPED-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -; TYPED-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}}}) +; TYPED-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; TYPED-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* ; TYPED-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1 ; TYPED-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4 From a8691b1e86ca9fb71bf954b3e9065fe5f1902eb2 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Sat, 29 Jul 2023 20:22:47 -0400 Subject: [PATCH 10/13] Fix jl_gc_alloc_typed from codegen. This must never happen in any of the tests, other than this one llvm unit test! :o --- src/llvm-pass-helpers.cpp | 2 +- test/llvmpasses/final-lower-gc.ll | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index 5ade0f9822fa8..39d37cee3928b 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -303,7 +303,7 @@ namespace jl_well_known { T_prjlvalue, { Type::getInt8PtrTy(ctx), T_size, - Type::getInt8PtrTy(ctx) }, + T_size }, // type false), Function::ExternalLinkage, GC_ALLOC_TYPED_NAME); diff --git a/test/llvmpasses/final-lower-gc.ll b/test/llvmpasses/final-lower-gc.ll index ccb2571f0080f..64d2c06e534b2 100644 --- a/test/llvmpasses/final-lower-gc.ll +++ b/test/llvmpasses/final-lower-gc.ll @@ -18,7 +18,7 @@ declare noalias nonnull {} addrspace(10)** @julia.new_gc_frame(i32) declare void @julia.push_gc_frame({} addrspace(10)**, i32) declare {} addrspace(10)** @julia.get_gc_frame_slot({} addrspace(10)**, i32) declare void @julia.pop_gc_frame({} addrspace(10)**) -declare noalias nonnull {} addrspace(10)* @julia.gc_alloc_bytes(i8*, i64) #0 +declare noalias nonnull {} addrspace(10)* @julia.gc_alloc_bytes(i8*, i64, i64) #0 attributes #0 = { allocsize(1) } @@ -82,7 +82,7 @@ top: %ptls_i8 = bitcast {}*** %ptls to i8* ; TYPED: %v = call noalias nonnull dereferenceable({{[0-9]+}}) {} addrspace(10)* @ijl_gc_pool_alloc_instrumented ; OPAQUE: %v = call noalias nonnull dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_pool_alloc_instrumented - %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 8) + %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 8, i64 12341234) %0 = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* %1 = getelementptr {} addrspace(10)*, {} addrspace(10)* addrspace(10)* %0, i64 -1 store {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* %1, align 8, !tbaa !0 @@ -96,9 +96,9 @@ top: %ptls = call {}*** @julia.ptls_states() %ptls_i8 = bitcast {}*** %ptls to i8* ; CHECK: %0 = add i64 %size, 8 -; TYPED: %v = call noalias nonnull dereferenceable(8) {} addrspace(10)* @ijl_gc_alloc_typed(i8* %ptls_i8, i64 %0, i8* null) -; OPAQUE: %v = call noalias nonnull dereferenceable(8) ptr addrspace(10) @ijl_gc_alloc_typed(ptr %ptls_i8, i64 %0, ptr null) - %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 %size) +; TYPED: %v = call noalias nonnull dereferenceable(8) {} addrspace(10)* @ijl_gc_alloc_typed(i8* %ptls_i8, i64 %0, i64 12341234) +; OPAQUE: %v = call noalias nonnull dereferenceable(8) ptr addrspace(10) @ijl_gc_alloc_typed(ptr %ptls_i8, i64 %0, i64 12341234) + %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 %size, i64 12341234) %0 = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* %1 = getelementptr {} addrspace(10)*, {} addrspace(10)* addrspace(10)* %0, i64 -1 store {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* %1, align 8, !tbaa !0 From 54a948d3c12d0ecac4938049b0a8b82ae2b5599f Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Sat, 29 Jul 2023 20:42:35 -0400 Subject: [PATCH 11/13] PR Feedback: Add simpler example, and explain the in-depth example more. --- doc/src/manual/profile.md | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/doc/src/manual/profile.md b/doc/src/manual/profile.md index a37719c9b782a..c898d48b4c85d 100644 --- a/doc/src/manual/profile.md +++ b/doc/src/manual/profile.md @@ -353,9 +353,20 @@ https://www.youtube.com/watch?v=BFvpwC8hEWQ ##### Allocation Profiler Example -Here is an example of how to invoke the Allocation profiler. A good number of samples to aim -for is around 1 - 10 thousand. Too many, and the profile visualizer can get overwhelmed, and -profiling will be slow. Too few, and you don't have a representative sample. +In this simple example, we use PProf to visualize the alloc profile. You could use another +visualization tool instead. We collect the profile (specifying a sample rate), then we visualize it. +```julia +using Profile, PProf +Profile.Allocs.clear() +Profile.Allocs.@profile sample_rate=0.0001 my_function() +PProf.Allocs.pprof() +``` + +Here is a more in-depth example, showing how we can tune the sample rate. A +good number of samples to aim for is around 1 - 10 thousand. Too many, and the +profile visualizer can get overwhelmed, and profiling will be slow. Too few, +and you don't have a representative sample. + ```julia-repl julia> import Profile @@ -371,17 +382,18 @@ julia> Profile.Allocs.@profile sample_rate=0.001 begin # 1.5 M * 0.001 = ~1.5K end 500000 -julia> prof = Profile.Allocs.fetch(); +julia> prof = Profile.Allocs.fetch(); # If you want, you can also manually inspect the results. julia> length(prof.allocs) # Confirm we have expected number of allocations. -1410 +1515 julia> using PProf # Now, visualize with an external tool, like PProf or ProfileCanvas. -julia> PProf.Allocs.pprof(prof; from_c=false) -Analyzing 1410 allocation samples... 100%|████████████████████████████████| Time: 0:00:16 +julia> PProf.Allocs.pprof(prof; from_c=false) # You can optionally pass in a previously fetched profile result. +Analyzing 1515 allocation samples... 100%|████████████████████████████████| Time: 0:00:00 "alloc-profile.pb.gz" +julia> Main binary filename not available. Serving web UI on http://localhost:62261 ``` From 53be440d074957a624113b0b38b1471b27d905d5 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Sat, 29 Jul 2023 23:01:21 -0400 Subject: [PATCH 12/13] Update profile.md --- doc/src/manual/profile.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/src/manual/profile.md b/doc/src/manual/profile.md index c898d48b4c85d..68ff555b75ff3 100644 --- a/doc/src/manual/profile.md +++ b/doc/src/manual/profile.md @@ -391,11 +391,11 @@ julia> using PProf # Now, visualize with an external tool, like PProf or Profil julia> PProf.Allocs.pprof(prof; from_c=false) # You can optionally pass in a previously fetched profile result. Analyzing 1515 allocation samples... 100%|████████████████████████████████| Time: 0:00:00 -"alloc-profile.pb.gz" - -julia> Main binary filename not available. +Main binary filename not available. Serving web UI on http://localhost:62261 +"alloc-profile.pb.gz" ``` +Then you can view the profile by navigating to http://localhost:62261. See PProf package for more options. ##### Allocation Profiling Tips From 13d08c7aa9946f507c89efd460cb9c69985ca508 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Sat, 29 Jul 2023 23:02:13 -0400 Subject: [PATCH 13/13] Update profile.md --- doc/src/manual/profile.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/src/manual/profile.md b/doc/src/manual/profile.md index 68ff555b75ff3..6e9f98d3f1780 100644 --- a/doc/src/manual/profile.md +++ b/doc/src/manual/profile.md @@ -395,7 +395,8 @@ Main binary filename not available. Serving web UI on http://localhost:62261 "alloc-profile.pb.gz" ``` -Then you can view the profile by navigating to http://localhost:62261. See PProf package for more options. +Then you can view the profile by navigating to http://localhost:62261, and the profile is saved to disk. +See PProf package for more options. ##### Allocation Profiling Tips