Memory errror when transforming model to gpu #9

dongqiu93 · 2024-04-18T19:46:05Z

The script breaks when trying to convert model to gpu. I was trying to use the DataFrame structure to run the fit!. It breaks at this step

m = m |> gpu

The full error log

CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA ~/.julia/packages/CUDA/fGE8R/lib/cudadrv/libcuda.jl:30
  [2] check
    @ CUDA ~/.julia/packages/CUDA/fGE8R/lib/cudadrv/libcuda.jl:37 [inlined]
  [3] cuMemAllocFromPoolAsync
    @ CUDA ~/.julia/packages/CUDA/fGE8R/lib/utils/call.jl:30 [inlined]
  [4] #alloc#1
    @ CUDA.Mem ~/.julia/packages/CUDA/fGE8R/lib/cudadrv/memory.jl:81 [inlined]
  [5] alloc
    @ CUDA.Mem ~/.julia/packages/CUDA/fGE8R/lib/cudadrv/memory.jl:71 [inlined]
  [6] actual_alloc(bytes::Int64; async::Bool, stream::CuStream, pool::CuMemoryPool)
    @ CUDA ~/.julia/packages/CUDA/fGE8R/src/pool.jl:66
  [7] actual_alloc
    @ ~/.julia/packages/CUDA/fGE8R/src/pool.jl:59 [inlined]
  [8] #1060
    @ ~/.julia/packages/CUDA/fGE8R/src/pool.jl:466 [inlined]
  [9] retry_reclaim
    @ ~/.julia/packages/CUDA/fGE8R/src/pool.jl:383 [inlined]
 [10] macro expansion
    @ ~/.julia/packages/CUDA/fGE8R/src/pool.jl:465 [inlined]
 [11] macro expansion
    @ ./timing.jl:395 [inlined]
 [12] #_alloc#1059
    @ ~/.julia/packages/CUDA/fGE8R/src/pool.jl:461 [inlined]
 [13] _alloc
    @ ~/.julia/packages/CUDA/fGE8R/src/pool.jl:457 [inlined]
 [14] #alloc#1058
    @ ~/.julia/packages/CUDA/fGE8R/src/pool.jl:447 [inlined]
 [15] alloc
    @ ~/.julia/packages/CUDA/fGE8R/src/pool.jl:441 [inlined]
 [16] CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}(::UndefInitializer, dims::Tuple{Int64})
    @ CUDA ~/.julia/packages/CUDA/fGE8R/src/array.jl:74
 [17] CuArray{T, N, B}(::UndefInitializer, dims::Tuple{Vararg{Int64, N}}) where {T, N, B}
    @ CUDA ~/.julia/packages/CUDA/fGE8R/src/array.jl:418 [inlined]
 [18] adapt_storage(::CUDA.CuArrayKernelAdaptor{CUDA.Mem.DeviceBuffer}, xs::Array{Float32, 3})
    @ CUDA ~/.julia/packages/CUDA/fGE8R/src/array.jl:740 [inlined]
 [19] adapt_structure
    @ ~/.julia/packages/Adapt/7T9au/src/Adapt.jl:57 [inlined]
 [20] adapt
    @ ~/.julia/packages/Adapt/7T9au/src/Adapt.jl:40 [inlined]
 [21] #cu#1106
    @ ~/.julia/packages/CUDA/fGE8R/src/array.jl:805 [inlined]
 [22] cu
    @ ~/.julia/packages/CUDA/fGE8R/src/array.jl:792 [inlined]
 [23] adapt_storage(to::Flux.FluxCUDAAdaptor, x::Vector{Float32})
    @ FluxCUDAExt ~/.julia/packages/Flux/Wz6D4/ext/FluxCUDAExt/functor.jl:4
 [24] adapt_structure
    @ ~/.julia/packages/Adapt/7T9au/src/Adapt.jl:57 [inlined]
 [25] adapt
    @ ~/.julia/packages/Adapt/7T9au/src/Adapt.jl:40 [inlined]
 [26] #11
    @ ~/.julia/packages/Flux/Wz6D4/ext/FluxCUDAExt/functor.jl:56 [inlined]
 [27] ExcludeWalk
    @ ~/.julia/packages/Functors/6fQUb/src/walks.jl:144 [inlined]
 [28] (::Functors.CachedWalk{Functors.ExcludeWalk{Functors.DefaultWalk, FluxCUDAExt.var"#11#12"{Nothing}, typeof(Flux._isleaf)}, Functors.NoKeyword})(::Function, ::Vector{Float32})
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:195
 [29] recurse
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:52 [inlined]
 [30] map
    @ Base ./tuple.jl:294 [inlined]
 [31] map(f::Functors.var"#recurse#26"{Functors.CachedWalk{Functors.ExcludeWalk{Functors.DefaultWalk, FluxCUDAExt.var"#11#12"{Nothing}, typeof(Flux._isleaf)}, Functors.NoKeyword}}, t::Tuple{typeof(identity), Vector{Float32}, Vector{Float32}, Vector{Float32}, Vector{Float32}, Float32, Float32, Bool, Bool, Nothing, Int64})
    @ Base ./tuple.jl:294
 [32] map(::Function, ::@NamedTuple{λ::typeof(identity), β::Vector{Float32}, γ::Vector{Float32}, μ::Vector{Float32}, σ²::Vector{Float32}, ϵ::Float32, momentum::Float32, affine::Bool, track_stats::Bool, active::Nothing, chs::Int64})
    @ Base ./namedtuple.jl:269
 [33] _map(::Function, ::@NamedTuple{λ::typeof(identity), β::Vector{Float32}, γ::Vector{Float32}, μ::Vector{Float32}, σ²::Vector{Float32}, ϵ::Float32, momentum::Float32, affine::Bool, track_stats::Bool, active::Nothing, chs::Int64})
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:3
 [34] (::Functors.DefaultWalk)(::Function, ::BatchNorm{typeof(identity), Vector{Float32}, Float32, Vector{Float32}})
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:91
 [35] ExcludeWalk
    @ ~/.julia/packages/Functors/6fQUb/src/walks.jl:144 [inlined]
 [36] (::Functors.CachedWalk{Functors.ExcludeWalk{Functors.DefaultWalk, FluxCUDAExt.var"#11#12"{Nothing}, typeof(Flux._isleaf)}, Functors.NoKeyword})(::Function, ::BatchNorm{typeof(identity), Vector{Float32}, Float32, Vector{Float32}})
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:195
 [37] recurse
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:52 [inlined]
 [38] map
    @ Base ./tuple.jl:292 [inlined]
 [39] _map
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:3 [inlined]
 [40] DefaultWalk
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:91 [inlined]
 [41] ExcludeWalk
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:144 [inlined]
 [42] CachedWalk
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:195 [inlined]
 [43] recurse
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:52 [inlined]
 [44] map(f::Functors.var"#recurse#26"{Functors.CachedWalk{Functors.ExcludeWalk{Functors.DefaultWalk, FluxCUDAExt.var"#11#12"{Nothing}, typeof(Flux._isleaf)}, Functors.NoKeyword}}, t::Tuple{Tuple{BatchNorm{typeof(identity), Vector{Float32}, Float32, Vector{Float32}}, NeuroTreeModels.StackTree}})
    @ Base ./tuple.jl:291
 [45] map
    @ Base ./namedtuple.jl:269 [inlined]
 [46] _map
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:3 [inlined]
 [47] DefaultWalk
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:91 [inlined]
 [48] ExcludeWalk
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:144 [inlined]
 [49] CachedWalk
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:195 [inlined]
 [50] recurse
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:52 [inlined]
 [51] map(f::Functors.var"#recurse#26"{Functors.CachedWalk{Functors.ExcludeWalk{Functors.DefaultWalk, FluxCUDAExt.var"#11#12"{Nothing}, typeof(Flux._isleaf)}, Functors.NoKeyword}}, t::Tuple{DataType, Chain{Tuple{BatchNorm{typeof(identity), Vector{Float32}, Float32, Vector{Float32}}, NeuroTreeModels.StackTree}}, Dict{Symbol, Any}})
    @ Base ./tuple.jl:293
 [52] map(::Function, ::@NamedTuple{_loss_type::DataType, chain::Chain{Tuple{BatchNorm{typeof(identity), Vector{Float32}, Float32, Vector{Float32}}, NeuroTreeModels.StackTree}}, info::Dict{Symbol, Any}})
    @ Base ./namedtuple.jl:269
 [53] _map(::Function, ::Tuple{BatchNorm{typeof(identity), Vector{Float32}, Float32, Vector{Float32}}, NeuroTreeModels.StackTree})
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:3 [inlined]
 [54] (::Functors.DefaultWalk)(::Function, ::NeuroTreeModels.NeuroTreeModel{NeuroTreeModels.GaussianMLE, Chain{Tuple{BatchNorm{typeof(identity), Vector{Float32}, Float32, Vector{Float32}}, NeuroTreeModels.StackTree}}})
    @ Functors ~/.julia/packages/Functors/6fQUb/src/walks.jl:91
 [55] ExcludeWalk
    @ ~/.julia/packages/Functors/6fQUb/src/walks.jl:144 [inlined]
 [56] CachedWalk
    @ ~/.julia/packages/Functors/6fQUb/src/walks.jl:195 [inlined]
 [57] execute
    @ ~/.julia/packages/Functors/6fQUb/src/walks.jl:53 [inlined]
 [58] #fmap#40
    @ ~/.julia/packages/Functors/6fQUb/src/maps.jl:11 [inlined]
 [59] fmap
    @ ~/.julia/packages/Functors/6fQUb/src/maps.jl:3 [inlined]
 [60] _cuda
    @ ~/.julia/packages/Flux/Wz6D4/ext/FluxCUDAExt/functor.jl:56 [inlined]
 [61] gpu(to::Flux.FluxCUDAAdaptor, x::NeuroTreeModels.NeuroTreeModel{NeuroTreeModels.GaussianMLE, Chain{Tuple{BatchNorm{typeof(identity), Vector{Float32}, Float32, Vector{Float32}}, NeuroTreeModels.StackTree}}})
    @ Flux ~/.julia/packages/Flux/Wz6D4/src/functor.jl:347
 [62] gpu
    @ Flux ~/.julia/packages/Flux/Wz6D4/src/functor.jl:250 [inlined]
 [63] |>(x::NeuroTreeModels.NeuroTreeModel{NeuroTreeModels.GaussianMLE, Chain{Tuple{BatchNorm{typeof(identity), Vector{Float32}, Float32, Vector{Float32}}, NeuroTreeModels.StackTree}}}, f::typeof(gpu))
    @ Base ./operators.jl:917
 [64] init(config::NeuroTreeModels.NeuroTreeRegressor, df::DataFrame; feature_names::Vector{Symbol}, target_name::String, weight_name::Nothing, offset_name::Nothing)
    @ NeuroTreeModels ~/.julia/packages/NeuroTreeModels/D6Ycs/src/fit.jl:32
 [65] init
    @ ~/.julia/packages/NeuroTreeModels/D6Ycs/src/fit.jl:1 [inlined]
 [66] fit(config::NeuroTreeModels.NeuroTreeRegressor, dtrain::DataFrame; feature_names::Vector{String}, target_name::String, weight_name::Nothing, offset_name::Nothing, deval::DataFrame, metric::String, print_every_n::Int64, early_stopping_rounds::Int64, verbosity::Int64, return_logger::Bool)
    @ NeuroTreeModels ~/.julia/packages/NeuroTreeModels/D6Ycs/src/fit.jl:115
 [67] fit!(m::Modeler.NeuroTreeModel, dtrain::DataFrame; deval::DataFrame)
    @ Modeler ~/Modeler.jl/src/model-struct.jl:165
 [68] top-level scope
    @ REPL[34]:1
 [69] eval
    @ ./boot.jl:385 [inlined]
 [70] eval
    @ ./Base.jl:88 [inlined]
 [71] repleval(m::Module, code::Expr, ::String)
    @ VSCodeServer ~/.vscode-server/extensions/julialang.language-julia-1.76.2/scripts/packages/VSCodeServer/src/repl.jl:229
 [72] (::VSCodeServer.var"#112#114"{Module, Expr, REPL.LineEditREPL, REPL.LineEdit.Prompt})()
    @ VSCodeServer ~/.vscode-server/extensions/julialang.language-julia-1.76.2/scripts/packages/VSCodeServer/src/repl.jl:192
 [73] with_logstate(f::Function, logstate::Any)
    @ Base.CoreLogging ./logging.jl:515
 [74] with_logger
    @ ./logging.jl:627 [inlined]
 [75] (::VSCodeServer.var"#111#113"{Module, Expr, REPL.LineEditREPL, REPL.LineEdit.Prompt})()
    @ VSCodeServer ~/.vscode-server/extensions/julialang.language-julia-1.76.2/scripts/packages/VSCodeServer/src/repl.jl:193
 [76] #invokelatest#2
    @ Base ./essentials.jl:887 [inlined]
 [77] invokelatest(::Any)
    @ Base ./essentials.jl:884

The text was updated successfully, but these errors were encountered:

jeremiedb · 2024-04-18T20:41:51Z

There's no need to perform any device conversion as the device is specified as part of the regressor and the rest is handled under the hood during by the fit function: https://evovest.github.io/NeuroTreeModels.jl/dev/models#NeuroTreeModels.NeuroTreeRegressor

See for example https://evovest.github.io/NeuroTreeModels.jl/dev/models#NeuroTreeModels.NeuroTreeRegressor where device is on how device is used.

I'd expect though to eventualy have device move as a kwarg of the fit rather than be part of the Regressor options. TBD!

dongqiu93 · 2024-04-18T20:58:43Z

It is handled under the hood during the fit function, however, the error comes from the fit function.

 [62] gpu
    @ Flux ~/.julia/packages/Flux/Wz6D4/src/functor.jl:250 [inlined]
 [63] |>(x::NeuroTreeModels.NeuroTreeModel{NeuroTreeModels.GaussianMLE, Chain{Tuple{BatchNorm{typeof(identity), Vector{Float32}, Float32, Vector{Float32}}, NeuroTreeModels.StackTree}}}, f::typeof(gpu))
    @ Base ./operators.jl:917
 [64] init(config::NeuroTreeModels.NeuroTreeRegressor, df::DataFrame; feature_names::Vector{Symbol}, target_name::String, weight_name::Nothing, offset_name::Nothing)
    @ NeuroTreeModels ~/.julia/packages/NeuroTreeModels/D6Ycs/src/fit.jl:32
 [65] init
    @ ~/.julia/packages/NeuroTreeModels/D6Ycs/src/fit.jl:1 [inlined]
 [66] fit(config::NeuroTreeModels.NeuroTreeRegressor, dtrain::DataFrame; feature_names::Vector{String}, target_name::String, weight_name::Nothing, offset_name::Nothing, deval::DataFrame, metric::String, print_every_n::Int64, early_stopping_rounds::Int64, verbosity::Int64, return_logger::Bool)
    @ NeuroTreeModels ~/.julia/packages/NeuroTreeModels/D6Ycs/src/fit.jl:115

jeremiedb · 2024-04-18T22:44:01Z

Issue appears specific to CUDA v5.3.0. FWIW, it's been fixed on master by JuliaGPU/CUDA.jl#2327.
Test runs fine with CUDA v5.2.0 and CUDA#master, so closing.

jeremiedb closed this as completed Apr 18, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Memory errror when transforming model to gpu #9

Memory errror when transforming model to gpu #9

dongqiu93 commented Apr 18, 2024

jeremiedb commented Apr 18, 2024

dongqiu93 commented Apr 18, 2024

jeremiedb commented Apr 18, 2024

Memory errror when transforming model to gpu #9

Memory errror when transforming model to gpu #9

Comments

dongqiu93 commented Apr 18, 2024

jeremiedb commented Apr 18, 2024

dongqiu93 commented Apr 18, 2024

jeremiedb commented Apr 18, 2024