Skip to content
This repository has been archived by the owner on Mar 12, 2021. It is now read-only.

Commit

Permalink
Support for Julia's multitasking.
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt committed Mar 25, 2020
1 parent 138ece7 commit 65a35b1
Show file tree
Hide file tree
Showing 16 changed files with 171 additions and 157 deletions.
43 changes: 38 additions & 5 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,28 @@ uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
version = "6.2.0"

[[CUDAnative]]
deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]
git-tree-sha1 = "e6742ce88d11f1fdf6a9357ba738735f86ce67b5"
repo-rev = "58c6755445c05ff26f1bdc5c12c7ae0aa6c39bc2"
repo-url = "https://github.com/JuliaGPU/CUDAnative.jl.git"
deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]
git-tree-sha1 = "1ee71ece4332185ad49b93f7b6cf9d51017e40ef"
uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
version = "2.10.2"
version = "3.0.0"

[[CodeTracking]]
deps = ["InteractiveUtils", "UUIDs"]
git-tree-sha1 = "0becdab7e6fbbcb7b88d8de5b72e5bb2f28239f3"
uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
version = "0.5.8"

[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "2.2.0"

[[Cthulhu]]
deps = ["CodeTracking", "InteractiveUtils", "TerminalMenus", "Unicode"]
git-tree-sha1 = "5e0f928ccaab1fa2911fc4e204e8a6f5b0213eaf"
uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
version = "1.0.0"

[[DataStructures]]
deps = ["InteractiveUtils", "OrderedCollections"]
Expand All @@ -56,6 +72,10 @@ version = "0.17.10"
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"

[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"

[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
Expand Down Expand Up @@ -100,6 +120,9 @@ version = "0.5.4"
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"

[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"

[[NNlib]]
deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
git-tree-sha1 = "d9f196d911f55aeaff11b11f681b135980783824"
Expand Down Expand Up @@ -146,6 +169,10 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"

[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"

[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"

Expand All @@ -157,6 +184,12 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[[TerminalMenus]]
deps = ["Compat", "REPL", "Test"]
git-tree-sha1 = "9ae6ed0c94eee4d898e049820942af21daf15efc"
uuid = "dc548174-15c3-5faf-af27-7997cfbde655"
version = "0.1.0"

[[Test]]
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Adapt = "1.0"
CEnum = "0.2"
CUDAapi = "3.0, 4.0"
CUDAdrv = "6.0.1"
CUDAnative = "2.10"
CUDAnative = "3.0"
DataStructures = "0.17"
GPUArrays = "3.1"
MacroTools = "0.5"
Expand Down
80 changes: 38 additions & 42 deletions src/blas/CUBLAS.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,68 +25,64 @@ include("wrappers.jl")
# high-level integrations
include("linalg.jl")

const handles_lock = ReentrantLock()
const created_handles = Dict{Tuple{UInt,Int},cublasHandle_t}()
const created_xt_handles = Dict{Tuple{UInt,Int},cublasXtHandle_t}()
const active_handles = Vector{Union{Nothing,cublasHandle_t}}()
const active_xt_handles = Vector{Union{Nothing,cublasXtHandle_t}}()
# thread cache for task-local library handles
const thread_handles = Vector{Union{Nothing,cublasHandle_t}}()
const thread_xt_handles = Vector{Union{Nothing,cublasXtHandle_t}}()

function handle()
tid = Threads.threadid()
if @inbounds active_handles[tid] === nothing
if @inbounds thread_handles[tid] === nothing
ctx = context()
key = (objectid(ctx), tid)
lock(handles_lock) do
active_handles[tid] = get!(created_handles, key) do
handle = cublasCreate_v2()
atexit(()->CUDAdrv.isvalid(ctx) && cublasDestroy_v2(handle))

# enable tensor math mode if our device supports it, and fast math is enabled
dev = CUDAdrv.device()
if Base.JLOptions().fast_math == 1 && CUDAdrv.capability(dev) >= v"7.0" && version() >= v"9"
cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)
end

handle
thread_handles[tid] = get!(task_local_storage(), (:CUBLAS, ctx)) do
handle = cublasCreate_v2()
atexit(()->CUDAdrv.isvalid(ctx) && cublasDestroy_v2(handle))

# enable tensor math mode if our device supports it, and fast math is enabled
dev = CUDAdrv.device()
if Base.JLOptions().fast_math == 1 && CUDAdrv.capability(dev) >= v"7.0" && version() >= v"9"
cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)
end

handle
end
end
@inbounds active_handles[tid]
@inbounds thread_handles[tid]
end

function xt_handle()
tid = Threads.threadid()
if @inbounds active_xt_handles[tid] === nothing
if @inbounds thread_xt_handles[tid] === nothing
ctx = context()
key = (objectid(ctx), tid)
lock(handles_lock) do
active_xt_handles[tid] = get!(created_xt_handles, key) do
handle = cublasXtCreate()
atexit(()->CUDAdrv.isvalid(ctx) && cublasXtDestroy(handle))

# select the devices
# TODO: this is weird, since we typically use a single device per thread/context
devs = convert.(Cint, CUDAdrv.devices())
cublasXtDeviceSelect(handle, length(devs), devs)

handle
end
thread_xt_handles[tid] = get!(task_local_storage(), (:CUBLASxt, ctx)) do
handle = cublasXtCreate()
atexit(()->CUDAdrv.isvalid(ctx) && cublasXtDestroy(handle))

# select the devices
# TODO: this is weird, since we typically use a single device per thread/context
devs = convert.(Cint, CUDAdrv.devices())
cublasXtDeviceSelect(handle, length(devs), devs)

handle
end
end
@inbounds active_xt_handles[tid]
@inbounds thread_xt_handles[tid]
end

function __init__()
resize!(active_handles, Threads.nthreads())
fill!(active_handles, nothing)
resize!(thread_handles, Threads.nthreads())
fill!(thread_handles, nothing)

resize!(active_xt_handles, Threads.nthreads())
fill!(active_xt_handles, nothing)
resize!(thread_xt_handles, Threads.nthreads())
fill!(thread_xt_handles, nothing)

CUDAnative.atcontextswitch() do tid, ctx
# we don't eagerly initialize handles, but do so lazily when requested
active_handles[tid] = nothing
active_xt_handles[tid] = nothing
thread_handles[tid] = nothing
thread_xt_handles[tid] = nothing
end

CUDAnative.attaskswitch() do tid, task
thread_handles[tid] = nothing
thread_xt_handles[tid] = nothing
end
end

Expand Down
3 changes: 1 addition & 2 deletions src/blas/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ end
end

function initialize_api()
# make sure the calling thread has an active context
CUDAnative.initialize_context()
CUDAnative.prepare_cuda_call()
end

macro check(ex)
Expand Down
31 changes: 15 additions & 16 deletions src/dnn/CUDNN.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,33 +39,32 @@ include("nnlib.jl")

include("compat.jl")

const handles_lock = ReentrantLock()
const created_handles = Dict{Tuple{UInt,Int},cudnnHandle_t}()
const active_handles = Vector{Union{Nothing,cudnnHandle_t}}()
# thread cache for task-local library handles
const thread_handles = Vector{Union{Nothing,cudnnHandle_t}}()

function handle()
tid = Threads.threadid()
if @inbounds active_handles[tid] === nothing
if @inbounds thread_handles[tid] === nothing
ctx = context()
key = (objectid(ctx), tid)
lock(handles_lock) do
active_handles[tid] = get!(created_handles, key) do
handle = cudnnCreate()
atexit(()->CUDAdrv.isvalid(ctx) && cudnnDestroy(handle))
handle
end
thread_handles[tid] = get!(task_local_storage(), (:CUDNN, ctx)) do
handle = cudnnCreate()
atexit(()->CUDAdrv.isvalid(ctx) && cudnnDestroy(handle))
handle
end
end
@inbounds active_handles[tid]
@inbounds thread_handles[tid]
end

function __init__()
resize!(active_handles, Threads.nthreads())
fill!(active_handles, nothing)
resize!(thread_handles, Threads.nthreads())
fill!(thread_handles, nothing)

CUDAnative.atcontextswitch() do tid, ctx
# we don't eagerly initialize handles, but do so lazily when requested
active_handles[tid] = nothing
thread_handles[tid] = nothing
end

CUDAnative.attaskswitch() do tid, task
thread_handles[tid] = nothing
end
end

Expand Down
3 changes: 1 addition & 2 deletions src/dnn/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ name(err::CUDNNError) = unsafe_string(cudnnGetErrorString(err))
end

function initialize_api()
# make sure the calling thread has an active context
CUDAnative.initialize_context()
CUDAnative.prepare_cuda_call()
end

macro check(ex)
Expand Down
3 changes: 1 addition & 2 deletions src/fft/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,7 @@ end
end

function initialize_api()
# make sure the calling thread has an active context
CUDAnative.initialize_context()
CUDAnative.prepare_cuda_call()
end

macro check(ex)
Expand Down
2 changes: 1 addition & 1 deletion src/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ synchronized right before and after executing `ex` to exclude any external effec
macro time(ex)
quote
# @time might surround an application, so be sure to initialize CUDA before that
CUDAnative.initialize_context()
CUDAnative.prepare_cuda_call()

# coarse synchronization to exclude effects from previously-executed code
CUDAdrv.synchronize()
Expand Down
31 changes: 15 additions & 16 deletions src/rand/CURAND.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,33 +23,32 @@ include("wrappers.jl")
# high-level integrations
include("random.jl")

const handles_lock = ReentrantLock()
const created_generators = Dict{Tuple{UInt,Int},RNG}()
const active_generators = Vector{Union{Nothing,RNG}}()
# thread cache for task-local library handles
const thread_generators = Vector{Union{Nothing,RNG}}()

function generator()
tid = Threads.threadid()
if @inbounds active_generators[tid] === nothing
if @inbounds thread_generators[tid] === nothing
ctx = context()
key = (objectid(ctx), tid)
lock(handles_lock) do
active_generators[tid] = get!(created_generators, key) do
rng = RNG()
Random.seed!(rng)
rng
end
thread_generators[tid] = get!(task_local_storage(), (:CURAND, ctx)) do
rng = RNG()
Random.seed!(rng)
rng
end
end
@inbounds active_generators[tid]
@inbounds thread_generators[tid]
end

function __init__()
resize!(active_generators, Threads.nthreads())
fill!(active_generators, nothing)
resize!(thread_generators, Threads.nthreads())
fill!(thread_generators, nothing)

CUDAnative.atcontextswitch() do tid, ctx
# we don't eagerly initialize handles, but do so lazily when requested
active_generators[tid] = nothing
thread_generators[tid] = nothing
end

CUDAnative.attaskswitch() do tid, task
thread_generators[tid] = nothing
end
end

Expand Down
3 changes: 1 addition & 2 deletions src/rand/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ end
end

function initialize_api()
# make sure the calling thread has an active context
CUDAnative.initialize_context()
CUDAnative.prepare_cuda_call()
end

macro check(ex)
Expand Down
Loading

0 comments on commit 65a35b1

Please sign in to comment.