diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 471a8954..d74f42e0 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -11,6 +11,6 @@ steps: cuda: "*" if: build.message !~ /\[skip tests\]/ env: - JULIA_OPENCL_BACKEND: "CUDA" + JULIA_OPENCL_BACKEND: "NVIDIA" OCL_ICD_FILENAMES: "libnvidia-opencl.so.1" timeout_in_minutes: 60 diff --git a/codecov.yml b/codecov.yml index 7687fd58..8f3b3ed4 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,6 +1,6 @@ coverage: ignore: - - "lib" + - "lib/lib*.jl" - "src/kernels" status: patch: false diff --git a/lib/CL.jl b/lib/CL.jl new file mode 100644 index 00000000..50cff22d --- /dev/null +++ b/lib/CL.jl @@ -0,0 +1,31 @@ +module cl + +export CLObject, CLString + +abstract type CLObject end + +Base.hash(x::CLObject) = hash(pointer(x)) +Base.isequal(x::T, y::T) where {T <: CLObject} = Base.hash(x) == Base.hash(y) +Base.:(==)(x::T, y::T) where {T <: CLObject} = Base.hash(x) == Base.hash(y) + +# The arrays contain a nullbyte that we pop first +function CLString(v::Array{Cchar}) + pop!(v) + String(reinterpret(UInt8, v)) +end + +include("api.jl") + +# API wrappers +include("error.jl") +include("platform.jl") +include("device.jl") +include("context.jl") +include("queue.jl") +include("event.jl") +include("memory.jl") +include("buffer.jl") +include("program.jl") +include("kernel.jl") + +end diff --git a/src/api.jl b/lib/api.jl similarity index 89% rename from src/api.jl rename to lib/api.jl index f045422c..9d047982 100644 --- a/src/api.jl +++ b/lib/api.jl @@ -90,11 +90,11 @@ const initialized = Ref{Bool}(false) end end -function parse_version(version_string) - mg = match(r"^OpenCL ([0-9]+)\.([0-9]+) .*$", version_string) - if mg === nothing - error("Non conforming version string: $(ver)") +function __init__() + if !OpenCL_jll.is_available() + @error "OpenCL_jll is not available for your platform, OpenCL.jl. will not work." end - return VersionNumber(parse(Int, mg.captures[1]), - parse(Int, mg.captures[2])) end + +const _versionDict = Dict{Ptr, VersionNumber}() +_deletecached!(obj::CLObject) = delete!(_versionDict, pointer(obj)) diff --git a/src/buffer.jl b/lib/buffer.jl similarity index 90% rename from src/buffer.jl rename to lib/buffer.jl index 810aa98d..1f506571 100644 --- a/src/buffer.jl +++ b/lib/buffer.jl @@ -316,43 +316,40 @@ function enqueue_map_mem(q::CmdQueue, return (mapped_arr, Event(ret_evt[])) end -@ocl_v1_2_only begin - - # low level enqueue fill operation, return event - function enqueue_fill_buffer(q::CmdQueue, buf::Buffer{T}, - pattern::T, offset::Csize_t, - nbytes::Csize_t, - wait_for::Union{Vector{Event},Nothing}) where T - if wait_for === nothing - evt_ids = C_NULL - n_evts = cl_uint(0) - else - evt_ids = [evt.id for evt in wait_for] - n_evts = cl_uint(length(evt_ids)) - end - ret_evt = Ref{cl_event}() - nbytes_pattern = sizeof(pattern) - @assert nbytes_pattern > 0 - clEnqueueFillBuffer(q.id, buf.id, [pattern], - unsigned(nbytes_pattern), offset, nbytes, - n_evts, evt_ids, ret_evt) - @return_event ret_evt[] +# low level enqueue fill operation, return event +function enqueue_fill_buffer(q::CmdQueue, buf::Buffer{T}, + pattern::T, offset::Csize_t, + nbytes::Csize_t, + wait_for::Union{Vector{Event},Nothing}) where T + if wait_for === nothing + evt_ids = C_NULL + n_evts = cl_uint(0) + else + evt_ids = [evt.id for evt in wait_for] + n_evts = cl_uint(length(evt_ids)) end + ret_evt = Ref{cl_event}() + nbytes_pattern = sizeof(pattern) + @assert nbytes_pattern > 0 + clEnqueueFillBuffer(q.id, buf.id, [pattern], + unsigned(nbytes_pattern), offset, nbytes, + n_evts, evt_ids, ret_evt) + @return_event ret_evt[] +end - # enqueue a fill operation, return an event - function enqueue_fill(q::CmdQueue, buf::Buffer{T}, x::T) where T - nbytes = sizeof(buf) - evt = enqueue_fill_buffer(q, buf, x, unsigned(0), - unsigned(nbytes), nothing) - return evt - end +# enqueue a fill operation, return an event +function enqueue_fill(q::CmdQueue, buf::Buffer{T}, x::T) where T + nbytes = sizeof(buf) + evt = enqueue_fill_buffer(q, buf, x, unsigned(0), + unsigned(nbytes), nothing) + return evt +end - # (blocking) fill the contents of a buffer with with a given value - function fill!(q::CmdQueue, buf::Buffer{T}, x::T) where T - evt = enqueue_fill(q, buf, x) - wait(evt) - return evt - end +# (blocking) fill the contents of a buffer with with a given value +function fill!(q::CmdQueue, buf::Buffer{T}, x::T) where T + evt = enqueue_fill(q, buf, x) + wait(evt) + return evt end # copy the contents of a buffer into an array diff --git a/src/context.jl b/lib/context.jl similarity index 100% rename from src/context.jl rename to lib/context.jl diff --git a/src/device.jl b/lib/device.jl similarity index 100% rename from src/device.jl rename to lib/device.jl diff --git a/src/error.jl b/lib/error.jl similarity index 100% rename from src/error.jl rename to lib/error.jl diff --git a/src/event.jl b/lib/event.jl similarity index 77% rename from src/event.jl rename to lib/event.jl index e1c175fc..1052186f 100644 --- a/src/event.jl +++ b/lib/event.jl @@ -43,6 +43,30 @@ end NannyEvent(evt::Event, obj; retain=false) = NannyEvent(evt.id, obj, retain=retain) +macro return_event(evt) + quote + evt = $(esc(evt)) + try + return Event(evt, retain=false) + catch err + clReleaseEvent(evt) + throw(err) + end + end +end + +macro return_nanny_event(evt, obj) + quote + evt = $(esc(evt)) + try + return NannyEvent(evt, $(esc(obj))) + catch err + clReleaseEvent(evt) + throw(err) + end + end +end + Base.pointer(evt::CLEvent) = evt.id function Base.show(io::IO, evt::Event) @@ -53,45 +77,42 @@ end Base.getindex(evt::CLEvent, evt_info::Symbol) = info(evt, evt_info) -@ocl_v1_1_only begin - - mutable struct UserEvent <: CLEvent - id::cl_event +mutable struct UserEvent <: CLEvent + id::cl_event - function UserEvent(evt_id::cl_event, retain=false) - if retain - clRetainEvent(evt_id) - end - evt = new(evt_id) - finalizer(_finalize, evt) - return evt + function UserEvent(evt_id::cl_event, retain=false) + if retain + clRetainEvent(evt_id) end + evt = new(evt_id) + finalizer(_finalize, evt) + return evt end +end - function UserEvent(ctx::Context; retain=false) - status = Ref{Cint}() - evt_id = clCreateUserEvent(ctx.id, status) - if status[] != CL_SUCCESS - throw(CLError(status[])) - end - try - return UserEvent(evt_id, retain) - catch err - clReleaseEvent(evt_id) - throw(err) - end +function UserEvent(ctx::Context; retain=false) + status = Ref{Cint}() + evt_id = clCreateUserEvent(ctx.id, status) + if status[] != CL_SUCCESS + throw(CLError(status[])) end - - function Base.show(io::IO, evt::UserEvent) - ptr_val = convert(UInt, Base.pointer(evt)) - ptr_address = "0x$(string(ptr_val, base = 16, pad = Sys.WORD_SIZE>>2))" - print(io, "OpenCL.UserEvent(@$ptr_address)") + try + return UserEvent(evt_id, retain) + catch err + clReleaseEvent(evt_id) + throw(err) end +end - function complete(evt::UserEvent) - clSetUserEventStatus(evt.id, CL_COMPLETE) - return evt - end +function Base.show(io::IO, evt::UserEvent) + ptr_val = convert(UInt, Base.pointer(evt)) + ptr_address = "0x$(string(ptr_val, base = 16, pad = Sys.WORD_SIZE>>2))" + print(io, "OpenCL.UserEvent(@$ptr_address)") +end + +function complete(evt::UserEvent) + clSetUserEventStatus(evt.id, CL_COMPLETE) + return evt end struct _EventCB @@ -156,28 +177,26 @@ function wait(evts::Vector{CLEvent}) return evts end -@ocl_v1_2_only begin - function enqueue_marker_with_wait_list(q::CmdQueue, - wait_for::Vector{CLEvent}) - n_wait_events = cl_uint(length(wait_for)) - wait_evt_ids = [evt.id for evt in wait_for] - ret_evt = Ref{cl_event}() - clEnqueueMarkerWithWaitList(q.id, n_wait_events, - isempty(wait_evt_ids) ? C_NULL : wait_evt_ids, - ret_evt) - @return_event ret_evt[] - end +function enqueue_marker_with_wait_list(q::CmdQueue, + wait_for::Vector{CLEvent}) + n_wait_events = cl_uint(length(wait_for)) + wait_evt_ids = [evt.id for evt in wait_for] + ret_evt = Ref{cl_event}() + clEnqueueMarkerWithWaitList(q.id, n_wait_events, + isempty(wait_evt_ids) ? C_NULL : wait_evt_ids, + ret_evt) + @return_event ret_evt[] +end - function enqueue_barrier_with_wait_list(q::CmdQueue, - wait_for::Vector{CLEvent}) - n_wait_events = cl_uint(length(wait_for)) - wait_evt_ids = [evt.id for evt in wait_for] - ret_evt = Ref{cl_event}() - clEnqueueBarrierWithWaitList(q.id, n_wait_events, - isempty(wait_evt_ids) ? C_NULL : wait_evt_ids, - ret_evt) - @return_event ret_evt[] - end +function enqueue_barrier_with_wait_list(q::CmdQueue, + wait_for::Vector{CLEvent}) + n_wait_events = cl_uint(length(wait_for)) + wait_evt_ids = [evt.id for evt in wait_for] + ret_evt = Ref{cl_event}() + clEnqueueBarrierWithWaitList(q.id, n_wait_events, + isempty(wait_evt_ids) ? C_NULL : wait_evt_ids, + ret_evt) + @return_event ret_evt[] end function enqueue_marker(q::CmdQueue) diff --git a/src/kernel.jl b/lib/kernel.jl similarity index 100% rename from src/kernel.jl rename to lib/kernel.jl diff --git a/src/memory.jl b/lib/memory.jl similarity index 100% rename from src/memory.jl rename to lib/memory.jl diff --git a/src/platform.jl b/lib/platform.jl similarity index 83% rename from src/platform.jl rename to lib/platform.jl index 435dfb5a..d3ba9137 100644 --- a/src/platform.jl +++ b/lib/platform.jl @@ -61,22 +61,16 @@ function info(p::Platform, pinfo) end function devices(p::Platform, dtype) - try - ndevices = Ref{Cuint}() - clGetDeviceIDs(p.id, dtype, 0, C_NULL, ndevices) - if ndevices[] == 0 - return Device[] - end - result = Vector{cl_device_id}(undef, ndevices[]) - clGetDeviceIDs(p.id, dtype, ndevices[], result, C_NULL) - return Device[Device(id) for id in result] - catch err - if err.desc == :CL_DEVICE_NOT_FOUND || err.code == -1 - return Device[] - else - throw(err) - end + ndevices = Ref{Cuint}() + ret = unchecked_clGetDeviceIDs(p.id, dtype, 0, C_NULL, ndevices) + if ret == CL_DEVICE_NOT_FOUND || ndevices[] == 0 + return Device[] + elseif ret != CL_SUCCESS + throw(CLError(ret)) end + result = Vector{cl_device_id}(undef, ndevices[]) + clGetDeviceIDs(p.id, dtype, ndevices[], result, C_NULL) + return Device[Device(id) for id in result] end devices(p::Platform) = devices(p, CL_DEVICE_TYPE_ALL) diff --git a/src/program.jl b/lib/program.jl similarity index 100% rename from src/program.jl rename to lib/program.jl diff --git a/src/queue.jl b/lib/queue.jl similarity index 100% rename from src/queue.jl rename to lib/queue.jl diff --git a/src/OpenCL.jl b/src/OpenCL.jl index a6c1e838..3bde7659 100644 --- a/src/OpenCL.jl +++ b/src/OpenCL.jl @@ -1,69 +1,12 @@ module OpenCL +# library wrappers +include("../lib/CL.jl") +using .cl export cl -module cl -abstract type CLObject end - -Base.hash(x::CLObject) = hash(pointer(x)) -Base.isequal(x::T, y::T) where {T <: CLObject} = Base.hash(x) == Base.hash(y) -Base.:(==)(x::T, y::T) where {T <: CLObject} = Base.hash(x) == Base.hash(y) - -# The arrays contain a nullbyte that we pop first -function CLString(v::Array{Cchar}) - pop!(v) - String(reinterpret(UInt8, v)) -end - -# OpenCL low level api -include("api.jl") - -# Errors -include("error.jl") - -# Macros -include("macros.jl") - -# OpenCL Platform -include("platform.jl") - -# OpenCL Device -include("device.jl") - -# OpenCL Context -include("context.jl") - -# OpenCL Queue -include("queue.jl") - -# OpenCL Event -include("event.jl") - -# OpenCL MemObject -include("memory.jl") - -# OpenCL Buffer -include("buffer.jl") - -# OpenCL Program -include("program.jl") - -# OpenCL Kernel -include("kernel.jl") - -# Util functions +# high-level functionality include("util.jl") - -# Multidimensional array include("array.jl") -@deprecate release! finalize -end # cl - -function __init__() - if cl.libopencl == "" - @warn "Could not locate an OpenCL library\nOpenCL API calls will be unavailable" - end end - -end # module diff --git a/src/array.jl b/src/array.jl index c2e1d102..a4209e8f 100644 --- a/src/array.jl +++ b/src/array.jl @@ -1,9 +1,11 @@ import LinearAlgebra +export CLArray, CLMatrix, CLVector, to_host + mutable struct CLArray{T, N} <: CLObject - ctx::Context - queue::CmdQueue - buffer::Buffer{T} + ctx::cl.Context + queue::cl.CmdQueue + buffer::cl.Buffer{T} size::NTuple{N, Int} end @@ -12,28 +14,28 @@ const CLVector{T} = CLArray{T,1} ## constructors -function CLArray(buf::Buffer{T}, queue::CmdQueue, sz::Tuple{Vararg{Int}}) where T - ctx = context(buf) +function CLArray(buf::cl.Buffer{T}, queue::cl.CmdQueue, sz::Tuple{Vararg{Int}}) where T + ctx = cl.context(buf) CLArray(ctx, queue, buf, sz) end -function CLArray(queue::CmdQueue, +function CLArray(queue::cl.CmdQueue, flags::Tuple{Vararg{Symbol}}, hostarray::AbstractArray{T,N}) where {T, N} - ctx = context(queue) - buf = Buffer(T, ctx, length(hostarray), flags, hostbuf=hostarray) + ctx = cl.context(queue) + buf = cl.Buffer(T, ctx, length(hostarray), flags, hostbuf=hostarray) sz = size(hostarray) CLArray(ctx, queue, buf, sz) end -CLArray(queue::CmdQueue, hostarray::AbstractArray{T,N}; +CLArray(queue::cl.CmdQueue, hostarray::AbstractArray{T,N}; flags=(:rw, :copy)) where {T, N} = CLArray(queue, (:rw, :copy), hostarray) Base.copy(A::CLArray; ctx=A.ctx, queue=A.queue, buffer=A.buffer, size=A.size) = CLArray(ctx, queue, buffer, size) function Base.deepcopy(A::CLArray{T,N}) where {T, N} - new_buf = Buffer(T, A.ctx, prod(A.size)) + new_buf = cl.Buffer(T, A.ctx, prod(A.size)) copy!(A.queue, new_buf, A.buffer) return CLArray(A.ctx, A.queue, new_buf, A.size) end @@ -41,29 +43,29 @@ end """ Create in device memory array of type `t` and size `dims` filled by value `x`. """ -function Base.fill(::Type{T}, q::CmdQueue, x::T, dims...) where T - ctx = info(q, :context) +function Base.fill(::Type{T}, q::cl.CmdQueue, x::T, dims...) where T + ctx = cl.info(q, :context) v = opencl_version(ctx) if v.major == 1 && v.minor >= 2 - buf = Buffer(T, ctx, prod(dims)) + buf = cl.Buffer(T, ctx, prod(dims)) fill!(q, buf, x) else - buf = Buffer(T, ctx, prod(dims), (:rw, :copy), hostbuf=fill(x, dims)) + buf = cl.Buffer(T, ctx, prod(dims), (:rw, :copy), hostbuf=fill(x, dims)) end return CLArray(buf, q, dims) end -Base.zeros(::Type{T}, q::CmdQueue, dims...) where {T} = fill(T, q, T(0), dims...) -Base.zeros(q::CmdQueue, dims...) = fill(Float64, q, Float64(0), dims...) -Base.ones(::Type{T}, q::CmdQueue, dims...) where {T} = fill(T, q, T(1), dims...) -Base.ones(q::CmdQueue, dims...) = fill(Float64, q, Float64(1), dims...) +Base.zeros(::Type{T}, q::cl.CmdQueue, dims...) where {T} = fill(T, q, T(0), dims...) +Base.zeros(q::cl.CmdQueue, dims...) = fill(Float64, q, Float64(0), dims...) +Base.ones(::Type{T}, q::cl.CmdQueue, dims...) where {T} = fill(T, q, T(1), dims...) +Base.ones(q::cl.CmdQueue, dims...) = fill(Float64, q, Float64(1), dims...) ## core functions buffer(A::CLArray) = A.buffer Base.pointer(A::CLArray) = A.buffer.id -context(A::CLArray) = context(A.buffer) +context(A::CLArray) = cl.context(A.buffer) queue(A::CLArray) = A.queue Base.eltype(A::CLArray{T, N}) where {T, N} = T Base.size(A::CLArray) = A.size @@ -72,7 +74,7 @@ Base.ndims(A::CLArray) = length(size(A)) Base.length(A::CLArray) = prod(size(A)) Base.:(==)(A:: CLArray, B:: CLArray) = buffer(A) == buffer(B) && size(A) == size(B) -Base.reshape(A::CLArray, dims...) = begin +function Base.reshape(A::CLArray, dims...) @assert prod(dims) == prod(size(A)) return copy(A, size=dims) end @@ -92,12 +94,13 @@ end ## other array operations -const TRANSPOSE_PROGRAM_PATH = joinpath(dirname(@__FILE__), "kernels/transpose.cl") +const TRANSPOSE_FLOAT_PROGRAM_PATH = joinpath(@__DIR__, "kernels", "transpose_float.cl") +const TRANSPOSE_DOUBLE_PROGRAM_PATH = joinpath(@__DIR__, "kernels", "transpose_double.cl") -function max_block_size(queue::CmdQueue, h::Int, w::Int) - dev = info(queue, :device) - dim1, dim2 = info(dev, :max_work_item_size)[1:2] - wgsize = info(dev, :max_work_group_size) +function max_block_size(queue::cl.CmdQueue, h::Int, w::Int) + dev = cl.info(queue, :device) + dim1, dim2 = cl.info(dev, :max_work_item_size)[1:2] + wgsize = cl.info(dev, :max_work_group_size) wglimit = floor(Int, sqrt(wgsize)) return gcd(dim1, dim2, h, w, wglimit) end @@ -109,12 +112,12 @@ function LinearAlgebra.transpose!(B::CLMatrix{Float32}, A::CLMatrix{Float32}; queue=A.queue) block_size = max_block_size(queue, size(A, 1), size(A, 2)) ctx = context(A) - kernel = get_kernel(ctx, TRANSPOSE_PROGRAM_PATH, "transpose", - block_size=block_size) + kernel = get_kernel(ctx, TRANSPOSE_FLOAT_PROGRAM_PATH, "transpose", + block_size=block_size) h, w = size(A) - lmem = LocalMem(Float32, block_size * (block_size + 1)) - set_args!(kernel, buffer(B), buffer(A), UInt32(h), UInt32(w), lmem) - return enqueue_kernel(queue, kernel, (h, w), (block_size, block_size)) + lmem = cl.LocalMem(Float32, block_size * (block_size + 1)) + cl.set_args!(kernel, buffer(B), buffer(A), UInt32(h), UInt32(w), lmem) + return cl.enqueue_kernel(queue, kernel, (h, w), (block_size, block_size)) end """Transpose CLMatrix A""" @@ -129,22 +132,25 @@ end """Transpose CLMatrix A, write result to a preallicated CLMatrix B""" function LinearAlgebra.transpose!(B::CLMatrix{Float64}, A::CLMatrix{Float64}; queue=A.queue) + dev = cl.info(queue, :device) + if !in("cl_khr_fp64", cl.info(dev, :extensions)) + throw(ArgumentError("Double precision not supported by device")) + end block_size = max_block_size(queue, size(A, 1), size(A, 2)) ctx = context(A) - kernel = get_kernel(ctx, TRANSPOSE_PROGRAM_PATH, "transpose_double", + kernel = get_kernel(ctx, TRANSPOSE_DOUBLE_PROGRAM_PATH, "transpose", block_size=block_size) h, w = size(A) - # lmem = LocalMem(Float64, block_size * (block_size + 1)) - lmem = LocalMem(Float64, block_size * block_size) - set_args!(kernel, buffer(B), buffer(A), UInt32(h), UInt32(w), lmem) - return enqueue_kernel(queue, kernel, (h, w), (block_size, block_size)) + # lmem = cl.LocalMem(Float64, block_size * (block_size + 1)) + lmem = cl.LocalMem(Float64, block_size * block_size) + cl.set_args!(kernel, buffer(B), buffer(A), UInt32(h), UInt32(w), lmem) + return cl.enqueue_kernel(queue, kernel, (h, w), (block_size, block_size)) end """Transpose CLMatrix A""" -function LinearAlgebra.transpose(A::CLMatrix{Float64}; - queue=A.queue) +function LinearAlgebra.transpose(A::CLMatrix{Float64}; queue=A.queue) B = zeros(Float64, queue, reverse(size(A))...) ev = LinearAlgebra.transpose!(B, A, queue=queue) - wait(ev) + cl.wait(ev) return B end diff --git a/src/kernels/transpose_double.cl b/src/kernels/transpose_double.cl new file mode 100644 index 00000000..65491675 --- /dev/null +++ b/src/kernels/transpose_double.cl @@ -0,0 +1,26 @@ + +#define BLOCK_SIZE %(block_size) +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1))) +void transpose(__global double *a_t, + __global double *a, + unsigned a_width, + unsigned a_height, + __local double *a_local) +{ + int base_idx_a = get_group_id(0) * BLOCK_SIZE + + get_group_id(1) * (BLOCK_SIZE * a_width); + int base_idx_a_t = get_group_id(1) * BLOCK_SIZE + + get_group_id(0) * (BLOCK_SIZE * a_height); + + int glob_idx_a = base_idx_a + get_local_id(0) + a_width * get_local_id(1); + int glob_idx_a_t = base_idx_a_t + get_local_id(0) + a_height * get_local_id(1); + + a_local[get_local_id(1) * BLOCK_SIZE + get_local_id(0)] = a[glob_idx_a]; + + barrier(CLK_LOCAL_MEM_FENCE); + + a_t[glob_idx_a_t] = a_local[get_local_id(0) * BLOCK_SIZE + get_local_id(1)]; +} + diff --git a/src/kernels/transpose.cl b/src/kernels/transpose_float.cl similarity index 50% rename from src/kernels/transpose.cl rename to src/kernels/transpose_float.cl index 41fbc6bb..d1088ea9 100644 --- a/src/kernels/transpose.cl +++ b/src/kernels/transpose_float.cl @@ -1,7 +1,5 @@ #define BLOCK_SIZE %(block_size) -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - __kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1))) void transpose(__global float *a_t, @@ -25,27 +23,3 @@ void transpose(__global float *a_t, a_t[glob_idx_a_t] = a_local[get_local_id(0) * BLOCK_SIZE + get_local_id(1)]; } - - -__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1))) -void transpose_double(__global double *a_t, - __global double *a, - unsigned a_width, - unsigned a_height, - __local double *a_local) -{ - int base_idx_a = get_group_id(0) * BLOCK_SIZE + - get_group_id(1) * (BLOCK_SIZE * a_width); - int base_idx_a_t = get_group_id(1) * BLOCK_SIZE + - get_group_id(0) * (BLOCK_SIZE * a_height); - - int glob_idx_a = base_idx_a + get_local_id(0) + a_width * get_local_id(1); - int glob_idx_a_t = base_idx_a_t + get_local_id(0) + a_height * get_local_id(1); - - a_local[get_local_id(1) * BLOCK_SIZE + get_local_id(0)] = a[glob_idx_a]; - - barrier(CLK_LOCAL_MEM_FENCE); - - a_t[glob_idx_a_t] = a_local[get_local_id(0) * BLOCK_SIZE + get_local_id(1)]; -} - diff --git a/src/macros.jl b/src/macros.jl deleted file mode 100644 index 01979eb2..00000000 --- a/src/macros.jl +++ /dev/null @@ -1,63 +0,0 @@ -#TODO: these are just stubs for future expanded versions -macro ocl_v1_1_only(ex) - quote - $(esc(ex)) - end -end - -macro ocl_v1_2_only(ex) - quote - $(esc(ex)) - end -end - -macro return_event(evt) - quote - evt = $(esc(evt)) - try - return Event(evt, retain=false) - catch err - clReleaseEvent(evt) - throw(err) - end - end -end - -macro return_nanny_event(evt, obj) - quote - evt = $(esc(evt)) - try - return NannyEvent(evt, $(esc(obj))) - catch err - clReleaseEvent(evt) - throw(err) - end - end -end - -function _version_test(qm, elem, ex::Expr, version::VersionNumber, name) - Base.depwarn("`@$name? elem ex1 : ex2` is deprecated, use `$name(elem) ? ex1 : ex2` instead", Symbol("@", name)) - @assert qm == :? - @assert ex.head == :(:) - @assert length(ex.args) == 2 - - quote - if cl.check_version($(esc(elem)), $version) - $(esc(ex.args[1])) - else - $(esc(ex.args[2])) - end - end -end - -macro min_v11(qm, elem, ex) - _version_test(qm, elem, ex, v"1.1", :min_v11) -end - -macro min_v12(qm, elem, ex) - _version_test(qm, elem, ex, v"1.2", :min_v12) -end - -macro min_v20(qm, elem, ex) - _version_test(qm, elem, ex, v"2.0", :min_v20) -end diff --git a/src/util.jl b/src/util.jl index 32453f72..4dfe3b9f 100644 --- a/src/util.jl +++ b/src/util.jl @@ -1,24 +1,25 @@ +export create_compute_context, opencl_version + function create_compute_context() ctx = create_some_context() device = first(devices(ctx)) - queue = CmdQueue(ctx) + queue = cl.CmdQueue(ctx) return (device, ctx, queue) end -opencl_version(obj::CLObject) = parse_version(obj[:version]) -opencl_version(c::Context) = opencl_version(first(devices(c))) -opencl_version(q::CmdQueue) = opencl_version(q[:device]) - -const _versionDict = Dict{Ptr, VersionNumber}() - -_deletecached!(obj::CLObject) = delete!(_versionDict, pointer(obj)) - -function check_version(obj::CLObject, version::VersionNumber) - version <= get!(_versionDict, pointer(obj)) do - opencl_version(obj) +function parse_version(version_string) + mg = match(r"^OpenCL ([0-9]+)\.([0-9]+) .*$", version_string) + if mg === nothing + error("Non conforming version string: $(ver)") end + return VersionNumber(parse(Int, mg.captures[1]), + parse(Int, mg.captures[2])) end +opencl_version(obj::CLObject) = parse_version(obj[:version]) +opencl_version(c::cl.Context) = opencl_version(first(cl.devices(c))) +opencl_version(q::cl.CmdQueue) = opencl_version(q[:device]) + """ Format string using dict-like variables, replacing all accurancies of `%(key)` with `value`. @@ -34,31 +35,25 @@ function format(s::String; vars...) s end -function build_kernel(ctx::Context, program::String, +function build_kernel(ctx::cl.Context, program::String, kernel_name::String; vars...) src = format(program; vars...) - p = Program(ctx, source=src) - build!(p) - return Kernel(p, kernel_name) + p = cl.Program(ctx, source=src) + cl.build!(p) + return cl.Kernel(p, kernel_name) end # cache for kernels; dict of form `(program_file, kernel_name, vars) -> kernel` -const CACHED_KERNELS = Dict{Tuple{String, String, Dict}, Kernel}() +const CACHED_KERNELS = Dict{Tuple{String, String, Dict}, cl.Kernel}() -function get_kernel(ctx::Context, program_file::String, +function get_kernel(ctx::cl.Context, program_file::String, kernel_name::String; vars...) key = (program_file, kernel_name, Dict(vars)) if in(key, keys(CACHED_KERNELS)) return CACHED_KERNELS[key] else - kernel = build_kernel(ctx, Base.read(program_file, String), kernel_name; vars...) + kernel = build_kernel(ctx, read(program_file, String), kernel_name; vars...) CACHED_KERNELS[key] = kernel return kernel end end - -min_v11(elem) = check_version(elem, v"1.1") -min_v12(elem) = check_version(elem, v"1.2") -min_v20(elem) = check_version(elem, v"2.0") -min_v21(elem) = check_version(elem, v"2.1") -min_v22(elem) = check_version(elem, v"2.2") diff --git a/test/array.jl b/test/array.jl index e3b9bf30..e879fa91 100644 --- a/test/array.jl +++ b/test/array.jl @@ -1,5 +1,3 @@ -using .cl: CLArray - using LinearAlgebra @testset "CLArray" begin @@ -26,10 +24,10 @@ using LinearAlgebra ctx = cl.Context(device) queue = cl.CmdQueue(ctx) - @test cl.to_host(cl.fill(Float32, queue, Float32(0.5), + @test to_host(fill(Float32, queue, Float32(0.5), 32, 64)) == fill(Float32(0.5), 32, 64) - @test cl.to_host(cl.zeros(Float32, queue, 64)) == zeros(Float32, 64) - @test cl.to_host(cl.ones(Float32, queue, 64)) == ones(Float32, 64) + @test to_host(zeros(Float32, queue, 64)) == zeros(Float32, 64) + @test to_host(ones(Float32, queue, 64)) == ones(Float32, 64) end @testset "core functions" begin @@ -44,9 +42,9 @@ using LinearAlgebra @test reshape(B, 128, 64) == A # transpose X = CLArray(queue, rand(Float32, 32, 32)) - B = cl.zeros(Float32, queue, 64, 128) + B = zeros(Float32, queue, 64, 128) ev = transpose!(B, A) cl.wait(ev) - #@test cl.to_host(copy(A')) == cl.to_host(B) + #@test to_host(copy(A')) == to_host(B) end end diff --git a/test/event.jl b/test/event.jl index c81546e0..ae829851 100644 --- a/test/event.jl +++ b/test/event.jl @@ -1,7 +1,7 @@ -if occursin("Portable", platform[:name]) || - occursin("Intel Gen OCL", platform[:name]) - msg = "$(platform[:name]) does not implement User Events or shows other problems" - @warn(msg) +if backend in ["POCL", "Intel"] + # unsupported by POCL + # hangs on Intel + @warn "Skipping event tests" else @testset "Event" begin @testset "status" begin diff --git a/test/minver.jl b/test/minver.jl deleted file mode 100644 index d598cd8b..00000000 --- a/test/minver.jl +++ /dev/null @@ -1,33 +0,0 @@ -@testset "Minver" begin - @testset "platform" begin - version = cl.opencl_version(platform) - - v11 = cl.min_v11(platform) - v12 = cl.min_v12(platform) - v20 = cl.min_v20(platform) - v21 = cl.min_v21(platform) - v22 = cl.min_v22(platform) - - @test v11 == (version >= v"1.1") - @test v12 == (version >= v"1.2") - @test v20 == (version >= v"2.0") - @test v21 == (version >= v"2.1") - @test v22 == (version >= v"2.2") - end - - @testset "device" begin - version = cl.opencl_version(device) - - v11 = cl.min_v11(device) - v12 = cl.min_v12(device) - v20 = cl.min_v20(device) - v21 = cl.min_v21(device) - v22 = cl.min_v22(device) - - @test v11 == (version >= v"1.1") - @test v12 == (version >= v"1.2") - @test v20 == (version >= v"2.0") - @test v21 == (version >= v"2.1") - @test v22 == (version >= v"2.2") - end -end diff --git a/test/platform.jl b/test/platform.jl index 4974d58c..7323046d 100644 --- a/test/platform.jl +++ b/test/platform.jl @@ -7,7 +7,7 @@ for k in [:profile, :version, :name, :vendor, :extensions] @test platform[k] == cl.info(platform, k) end - v = cl.opencl_version(platform) + v = opencl_version(platform) @test 1 <= v.major <= 3 @test 0 <= v.minor <= 2 end diff --git a/test/program.jl b/test/program.jl index 7d7d362b..e70fd021 100644 --- a/test/program.jl +++ b/test/program.jl @@ -51,8 +51,8 @@ @test prg[:source] == test_source end - if device[:platform][:name] == "Portable Computing Language" - @warn("Skipping unsupported binary build on POCL") + if backend == "POCL" + @warn "Skipping binary program tests" else @testset "binaries" begin ctx = cl.Context(device) diff --git a/test/runtests.jl b/test/runtests.jl index 7e63c9e0..087878d5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,22 +1,32 @@ -module TestOpenCL using Test using OpenCL -using Base.GC backend = get(ENV, "JULIA_OPENCL_BACKEND", "POCL") if backend == "POCL" - # Use POCL for the tests - # XXX: support testing with other OpenCL implementations using pocl_jll platform = filter(cl.platforms()) do platform cl.info(platform, :name) == "Portable Computing Language" end |> first device = first(cl.devices(platform, :cpu)) -else - platform = first(cl.platforms()) +elseif backend in ["NVIDIA", "Intel"] + platforms = filter(cl.platforms()) do platform + contains(cl.info(platform, :name), backend) + end + platform = first(platforms) device = first(cl.devices(platform)) +else + # we're strict about the possible values for 'backend' + # so that we can more easily match in the tests. + error("""Unknown OpenCL backend: $backend. + + Supported built-in backends: POCL. + Supported system back-ends: Intel, NVIDIA.""") end -@info "Testing using $backend back-end" platform device +@info """Testing using $backend back-end + - platform: $(cl.info(platform, :name)) + - device: $(cl.info(device, :name)) + + To test with a different back-end, define JULIA_OPENCL_BACKEND.""" @testset "OpenCL.jl" begin @@ -34,8 +44,7 @@ include("platform.jl") include("context.jl") include("device.jl") include("cmdqueue.jl") -include("minver.jl") -#include("event.jl") +include("event.jl") include("program.jl") include("kernel.jl") include("behaviour.jl") @@ -44,10 +53,8 @@ include("buffer.jl") include("array.jl") @testset "context jl reference counting" begin - Base.GC.gc() + GC.gc() @test isempty(cl._ctx_reference_count) end end - -end # module