Skip to content
This repository has been archived by the owner on Mar 12, 2021. It is now read-only.

Commit

Permalink
Merge pull request #709 from JuliaGPU/tb/mapreduce_broadcast
Browse files Browse the repository at this point in the history
Support and use broadcast with mapreduce.
  • Loading branch information
maleadt authored May 8, 2020
2 parents 4fb2c85 + c7fa104 commit 7b1e0f0
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 12 deletions.
7 changes: 5 additions & 2 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ version = "0.1.1"

[[GPUArrays]]
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
git-tree-sha1 = "c63cb01e3b6f48ab39f1e35c31ba870650814a18"
git-tree-sha1 = "bf9f724da10a403a9e85c394d5005789147e77b7"
repo-rev = "6e7560a"
repo-url = "https://github.com/JuliaGPU/GPUArrays.jl.git"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "3.2.0"

Expand All @@ -92,6 +94,7 @@ uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "1.3.4"

[[LibGit2]]
deps = ["Printf"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"

[[Libdl]]
Expand Down Expand Up @@ -127,7 +130,7 @@ uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.1.0"

[[Pkg]]
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"

[[Printf]]
Expand Down
24 changes: 14 additions & 10 deletions src/mapreduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -133,13 +133,17 @@ end

## COV_EXCL_STOP

NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractArray...; init=nothing) where T
# TODO: Broadcast-semantics after JuliaLang-julia#31020
A = first(As)
all(B -> size(A) == size(B), As) || throw(DimensionMismatch("dimensions of containers must be identical"))
if VERSION < v"1.5.0-DEV.748"
Base.axes(bc::Base.Broadcast.Broadcasted{<:CuArrayStyle, <:NTuple{N}},
d::Integer) where N =
d <= N ? axes(bc)[d] : Base.OneTo(1)
end

NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T},
A::Union{AbstractArray,Broadcast.Broadcasted};
init=nothing) where T
Base.check_reducedims(R, A)
isempty(A) && return R
length(A) == 0 && return R # isempty(::Broadcasted) iterates

f = cufunc(f)
op = cufunc(op)
Expand All @@ -156,8 +160,8 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA

# iteration domain, split in two: one part covers the dimensions that should
# be reduced, and the other covers the rest. combining both covers all values.
Rall = CartesianIndices(A)
Rother = CartesianIndices(R)
Rall = CartesianIndices(axes(A))
Rother = CartesianIndices(axes(R))
Rreduce = CartesianIndices(ifelse.(axes(A) .== axes(R), Ref(Base.OneTo(1)), axes(A)))
# NOTE: we hard-code `OneTo` (`first.(axes(A))` would work too) or we get a
# CartesianIndices object with UnitRanges that behave badly on the GPU.
Expand Down Expand Up @@ -187,7 +191,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
# we might not be able to launch all those threads to reduce each slice in one go.
# that's why each threads also loops across their inputs, processing multiple values
# so that we can span the entire reduction dimension using a single thread block.
args = (f, op, init, Rreduce, Rother, Val(shuffle), R′, As...)
args = (f, op, init, Rreduce, Rother, Val(shuffle), R′, A)
kernel_args = cudaconvert.(args)
kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
kernel = cufunction(partial_mapreduce_grid, kernel_tt)
Expand Down Expand Up @@ -218,7 +222,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
if reduce_blocks == 1
# we can cover the dimensions to reduce using a single block
@cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
f, op, init, Rreduce, Rother, Val(shuffle), R′, As...)
f, op, init, Rreduce, Rother, Val(shuffle), R′, A)
else
# we need multiple steps to cover all values to reduce
partial = similar(R, (size(R)..., reduce_blocks))
Expand All @@ -232,7 +236,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
end
end
@cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
f, op, init, Rreduce, Rother, Val(shuffle), partial, As...)
f, op, init, Rreduce, Rother, Val(shuffle), partial, A)

GPUArrays.mapreducedim!(identity, op, R′, partial; init=init)
end
Expand Down

0 comments on commit 7b1e0f0

Please sign in to comment.