Skip to content

Commit edaeb41

Browse files
committed
dogfood
[only tests] [only benchmarks]
1 parent 2d82b85 commit edaeb41

File tree

5 files changed

+38
-39
lines changed

5 files changed

+38
-39
lines changed

src/CUDA.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ module CUDA
33
using GPUCompiler
44

55
using GPUArrays
6+
import KernelAbstractions: KernelIntrinsics as KI
67

78
using GPUToolbox
89

src/accumulate.jl

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,16 @@
1515
function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArray,
1616
Rdim, Rpre, Rpost, Rother, neutral, init,
1717
::Val{inclusive}=Val(true)) where {T, inclusive}
18-
threads = blockDim().x
19-
thread = threadIdx().x
20-
block = blockIdx().x
18+
threads = KI.get_local_size().x
19+
thread = KI.get_local_id().x
20+
block = KI.get_group_id().x
2121

2222
temp = CuDynamicSharedArray(T, (2*threads,))
2323

2424
# iterate the main dimension using threads and the first block dimension
25-
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
25+
i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
2626
# iterate the other dimensions using the remaining block dimensions
27-
j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y
27+
j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
2828

2929
if j > length(Rother)
3030
return
@@ -47,7 +47,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
4747
offset = 1
4848
d = threads>>1
4949
while d > 0
50-
sync_threads()
50+
KI.barrier()
5151
@inbounds if thread <= d
5252
ai = offset * (2*thread-1)
5353
bi = offset * (2*thread)
@@ -66,7 +66,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
6666
d = 1
6767
while d < threads
6868
offset >>= 1
69-
sync_threads()
69+
KI.barrier()
7070
@inbounds if thread <= d
7171
ai = offset * (2*thread-1)
7272
bi = offset * (2*thread)
@@ -78,7 +78,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
7878
d *= 2
7979
end
8080

81-
sync_threads()
81+
KI.barrier()
8282

8383
# write results to device memory
8484
@inbounds if i <= length(Rdim)
@@ -100,14 +100,14 @@ end
100100
function aggregate_partial_scan(op::Function, output::AbstractArray,
101101
aggregates::AbstractArray, Rdim, Rpre, Rpost, Rother,
102102
init)
103-
threads = blockDim().x
104-
thread = threadIdx().x
105-
block = blockIdx().x
103+
threads = KI.get_local_size().x
104+
thread = KI.get_local_id().x
105+
block = KI.get_group_id().x
106106

107107
# iterate the main dimension using threads and the first block dimension
108-
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
108+
i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
109109
# iterate the other dimensions using the remaining block dimensions
110-
j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y
110+
j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
111111

112112
@inbounds if i <= length(Rdim) && j <= length(Rother)
113113
I = Rother[j]

src/device/random.jl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,18 +63,16 @@ end
6363
@inline Philox2x32() = Philox2x32{7}()
6464

6565
@inline function Base.getproperty(rng::Philox2x32, field::Symbol)
66-
threadId = threadIdx().x + (threadIdx().y - 1i32) * blockDim().x +
67-
(threadIdx().z - 1i32) * blockDim().x * blockDim().y
6866
warpId = (threadId - 1i32) >> 0x5 + 1i32 # fld1
6967

7068
if field === :key
7169
@inbounds global_random_keys()[warpId]
7270
elseif field === :ctr1
7371
@inbounds global_random_counters()[warpId]
7472
elseif field === :ctr2
75-
blockId = blockIdx().x + (blockIdx().y - 1i32) * gridDim().x +
76-
(blockIdx().z - 1i32) * gridDim().x * gridDim().y
77-
globalId = threadId + (blockId - 1i32) * (blockDim().x * blockDim().y * blockDim().z)
73+
globalId = KI.get_global_id().x +
74+
(KI.get_global_id().y - 1i32) * KI.get_global_size().x +
75+
(KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
7876
globalId%UInt32
7977
end::UInt32
8078
end

src/indexing.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ function Base.findall(bools::AnyCuArray{Bool})
3333
if n > 0
3434
## COV_EXCL_START
3535
function kernel(ys::CuDeviceArray, bools, indices)
36-
i = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
36+
i = KI.get_local_id().x + (KI.get_group_id().x - 1i32) * KI.get_local_size().x
3737

3838
@inbounds if i <= length(bools) && bools[i]
3939
i′ = CartesianIndices(bools)[i]

src/mapreduce.jl

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ end
1919
@inline function reduce_block(op, val::T, neutral, shuffle::Val{true}) where T
2020
# shared mem for partial sums
2121
assume(warpsize() == 32)
22-
shared = CuStaticSharedArray(T, 32)
22+
shared = KI.localmemory(T, 32)
2323

24-
wid, lane = fldmod1(threadIdx().x, warpsize())
24+
wid, lane = fldmod1(KI.get_local_id().x, warpsize())
2525

2626
# each warp performs partial reduction
2727
val = reduce_warp(op, val)
@@ -32,10 +32,10 @@ end
3232
end
3333

3434
# wait for all partial reductions
35-
sync_threads()
35+
KI.barrier()
3636

3737
# read from shared memory only if that warp existed
38-
val = if threadIdx().x <= fld1(blockDim().x, warpsize())
38+
val = if KI.get_local_id().x <= fld1(KI.get_local_size().x, warpsize())
3939
@inbounds shared[lane]
4040
else
4141
neutral
@@ -49,8 +49,8 @@ end
4949
return val
5050
end
5151
@inline function reduce_block(op, val::T, neutral, shuffle::Val{false}) where T
52-
threads = blockDim().x
53-
thread = threadIdx().x
52+
threads = KI.get_local_size().x
53+
thread = KI.get_local_id().x
5454

5555
# shared mem for a complete reduction
5656
shared = CuDynamicSharedArray(T, (threads,))
@@ -59,7 +59,7 @@ end
5959
# perform a reduction
6060
d = 1
6161
while d < threads
62-
sync_threads()
62+
KI.barrier()
6363
index = 2 * d * (thread-1) + 1
6464
@inbounds if index <= threads
6565
other_val = if index + d <= threads
@@ -92,10 +92,10 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
9292

9393
# decompose the 1D hardware indices into separate ones for reduction (across threads
9494
# and possibly blocks if it doesn't fit) and other elements (remaining blocks)
95-
threadIdx_reduce = threadIdx().x
96-
blockDim_reduce = blockDim().x
97-
blockIdx_reduce, blockIdx_other = fldmod1(blockIdx().x, length(Rother))
98-
gridDim_reduce = gridDim().x ÷ length(Rother)
95+
threadIdx_reduce = KI.get_local_id().x
96+
blockDim_reduce = KI.get_local_size().x
97+
blockIdx_reduce, blockIdx_other = fldmod1(KI.get_group_id().x, length(Rother))
98+
gridDim_reduce = KI.get_num_groups().x ÷ length(Rother)
9999

100100
# block-based indexing into the values outside of the reduction dimension
101101
# (that means we can safely synchronize threads within this block)
@@ -134,7 +134,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
134134
end
135135

136136
function serial_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
137-
grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
137+
grid_idx = KI.get_local_id().x + (KI.get_group_id().x - 1i32) * KI.get_local_size().x
138138
@inbounds if grid_idx <= length(Rother)
139139
Iother = Rother[grid_idx]
140140

@@ -160,14 +160,14 @@ end
160160

161161
# factored out for use in tests
162162
function serial_mapreduce_threshold(dev)
163-
max_concurrency = attribute(dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) *
164-
attribute(dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
163+
max_concurrency = KI.max_work_group_size(CUDABackend()) * KI.multiprocessor_count(CUDABackend())
165164
return max_concurrency
166165
end
167166

168167
function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
169168
A::Union{AbstractArray,Broadcast.Broadcasted};
170169
init=nothing) where {F, OP, T}
170+
backend = CUDABackend()
171171
if !isa(A, Broadcast.Broadcasted)
172172
# XXX: Base.axes isn't defined anymore for Broadcasted, breaking this check
173173
Base.check_reducedims(R, A)
@@ -201,11 +201,11 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
201201
# If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
202202
if length(Rother) >= serial_mapreduce_threshold(dev)
203203
args = (f, op, init, Rreduce, Rother, R, A)
204-
kernel = @cuda launch=false serial_mapreduce_kernel(args...)
204+
kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
205205
kernel_config = launch_configuration(kernel.fun)
206206
threads = kernel_config.threads
207207
blocks = cld(length(Rother), threads)
208-
kernel(args...; threads, blocks)
208+
kernel(args...; workgroupsize=threads, numworkgroups=blocks)
209209
return R
210210
end
211211

@@ -228,9 +228,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
228228
# we might not be able to launch all those threads to reduce each slice in one go.
229229
# that's why each threads also loops across their inputs, processing multiple values
230230
# so that we can span the entire reduction dimension using a single thread block.
231-
kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), R, A)
231+
kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
232232
compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T)
233-
kernel_config = launch_configuration(kernel.fun; shmem=compute_shmemcompute_threads)
233+
kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmemcompute_threads)
234234
reduce_threads = compute_threads(kernel_config.threads)
235235
reduce_shmem = compute_shmem(reduce_threads)
236236

@@ -255,7 +255,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
255255
# perform the actual reduction
256256
if reduce_blocks == 1
257257
# we can cover the dimensions to reduce using a single block
258-
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
258+
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; ; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem)
259259
else
260260
# TODO: provide a version that atomically reduces from different blocks
261261

@@ -286,7 +286,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
286286
end
287287

288288
partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
289-
threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
289+
; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
290290

291291
GPUArrays.mapreducedim!(identity, op, R, partial; init)
292292
end

0 commit comments

Comments
 (0)