Skip to content

Commit 6f4016a

Browse files
committed
[Temp] Try out KernelIntrinsics
1 parent c4f959e commit 6f4016a

File tree

4 files changed

+41
-32
lines changed

4 files changed

+41
-32
lines changed

src/accumulate.jl

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,16 @@
1515
function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArray,
1616
Rdim, Rpre, Rpost, Rother, neutral, init,
1717
::Val{inclusive}=Val(true)) where {T, inclusive}
18-
threads = blockDim().x
19-
thread = threadIdx().x
20-
block = blockIdx().x
18+
threads = KI.get_local_size().x
19+
thread = KI.get_local_id().x
20+
block = KI.get_group_id().x
2121

2222
temp = CuDynamicSharedArray(T, (2*threads,))
2323

2424
# iterate the main dimension using threads and the first block dimension
25-
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
25+
i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
2626
# iterate the other dimensions using the remaining block dimensions
27-
j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y
27+
j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
2828

2929
if j > length(Rother)
3030
return
@@ -47,7 +47,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
4747
offset = 1
4848
d = threads>>1
4949
while d > 0
50-
sync_threads()
50+
KI.barrier()
5151
@inbounds if thread <= d
5252
ai = offset * (2*thread-1)
5353
bi = offset * (2*thread)
@@ -66,7 +66,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
6666
d = 1
6767
while d < threads
6868
offset >>= 1
69-
sync_threads()
69+
KI.barrier()
7070
@inbounds if thread <= d
7171
ai = offset * (2*thread-1)
7272
bi = offset * (2*thread)
@@ -78,7 +78,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
7878
d *= 2
7979
end
8080

81-
sync_threads()
81+
KI.barrier()
8282

8383
# write results to device memory
8484
@inbounds if i <= length(Rdim)
@@ -100,14 +100,14 @@ end
100100
function aggregate_partial_scan(op::Function, output::AbstractArray,
101101
aggregates::AbstractArray, Rdim, Rpre, Rpost, Rother,
102102
init)
103-
threads = blockDim().x
104-
thread = threadIdx().x
105-
block = blockIdx().x
103+
threads = KI.get_local_size().x
104+
thread = KI.get_local_id().x
105+
block = KI.get_group_id().x
106106

107107
# iterate the main dimension using threads and the first block dimension
108-
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
108+
i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
109109
# iterate the other dimensions using the remaining block dimensions
110-
j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y
110+
j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
111111

112112
@inbounds if i <= length(Rdim) && j <= length(Rother)
113113
I = Rother[j]

src/device/random.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ end
7272
elseif field === :ctr1
7373
@inbounds global_random_counters()[warpId]
7474
elseif field === :ctr2
75-
blockId = blockIdx().x + (blockIdx().y - 1i32) * gridDim().x +
76-
(blockIdx().z - 1i32) * gridDim().x * gridDim().y
77-
globalId = threadId + (blockId - 1i32) * (blockDim().x * blockDim().y * blockDim().z)
75+
globalId = KI.get_global_id().x +
76+
(KI.get_global_id().y - 1i32) * KI.get_global_size().x +
77+
(KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
7878
globalId%UInt32
7979
end::UInt32
8080
end

src/indexing.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ function Base.findall(bools::AnyCuArray{Bool})
3333
if n > 0
3434
## COV_EXCL_START
3535
function kernel(ys::CuDeviceArray, bools, indices)
36-
i = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
36+
i = KI.get_local_id().x + (KI.get_group_id().x - 1i32) * KI.get_local_size().x
3737

3838
@inbounds if i <= length(bools) && bools[i]
3939
i′ = CartesianIndices(bools)[i]

src/mapreduce.jl

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ end
1919
@inline function reduce_block(op, val::T, neutral, shuffle::Val{true}) where T
2020
# shared mem for partial sums
2121
assume(warpsize() == 32)
22-
shared = CuStaticSharedArray(T, 32)
22+
shared = KI.localmemory(T, 32)
2323

24-
wid, lane = fldmod1(threadIdx().x, warpsize())
24+
wid, lane = fldmod1(KI.get_local_id().x, warpsize())
2525

2626
# each warp performs partial reduction
2727
val = reduce_warp(op, val)
@@ -32,10 +32,10 @@ end
3232
end
3333

3434
# wait for all partial reductions
35-
sync_threads()
35+
KI.barrier()
3636

3737
# read from shared memory only if that warp existed
38-
val = if threadIdx().x <= fld1(blockDim().x, warpsize())
38+
val = if KI.get_local_id().x <= fld1(KI.get_local_size().x, warpsize())
3939
@inbounds shared[lane]
4040
else
4141
neutral
@@ -49,8 +49,8 @@ end
4949
return val
5050
end
5151
@inline function reduce_block(op, val::T, neutral, shuffle::Val{false}) where T
52-
threads = blockDim().x
53-
thread = threadIdx().x
52+
threads = KI.get_local_size().x
53+
thread = KI.get_local_id().x
5454

5555
# shared mem for a complete reduction
5656
shared = CuDynamicSharedArray(T, (threads,))
@@ -59,7 +59,7 @@ end
5959
# perform a reduction
6060
d = 1
6161
while d < threads
62-
sync_threads()
62+
KI.barrier()
6363
index = 2 * d * (thread-1) + 1
6464
@inbounds if index <= threads
6565
other_val = if index + d <= threads
@@ -92,10 +92,10 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
9292

9393
# decompose the 1D hardware indices into separate ones for reduction (across threads
9494
# and possibly blocks if it doesn't fit) and other elements (remaining blocks)
95-
threadIdx_reduce = threadIdx().x
96-
blockDim_reduce = blockDim().x
97-
blockIdx_reduce, blockIdx_other = fldmod1(blockIdx().x, length(Rother))
98-
gridDim_reduce = gridDim().x ÷ length(Rother)
95+
threadIdx_reduce = KI.get_local_id().x
96+
blockDim_reduce = KI.get_local_size().x
97+
blockIdx_reduce, blockIdx_other = fldmod1(KI.get_group_id().x, length(Rother))
98+
gridDim_reduce = KI.get_num_groups().x ÷ length(Rother)
9999

100100
# block-based indexing into the values outside of the reduction dimension
101101
# (that means we can safely synchronize threads within this block)
@@ -134,7 +134,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
134134
end
135135

136136
function serial_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
137-
grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
137+
grid_idx = KI.get_local_id().x + (KI.get_group_id().x - 1i32) * KI.get_local_size().x
138138
@inbounds if grid_idx <= length(Rother)
139139
Iother = Rother[grid_idx]
140140

@@ -160,14 +160,14 @@ end
160160

161161
# factored out for use in tests
162162
function serial_mapreduce_threshold(dev)
163-
max_concurrency = attribute(dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) *
164-
attribute(dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
163+
max_concurrency = KI.max_work_group_size(CUDABackend()) * KI.multiprocessor_count(CUDABackend())
165164
return max_concurrency
166165
end
167166

168167
function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
169168
A::Union{AbstractArray,Broadcast.Broadcasted};
170169
init=nothing) where {F, OP, T}
170+
backend = CUDABackend()
171171
if !isa(A, Broadcast.Broadcasted)
172172
# XXX: Base.axes isn't defined anymore for Broadcasted, breaking this check
173173
Base.check_reducedims(R, A)
@@ -201,10 +201,13 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
201201
# If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
202202
if length(Rother) >= serial_mapreduce_threshold(dev)
203203
args = (f, op, init, Rreduce, Rother, R, A)
204+
# kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
204205
kernel = @cuda launch=false serial_mapreduce_kernel(args...)
206+
# kernel_config = launch_configuration(kernel.kern.fun)
205207
kernel_config = launch_configuration(kernel.fun)
206208
threads = kernel_config.threads
207209
blocks = cld(length(Rother), threads)
210+
# kernel(args...; workgroupsize=threads, numworkgroups=blocks)
208211
kernel(args...; threads, blocks)
209212
return R
210213
end
@@ -228,8 +231,10 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
228231
# we might not be able to launch all those threads to reduce each slice in one go.
229232
# that's why each threads also loops across their inputs, processing multiple values
230233
# so that we can span the entire reduction dimension using a single thread block.
234+
# kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
231235
kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), R, A)
232236
compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T)
237+
# kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads)
233238
kernel_config = launch_configuration(kernel.fun; shmem=compute_shmemcompute_threads)
234239
reduce_threads = compute_threads(kernel_config.threads)
235240
reduce_shmem = compute_shmem(reduce_threads)
@@ -255,6 +260,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
255260
# perform the actual reduction
256261
if reduce_blocks == 1
257262
# we can cover the dimensions to reduce using a single block
263+
# kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem)
258264
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
259265
else
260266
# TODO: provide a version that atomically reduces from different blocks
@@ -265,7 +271,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
265271
# NOTE: we can't use the previously-compiled kernel, or its launch configuration,
266272
# since the type of `partial` might not match the original output container
267273
# (e.g. if that was a view).
274+
# partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
268275
partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
276+
# partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads)
269277
partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmemcompute_threads)
270278
partial_reduce_threads = compute_threads(partial_kernel_config.threads)
271279
partial_reduce_shmem = compute_shmem(partial_reduce_threads)
@@ -286,7 +294,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
286294
end
287295

288296
partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
289-
threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
297+
threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
298+
# workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
290299

291300
GPUArrays.mapreducedim!(identity, op, R, partial; init)
292301
end

0 commit comments

Comments
 (0)