1919@inline function reduce_block (op, val:: T , neutral, shuffle:: Val{true} ) where T
2020 # shared mem for partial sums
2121 assume (warpsize () == 32 )
22- shared = CuStaticSharedArray (T, 32 )
22+ shared = KI . localmemory (T, 32 )
2323
24- wid, lane = fldmod1 (threadIdx (). x, warpsize ())
24+ wid, lane = fldmod1 (KI . get_local_id (). x, warpsize ())
2525
2626 # each warp performs partial reduction
2727 val = reduce_warp (op, val)
3232 end
3333
3434 # wait for all partial reductions
35- sync_threads ()
35+ KI . barrier ()
3636
3737 # read from shared memory only if that warp existed
38- val = if threadIdx (). x <= fld1 (blockDim (). x, warpsize ())
38+ val = if KI . get_local_id (). x <= fld1 (KI . get_local_size (). x, warpsize ())
3939 @inbounds shared[lane]
4040 else
4141 neutral
4949 return val
5050end
5151@inline function reduce_block (op, val:: T , neutral, shuffle:: Val{false} ) where T
52- threads = blockDim (). x
53- thread = threadIdx (). x
52+ threads = KI . get_local_size (). x
53+ thread = KI . get_local_id (). x
5454
5555 # shared mem for a complete reduction
5656 shared = CuDynamicSharedArray (T, (threads,))
5959 # perform a reduction
6060 d = 1
6161 while d < threads
62- sync_threads ()
62+ KI . barrier ()
6363 index = 2 * d * (thread- 1 ) + 1
6464 @inbounds if index <= threads
6565 other_val = if index + d <= threads
@@ -92,10 +92,10 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
9292
9393 # decompose the 1D hardware indices into separate ones for reduction (across threads
9494 # and possibly blocks if it doesn't fit) and other elements (remaining blocks)
95- threadIdx_reduce = threadIdx (). x
96- blockDim_reduce = blockDim (). x
97- blockIdx_reduce, blockIdx_other = fldmod1 (blockIdx (). x, length (Rother))
98- gridDim_reduce = gridDim (). x ÷ length (Rother)
95+ threadIdx_reduce = KI . get_local_id (). x
96+ blockDim_reduce = KI . get_local_size (). x
97+ blockIdx_reduce, blockIdx_other = fldmod1 (KI . get_group_id (). x, length (Rother))
98+ gridDim_reduce = KI . get_num_groups (). x ÷ length (Rother)
9999
100100 # block-based indexing into the values outside of the reduction dimension
101101 # (that means we can safely synchronize threads within this block)
@@ -134,7 +134,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
134134end
135135
136136function serial_mapreduce_kernel (f, op, neutral, Rreduce, Rother, R, As)
137- grid_idx = threadIdx (). x + (blockIdx (). x - 1 i32) * blockDim (). x
137+ grid_idx = KI . get_local_id (). x + (KI . get_group_id (). x - 1 i32) * KI . get_local_size (). x
138138 @inbounds if grid_idx <= length (Rother)
139139 Iother = Rother[grid_idx]
140140
@@ -160,14 +160,14 @@ end
160160
161161# factored out for use in tests
162162function serial_mapreduce_threshold (dev)
163- max_concurrency = attribute (dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) *
164- attribute (dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
163+ max_concurrency = KI. max_work_group_size (CUDABackend ()) * KI. multiprocessor_count (CUDABackend ())
165164 return max_concurrency
166165end
167166
168167function GPUArrays. mapreducedim! (f:: F , op:: OP , R:: AnyCuArray{T} ,
169168 A:: Union{AbstractArray,Broadcast.Broadcasted} ;
170169 init= nothing ) where {F, OP, T}
170+ backend = CUDABackend ()
171171 if ! isa (A, Broadcast. Broadcasted)
172172 # XXX : Base.axes isn't defined anymore for Broadcasted, breaking this check
173173 Base. check_reducedims (R, A)
@@ -201,10 +201,13 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
201201 # If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
202202 if length (Rother) >= serial_mapreduce_threshold (dev)
203203 args = (f, op, init, Rreduce, Rother, R, A)
204+ # kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
204205 kernel = @cuda launch= false serial_mapreduce_kernel (args... )
206+ # kernel_config = launch_configuration(kernel.kern.fun)
205207 kernel_config = launch_configuration (kernel. fun)
206208 threads = kernel_config. threads
207209 blocks = cld (length (Rother), threads)
210+ # kernel(args...; workgroupsize=threads, numworkgroups=blocks)
208211 kernel (args... ; threads, blocks)
209212 return R
210213 end
@@ -228,8 +231,10 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
228231 # we might not be able to launch all those threads to reduce each slice in one go.
229232 # that's why each threads also loops across their inputs, processing multiple values
230233 # so that we can span the entire reduction dimension using a single thread block.
234+ # kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
231235 kernel = @cuda launch= false partial_mapreduce_grid (f, op, init, Rreduce, Rother, Val (shuffle), R, A)
232236 compute_shmem (threads) = shuffle ? 0 : threads* sizeof (T)
237+ # kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads)
233238 kernel_config = launch_configuration (kernel. fun; shmem= compute_shmem∘ compute_threads)
234239 reduce_threads = compute_threads (kernel_config. threads)
235240 reduce_shmem = compute_shmem (reduce_threads)
@@ -255,6 +260,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
255260 # perform the actual reduction
256261 if reduce_blocks == 1
257262 # we can cover the dimensions to reduce using a single block
263+ # kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem)
258264 kernel (f, op, init, Rreduce, Rother, Val (shuffle), R, A; threads, blocks, shmem)
259265 else
260266 # TODO : provide a version that atomically reduces from different blocks
@@ -265,7 +271,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
265271 # NOTE: we can't use the previously-compiled kernel, or its launch configuration,
266272 # since the type of `partial` might not match the original output container
267273 # (e.g. if that was a view).
274+ # partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
268275 partial_kernel = @cuda launch= false partial_mapreduce_grid (f, op, init, Rreduce, Rother, Val (shuffle), partial, A)
276+ # partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads)
269277 partial_kernel_config = launch_configuration (partial_kernel. fun; shmem= compute_shmem∘ compute_threads)
270278 partial_reduce_threads = compute_threads (partial_kernel_config. threads)
271279 partial_reduce_shmem = compute_shmem (partial_reduce_threads)
@@ -286,7 +294,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
286294 end
287295
288296 partial_kernel (f, op, init, Rreduce, Rother, Val (shuffle), partial, A;
289- threads= partial_threads, blocks= partial_blocks, shmem= partial_shmem)
297+ threads= partial_threads, blocks= partial_blocks, shmem= partial_shmem)
298+ # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
290299
291300 GPUArrays. mapreducedim! (identity, op, R, partial; init)
292301 end
0 commit comments