Test GPUArrays `reverse` #2832

christiangnrd · 2025-08-04T19:54:57Z

[only julia]
[only benchmarks]

github-actions · 2025-08-04T19:55:30Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 995728201..abb789fa4 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -160,29 +160,29 @@ end
 
 KI.kiconvert(::CUDABackend, arg) = cudaconvert(arg)
 
-function KI.kifunction(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kifunction(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = cufunction(f, tt; name, kwargs...)
-    KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+    return KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 
-function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
+function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups = nothing, workgroupsize = nothing, kwargs...)
     threadsPerThreadgroup = isnothing(workgroupsize) ? 1 : workgroupsize
     threadgroupsPerGrid = isnothing(numworkgroups) ? 1 : numworkgroups
 
-    obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid, kwargs...)
+    return obj.kern(args...; threads = threadsPerThreadgroup, blocks = threadgroupsPerGrid, kwargs...)
 end
 
 
-function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
     kernel_config = launch_configuration(kikern.kern.fun)
 
-    Int(min(kernel_config.threads, max_work_items))
+    return Int(min(kernel_config.threads, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
 end
 function KI.multiprocessor_count(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
 end
 
 ## indexing
@@ -198,7 +198,7 @@ end
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+    return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
 end
 
 @device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 051ecc11e..d238bb8a2 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     if j > length(Rother)
         return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
     block = KI.get_group_id().x
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
         @inbounds global_random_counters()[warpId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId%UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
-                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+            threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+        )
+        # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
-    "CPU synchronization",
-    "fallback test: callable types",]))
+Testsuite.testsuite(
+    () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+        [
+            "CPU synchronization",
+            "fallback test: callable types",
+        ]
+    )
+)
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end

github-actions

CUDA.jl Benchmarks

Benchmark suite	Current: `69871e5`	Previous: `f4c05e0`	Ratio
`array/reverse/1d`	`22693` ns	`19857` ns	`1.14`
`array/reverse/2dL_inplace`	`71436` ns	`66720` ns	`1.07`
`array/reverse/1dL`	`73269` ns	`70068` ns	`1.05`
`array/reverse/2d`	`25715` ns	`21721` ns	`1.18`
`array/reverse/1d_inplace`	`11652` ns	`11535` ns	`1.01`
`array/reverse/2d_inplace`	`15940` ns	`13153` ns	`1.21`
`array/reverse/2dL`	`91082` ns	`73755` ns	`1.23`
`array/reverse/1dL_inplace`	`67942` ns	`66862` ns	`1.02`

This comment was automatically generated by workflow using github-action-benchmark.

[only tests] [only benchmarks]

This reverts commit 3fba940.

This reverts commit fc12cd1.

[only julia] [only benchmarks]

christiangnrd force-pushed the rev branch 3 times, most recently from b9d6112 to 8e44b5e Compare August 5, 2025 02:06

github-actions bot reviewed Aug 5, 2025

View reviewed changes

christiangnrd force-pushed the rev branch 3 times, most recently from a22d4dc to ba6f7ee Compare August 5, 2025 21:33

christiangnrd mentioned this pull request Aug 5, 2025

Benchmark reverse on bigger arrays #2833

Merged

christiangnrd added 18 commits November 3, 2025 16:03

KernelIntrinsics

3652aeb

Tweak

05bdd5c

dogfood

b86ce95

[only tests] [only benchmarks]

CI

70944da

Apply suggestion from @christiangnrd

0d49f17

Apply suggestion from @christiangnrd

762e591

Apply suggestion from @christiangnrd

e133fbd

Fix

f3413ba

shtgbr

0474a67

Fix

0c1ea74

another fix

14f1c27

fix

0a3e029

dfgbsg

d652d0b

dgbsg

53dfbf0

Finally?

08baa0f

Revert "shtgbr"

fed145c

This reverts commit 3fba940.

Reapply "shtgbr"

46cbc05

This reverts commit fc12cd1.

Revert KIKernel

d341f2d

christiangnrd force-pushed the rev branch from ba6f7ee to 9be3aa9 Compare November 3, 2025 20:05

fix

6a2c51b

christiangnrd force-pushed the rev branch from 9be3aa9 to bf9c153 Compare November 3, 2025 20:26

christiangnrd added 4 commits November 4, 2025 13:26

New

b63f875

Test GPUArrays reverse

f61fd57

[only julia] [only benchmarks]

temp

e2f192a

trhws

69871e5

christiangnrd force-pushed the rev branch from bf9c153 to 69871e5 Compare November 4, 2025 17:44

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Test GPUArrays `reverse` #2832

Test GPUArrays `reverse` #2832

Uh oh!

christiangnrd commented Aug 4, 2025

Uh oh!

github-actions bot commented Aug 4, 2025 •

edited

Loading

Uh oh!

github-actions bot left a comment •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

Test GPUArrays reverse #2832

Are you sure you want to change the base?

Test GPUArrays reverse #2832

Uh oh!

Conversation

christiangnrd commented Aug 4, 2025

Uh oh!

github-actions bot commented Aug 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

CUDA.jl Benchmarks

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

Test GPUArrays `reverse` #2832

Test GPUArrays `reverse` #2832

github-actions bot commented Aug 4, 2025 •

edited

Loading

github-actions bot left a comment •

edited

Loading