Skip to content

Conversation

@christiangnrd
Copy link
Member

[only tests]
[only benchmarks]

@github-actions
Copy link
Contributor

github-actions bot commented Oct 22, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 610c72c1c..7e727dbf4 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -160,29 +160,29 @@ end
 
 KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
 
-function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = cufunction(f, tt; name, kwargs...)
-    KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+    return KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 
-function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
+function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = nothing, workgroupsize = nothing, kwargs...)
     threadsPerThreadgroup = isnothing(workgroupsize) ? 1 : workgroupsize
     threadgroupsPerGrid = isnothing(numworkgroups) ? 1 : numworkgroups
 
-    obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid, kwargs...)
+    return obj.kern(args...; threads = threadsPerThreadgroup, blocks = threadgroupsPerGrid, kwargs...)
 end
 
 
-function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.Kernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
     kernel_config = launch_configuration(kikern.kern.fun)
 
-    Int(min(kernel_config.threads, max_work_items))
+    return Int(min(kernel_config.threads, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
 end
 function KI.multiprocessor_count(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
 end
 
 ## indexing
@@ -198,7 +198,7 @@ end
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+    return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
 end
 
 @device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 051ecc11e..d238bb8a2 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     if j > length(Rother)
         return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
     block = KI.get_group_id().x
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
         @inbounds global_random_counters()[warpId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId%UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
-                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+            threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+        )
+        # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
-    "CPU synchronization",
-    "fallback test: callable types",]))
+Testsuite.testsuite(
+    () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+        [
+            "CPU synchronization",
+            "fallback test: callable types",
+        ]
+    )
+)
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end

Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Benchmark suite Current: 2aa6e52 Previous: 1e35ff7 Ratio
latency/precompile 64569076365 ns 57073264350 ns 1.13
latency/ttfp 8497494612.5 ns 8361242115.5 ns 1.02
latency/import 4685275562 ns 4520871512 ns 1.04
integration/volumerhs 9623614 ns 9609145.5 ns 1.00
integration/byval/slices=1 146885.5 ns 146784 ns 1.00
integration/byval/slices=3 425683 ns 425930 ns 1.00
integration/byval/reference 145012 ns 144913 ns 1.00
integration/byval/slices=2 286231 ns 286275 ns 1.00
integration/cudadevrt 103543 ns 103477 ns 1.00
kernel/indexing 14190 ns 14088 ns 1.01
kernel/indexing_checked 14772 ns 14920 ns 0.99
kernel/occupancy 687.1118421052631 ns 670.5283018867924 ns 1.02
kernel/launch 2133.2 ns 2192.1111111111113 ns 0.97
kernel/rand 14942 ns 18597.5 ns 0.80
array/reverse/1d 19948 ns 19990 ns 1.00
array/reverse/2dL_inplace 66912 ns 66851 ns 1.00
array/reverse/1dL 70179 ns 70214 ns 1.00
array/reverse/2d 21699.5 ns 21764 ns 1.00
array/reverse/1d_inplace 9617 ns 9644 ns 1.00
array/reverse/2d_inplace 13395.5 ns 11083 ns 1.21
array/reverse/2dL 73740.5 ns 73680.5 ns 1.00
array/reverse/1dL_inplace 66745 ns 66780 ns 1.00
array/copy 20856 ns 20656 ns 1.01
array/iteration/findall/int 160265 ns 157234 ns 1.02
array/iteration/findall/bool 142631.5 ns 139637.5 ns 1.02
array/iteration/findfirst/int 161918 ns 161491 ns 1.00
array/iteration/findfirst/bool 162426 ns 161981.5 ns 1.00
array/iteration/scalar 72619 ns 72914 ns 1.00
array/iteration/logical 219741.5 ns 215503 ns 1.02
array/iteration/findmin/1d 50523 ns 52893.5 ns 0.96
array/iteration/findmin/2d 97246 ns 96673.5 ns 1.01
array/reductions/reduce/Int64/1d 48481 ns 43374 ns 1.12
array/reductions/reduce/Int64/dims=1 50034 ns 44924.5 ns 1.11
array/reductions/reduce/Int64/dims=2 69278 ns 61289 ns 1.13
array/reductions/reduce/Int64/dims=1L 89383 ns 89013 ns 1.00
array/reductions/reduce/Int64/dims=2L 90149 ns 88275 ns 1.02
array/reductions/reduce/Float32/1d 36725 ns 37043 ns 0.99
array/reductions/reduce/Float32/dims=1 48039.5 ns 43018 ns 1.12
array/reductions/reduce/Float32/dims=2 64599 ns 59774 ns 1.08
array/reductions/reduce/Float32/dims=1L 53711 ns 52409 ns 1.02
array/reductions/reduce/Float32/dims=2L 73778.5 ns 72278 ns 1.02
array/reductions/mapreduce/Int64/1d 48230.5 ns 43540 ns 1.11
array/reductions/mapreduce/Int64/dims=1 49526.5 ns 45057.5 ns 1.10
array/reductions/mapreduce/Int64/dims=2 69219 ns 61470 ns 1.13
array/reductions/mapreduce/Int64/dims=1L 89407 ns 88923 ns 1.01
array/reductions/mapreduce/Int64/dims=2L 90380 ns 88349 ns 1.02
array/reductions/mapreduce/Float32/1d 36707.5 ns 36698 ns 1.00
array/reductions/mapreduce/Float32/dims=1 56322 ns 41442 ns 1.36
array/reductions/mapreduce/Float32/dims=2 62875 ns 59908 ns 1.05
array/reductions/mapreduce/Float32/dims=1L 54066 ns 52585 ns 1.03
array/reductions/mapreduce/Float32/dims=2L 73313.5 ns 72014 ns 1.02
array/broadcast 19973 ns 20078 ns 0.99
array/copyto!/gpu_to_gpu 12932 ns 12908 ns 1.00
array/copyto!/cpu_to_gpu 214961 ns 213437 ns 1.01
array/copyto!/gpu_to_cpu 284935 ns 283206 ns 1.01
array/accumulate/Int64/1d 127610 ns 124198 ns 1.03
array/accumulate/Int64/dims=1 85284 ns 83165 ns 1.03
array/accumulate/Int64/dims=2 159083 ns 157631 ns 1.01
array/accumulate/Int64/dims=1L 1793435 ns 1709733 ns 1.05
array/accumulate/Int64/dims=2L 973355 ns 966057.5 ns 1.01
array/accumulate/Float32/1d 112339 ns 108414 ns 1.04
array/accumulate/Float32/dims=1 81850 ns 79731.5 ns 1.03
array/accumulate/Float32/dims=2 149302.5 ns 146657 ns 1.02
array/accumulate/Float32/dims=1L 1712861.5 ns 1616606.5 ns 1.06
array/accumulate/Float32/dims=2L 714168 ns 697417 ns 1.02
array/construct 1281.8 ns 1271.5 ns 1.01
array/random/randn/Float32 44835.5 ns 45612 ns 0.98
array/random/randn!/Float32 25080 ns 24822 ns 1.01
array/random/rand!/Int64 27406 ns 27264 ns 1.01
array/random/rand!/Float32 8721.333333333334 ns 8854 ns 0.99
array/random/rand/Int64 30089 ns 29823 ns 1.01
array/random/rand/Float32 13225 ns 13073 ns 1.01
array/permutedims/4d 59220.5 ns 59525 ns 0.99
array/permutedims/2d 54019 ns 53919 ns 1.00
array/permutedims/3d 54573.5 ns 54583 ns 1.00
array/sorting/1d 2766346 ns 2757051 ns 1.00
array/sorting/by 3346673 ns 3344047 ns 1.00
array/sorting/2d 1081366 ns 1080794 ns 1.00
cuda/synchronization/stream/auto 1026.8 ns 1034 ns 0.99
cuda/synchronization/stream/nonblocking 7370.8 ns 8105 ns 0.91
cuda/synchronization/stream/blocking 828.09375 ns 796.4842105263158 ns 1.04
cuda/synchronization/context/auto 1188.2 ns 1198.2 ns 0.99
cuda/synchronization/context/nonblocking 7132.4 ns 8018.6 ns 0.89
cuda/synchronization/context/blocking 910.7560975609756 ns 918.6428571428571 ns 0.99

This comment was automatically generated by workflow using github-action-benchmark.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant