[Do not merge] Test KernelIntrinsics #2944

christiangnrd · 2025-10-22T14:56:20Z

[only tests]
[only benchmarks]

github-actions · 2025-10-22T14:57:06Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 610c72c1c..7e727dbf4 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -160,29 +160,29 @@ end
 
 KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
 
-function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = cufunction(f, tt; name, kwargs...)
-    KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+    return KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 
-function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
+function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = nothing, workgroupsize = nothing, kwargs...)
     threadsPerThreadgroup = isnothing(workgroupsize) ? 1 : workgroupsize
     threadgroupsPerGrid = isnothing(numworkgroups) ? 1 : numworkgroups
 
-    obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid, kwargs...)
+    return obj.kern(args...; threads = threadsPerThreadgroup, blocks = threadgroupsPerGrid, kwargs...)
 end
 
 
-function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.Kernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
     kernel_config = launch_configuration(kikern.kern.fun)
 
-    Int(min(kernel_config.threads, max_work_items))
+    return Int(min(kernel_config.threads, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
 end
 function KI.multiprocessor_count(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
 end
 
 ## indexing
@@ -198,7 +198,7 @@ end
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+    return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
 end
 
 @device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 051ecc11e..d238bb8a2 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     if j > length(Rother)
         return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
     block = KI.get_group_id().x
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
         @inbounds global_random_counters()[warpId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId%UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
-                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+            threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+        )
+        # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
-    "CPU synchronization",
-    "fallback test: callable types",]))
+Testsuite.testsuite(
+    () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+        [
+            "CPU synchronization",
+            "fallback test: callable types",
+        ]
+    )
+)
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end

src/mapreduce.jl

src/CUDAKernels.jl

github-actions

CUDA.jl Benchmarks

Benchmark suite	Current: `2aa6e52`	Previous: `1e35ff7`	Ratio
`latency/precompile`	`64569076365` ns	`57073264350` ns	`1.13`
`latency/ttfp`	`8497494612.5` ns	`8361242115.5` ns	`1.02`
`latency/import`	`4685275562` ns	`4520871512` ns	`1.04`
`integration/volumerhs`	`9623614` ns	`9609145.5` ns	`1.00`
`integration/byval/slices=1`	`146885.5` ns	`146784` ns	`1.00`
`integration/byval/slices=3`	`425683` ns	`425930` ns	`1.00`
`integration/byval/reference`	`145012` ns	`144913` ns	`1.00`
`integration/byval/slices=2`	`286231` ns	`286275` ns	`1.00`
`integration/cudadevrt`	`103543` ns	`103477` ns	`1.00`
`kernel/indexing`	`14190` ns	`14088` ns	`1.01`
`kernel/indexing_checked`	`14772` ns	`14920` ns	`0.99`
`kernel/occupancy`	`687.1118421052631` ns	`670.5283018867924` ns	`1.02`
`kernel/launch`	`2133.2` ns	`2192.1111111111113` ns	`0.97`
`kernel/rand`	`14942` ns	`18597.5` ns	`0.80`
`array/reverse/1d`	`19948` ns	`19990` ns	`1.00`
`array/reverse/2dL_inplace`	`66912` ns	`66851` ns	`1.00`
`array/reverse/1dL`	`70179` ns	`70214` ns	`1.00`
`array/reverse/2d`	`21699.5` ns	`21764` ns	`1.00`
`array/reverse/1d_inplace`	`9617` ns	`9644` ns	`1.00`
`array/reverse/2d_inplace`	`13395.5` ns	`11083` ns	`1.21`
`array/reverse/2dL`	`73740.5` ns	`73680.5` ns	`1.00`
`array/reverse/1dL_inplace`	`66745` ns	`66780` ns	`1.00`
`array/copy`	`20856` ns	`20656` ns	`1.01`
`array/iteration/findall/int`	`160265` ns	`157234` ns	`1.02`
`array/iteration/findall/bool`	`142631.5` ns	`139637.5` ns	`1.02`
`array/iteration/findfirst/int`	`161918` ns	`161491` ns	`1.00`
`array/iteration/findfirst/bool`	`162426` ns	`161981.5` ns	`1.00`
`array/iteration/scalar`	`72619` ns	`72914` ns	`1.00`
`array/iteration/logical`	`219741.5` ns	`215503` ns	`1.02`
`array/iteration/findmin/1d`	`50523` ns	`52893.5` ns	`0.96`
`array/iteration/findmin/2d`	`97246` ns	`96673.5` ns	`1.01`
`array/reductions/reduce/Int64/1d`	`48481` ns	`43374` ns	`1.12`
`array/reductions/reduce/Int64/dims=1`	`50034` ns	`44924.5` ns	`1.11`
`array/reductions/reduce/Int64/dims=2`	`69278` ns	`61289` ns	`1.13`
`array/reductions/reduce/Int64/dims=1L`	`89383` ns	`89013` ns	`1.00`
`array/reductions/reduce/Int64/dims=2L`	`90149` ns	`88275` ns	`1.02`
`array/reductions/reduce/Float32/1d`	`36725` ns	`37043` ns	`0.99`
`array/reductions/reduce/Float32/dims=1`	`48039.5` ns	`43018` ns	`1.12`
`array/reductions/reduce/Float32/dims=2`	`64599` ns	`59774` ns	`1.08`
`array/reductions/reduce/Float32/dims=1L`	`53711` ns	`52409` ns	`1.02`
`array/reductions/reduce/Float32/dims=2L`	`73778.5` ns	`72278` ns	`1.02`
`array/reductions/mapreduce/Int64/1d`	`48230.5` ns	`43540` ns	`1.11`
`array/reductions/mapreduce/Int64/dims=1`	`49526.5` ns	`45057.5` ns	`1.10`
`array/reductions/mapreduce/Int64/dims=2`	`69219` ns	`61470` ns	`1.13`
`array/reductions/mapreduce/Int64/dims=1L`	`89407` ns	`88923` ns	`1.01`
`array/reductions/mapreduce/Int64/dims=2L`	`90380` ns	`88349` ns	`1.02`
`array/reductions/mapreduce/Float32/1d`	`36707.5` ns	`36698` ns	`1.00`
`array/reductions/mapreduce/Float32/dims=1`	`56322` ns	`41442` ns	`1.36`
`array/reductions/mapreduce/Float32/dims=2`	`62875` ns	`59908` ns	`1.05`
`array/reductions/mapreduce/Float32/dims=1L`	`54066` ns	`52585` ns	`1.03`
`array/reductions/mapreduce/Float32/dims=2L`	`73313.5` ns	`72014` ns	`1.02`
`array/broadcast`	`19973` ns	`20078` ns	`0.99`
`array/copyto!/gpu_to_gpu`	`12932` ns	`12908` ns	`1.00`
`array/copyto!/cpu_to_gpu`	`214961` ns	`213437` ns	`1.01`
`array/copyto!/gpu_to_cpu`	`284935` ns	`283206` ns	`1.01`
`array/accumulate/Int64/1d`	`127610` ns	`124198` ns	`1.03`
`array/accumulate/Int64/dims=1`	`85284` ns	`83165` ns	`1.03`
`array/accumulate/Int64/dims=2`	`159083` ns	`157631` ns	`1.01`
`array/accumulate/Int64/dims=1L`	`1793435` ns	`1709733` ns	`1.05`
`array/accumulate/Int64/dims=2L`	`973355` ns	`966057.5` ns	`1.01`
`array/accumulate/Float32/1d`	`112339` ns	`108414` ns	`1.04`
`array/accumulate/Float32/dims=1`	`81850` ns	`79731.5` ns	`1.03`
`array/accumulate/Float32/dims=2`	`149302.5` ns	`146657` ns	`1.02`
`array/accumulate/Float32/dims=1L`	`1712861.5` ns	`1616606.5` ns	`1.06`
`array/accumulate/Float32/dims=2L`	`714168` ns	`697417` ns	`1.02`
`array/construct`	`1281.8` ns	`1271.5` ns	`1.01`
`array/random/randn/Float32`	`44835.5` ns	`45612` ns	`0.98`
`array/random/randn!/Float32`	`25080` ns	`24822` ns	`1.01`
`array/random/rand!/Int64`	`27406` ns	`27264` ns	`1.01`
`array/random/rand!/Float32`	`8721.333333333334` ns	`8854` ns	`0.99`
`array/random/rand/Int64`	`30089` ns	`29823` ns	`1.01`
`array/random/rand/Float32`	`13225` ns	`13073` ns	`1.01`
`array/permutedims/4d`	`59220.5` ns	`59525` ns	`0.99`
`array/permutedims/2d`	`54019` ns	`53919` ns	`1.00`
`array/permutedims/3d`	`54573.5` ns	`54583` ns	`1.00`
`array/sorting/1d`	`2766346` ns	`2757051` ns	`1.00`
`array/sorting/by`	`3346673` ns	`3344047` ns	`1.00`
`array/sorting/2d`	`1081366` ns	`1080794` ns	`1.00`
`cuda/synchronization/stream/auto`	`1026.8` ns	`1034` ns	`0.99`
`cuda/synchronization/stream/nonblocking`	`7370.8` ns	`8105` ns	`0.91`
`cuda/synchronization/stream/blocking`	`828.09375` ns	`796.4842105263158` ns	`1.04`
`cuda/synchronization/context/auto`	`1188.2` ns	`1198.2` ns	`0.99`
`cuda/synchronization/context/nonblocking`	`7132.4` ns	`8018.6` ns	`0.89`
`cuda/synchronization/context/blocking`	`910.7560975609756` ns	`918.6428571428571` ns	`0.99`

This comment was automatically generated by workflow using github-action-benchmark.

christiangnrd force-pushed the intrinsics branch from edaeb41 to 8e3e1d4 Compare October 22, 2025 15:00

christiangnrd commented Oct 22, 2025

View reviewed changes

src/mapreduce.jl Outdated Show resolved Hide resolved

christiangnrd commented Oct 22, 2025

View reviewed changes

src/mapreduce.jl Outdated Show resolved Hide resolved

christiangnrd commented Oct 22, 2025

View reviewed changes

src/CUDAKernels.jl Outdated Show resolved Hide resolved

christiangnrd force-pushed the intrinsics branch from 497ef42 to 506e02d Compare October 22, 2025 17:23

github-actions bot reviewed Oct 22, 2025

View reviewed changes

christiangnrd force-pushed the intrinsics branch 2 times, most recently from aef3728 to fef539a Compare November 6, 2025 14:51

christiangnrd force-pushed the intrinsics branch from fef539a to 8e15ff9 Compare November 12, 2025 02:58

christiangnrd added 3 commits November 11, 2025 23:00

KernelIntrinsics

c4f959e

[Temp] Try out KernelIntrinsics

6f4016a

[Temp] CI

2aa6e52

christiangnrd force-pushed the intrinsics branch from 8e15ff9 to 2aa6e52 Compare November 12, 2025 03:01

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Do not merge] Test KernelIntrinsics #2944

[Do not merge] Test KernelIntrinsics #2944

Uh oh!

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot left a comment •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

[Do not merge] Test KernelIntrinsics #2944

Are you sure you want to change the base?

[Do not merge] Test KernelIntrinsics #2944

Uh oh!

Conversation

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

CUDA.jl Benchmarks

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

github-actions bot commented Oct 22, 2025 •

edited

Loading

github-actions bot left a comment •

edited

Loading