JuliaGPU · leios · Feb 8, 2024 · Feb 9, 2024 · Feb 12, 2024 · Feb 12, 2024
diff --git a/docs/src/interface.md b/docs/src/interface.md
@@ -13,8 +13,6 @@ all, you need to provide a type that represents your execution back-end and a wa
 kernels:
 
 ```@docs
-GPUArrays.AbstractGPUBackend
-GPUArrays.AbstractKernelContext
 GPUArrays.gpu_call
 GPUArrays.thread_block_heuristic
 ```

diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
@@ -18,17 +18,17 @@ using Reexport
 include("device/execution.jl")
 ## executed on-device
 include("device/abstractarray.jl")
-include("device/indexing.jl")
+#include("device/indexing.jl")
 include("device/memory.jl")
-include("device/synchronization.jl")
+#include("device/synchronization.jl")
 
 using KernelAbstractions
 # host abstractions
 include("host/abstractarray.jl")
 include("host/construction.jl")
 ## integrations and specialized methods
 include("host/base.jl")
-include("host/indexing.jl")
+#include("host/indexing.jl")
 include("host/broadcast.jl")
 include("host/mapreduce.jl")
 include("host/linalg.jl")

diff --git a/src/device/indexing.jl b/src/device/indexing.jl
diff --git a/src/device/synchronization.jl b/src/device/synchronization.jl
diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl
@@ -147,7 +147,7 @@ end
 
 ## generalized blocks of heterogeneous memory
 
-@kernel function cartesian_copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets)
+@kernel function cartesian_copy_kernel!(dest, dest_offsets, src, src_offsets)
     I = @index(Global, Cartesian)
     @inbounds dest[I + dest_offsets] = src[I + src_offsets]
 end

diff --git a/src/host/base.jl b/src/host/base.jl
@@ -4,22 +4,20 @@ import Base: _RepeatInnerOuter
 # Handle `out = repeat(x; inner)` by parallelizing over `out` array This can benchmark
 # faster if repeating elements along the first axis (i.e. `inner=(n, ones...)`), as data
 # access can be contiguous on write.
-function repeat_inner_dst_kernel!(
-    ctx::AbstractKernelContext,
+@kernel function repeat_inner_dst_kernel!(
     xs::AbstractArray{<:Any, N},
     inner::NTuple{N, Int},
     out::AbstractArray{<:Any, N}
 ) where {N}
     # Get the "stride" index in each dimension, where the size
     # of the stride is given by `inner`. The stride-index (sdx) then
     # corresponds to the index of the repeated value in `xs`.
-    odx = @cartesianidx out
+    odx = @index(Global, Cartesian)
     dest_inds = odx.I
     sdx = ntuple(N) do i
         @inbounds (dest_inds[i] - 1) ÷ inner[i] + 1
     end
     @inbounds out[odx] = xs[CartesianIndex(sdx)]
-    return nothing
 end
 
 # Handle `out = repeat(x; inner)` by parallelizing over the `xs` array This tends to
@@ -90,8 +88,6 @@ end
         end
         @inbounds out[CartesianIndex(odx)] = val
     end
-
-    return nothing
 end
 
 function repeat_outer(xs::AnyGPUArray, outer)

diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
@@ -51,23 +51,22 @@ end
     bc′ = Broadcast.preprocess(dest, bc)
 
     # grid-stride kernel
-    function broadcast_kernel(ctx, dest, bc′, nelem)
+    @kernel function broadcast_kernel(dest, bc′, nelem)
         i = 0
+        I = @index(Global, Linear)
         while i < nelem
             i += 1
-            I = @cartesianidx(dest, i)
-            @inbounds dest[I] = bc′[I]
+            idx = CartesianIndices(dest)[(I-1)*nelem + i]
+            @inbounds dest[idx] = bc′[idx]
         end
-        return
     end
     elements = length(dest)
     elements_per_thread = typemax(Int)
-    heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc′, 1;
-                                 elements, elements_per_thread)
-    config = launch_configuration(backend(dest), heuristic;
-                                  elements, elements_per_thread)
-    gpu_call(broadcast_kernel, dest, bc′, config.elements_per_thread;
-             threads=config.threads, blocks=config.blocks)
+    # TODO: figure out actual arguments, 3 should be workgroupsize
+    config = KernelAbstractions.launch_config(broadcast_kernel, elements,
+                                              elements/elements_per_thread)
+    kernel! = broadcast_kernel(get_backend(dest), config.threads)
+    kernel!(dest, bc', nelem, ndrange = config.ndrange)
 
     return dest
 end

diff --git a/src/host/construction.jl b/src/host/construction.jl
@@ -11,8 +11,8 @@ Base.convert(::Type{T}, a::AbstractArray) where {T<:AbstractGPUArray} = a isa T
 
 function Base.fill!(A::AnyGPUArray{T}, x) where T
     length(A) == 0 && return A
-    @kernel fill!(a, val)
-        idx = @index(Linear, Global)
+    @kernel function fill!(a, val)
+        idx = @index(Global, Linear)
         @inbounds a[idx] = val
     end
     kernel = fill!(backend(A))
@@ -23,11 +23,12 @@ end
 
 ## identity matrices
 
-@kernel function identity_kernel(ctx::AbstractKernelContext, res::AbstractArray{T}, stride, val) where T
+@kernel function identity_kernel(res::AbstractArray{T}, stride, val) where T
     i = @index(Global, Linear)
     ilin = (stride * (i - 1)) + i
-    ilin > length(res) && return
-    @inbounds res[ilin] = val
+    if ilin < length(res)
+        @inbounds res[ilin] = val
+    end
 end
 
 function (T::Type{<: AnyGPUArray{U}})(s::UniformScaling, dims::Dims{2}) where {U}