Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attempt to fix up KA transition #517

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions docs/src/interface.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ all, you need to provide a type that represents your execution back-end and a wa
kernels:

```@docs
GPUArrays.AbstractGPUBackend
GPUArrays.AbstractKernelContext
GPUArrays.gpu_call
GPUArrays.thread_block_heuristic
```
Expand Down
6 changes: 3 additions & 3 deletions src/GPUArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@ using Reexport
include("device/execution.jl")
## executed on-device
include("device/abstractarray.jl")
include("device/indexing.jl")
#include("device/indexing.jl")
include("device/memory.jl")
include("device/synchronization.jl")
#include("device/synchronization.jl")

using KernelAbstractions
# host abstractions
include("host/abstractarray.jl")
include("host/construction.jl")
## integrations and specialized methods
include("host/base.jl")
include("host/indexing.jl")
#include("host/indexing.jl")
include("host/broadcast.jl")
include("host/mapreduce.jl")
include("host/linalg.jl")
Expand Down
83 changes: 0 additions & 83 deletions src/device/indexing.jl

This file was deleted.

13 changes: 0 additions & 13 deletions src/device/synchronization.jl

This file was deleted.

2 changes: 1 addition & 1 deletion src/host/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ end

## generalized blocks of heterogeneous memory

@kernel function cartesian_copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets)
@kernel function cartesian_copy_kernel!(dest, dest_offsets, src, src_offsets)
I = @index(Global, Cartesian)
@inbounds dest[I + dest_offsets] = src[I + src_offsets]
end
Expand Down
8 changes: 2 additions & 6 deletions src/host/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,20 @@ import Base: _RepeatInnerOuter
# Handle `out = repeat(x; inner)` by parallelizing over `out` array This can benchmark
# faster if repeating elements along the first axis (i.e. `inner=(n, ones...)`), as data
# access can be contiguous on write.
function repeat_inner_dst_kernel!(
ctx::AbstractKernelContext,
@kernel function repeat_inner_dst_kernel!(
xs::AbstractArray{<:Any, N},
inner::NTuple{N, Int},
out::AbstractArray{<:Any, N}
) where {N}
# Get the "stride" index in each dimension, where the size
# of the stride is given by `inner`. The stride-index (sdx) then
# corresponds to the index of the repeated value in `xs`.
odx = @cartesianidx out
odx = @index(Global, Cartesian)
dest_inds = odx.I
sdx = ntuple(N) do i
@inbounds (dest_inds[i] - 1) ÷ inner[i] + 1
end
@inbounds out[odx] = xs[CartesianIndex(sdx)]
return nothing
end

# Handle `out = repeat(x; inner)` by parallelizing over the `xs` array This tends to
Expand Down Expand Up @@ -90,8 +88,6 @@ end
end
@inbounds out[CartesianIndex(odx)] = val
end

return nothing
end

function repeat_outer(xs::AnyGPUArray, outer)
Expand Down
19 changes: 9 additions & 10 deletions src/host/broadcast.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,23 +51,22 @@ end
bc′ = Broadcast.preprocess(dest, bc)

# grid-stride kernel
function broadcast_kernel(ctx, dest, bc′, nelem)
@kernel function broadcast_kernel(dest, bc′, nelem)
i = 0
I = @index(Global, Linear)
while i < nelem
i += 1
I = @cartesianidx(dest, i)
@inbounds dest[I] = bc′[I]
idx = CartesianIndices(dest)[(I-1)*nelem + i]
@inbounds dest[idx] = bc′[idx]
end
return
end
elements = length(dest)
elements_per_thread = typemax(Int)
heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc′, 1;
elements, elements_per_thread)
config = launch_configuration(backend(dest), heuristic;
elements, elements_per_thread)
gpu_call(broadcast_kernel, dest, bc′, config.elements_per_thread;
threads=config.threads, blocks=config.blocks)
# TODO: figure out actual arguments, 3 should be workgroupsize
config = KernelAbstractions.launch_config(broadcast_kernel, elements,
elements/elements_per_thread)
kernel! = broadcast_kernel(get_backend(dest), config.threads)
kernel!(dest, bc', nelem, ndrange = config.ndrange)

return dest
end
Expand Down
11 changes: 6 additions & 5 deletions src/host/construction.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ Base.convert(::Type{T}, a::AbstractArray) where {T<:AbstractGPUArray} = a isa T

function Base.fill!(A::AnyGPUArray{T}, x) where T
length(A) == 0 && return A
@kernel fill!(a, val)
idx = @index(Linear, Global)
@kernel function fill!(a, val)
idx = @index(Global, Linear)
@inbounds a[idx] = val
end
kernel = fill!(backend(A))
Expand All @@ -23,11 +23,12 @@ end

## identity matrices

@kernel function identity_kernel(ctx::AbstractKernelContext, res::AbstractArray{T}, stride, val) where T
@kernel function identity_kernel(res::AbstractArray{T}, stride, val) where T
i = @index(Global, Linear)
ilin = (stride * (i - 1)) + i
ilin > length(res) && return
@inbounds res[ilin] = val
if ilin < length(res)
@inbounds res[ilin] = val
end
end

function (T::Type{<: AnyGPUArray{U}})(s::UniformScaling, dims::Dims{2}) where {U}
Expand Down
Loading
Loading