diff --git a/Project.toml b/Project.toml index dddeefb9..ac4f4c82 100644 --- a/Project.toml +++ b/Project.toml @@ -32,7 +32,7 @@ StaticArrays = "0.12, 1.0" UUIDs = "<0.0.1, 1.6" UnsafeAtomics = "0.2.1" UnsafeAtomicsLLVM = "0.1, 0.2" -julia = "1.6" +julia = "1.10" [extensions] EnzymeExt = "EnzymeCore" diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index b153f60e..e9f0e80a 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -8,11 +8,11 @@ using KernelAbstractions using Random if !haskey(ENV, "KA_BACKEND") - const BACKEND = CPU() + const BACKEND = OpenCLBackend() else backend = ENV["KA_BACKEND"] if backend == "CPU" - const BACKEND = CPU() + const BACKEND = OpenCLBackend() elseif backend == "CUDA" using CUDA const BACKEND = CUDABackend() diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md index 02c49cdc..4a75a806 100644 --- a/docs/src/quickstart.md +++ b/docs/src/quickstart.md @@ -27,13 +27,13 @@ end ## Launching kernel on the host You can construct a kernel for a specific backend by calling the kernel with -`mul2_kernel(CPU(), 16)`. The first argument is a backend of type `KA.Backend`, +`mul2_kernel(OpenCLBackend(), 16)`. The first argument is a backend of type `KA.Backend`, the second argument being the workgroup size. This returns a generated kernel executable that is then executed with the input argument `A` and the additional argument being a static `ndrange`. ```julia -dev = CPU() +dev = OpenCLBackend() A = ones(1024, 1024) ev = mul2_kernel(dev, 64)(A, ndrange=size(A)) synchronize(dev) diff --git a/examples/histogram.jl b/examples/histogram.jl index 311cff76..5ff839ea 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -94,7 +94,7 @@ end histogram!(rand_histogram, rand_input) histogram!(linear_histogram, linear_input) histogram!(two_histogram, all_two) - KernelAbstractions.synchronize(CPU()) + KernelAbstractions.synchronize(backend) @test isapprox(Array(rand_histogram), histogram_rand_baseline) @test isapprox(Array(linear_histogram), histogram_linear_baseline) diff --git a/examples/numa_aware.jl b/examples/numa_aware.jl index f970d558..09ffadf5 100644 --- a/examples/numa_aware.jl +++ b/examples/numa_aware.jl @@ -19,7 +19,7 @@ Estimate the memory bandwidth (GB/s) by performing a time measurement of a SAXPY kernel. Returns the memory bandwidth (GB/s) and the compute (GFLOP/s). """ function measure_membw( - backend = CPU(); verbose = true, N = 1024 * 500_000, dtype = Float32, + backend = OpenCLBackend(); verbose = true, N = 1024 * 500_000, dtype = Float32, init = :parallel, ) bytes = 3 * sizeof(dtype) * N # num bytes transferred in SAXPY @@ -52,8 +52,8 @@ function measure_membw( end # Static should be much better (on a system with multiple NUMA domains) -measure_membw(CPU()); -measure_membw(CPU(; static = true)); +measure_membw(OpenCLBackend()); +# measure_membw(OpenCLBackend(; static = true)); # The following has significantly worse performance (even on systems with a single memory domain)! # measure_membw(CPU(); init=:serial); diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 741ac12f..618f2b4f 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -35,6 +35,7 @@ and then invoked on the arguments. - [`@uniform`](@ref) - [`@synchronize`](@ref) - [`@print`](@ref) +- [`@context`](@ref) # Example: @@ -51,45 +52,33 @@ synchronize(backend) ``` """ macro kernel(expr) - __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false) + __kernel(expr, #=force_inbounds=# false) end """ - @kernel config function f(args) end - -This allows for two different configurations: - -1. `cpu={true, false}`: Disables code-generation of the CPU function. This relaxes semantics such that KernelAbstractions primitives can be used in non-kernel functions. -2. `inbounds={false, true}`: Enables a forced `@inbounds` macro around the function definition in the case the user is using too many `@inbounds` already in their kernel. Note that this can lead to incorrect results, crashes, etc and is fundamentally unsafe. Be careful! - -- [`@context`](@ref) + @kernel inbounds={false, true} function f(args) end !!! warn This is an experimental feature. """ macro kernel(ex...) if length(ex) == 1 - __kernel(ex[1], true, false) + __kernel(ex[1], false) else - generate_cpu = true force_inbounds = false for i in 1:(length(ex) - 1) if ex[i] isa Expr && ex[i].head == :(=) && - ex[i].args[1] == :cpu && ex[i].args[2] isa Bool - generate_cpu = ex[i].args[2] - elseif ex[i] isa Expr && ex[i].head == :(=) && ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool force_inbounds = ex[i].args[2] else error( "Configuration should be of form:\n" * - "* `cpu=true`\n" * "* `inbounds=false`\n" * "got `", ex[i], "`", ) end end - __kernel(ex[end], generate_cpu, force_inbounds) + __kernel(ex[end], force_inbounds) end end @@ -198,47 +187,6 @@ macro localmem(T, dims) end end -""" - @private T dims - -Declare storage that is local to each item in the workgroup. This can be safely used -across [`@synchronize`](@ref) statements. On a CPU, this will allocate additional implicit -dimensions to ensure correct localization. - -For storage that only persists between `@synchronize` statements, an `MArray` can be used -instead. - -See also [`@uniform`](@ref). -""" -macro private(T, dims) - if dims isa Integer - dims = (dims,) - end - quote - $Scratchpad($(esc(:__ctx__)), $(esc(T)), Val($(esc(dims)))) - end -end - -""" - @private mem = 1 - -Creates a private local of `mem` per item in the workgroup. This can be safely used -across [`@synchronize`](@ref) statements. -""" -macro private(expr) - esc(expr) -end - -""" - @uniform expr - -`expr` is evaluated outside the workitem scope. This is useful for variable declarations -that span workitems, or are reused across `@synchronize` statements. -""" -macro uniform(value) - esc(value) -end - """ @synchronize() @@ -258,10 +206,6 @@ end After a `@synchronize` statement all read and writes to global and local memory from each thread in the workgroup are visible in from all other threads in the workgroup. `cond` is not allowed to have any visible sideffects. - -# Platform differences - - `GPU`: This synchronization will only occur if the `cond` evaluates. - - `CPU`: This synchronization will always occur. """ macro synchronize(cond) quote @@ -274,16 +218,13 @@ end Access the hidden context object used by KernelAbstractions. -!!! warn - Only valid to be used from a kernel with `cpu=false`. - ``` function f(@context, a) I = @index(Global, Linear) a[I] end -@kernel cpu=false function my_kernel(a) +@kernel function my_kernel(a) f(@context, a) end ``` @@ -296,10 +237,6 @@ end @print(items...) This is a unified print statement. - -# Platform differences - - `GPU`: This will reorganize the items to print via `@cuprintf` - - `CPU`: This will call `print(items...)` """ macro print(items...) @@ -420,37 +357,6 @@ Abstract type for all KernelAbstractions backends. """ abstract type Backend end -""" -Abstract type for all GPU based KernelAbstractions backends. - -!!! note - New backend implementations **must** sub-type this abstract type. -""" -abstract type GPU <: Backend end - -""" - CPU(; static=false) - -Instantiate a CPU (multi-threaded) backend. - -## Options: - - `static`: Uses a static thread assignment, this can be beneficial for NUMA aware code. - Defaults to false. -""" -struct CPU <: Backend - static::Bool - CPU(; static::Bool = false) = new(static) -end - -""" - isgpu(::Backend)::Bool - -Returns true for all [`GPU`](@ref) backends. -""" -isgpu(::GPU) = true -isgpu(::CPU) = false - - """ get_backend(A::AbstractArray)::Backend @@ -465,12 +371,9 @@ function get_backend end # Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.: get_backend(A::AbstractArray) = get_backend(parent(A)) -get_backend(::Array) = CPU() - # Define: # adapt_storage(::Backend, a::Array) = adapt(BackendArray, a) # adapt_storage(::Backend, a::BackendArray) = a -Adapt.adapt_storage(::CPU, a::Array) = a """ allocate(::Backend, Type, dims...)::AbstractArray @@ -658,7 +561,7 @@ Partition a kernel for the given ndrange and workgroupsize. return iterspace, dynamic end -function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: Union{CPU, GPU}, S <: _Size, NDRange <: _Size, XPUName} +function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend, S <: _Size, NDRange <: _Size, XPUName} return Kernel{Backend, S, NDRange, XPUName}(backend, xpu_name) end diff --git a/src/cpu.jl b/src/cpu.jl deleted file mode 100644 index bae45a3a..00000000 --- a/src/cpu.jl +++ /dev/null @@ -1,225 +0,0 @@ -import UnsafeAtomicsLLVM - -unsafe_free!(::AbstractArray) = return -synchronize(::CPU) = nothing - -allocate(::CPU, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims) - -function zeros(backend::CPU, ::Type{T}, dims::Tuple) where {T} - arr = allocate(backend, T, dims) - kernel = init_kernel(backend) - kernel(arr, zero, T, ndrange = length(arr)) - return arr -end -function ones(backend::CPU, ::Type{T}, dims::Tuple) where {T} - arr = allocate(backend, T, dims) - kernel = init_kernel(backend) - kernel(arr, one, T; ndrange = length(arr)) - return arr -end - -function copyto!(backend::CPU, A, B) - if get_backend(A) == get_backend(B) && get_backend(A) isa CPU - if length(A) != length(B) - error("Arrays must match in length") - end - if Base.mightalias(A, B) - error("Arrays may not alias") - end - kernel = copy_kernel(backend) - kernel(A, B, ndrange = length(A)) - return A - else - return Base.copyto!(A, B) - end -end - -functional(::CPU) = true - -function (obj::Kernel{CPU})(args...; ndrange = nothing, workgroupsize = nothing) - ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize) - - if length(blocks(iterspace)) == 0 - return nothing - end - - __run(obj, ndrange, iterspace, args, dynamic, obj.backend.static) -end - -const CPU_GRAINSIZE = 1024 # Vectorization, 4x unrolling, minimal grain size -function default_cpu_workgroupsize(ndrange) - # if the total kernel is small, don't launch multiple tasks - n = prod(ndrange) - if iszero(n) - # If the ndrange is zero return a workgroupsize of (1, 1,...) - return map(one, ndrange) - elseif n <= CPU_GRAINSIZE - return ndrange - else - available = Ref(CPU_GRAINSIZE) - return ntuple(length(ndrange)) do i - dim = ndrange[i] - remaining = available[] - if remaining == 0 - return 1 - elseif remaining <= dim - available[] = 0 - return remaining - else - available[] = remaining รท dim - return dim - end - end - end -end - -@inline function launch_config(kernel::Kernel{CPU}, ndrange, workgroupsize) - if ndrange isa Integer - ndrange = (ndrange,) - end - if workgroupsize isa Integer - workgroupsize = (workgroupsize,) - end - - if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && workgroupsize === nothing - workgroupsize = default_cpu_workgroupsize(ndrange) - end - iterspace, dynamic = partition(kernel, ndrange, workgroupsize) - # partition checked that the ndrange's agreed - if KernelAbstractions.ndrange(kernel) <: StaticSize - ndrange = nothing - end - - return ndrange, workgroupsize, iterspace, dynamic -end - -# Inference barriers -function __run(obj, ndrange, iterspace, args, dynamic, static_threads) - N = length(iterspace) - Nthreads = Threads.nthreads() - if Nthreads == 1 - len, rem = N, 0 - else - len, rem = divrem(N, Nthreads) - end - # not enough iterations for all the threads? - if len == 0 - Nthreads = N - len, rem = 1, 0 - end - if Nthreads == 1 - __thread_run(1, len, rem, obj, ndrange, iterspace, args, dynamic) - else - if static_threads - Threads.@threads :static for tid in 1:Nthreads - __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic) - end - else - @sync for tid in 1:Nthreads - Threads.@spawn __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic) - end - end - end - return nothing -end - -function __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic) - # compute this thread's iterations - f = 1 + ((tid - 1) * len) - l = f + len - 1 - # distribute remaining iterations evenly - if rem > 0 - if tid <= rem - f = f + (tid - 1) - l = l + tid - else - f = f + rem - l = l + rem - end - end - # run this thread's iterations - for i in f:l - block = @inbounds blocks(iterspace)[i] - ctx = mkcontext(obj, block, ndrange, iterspace, dynamic) - obj.f(ctx, args...) - end - return nothing -end - -function mkcontext(kernel::Kernel{CPU}, I, _ndrange, iterspace, ::Dynamic) where {Dynamic} - return CompilerMetadata{ndrange(kernel), Dynamic}(I, _ndrange, iterspace) -end - -@inline function __index_Local_Linear(ctx, idx::CartesianIndex) - indices = workitems(__iterspace(ctx)) - return @inbounds LinearIndices(indices)[idx] -end - -@inline function __index_Group_Linear(ctx, idx::CartesianIndex) - indices = blocks(__iterspace(ctx)) - return @inbounds LinearIndices(indices)[__groupindex(ctx)] -end - -@inline function __index_Global_Linear(ctx, idx::CartesianIndex) - I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx) - @inbounds LinearIndices(__ndrange(ctx))[I] -end - -@inline function __index_Local_Cartesian(_, idx::CartesianIndex) - return idx -end - -@inline function __index_Group_Cartesian(ctx, ::CartesianIndex) - __groupindex(ctx) -end - -@inline function __index_Global_Cartesian(ctx, idx::CartesianIndex) - return @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx) -end - -@inline function __validindex(ctx, idx::CartesianIndex) - # Turns this into a noop for code where we can turn of checkbounds of - if __dynamic_checkbounds(ctx) - I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx) - return I in __ndrange(ctx) - else - return true - end -end - -### -# CPU implementation of shared memory -### -@inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val) where {T, Dims} - MArray{__size(Dims), T}(undef) -end - -### -# CPU implementation of scratch memory -# - memory allocated as a MArray with size `Dims` -### - -struct ScratchArray{N, D} - data::D - ScratchArray{N}(data::D) where {N, D} = new{N, D}(data) -end - -@inline function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims} - return ScratchArray{length(Dims)}(MArray{__size((Dims..., prod(__groupsize(ctx)))), T}(undef)) -end - -# Base.view creates a boundscheck which captures A -# https://github.com/JuliaLang/julia/issues/39308 -@inline function aview(A, I::Vararg{Any, N}) where {N} - J = Base.to_indices(A, I) - Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J...)), J...) -end - -@inline function Base.getindex(A::ScratchArray{N}, idx) where {N} - return @inbounds aview(A.data, ntuple(_ -> :, Val(N))..., idx) -end - -# Argument conversion -argconvert(k::Kernel{CPU}, arg) = arg - -supports_enzyme(::CPU) = true diff --git a/src/macros.jl b/src/macros.jl index 55b4bab4..e501645a 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -10,7 +10,7 @@ function find_return(stmt) end # XXX: Proper errors -function __kernel(expr, generate_cpu = true, force_inbounds = false) +function __kernel(expr, force_inbounds = false) def = splitdef(expr) name = def[:name] args = def[:args] @@ -29,19 +29,6 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false) constargs[i] = false end - # create two functions - # 1. GPU function - # 2. CPU function with work-group loops inserted - # - # Without the deepcopy we might accidentially modify expr shared between CPU and GPU - cpu_name = Symbol(:cpu_, name) - if generate_cpu - def_cpu = deepcopy(def) - def_cpu[:name] = cpu_name - transform_cpu!(def_cpu, constargs, force_inbounds) - cpu_function = combinedef(def_cpu) - end - def_gpu = deepcopy(def) def_gpu[:name] = gpu_name = Symbol(:gpu_, name) transform_gpu!(def_gpu, constargs, force_inbounds) @@ -54,24 +41,12 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false) $name(dev, size) = $name(dev, $StaticSize(size), $DynamicSize()) $name(dev, size, range) = $name(dev, $StaticSize(size), $StaticSize(range)) function $name(dev::Dev, sz::S, range::NDRange) where {Dev, S <: $_Size, NDRange <: $_Size} - if $isgpu(dev) - return $construct(dev, sz, range, $gpu_name) - else - if $generate_cpu - return $construct(dev, sz, range, $cpu_name) - else - error("This kernel is unavailable for backend CPU") - end - end + return $construct(dev, sz, range, $gpu_name) end end end - if generate_cpu - return Expr(:block, esc(cpu_function), esc(gpu_function), esc(constructors)) - else - return Expr(:block, esc(gpu_function), esc(constructors)) - end + return Expr(:block, esc(gpu_function), esc(constructors)) end # The easy case, transform the function for GPU execution @@ -101,198 +76,4 @@ function transform_gpu!(def, constargs, force_inbounds) Expr(:block, let_constargs...), body, ) -end - -# The hard case, transform the function for CPU execution -# - mark constant arguments by applying `constify`. -# - insert aliasscope markers -# - insert implied loop bodys -# - handle indicies -# - hoist workgroup definitions -# - hoist uniform variables -function transform_cpu!(def, constargs, force_inbounds) - let_constargs = Expr[] - for (i, arg) in enumerate(def[:args]) - if constargs[i] - push!(let_constargs, :($arg = $constify($arg))) - end - end - pushfirst!(def[:args], :__ctx__) - new_stmts = Expr[] - body = MacroTools.flatten(def[:body]) - push!(new_stmts, Expr(:aliasscope)) - if force_inbounds - push!(new_stmts, Expr(:inbounds, true)) - end - append!(new_stmts, split(body.args)) - if force_inbounds - push!(new_stmts, Expr(:inbounds, :pop)) - end - push!(new_stmts, Expr(:popaliasscope)) - push!(new_stmts, :(return nothing)) - def[:body] = Expr( - :let, - Expr(:block, let_constargs...), - Expr(:block, new_stmts...), - ) -end - -struct WorkgroupLoop - indicies::Vector{Any} - stmts::Vector{Any} - allocations::Vector{Any} - private_allocations::Vector{Any} - private::Set{Symbol} -end - -is_sync(expr) = @capture(expr, @synchronize() | @synchronize(a_)) - -function is_scope_construct(expr::Expr) - expr.head === :block # || - # expr.head === :let -end - -function find_sync(stmt) - result = false - postwalk(stmt) do expr - result |= is_sync(expr) - expr - end - result -end - -# TODO proper handling of LineInfo -function split( - stmts, - indicies = Any[], private = Set{Symbol}(), - ) - # 1. Split the code into blocks separated by `@synchronize` - # 2. Aggregate `@index` expressions - # 3. Hoist allocations - # 4. Hoist uniforms - - current = Any[] - allocations = Any[] - private_allocations = Any[] - new_stmts = Any[] - for stmt in stmts - has_sync = find_sync(stmt) - if has_sync - loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private)) - push!(new_stmts, emit(loop)) - allocations = Any[] - private_allocations = Any[] - current = Any[] - is_sync(stmt) && continue - - # Recurse into scope constructs - # TODO: This currently implements hard scoping - # probably need to implemet soft scoping - # by not deepcopying the environment. - recurse(x) = x - function recurse(expr::Expr) - expr = unblock(expr) - if is_scope_construct(expr) && any(find_sync, expr.args) - new_args = unblock(split(expr.args, deepcopy(indicies), deepcopy(private))) - return Expr(expr.head, new_args...) - else - return Expr(expr.head, map(recurse, expr.args)...) - end - end - push!(new_stmts, recurse(stmt)) - continue - end - - if @capture(stmt, @uniform x_) - push!(allocations, stmt) - continue - elseif @capture(stmt, @private lhs_ = rhs_) - push!(private, lhs) - push!(private_allocations, :($lhs = $rhs)) - continue - elseif @capture(stmt, lhs_ = rhs_ | (vs__, lhs_ = rhs_)) - if @capture(rhs, @index(args__)) - push!(indicies, stmt) - continue - elseif @capture(rhs, @localmem(args__) | @uniform(args__)) - push!(allocations, stmt) - continue - elseif @capture(rhs, @private(T_, dims_)) - # Implement the legacy `mem = @private T dims` as - # mem = Scratchpad(T, Val(dims)) - - if dims isa Integer - dims = (dims,) - end - alloc = :($Scratchpad(__ctx__, $T, Val($dims))) - push!(allocations, :($lhs = $alloc)) - push!(private, lhs) - continue - end - end - - push!(current, stmt) - end - - # everything since the last `@synchronize` - if !isempty(current) - loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private)) - push!(new_stmts, emit(loop)) - end - return new_stmts -end - -function emit(loop) - idx = gensym(:I) - for stmt in loop.indicies - # splice index into the i = @index(Cartesian, $idx) - @assert stmt.head === :(=) - rhs = stmt.args[2] - push!(rhs.args, idx) - end - stmts = Any[] - append!(stmts, loop.allocations) - - # private_allocations turn into lhs = ntuple(i->rhs, length(__workitems_iterspace())) - N = gensym(:N) - push!(stmts, :($N = length($__workitems_iterspace(__ctx__)))) - - for stmt in loop.private_allocations - if @capture(stmt, lhs_ = rhs_) - push!(stmts, :($lhs = ntuple(_ -> $rhs, $N))) - else - error("@private $stmt not an assignment") - end - end - - # don't emit empty loops - if !(isempty(loop.stmts) || all(s -> s isa LineNumberNode, loop.stmts)) - body = Expr(:block, loop.stmts...) - body = postwalk(body) do expr - if @capture(expr, lhs_ = rhs_) - if lhs in loop.private - error("Can't assign to variables marked private") - end - elseif @capture(expr, A_[i__]) - if A in loop.private - return :($A[$__index_Local_Linear(__ctx__, $(idx))][$(i...)]) - end - elseif expr isa Symbol - if expr in loop.private - return :($expr[$__index_Local_Linear(__ctx__, $(idx))]) - end - end - return expr - end - loopexpr = quote - for $idx in $__workitems_iterspace(__ctx__) - $__validindex(__ctx__, $idx) || continue - $(loop.indicies...) - $(unblock(body)) - end - end - push!(stmts, loopexpr) - end - - return unblock(Expr(:block, stmts...)) -end +end \ No newline at end of file