diff --git a/Project.toml b/Project.toml
index dddeefb9..ac4f4c82 100644
--- a/Project.toml
+++ b/Project.toml
@@ -32,7 +32,7 @@ StaticArrays = "0.12, 1.0"
 UUIDs = "<0.0.1, 1.6"
 UnsafeAtomics = "0.2.1"
 UnsafeAtomicsLLVM = "0.1, 0.2"
-julia = "1.6"
+julia = "1.10"
 
 [extensions]
 EnzymeExt = "EnzymeCore"
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index b153f60e..e9f0e80a 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -8,11 +8,11 @@ using KernelAbstractions
 using Random
 
 if !haskey(ENV, "KA_BACKEND")
-    const BACKEND = CPU()
+    const BACKEND = OpenCLBackend()
 else
     backend = ENV["KA_BACKEND"]
     if backend == "CPU"
-        const BACKEND = CPU()
+        const BACKEND = OpenCLBackend()
     elseif backend == "CUDA"
         using CUDA
         const BACKEND = CUDABackend()
diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md
index 02c49cdc..4a75a806 100644
--- a/docs/src/quickstart.md
+++ b/docs/src/quickstart.md
@@ -27,13 +27,13 @@ end
 ## Launching kernel on the host
 
 You can construct a kernel for a specific backend by calling the kernel with
-`mul2_kernel(CPU(), 16)`. The first argument is a backend of type `KA.Backend`,
+`mul2_kernel(OpenCLBackend(), 16)`. The first argument is a backend of type `KA.Backend`,
 the second argument being the workgroup size. This returns a generated kernel
 executable that is then executed with the input argument `A` and the additional
 argument being a static `ndrange`.
 
 ```julia
-dev = CPU()
+dev = OpenCLBackend()
 A = ones(1024, 1024)
 ev = mul2_kernel(dev, 64)(A, ndrange=size(A))
 synchronize(dev)
diff --git a/examples/histogram.jl b/examples/histogram.jl
index 311cff76..5ff839ea 100644
--- a/examples/histogram.jl
+++ b/examples/histogram.jl
@@ -94,7 +94,7 @@ end
         histogram!(rand_histogram, rand_input)
         histogram!(linear_histogram, linear_input)
         histogram!(two_histogram, all_two)
-        KernelAbstractions.synchronize(CPU())
+        KernelAbstractions.synchronize(backend)
 
         @test isapprox(Array(rand_histogram), histogram_rand_baseline)
         @test isapprox(Array(linear_histogram), histogram_linear_baseline)
diff --git a/examples/numa_aware.jl b/examples/numa_aware.jl
index f970d558..09ffadf5 100644
--- a/examples/numa_aware.jl
+++ b/examples/numa_aware.jl
@@ -19,7 +19,7 @@ Estimate the memory bandwidth (GB/s) by performing a time measurement of a
 SAXPY kernel. Returns the memory bandwidth (GB/s) and the compute (GFLOP/s).
 """
 function measure_membw(
-        backend = CPU(); verbose = true, N = 1024 * 500_000, dtype = Float32,
+        backend = OpenCLBackend(); verbose = true, N = 1024 * 500_000, dtype = Float32,
         init = :parallel,
     )
     bytes = 3 * sizeof(dtype) * N # num bytes transferred in SAXPY
@@ -52,8 +52,8 @@ function measure_membw(
 end
 
 # Static should be much better (on a system with multiple NUMA domains)
-measure_membw(CPU());
-measure_membw(CPU(; static = true));
+measure_membw(OpenCLBackend());
+# measure_membw(OpenCLBackend(; static = true));
 
 # The following has significantly worse performance (even on systems with a single memory domain)!
 # measure_membw(CPU(); init=:serial);
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 741ac12f..618f2b4f 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -35,6 +35,7 @@ and then invoked on the arguments.
 - [`@uniform`](@ref)
 - [`@synchronize`](@ref)
 - [`@print`](@ref)
+- [`@context`](@ref)
 
 # Example:
 
@@ -51,45 +52,33 @@ synchronize(backend)
 ```
 """
 macro kernel(expr)
-    __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false)
+    __kernel(expr, #=force_inbounds=# false)
 end
 
 """
-    @kernel config function f(args) end
-
-This allows for two different configurations:
-
-1. `cpu={true, false}`: Disables code-generation of the CPU function. This relaxes semantics such that KernelAbstractions primitives can be used in non-kernel functions.
-2. `inbounds={false, true}`: Enables a forced `@inbounds` macro around the function definition in the case the user is using too many `@inbounds` already in their kernel. Note that this can lead to incorrect results, crashes, etc and is fundamentally unsafe. Be careful!
-
-- [`@context`](@ref)
+    @kernel inbounds={false, true} function f(args) end
 
 !!! warn
     This is an experimental feature.
 """
 macro kernel(ex...)
     if length(ex) == 1
-        __kernel(ex[1], true, false)
+        __kernel(ex[1], false)
     else
-        generate_cpu = true
         force_inbounds = false
         for i in 1:(length(ex) - 1)
             if ex[i] isa Expr && ex[i].head == :(=) &&
-                    ex[i].args[1] == :cpu && ex[i].args[2] isa Bool
-                generate_cpu = ex[i].args[2]
-            elseif ex[i] isa Expr && ex[i].head == :(=) &&
                     ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
                 force_inbounds = ex[i].args[2]
             else
                 error(
                     "Configuration should be of form:\n" *
-                        "* `cpu=true`\n" *
                         "* `inbounds=false`\n" *
                         "got `", ex[i], "`",
                 )
             end
         end
-        __kernel(ex[end], generate_cpu, force_inbounds)
+        __kernel(ex[end], force_inbounds)
     end
 end
 
@@ -198,47 +187,6 @@ macro localmem(T, dims)
     end
 end
 
-"""
-    @private T dims
-
-Declare storage that is local to each item in the workgroup. This can be safely used
-across [`@synchronize`](@ref) statements. On a CPU, this will allocate additional implicit
-dimensions to ensure correct localization.
-
-For storage that only persists between `@synchronize` statements, an `MArray` can be used
-instead.
-
-See also [`@uniform`](@ref).
-"""
-macro private(T, dims)
-    if dims isa Integer
-        dims = (dims,)
-    end
-    quote
-        $Scratchpad($(esc(:__ctx__)), $(esc(T)), Val($(esc(dims))))
-    end
-end
-
-"""
-    @private mem = 1
-
-Creates a private local of `mem` per item in the workgroup. This can be safely used
-across [`@synchronize`](@ref) statements.
-"""
-macro private(expr)
-    esc(expr)
-end
-
-"""
-    @uniform expr
-
-`expr` is evaluated outside the workitem scope. This is useful for variable declarations
-that span workitems, or are reused across `@synchronize` statements.
-"""
-macro uniform(value)
-    esc(value)
-end
-
 """
     @synchronize()
 
@@ -258,10 +206,6 @@ end
 After a `@synchronize` statement all read and writes to global and local memory
 from each thread in the workgroup are visible in from all other threads in the
 workgroup. `cond` is not allowed to have any visible sideffects.
-
-# Platform differences
-  - `GPU`: This synchronization will only occur if the `cond` evaluates.
-  - `CPU`: This synchronization will always occur.
 """
 macro synchronize(cond)
     quote
@@ -274,16 +218,13 @@ end
 
 Access the hidden context object used by KernelAbstractions.
 
-!!! warn
-    Only valid to be used from a kernel with `cpu=false`.
-
 ```
 function f(@context, a)
     I = @index(Global, Linear)
     a[I]
 end
 
-@kernel cpu=false function my_kernel(a)
+@kernel function my_kernel(a)
     f(@context, a)
 end
 ```
@@ -296,10 +237,6 @@ end
     @print(items...)
 
 This is a unified print statement.
-
-# Platform differences
-  - `GPU`: This will reorganize the items to print via `@cuprintf`
-  - `CPU`: This will call `print(items...)`
 """
 macro print(items...)
 
@@ -420,37 +357,6 @@ Abstract type for all KernelAbstractions backends.
 """
 abstract type Backend end
 
-"""
-Abstract type for all GPU based KernelAbstractions backends.
-
-!!! note
-    New backend implementations **must** sub-type this abstract type.
-"""
-abstract type GPU <: Backend end
-
-"""
-    CPU(; static=false)
-
-Instantiate a CPU (multi-threaded) backend.
-
-## Options:
- - `static`: Uses a static thread assignment, this can be beneficial for NUMA aware code.
-   Defaults to false.
-"""
-struct CPU <: Backend
-    static::Bool
-    CPU(; static::Bool = false) = new(static)
-end
-
-"""
-    isgpu(::Backend)::Bool
-
-Returns true for all [`GPU`](@ref) backends.
-"""
-isgpu(::GPU) = true
-isgpu(::CPU) = false
-
-
 """
     get_backend(A::AbstractArray)::Backend
 
@@ -465,12 +371,9 @@ function get_backend end
 # Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.:
 get_backend(A::AbstractArray) = get_backend(parent(A))
 
-get_backend(::Array) = CPU()
-
 # Define:
 #   adapt_storage(::Backend, a::Array) = adapt(BackendArray, a)
 #   adapt_storage(::Backend, a::BackendArray) = a
-Adapt.adapt_storage(::CPU, a::Array) = a
 
 """
     allocate(::Backend, Type, dims...)::AbstractArray
@@ -658,7 +561,7 @@ Partition a kernel for the given ndrange and workgroupsize.
     return iterspace, dynamic
 end
 
-function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: Union{CPU, GPU}, S <: _Size, NDRange <: _Size, XPUName}
+function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend, S <: _Size, NDRange <: _Size, XPUName}
     return Kernel{Backend, S, NDRange, XPUName}(backend, xpu_name)
 end
 
diff --git a/src/cpu.jl b/src/cpu.jl
deleted file mode 100644
index bae45a3a..00000000
--- a/src/cpu.jl
+++ /dev/null
@@ -1,225 +0,0 @@
-import UnsafeAtomicsLLVM
-
-unsafe_free!(::AbstractArray) = return
-synchronize(::CPU) = nothing
-
-allocate(::CPU, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims)
-
-function zeros(backend::CPU, ::Type{T}, dims::Tuple) where {T}
-    arr = allocate(backend, T, dims)
-    kernel = init_kernel(backend)
-    kernel(arr, zero, T, ndrange = length(arr))
-    return arr
-end
-function ones(backend::CPU, ::Type{T}, dims::Tuple) where {T}
-    arr = allocate(backend, T, dims)
-    kernel = init_kernel(backend)
-    kernel(arr, one, T; ndrange = length(arr))
-    return arr
-end
-
-function copyto!(backend::CPU, A, B)
-    if get_backend(A) == get_backend(B) && get_backend(A) isa CPU
-        if length(A) != length(B)
-            error("Arrays must match in length")
-        end
-        if Base.mightalias(A, B)
-            error("Arrays may not alias")
-        end
-        kernel = copy_kernel(backend)
-        kernel(A, B, ndrange = length(A))
-        return A
-    else
-        return Base.copyto!(A, B)
-    end
-end
-
-functional(::CPU) = true
-
-function (obj::Kernel{CPU})(args...; ndrange = nothing, workgroupsize = nothing)
-    ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
-
-    if length(blocks(iterspace)) == 0
-        return nothing
-    end
-
-    __run(obj, ndrange, iterspace, args, dynamic, obj.backend.static)
-end
-
-const CPU_GRAINSIZE = 1024 # Vectorization, 4x unrolling, minimal grain size
-function default_cpu_workgroupsize(ndrange)
-    # if the total kernel is small, don't launch multiple tasks
-    n = prod(ndrange)
-    if iszero(n)
-        # If the ndrange is zero return a workgroupsize of (1, 1,...)
-        return map(one, ndrange)
-    elseif n <= CPU_GRAINSIZE
-        return ndrange
-    else
-        available = Ref(CPU_GRAINSIZE)
-        return ntuple(length(ndrange)) do i
-            dim = ndrange[i]
-            remaining = available[]
-            if remaining == 0
-                return 1
-            elseif remaining <= dim
-                available[] = 0
-                return remaining
-            else
-                available[] = remaining ÷ dim
-                return dim
-            end
-        end
-    end
-end
-
-@inline function launch_config(kernel::Kernel{CPU}, ndrange, workgroupsize)
-    if ndrange isa Integer
-        ndrange = (ndrange,)
-    end
-    if workgroupsize isa Integer
-        workgroupsize = (workgroupsize,)
-    end
-
-    if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && workgroupsize === nothing
-        workgroupsize = default_cpu_workgroupsize(ndrange)
-    end
-    iterspace, dynamic = partition(kernel, ndrange, workgroupsize)
-    # partition checked that the ndrange's agreed
-    if KernelAbstractions.ndrange(kernel) <: StaticSize
-        ndrange = nothing
-    end
-
-    return ndrange, workgroupsize, iterspace, dynamic
-end
-
-# Inference barriers
-function __run(obj, ndrange, iterspace, args, dynamic, static_threads)
-    N = length(iterspace)
-    Nthreads = Threads.nthreads()
-    if Nthreads == 1
-        len, rem = N, 0
-    else
-        len, rem = divrem(N, Nthreads)
-    end
-    # not enough iterations for all the threads?
-    if len == 0
-        Nthreads = N
-        len, rem = 1, 0
-    end
-    if Nthreads == 1
-        __thread_run(1, len, rem, obj, ndrange, iterspace, args, dynamic)
-    else
-        if static_threads
-            Threads.@threads :static for tid in 1:Nthreads
-                __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
-            end
-        else
-            @sync for tid in 1:Nthreads
-                Threads.@spawn __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
-            end
-        end
-    end
-    return nothing
-end
-
-function __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
-    # compute this thread's iterations
-    f = 1 + ((tid - 1) * len)
-    l = f + len - 1
-    # distribute remaining iterations evenly
-    if rem > 0
-        if tid <= rem
-            f = f + (tid - 1)
-            l = l + tid
-        else
-            f = f + rem
-            l = l + rem
-        end
-    end
-    # run this thread's iterations
-    for i in f:l
-        block = @inbounds blocks(iterspace)[i]
-        ctx = mkcontext(obj, block, ndrange, iterspace, dynamic)
-        obj.f(ctx, args...)
-    end
-    return nothing
-end
-
-function mkcontext(kernel::Kernel{CPU}, I, _ndrange, iterspace, ::Dynamic) where {Dynamic}
-    return CompilerMetadata{ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
-end
-
-@inline function __index_Local_Linear(ctx, idx::CartesianIndex)
-    indices = workitems(__iterspace(ctx))
-    return @inbounds LinearIndices(indices)[idx]
-end
-
-@inline function __index_Group_Linear(ctx, idx::CartesianIndex)
-    indices = blocks(__iterspace(ctx))
-    return @inbounds LinearIndices(indices)[__groupindex(ctx)]
-end
-
-@inline function __index_Global_Linear(ctx, idx::CartesianIndex)
-    I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx)
-    @inbounds LinearIndices(__ndrange(ctx))[I]
-end
-
-@inline function __index_Local_Cartesian(_, idx::CartesianIndex)
-    return idx
-end
-
-@inline function __index_Group_Cartesian(ctx, ::CartesianIndex)
-    __groupindex(ctx)
-end
-
-@inline function __index_Global_Cartesian(ctx, idx::CartesianIndex)
-    return @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx)
-end
-
-@inline function __validindex(ctx, idx::CartesianIndex)
-    # Turns this into a noop for code where we can turn of checkbounds of
-    if __dynamic_checkbounds(ctx)
-        I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx)
-        return I in __ndrange(ctx)
-    else
-        return true
-    end
-end
-
-###
-# CPU implementation of shared memory
-###
-@inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val) where {T, Dims}
-    MArray{__size(Dims), T}(undef)
-end
-
-###
-# CPU implementation of scratch memory
-# - memory allocated as a MArray with size `Dims`
-###
-
-struct ScratchArray{N, D}
-    data::D
-    ScratchArray{N}(data::D) where {N, D} = new{N, D}(data)
-end
-
-@inline function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
-    return ScratchArray{length(Dims)}(MArray{__size((Dims..., prod(__groupsize(ctx)))), T}(undef))
-end
-
-# Base.view creates a boundscheck which captures A
-# https://github.com/JuliaLang/julia/issues/39308
-@inline function aview(A, I::Vararg{Any, N}) where {N}
-    J = Base.to_indices(A, I)
-    Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J...)), J...)
-end
-
-@inline function Base.getindex(A::ScratchArray{N}, idx) where {N}
-    return @inbounds aview(A.data, ntuple(_ -> :, Val(N))..., idx)
-end
-
-# Argument conversion
-argconvert(k::Kernel{CPU}, arg) = arg
-
-supports_enzyme(::CPU) = true
diff --git a/src/macros.jl b/src/macros.jl
index 55b4bab4..e501645a 100644
--- a/src/macros.jl
+++ b/src/macros.jl
@@ -10,7 +10,7 @@ function find_return(stmt)
 end
 
 # XXX: Proper errors
-function __kernel(expr, generate_cpu = true, force_inbounds = false)
+function __kernel(expr, force_inbounds = false)
     def = splitdef(expr)
     name = def[:name]
     args = def[:args]
@@ -29,19 +29,6 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false)
         constargs[i] = false
     end
 
-    # create two functions
-    # 1. GPU function
-    # 2. CPU function with work-group loops inserted
-    #
-    # Without the deepcopy we might accidentially modify expr shared between CPU and GPU
-    cpu_name = Symbol(:cpu_, name)
-    if generate_cpu
-        def_cpu = deepcopy(def)
-        def_cpu[:name] = cpu_name
-        transform_cpu!(def_cpu, constargs, force_inbounds)
-        cpu_function = combinedef(def_cpu)
-    end
-
     def_gpu = deepcopy(def)
     def_gpu[:name] = gpu_name = Symbol(:gpu_, name)
     transform_gpu!(def_gpu, constargs, force_inbounds)
@@ -54,24 +41,12 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false)
             $name(dev, size) = $name(dev, $StaticSize(size), $DynamicSize())
             $name(dev, size, range) = $name(dev, $StaticSize(size), $StaticSize(range))
             function $name(dev::Dev, sz::S, range::NDRange) where {Dev, S <: $_Size, NDRange <: $_Size}
-                if $isgpu(dev)
-                    return $construct(dev, sz, range, $gpu_name)
-                else
-                    if $generate_cpu
-                        return $construct(dev, sz, range, $cpu_name)
-                    else
-                        error("This kernel is unavailable for backend CPU")
-                    end
-                end
+                return $construct(dev, sz, range, $gpu_name)
             end
         end
     end
 
-    if generate_cpu
-        return Expr(:block, esc(cpu_function), esc(gpu_function), esc(constructors))
-    else
-        return Expr(:block, esc(gpu_function), esc(constructors))
-    end
+    return Expr(:block, esc(gpu_function), esc(constructors))
 end
 
 # The easy case, transform the function for GPU execution
@@ -101,198 +76,4 @@ function transform_gpu!(def, constargs, force_inbounds)
         Expr(:block, let_constargs...),
         body,
     )
-end
-
-# The hard case, transform the function for CPU execution
-# - mark constant arguments by applying `constify`.
-# - insert aliasscope markers
-# - insert implied loop bodys
-#   - handle indicies
-#   - hoist workgroup definitions
-#   - hoist uniform variables
-function transform_cpu!(def, constargs, force_inbounds)
-    let_constargs = Expr[]
-    for (i, arg) in enumerate(def[:args])
-        if constargs[i]
-            push!(let_constargs, :($arg = $constify($arg)))
-        end
-    end
-    pushfirst!(def[:args], :__ctx__)
-    new_stmts = Expr[]
-    body = MacroTools.flatten(def[:body])
-    push!(new_stmts, Expr(:aliasscope))
-    if force_inbounds
-        push!(new_stmts, Expr(:inbounds, true))
-    end
-    append!(new_stmts, split(body.args))
-    if force_inbounds
-        push!(new_stmts, Expr(:inbounds, :pop))
-    end
-    push!(new_stmts, Expr(:popaliasscope))
-    push!(new_stmts, :(return nothing))
-    def[:body] = Expr(
-        :let,
-        Expr(:block, let_constargs...),
-        Expr(:block, new_stmts...),
-    )
-end
-
-struct WorkgroupLoop
-    indicies::Vector{Any}
-    stmts::Vector{Any}
-    allocations::Vector{Any}
-    private_allocations::Vector{Any}
-    private::Set{Symbol}
-end
-
-is_sync(expr) = @capture(expr, @synchronize() | @synchronize(a_))
-
-function is_scope_construct(expr::Expr)
-    expr.head === :block # ||
-    # expr.head === :let
-end
-
-function find_sync(stmt)
-    result = false
-    postwalk(stmt) do expr
-        result |= is_sync(expr)
-        expr
-    end
-    result
-end
-
-# TODO proper handling of LineInfo
-function split(
-        stmts,
-        indicies = Any[], private = Set{Symbol}(),
-    )
-    # 1. Split the code into blocks separated by `@synchronize`
-    # 2. Aggregate `@index` expressions
-    # 3. Hoist allocations
-    # 4. Hoist uniforms
-
-    current = Any[]
-    allocations = Any[]
-    private_allocations = Any[]
-    new_stmts = Any[]
-    for stmt in stmts
-        has_sync = find_sync(stmt)
-        if has_sync
-            loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private))
-            push!(new_stmts, emit(loop))
-            allocations = Any[]
-            private_allocations = Any[]
-            current = Any[]
-            is_sync(stmt) && continue
-
-            # Recurse into scope constructs
-            # TODO: This currently implements hard scoping
-            #       probably need to implemet soft scoping
-            #       by not deepcopying the environment.
-            recurse(x) = x
-            function recurse(expr::Expr)
-                expr = unblock(expr)
-                if is_scope_construct(expr) && any(find_sync, expr.args)
-                    new_args = unblock(split(expr.args, deepcopy(indicies), deepcopy(private)))
-                    return Expr(expr.head, new_args...)
-                else
-                    return Expr(expr.head, map(recurse, expr.args)...)
-                end
-            end
-            push!(new_stmts, recurse(stmt))
-            continue
-        end
-
-        if @capture(stmt, @uniform x_)
-            push!(allocations, stmt)
-            continue
-        elseif @capture(stmt, @private lhs_ = rhs_)
-            push!(private, lhs)
-            push!(private_allocations, :($lhs = $rhs))
-            continue
-        elseif @capture(stmt, lhs_ = rhs_ | (vs__, lhs_ = rhs_))
-            if @capture(rhs, @index(args__))
-                push!(indicies, stmt)
-                continue
-            elseif @capture(rhs, @localmem(args__) | @uniform(args__))
-                push!(allocations, stmt)
-                continue
-            elseif @capture(rhs, @private(T_, dims_))
-                # Implement the legacy `mem = @private T dims` as
-                # mem = Scratchpad(T, Val(dims))
-
-                if dims isa Integer
-                    dims = (dims,)
-                end
-                alloc = :($Scratchpad(__ctx__, $T, Val($dims)))
-                push!(allocations, :($lhs = $alloc))
-                push!(private, lhs)
-                continue
-            end
-        end
-
-        push!(current, stmt)
-    end
-
-    # everything since the last `@synchronize`
-    if !isempty(current)
-        loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private))
-        push!(new_stmts, emit(loop))
-    end
-    return new_stmts
-end
-
-function emit(loop)
-    idx = gensym(:I)
-    for stmt in loop.indicies
-        # splice index into the i = @index(Cartesian, $idx)
-        @assert stmt.head === :(=)
-        rhs = stmt.args[2]
-        push!(rhs.args, idx)
-    end
-    stmts = Any[]
-    append!(stmts, loop.allocations)
-
-    # private_allocations turn into lhs = ntuple(i->rhs, length(__workitems_iterspace()))
-    N = gensym(:N)
-    push!(stmts, :($N = length($__workitems_iterspace(__ctx__))))
-
-    for stmt in loop.private_allocations
-        if @capture(stmt, lhs_ = rhs_)
-            push!(stmts, :($lhs = ntuple(_ -> $rhs, $N)))
-        else
-            error("@private $stmt not an assignment")
-        end
-    end
-
-    # don't emit empty loops
-    if !(isempty(loop.stmts) || all(s -> s isa LineNumberNode, loop.stmts))
-        body = Expr(:block, loop.stmts...)
-        body = postwalk(body) do expr
-            if @capture(expr, lhs_ = rhs_)
-                if lhs in loop.private
-                    error("Can't assign to variables marked private")
-                end
-            elseif @capture(expr, A_[i__])
-                if A in loop.private
-                    return :($A[$__index_Local_Linear(__ctx__, $(idx))][$(i...)])
-                end
-            elseif expr isa Symbol
-                if expr in loop.private
-                    return :($expr[$__index_Local_Linear(__ctx__, $(idx))])
-                end
-            end
-            return expr
-        end
-        loopexpr = quote
-            for $idx in $__workitems_iterspace(__ctx__)
-                $__validindex(__ctx__, $idx) || continue
-                $(loop.indicies...)
-                $(unblock(body))
-            end
-        end
-        push!(stmts, loopexpr)
-    end
-
-    return unblock(Expr(:block, stmts...))
-end
+end
\ No newline at end of file