sintefmath · moyner · Oct 23, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "JutulDarcy"
 uuid = "82210473-ab04-4dce-b31b-11573c4f8e0a"
 authors = ["Olav Møyner <[email protected]>"]
-version = "0.2.35"
+version = "0.2.36"
 
 [deps]
 AlgebraicMultigrid = "2169fc97-5a83-5252-b627-83903c6c433c"
@@ -34,20 +34,26 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
 
 [weakdeps]
+AMGX = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
 
 [extensions]
+JutulDarcyAMGXExt = "AMGX"
+JutulDarcyCUDAExt = "CUDA"
 JutulDarcyGLMakieExt = "GLMakie"
 JutulDarcyMakieExt = "Makie"
 JutulDarcyPartitionedArraysExt = ["PartitionedArrays", "MPI", "HYPRE"]
 
 [compat]
 AlgebraicMultigrid = "0.5.1, 0.6.0"
 Artifacts = "1"
+AMGX = "0.2"
+CUDA = "5"
 DataStructures = "0.18.13"
 Dates = "1"
 DelimitedFiles = "1.6"
@@ -56,7 +62,7 @@ ForwardDiff = "0.10.30"
 GLMakie = "0.10.13"
 GeoEnergyIO = "1.1.12"
 HYPRE = "1.6.0, 1.7"
-Jutul = "0.2.40"
+Jutul = "0.2.41"
 Krylov = "0.9.1"
 LazyArtifacts = "1"
 LinearAlgebra = "1"
@@ -78,6 +84,8 @@ Tullio = "0.3.4"
 julia = "1.7"
 
 [extras]
+AMGX = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
@@ -87,4 +95,4 @@ TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
 
 [targets]
-test = ["Test", "TestItemRunner", "TestItems", "HYPRE", "MPI", "PartitionedArrays"]
+test = ["Test", "CUDA", "TestItemRunner", "TestItems", "HYPRE", "MPI", "PartitionedArrays"]
diff --git a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
@@ -0,0 +1,14 @@
+module JutulDarcyAMGXExt
+    using JutulDarcy, Jutul, AMGX, AMGX.CUDA, LinearAlgebra, SparseArrays
+    import JutulDarcy: AMGXPreconditioner
+    import Jutul: @tic
+
+    timeit_debug_enabled() = Jutul.timeit_debug_enabled()
+
+    include("cpr.jl")
+
+    function __init__()
+        # TODO: Figure out a way to not always do this?
+        AMGX.initialize()
+    end
+end
diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -0,0 +1,112 @@
+function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv)
+    data = amgx.data
+    update_pressure_system!(amgx, A_p, Tv)
+    return amgx
+end
+
+mutable struct AMGXStorage{C, R, V, M, S}
+    config::C
+    resources::R
+    r::V
+    x::V
+    matrix::M
+    solver::S
+    function AMGXStorage(config, amgx_mode = AMGX.dDDI)
+        resources = AMGX.Resources(config)
+        r = AMGX.AMGXVector(resources, amgx_mode)
+        x = AMGX.AMGXVector(resources, amgx_mode)
+        matrix = AMGX.AMGXMatrix(resources, amgx_mode)
+        solver = AMGX.Solver(resources, amgx_mode, config)
+        function finalize_storage!(amgx_s::AMGXStorage)
+            @async println("Finalizing AMGXStorage")
+            close(amgx_s.solver)
+            close(amgx_s.r)
+            close(amgx_s.x)
+            close(amgx_s.matrix)
+            close(amgx_s.resources)
+            close(amgx_s.config)
+        end
+        s = new{
+            typeof(config),
+            typeof(resources),
+            typeof(r),
+            typeof(matrix),
+            typeof(solver)
+        }(config, resources, r, x, matrix, solver)
+        return finalizer(finalize_storage!, s)
+    end
+end
+
+function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p)
+    # TODO: This sync call is maybe needd?
+    AMGX.CUDA.synchronize()
+    s = amgx.data[:storage]
+    n = length(r_p)
+    AMGX.upload!(s.r, r_p)
+    AMGX.set_zero!(s.x, n)
+    AMGX.solve!(s.x, s.solver, s.r)
+    AMGX.download!(r_p, s.x)
+    return r_p
+end
+
+function update_pressure_system!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv)
+    if !haskey(amgx.data, :storage)
+        if Tv == Float64
+            amgx_mode = AMGX.dDDI
+        else
+            amgx_mode = AMGX.dFFI
+        end
+        n, m = size(A)
+        @assert n == m
+        config = AMGX.Config(amgx.settings)
+        s = AMGXStorage(config, amgx_mode)
+        # RHS and solution vectors to right size just in case
+        AMGX.set_zero!(s.x, n)
+        AMGX.set_zero!(s.r, n)
+
+        row_ptr = Cint.(A.At.colptr .- 1)
+        colval = Cint.(A.At.rowval .- 1)
+        nzval = A.At.nzval
+        # TODO: Support for other types than Float64, should be done in setup of
+        # pressure system
+        AMGX.pin_memory(nzval)
+        AMGX.upload!(s.matrix,
+            row_ptr,
+            colval,
+            nzval
+        )
+        amgx.data[:storage] = s
+        amgx.data[:buffer_p] = AMGX.CUDA.CuVector{Tv}(undef, n)
+    else
+        pb = amgx.data[:buffer_p]
+        s = amgx.data[:storage]
+        A_gpu = s.matrix
+        @assert nnz(A) == nnz(A_gpu)
+        AMGX.replace_coefficients!(A_gpu, A.At.nzval)
+    end
+    AMGX.setup!(s.solver, s.matrix)
+    return amgx
+end
+
+function JutulDarcy.gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op)
+    # TODO: Rename this.
+    data = cpr.pressure_precond.data
+    cpr_s = cpr.storage
+    if !haskey(data, :w_p)
+        Tv = eltype(r_cu)
+        n = length(r_cu)
+        bz = cpr_s.block_size
+        cpr.pressure_precond.data[:buffer_full] = similar(r_cu)
+        bz_w, n_w = size(cpr_s.w_p)
+        @assert n == bz*n_w
+        data[:w_p] = AMGX.CUDA.CuMatrix{Tv}(undef, bz_w, n_w)
+        data[:main_system] = J_bsr
+        data[:operator] = op
+    end
+    # Put updated weights on GPU
+    w_p_cpu = cpr_s.w_p
+    w_p_gpu = data[:w_p]
+    @assert size(w_p_cpu) == size(w_p_gpu)
+    copyto!(w_p_gpu, w_p_cpu)
+    return cpr
+end
diff --git a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
@@ -0,0 +1,11 @@
+module JutulDarcyCUDAExt
+    using Jutul, JutulDarcy, CUDA, LinearAlgebra, SparseArrays
+    import Jutul: @tic
+
+    timeit_debug_enabled() = Jutul.timeit_debug_enabled()
+
+    include("ilu0.jl")
+    include("krylov.jl")
+    include("cuda_utils.jl")
+    include("cpr.jl")
+end
diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl
@@ -0,0 +1,65 @@
+function JutulDarcy.gpu_reduce_residual!(r_p, w_p, r)
+    @assert eltype(r_p) == eltype(w_p) == eltype(r)
+    @assert length(w_p) == length(r)
+    @assert size(w_p, 2) == length(r_p)
+    kernel = CUDA.@cuda launch = false reduce_residual_kernel!(r_p, w_p, r)
+    threads, blocks = kernel_helper(kernel, r_p)
+    CUDA.@sync begin
+        kernel(r_p, w_p, r; threads, blocks)
+    end
+    return r_p
+end
+
+function reduce_residual_kernel!(r_p, w_p, r_ps)
+    index = threadIdx().x
+    stride = blockDim().x
+    T = eltype(r_p)
+    ncomp = size(w_p, 1)
+    for cell in index:stride:length(r_p)
+        v = zero(T)
+        for comp in 1:ncomp
+            ix = (cell-1)*ncomp + comp
+            @inbounds wi = w_p[comp, cell]
+            @inbounds ri = r_ps[ix]
+            v += wi*ri
+        end
+        @inbounds r_p[cell] = v
+    end
+    return nothing
+end
+
+function JutulDarcy.gpu_increment_pressure!(x, dp)
+    n = length(x)
+    bz = div(n, length(dp))
+    @assert bz > 0
+    if false
+        kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz)
+        threads, blocks = kernel_helper(kernel, dp)
+        threads = blocks = 1
+
+        CUDA.@sync begin
+        kernel(x, dp, bz; threads, blocks)
+        end
+    else
+        x_view = view(x, 1:bz:(n+1-bz))
+        @. x_view += dp
+    end
+    return x
+end
+
+function gpu_increment_pressure_kernel!(x, dp, bz)
+    index = threadIdx().x
+    stride = blockDim().x
+    for cell in index:stride:length(dp)
+        x[(cell-1)*bz + 1] += dp[cell]
+    end
+    return nothing
+end
+
+function kernel_helper(kernel, V)
+    config = launch_configuration(kernel.fun)
+    threads = min(length(V), config.threads)
+    blocks = cld(length(V), threads)
+
+    return (threads, blocks)
+end
diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -0,0 +1,117 @@
+function JutulDarcy.build_gpu_block_system(Ti, Tv, sz::Tuple{Int, Int}, blockDim::Int, rowptr, colval, nzval, r0)
+    rowPtr = CuVector{Ti}(rowptr)
+    colVal = CuVector{Ti}(colval)
+    nzVal = CuVector{Tv}(nzval)
+    dims = blockDim.*sz
+    dir = 'C'
+    nnz = length(nzVal)÷(blockDim^2)
+    J_bsr = CUDA.CUSPARSE.CuSparseMatrixBSR{Tv, Ti}(rowPtr, colVal, nzVal, dims, blockDim, dir, nnz)
+    r_cu = CuVector{Tv}(vec(r0))
+    return (J_bsr, r_cu)
+end
+
+function JutulDarcy.update_gpu_block_system!(J, blockDim, nzval, ϵ = 1e-12)
+    if ϵ > 0.0
+        for (i, v) in enumerate(nzval)
+            if abs(v) < ϵ
+                if v > 0
+                    v = ϵ
+                else
+                    v = ϵ
+                end
+                @inbounds nzval[i] = v
+            end
+        end
+    end
+    copyto!(J.nzVal, nzval)
+end
+
+function JutulDarcy.update_gpu_block_residual!(r_cu, blockDim, r)
+    copyto!(r_cu, r)
+end
+
+function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::LinearizedSystem)
+    return nothing
+end
+
+function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSystem)
+    @assert size(lsys.subsystems) == (2, 2)
+    @assert length(lsys.schur_buffer) == 2
+    buf_full_cpu = lsys.schur_buffer[1]
+    buf_1_cpu, buf_2_cpu = lsys.schur_buffer[2]
+    # B_cpu = lsys[1, 1].jac # Already transferred, not needed
+    C_cpu = lsys[1, 2].jac
+    D_cpu = lsys[2, 1].jac
+    # E_cpu = lsys[2, 2].jac # Only factorization needed I think.
+
+    E_factor = only(lsys.factor.factor)
+    # E_L_cpu = E_factor.L
+    # E_U_cpu = E_factor.U
+
+    to_gpu(x) = copy_to_gpu(x, Tv, Ti)
+
+    D = to_gpu(D_cpu)
+    # E_L = to_gpu(E_L_cpu)
+    # E_U = to_gpu(E_U_cpu)
+    C = to_gpu(C_cpu)
+
+    buf = to_gpu(buf_1_cpu)
+    # buf_2 = to_gpu(buf_2_cpu)
+    # Operations and format:
+    # B C
+    # D E
+    # mul!(b_buf_2, D_i, x)
+    # ldiv!(b_buf_1, E_i, b_buf_2)
+    # mul!(res, C_i, b_buf_1, -α, true)
+
+    return Dict(
+        :C => C,
+        :D => D,
+        :buf => buf,
+        :buf_1_cpu => buf_1_cpu,
+        :buf_2_cpu => buf_2_cpu,
+        :E_factor => E_factor
+    )
+end
+
+function JutulDarcy.update_gpu_schur_system!(schur::Nothing, lsys)
+    return schur
+end
+
+function JutulDarcy.update_gpu_schur_system!(schur, lsys)
+    C_cpu = lsys[1, 2].jac
+    copyto!(schur[:C].nzVal, C_cpu.nzval)
+    D_cpu = lsys[2, 1].jac
+    copyto!(schur[:D].nzVal, D_cpu.nzval)
+    return schur
+end
+
+function copy_to_gpu(x, Ti, Tv)
+    error("Internal converter not set up for $(typeof(x))")
+end
+
+function copy_to_gpu(x::SparseMatrixCSC{Tvc, Tic}, Tv, Ti) where {Tvc, Tic}
+    if Tvc != Tv || Tic != Ti
+        x = SparseMatrixCSC{Tv, Ti}(x)
+    end
+    return CUDA.CUSPARSE.CuSparseMatrixCSC(x)
+end
+
+function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc}
+    return convert(CuVector{Tv}, x)
+end
+
+function JutulDarcy.schur_mul_gpu!(y, x, α, β, J, C, D, buf::CuVector, buf1_cpu::Vector, buf2_cpu::Vector, E_factor)
+    @tic "schur apply" @sync begin 
+        # Working on GPU
+        @async mul!(y, J, x, α, β)
+        mul!(buf, D, x)
+        # Move over to CPU for this part
+        copyto!(buf1_cpu, buf)
+        ldiv!(buf2_cpu, E_factor, buf1_cpu)
+        # Back to GPU for the big array...
+        copyto!(buf, buf2_cpu)
+    end
+    mul!(y, C, buf, -α, true)
+    return y
+end