Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tentative CUDA/AMGX support #76

Open
wants to merge 30 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
ef6c0a1
Add a basic reservoir version of CUDA linear solver
moyner Oct 23, 2024
f778b4f
Update utils.jl
moyner Oct 24, 2024
e2ff69f
Generalize CUDA a bit
moyner Oct 24, 2024
93bb1e1
Put in place some Schur stuff
moyner Oct 24, 2024
b55ba61
Put in place some more factorization updates
moyner Oct 24, 2024
e3c230e
Schur additions
moyner Oct 24, 2024
ca0572f
Update mul
moyner Oct 24, 2024
267a1ca
Rough draft of schur mostly on GPU
moyner Oct 24, 2024
391b16f
Make GPU and CPU operations independent in schur multiply
moyner Oct 24, 2024
a5d12e8
Add timing to CUDA calls
moyner Oct 24, 2024
281e652
Update interface
moyner Oct 25, 2024
e5e20ea
Draft extension to upcoming AMGX release
moyner Oct 25, 2024
5b3251b
Add some AMGX init code
moyner Oct 25, 2024
d914ceb
More CPR testing
moyner Oct 25, 2024
253ff5c
More work on GPU CPR
moyner Oct 25, 2024
7ee843d
CPR apply seems to run once...
moyner Oct 25, 2024
03a5655
Fixes to kernel launches
moyner Oct 25, 2024
eba4fb6
Sort-of-working GPU CPR with wells
moyner Oct 25, 2024
d0cffe6
Update format for AMGX config
moyner Oct 25, 2024
3833251
Update copy of buffer in CUDA
moyner Oct 27, 2024
b5ab2cc
Updates to CPR code
moyner Oct 27, 2024
e54d62b
Updates to CUDA
moyner Oct 27, 2024
f7f7d39
Fix for wells
moyner Oct 28, 2024
a2c05c7
Some CUDA code backup
moyner Oct 28, 2024
017c1ef
Use CUDA threads in CPR code
moyner Oct 28, 2024
c6f15b1
Use view instead of custom kernel for CPR dp
moyner Oct 28, 2024
b1a070c
Performance tweaks
moyner Oct 28, 2024
a58af9c
Remove some code duplication
moyner Oct 28, 2024
96c9495
Reorder destructors for AMGX
moyner Oct 28, 2024
0824cfc
Bump Jutul compat & version
moyner Oct 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "JutulDarcy"
uuid = "82210473-ab04-4dce-b31b-11573c4f8e0a"
authors = ["Olav Møyner <[email protected]>"]
version = "0.2.35"
version = "0.2.36"

[deps]
AlgebraicMultigrid = "2169fc97-5a83-5252-b627-83903c6c433c"
Expand Down Expand Up @@ -34,20 +34,26 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"

[weakdeps]
AMGX = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9"

[extensions]
JutulDarcyAMGXExt = "AMGX"
JutulDarcyCUDAExt = "CUDA"
JutulDarcyGLMakieExt = "GLMakie"
JutulDarcyMakieExt = "Makie"
JutulDarcyPartitionedArraysExt = ["PartitionedArrays", "MPI", "HYPRE"]

[compat]
AlgebraicMultigrid = "0.5.1, 0.6.0"
Artifacts = "1"
AMGX = "0.2"
CUDA = "5"
DataStructures = "0.18.13"
Dates = "1"
DelimitedFiles = "1.6"
Expand All @@ -56,7 +62,7 @@ ForwardDiff = "0.10.30"
GLMakie = "0.10.13"
GeoEnergyIO = "1.1.12"
HYPRE = "1.6.0, 1.7"
Jutul = "0.2.40"
Jutul = "0.2.41"
Krylov = "0.9.1"
LazyArtifacts = "1"
LinearAlgebra = "1"
Expand All @@ -78,6 +84,8 @@ Tullio = "0.3.4"
julia = "1.7"

[extras]
AMGX = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
Expand All @@ -87,4 +95,4 @@ TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"

[targets]
test = ["Test", "TestItemRunner", "TestItems", "HYPRE", "MPI", "PartitionedArrays"]
test = ["Test", "CUDA", "TestItemRunner", "TestItems", "HYPRE", "MPI", "PartitionedArrays"]
14 changes: 14 additions & 0 deletions ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
module JutulDarcyAMGXExt
using JutulDarcy, Jutul, AMGX, AMGX.CUDA, LinearAlgebra, SparseArrays
import JutulDarcy: AMGXPreconditioner
import Jutul: @tic

timeit_debug_enabled() = Jutul.timeit_debug_enabled()

include("cpr.jl")

function __init__()
# TODO: Figure out a way to not always do this?
AMGX.initialize()
end
end
112 changes: 112 additions & 0 deletions ext/JutulDarcyAMGXExt/cpr.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv)
data = amgx.data
update_pressure_system!(amgx, A_p, Tv)
return amgx
end

mutable struct AMGXStorage{C, R, V, M, S}
config::C
resources::R
r::V
x::V
matrix::M
solver::S
function AMGXStorage(config, amgx_mode = AMGX.dDDI)
resources = AMGX.Resources(config)
r = AMGX.AMGXVector(resources, amgx_mode)
x = AMGX.AMGXVector(resources, amgx_mode)
matrix = AMGX.AMGXMatrix(resources, amgx_mode)
solver = AMGX.Solver(resources, amgx_mode, config)
function finalize_storage!(amgx_s::AMGXStorage)
@async println("Finalizing AMGXStorage")
close(amgx_s.solver)
close(amgx_s.r)
close(amgx_s.x)
close(amgx_s.matrix)
close(amgx_s.resources)
close(amgx_s.config)
end
s = new{
typeof(config),
typeof(resources),
typeof(r),
typeof(matrix),
typeof(solver)
}(config, resources, r, x, matrix, solver)
return finalizer(finalize_storage!, s)
end
end

function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p)
# TODO: This sync call is maybe needd?
AMGX.CUDA.synchronize()
s = amgx.data[:storage]
n = length(r_p)
AMGX.upload!(s.r, r_p)
AMGX.set_zero!(s.x, n)
AMGX.solve!(s.x, s.solver, s.r)
AMGX.download!(r_p, s.x)
return r_p
end

function update_pressure_system!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv)
if !haskey(amgx.data, :storage)
if Tv == Float64
amgx_mode = AMGX.dDDI
else
amgx_mode = AMGX.dFFI
end
n, m = size(A)
@assert n == m
config = AMGX.Config(amgx.settings)
s = AMGXStorage(config, amgx_mode)
# RHS and solution vectors to right size just in case
AMGX.set_zero!(s.x, n)
AMGX.set_zero!(s.r, n)

row_ptr = Cint.(A.At.colptr .- 1)
colval = Cint.(A.At.rowval .- 1)
nzval = A.At.nzval
# TODO: Support for other types than Float64, should be done in setup of
# pressure system
AMGX.pin_memory(nzval)
AMGX.upload!(s.matrix,
row_ptr,
colval,
nzval
)
amgx.data[:storage] = s
amgx.data[:buffer_p] = AMGX.CUDA.CuVector{Tv}(undef, n)
else
pb = amgx.data[:buffer_p]
s = amgx.data[:storage]
A_gpu = s.matrix
@assert nnz(A) == nnz(A_gpu)
AMGX.replace_coefficients!(A_gpu, A.At.nzval)
end
AMGX.setup!(s.solver, s.matrix)
return amgx
end

function JutulDarcy.gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op)
# TODO: Rename this.
data = cpr.pressure_precond.data
cpr_s = cpr.storage
if !haskey(data, :w_p)
Tv = eltype(r_cu)
n = length(r_cu)
bz = cpr_s.block_size
cpr.pressure_precond.data[:buffer_full] = similar(r_cu)
bz_w, n_w = size(cpr_s.w_p)
@assert n == bz*n_w
data[:w_p] = AMGX.CUDA.CuMatrix{Tv}(undef, bz_w, n_w)
data[:main_system] = J_bsr
data[:operator] = op
end
# Put updated weights on GPU
w_p_cpu = cpr_s.w_p
w_p_gpu = data[:w_p]
@assert size(w_p_cpu) == size(w_p_gpu)
copyto!(w_p_gpu, w_p_cpu)
return cpr
end
11 changes: 11 additions & 0 deletions ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
module JutulDarcyCUDAExt
using Jutul, JutulDarcy, CUDA, LinearAlgebra, SparseArrays
import Jutul: @tic

timeit_debug_enabled() = Jutul.timeit_debug_enabled()

include("ilu0.jl")
include("krylov.jl")
include("cuda_utils.jl")
include("cpr.jl")
end
65 changes: 65 additions & 0 deletions ext/JutulDarcyCUDAExt/cpr.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
function JutulDarcy.gpu_reduce_residual!(r_p, w_p, r)
@assert eltype(r_p) == eltype(w_p) == eltype(r)
@assert length(w_p) == length(r)
@assert size(w_p, 2) == length(r_p)
kernel = CUDA.@cuda launch = false reduce_residual_kernel!(r_p, w_p, r)
threads, blocks = kernel_helper(kernel, r_p)
CUDA.@sync begin
kernel(r_p, w_p, r; threads, blocks)
end
return r_p
end

function reduce_residual_kernel!(r_p, w_p, r_ps)
index = threadIdx().x
stride = blockDim().x
T = eltype(r_p)
ncomp = size(w_p, 1)
for cell in index:stride:length(r_p)
v = zero(T)
for comp in 1:ncomp
ix = (cell-1)*ncomp + comp
@inbounds wi = w_p[comp, cell]
@inbounds ri = r_ps[ix]
v += wi*ri
end
@inbounds r_p[cell] = v
end
return nothing
end

function JutulDarcy.gpu_increment_pressure!(x, dp)
n = length(x)
bz = div(n, length(dp))
@assert bz > 0
if false
kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz)
threads, blocks = kernel_helper(kernel, dp)
threads = blocks = 1

CUDA.@sync begin
kernel(x, dp, bz; threads, blocks)
end
else
x_view = view(x, 1:bz:(n+1-bz))
@. x_view += dp
end
return x
end

function gpu_increment_pressure_kernel!(x, dp, bz)
index = threadIdx().x
stride = blockDim().x
for cell in index:stride:length(dp)
x[(cell-1)*bz + 1] += dp[cell]
end
return nothing
end

function kernel_helper(kernel, V)
config = launch_configuration(kernel.fun)
threads = min(length(V), config.threads)
blocks = cld(length(V), threads)

return (threads, blocks)
end
117 changes: 117 additions & 0 deletions ext/JutulDarcyCUDAExt/cuda_utils.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
function JutulDarcy.build_gpu_block_system(Ti, Tv, sz::Tuple{Int, Int}, blockDim::Int, rowptr, colval, nzval, r0)
rowPtr = CuVector{Ti}(rowptr)
colVal = CuVector{Ti}(colval)
nzVal = CuVector{Tv}(nzval)
dims = blockDim.*sz
dir = 'C'
nnz = length(nzVal)÷(blockDim^2)
J_bsr = CUDA.CUSPARSE.CuSparseMatrixBSR{Tv, Ti}(rowPtr, colVal, nzVal, dims, blockDim, dir, nnz)
r_cu = CuVector{Tv}(vec(r0))
return (J_bsr, r_cu)
end

function JutulDarcy.update_gpu_block_system!(J, blockDim, nzval, ϵ = 1e-12)
if ϵ > 0.0
for (i, v) in enumerate(nzval)
if abs(v) < ϵ
if v > 0
v = ϵ
else
v = ϵ
end
@inbounds nzval[i] = v
end
end
end
copyto!(J.nzVal, nzval)
end

function JutulDarcy.update_gpu_block_residual!(r_cu, blockDim, r)
copyto!(r_cu, r)
end

function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::LinearizedSystem)
return nothing
end

function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSystem)
@assert size(lsys.subsystems) == (2, 2)
@assert length(lsys.schur_buffer) == 2
buf_full_cpu = lsys.schur_buffer[1]
buf_1_cpu, buf_2_cpu = lsys.schur_buffer[2]
# B_cpu = lsys[1, 1].jac # Already transferred, not needed
C_cpu = lsys[1, 2].jac
D_cpu = lsys[2, 1].jac
# E_cpu = lsys[2, 2].jac # Only factorization needed I think.

E_factor = only(lsys.factor.factor)
# E_L_cpu = E_factor.L
# E_U_cpu = E_factor.U

to_gpu(x) = copy_to_gpu(x, Tv, Ti)

D = to_gpu(D_cpu)
# E_L = to_gpu(E_L_cpu)
# E_U = to_gpu(E_U_cpu)
C = to_gpu(C_cpu)

buf = to_gpu(buf_1_cpu)
# buf_2 = to_gpu(buf_2_cpu)
# Operations and format:
# B C
# D E
# mul!(b_buf_2, D_i, x)
# ldiv!(b_buf_1, E_i, b_buf_2)
# mul!(res, C_i, b_buf_1, -α, true)

return Dict(
:C => C,
:D => D,
:buf => buf,
:buf_1_cpu => buf_1_cpu,
:buf_2_cpu => buf_2_cpu,
:E_factor => E_factor
)
end

function JutulDarcy.update_gpu_schur_system!(schur::Nothing, lsys)
return schur
end

function JutulDarcy.update_gpu_schur_system!(schur, lsys)
C_cpu = lsys[1, 2].jac
copyto!(schur[:C].nzVal, C_cpu.nzval)
D_cpu = lsys[2, 1].jac
copyto!(schur[:D].nzVal, D_cpu.nzval)
return schur
end

function copy_to_gpu(x, Ti, Tv)
error("Internal converter not set up for $(typeof(x))")
end

function copy_to_gpu(x::SparseMatrixCSC{Tvc, Tic}, Tv, Ti) where {Tvc, Tic}
if Tvc != Tv || Tic != Ti
x = SparseMatrixCSC{Tv, Ti}(x)
end
return CUDA.CUSPARSE.CuSparseMatrixCSC(x)
end

function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc}
return convert(CuVector{Tv}, x)
end

function JutulDarcy.schur_mul_gpu!(y, x, α, β, J, C, D, buf::CuVector, buf1_cpu::Vector, buf2_cpu::Vector, E_factor)
@tic "schur apply" @sync begin
# Working on GPU
@async mul!(y, J, x, α, β)
mul!(buf, D, x)
# Move over to CPU for this part
copyto!(buf1_cpu, buf)
ldiv!(buf2_cpu, E_factor, buf1_cpu)
# Back to GPU for the big array...
copyto!(buf, buf2_cpu)
end
mul!(y, C, buf, -α, true)
return y
end
Loading
Loading