CliMA · navidcy · Nov 9, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/.buildkite/distributed/pipeline.yml b/.buildkite/distributed/pipeline.yml
@@ -1,7 +1,7 @@
 agents:
   queue: new-central
   slurm_mem: 8G
-  modules: climacommon/2024_10_09
+  modules: climacommon/2024_10_08
 
 env:
   JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite/distributed"
@@ -16,60 +16,74 @@ steps:
     key: "init_central"
     env:
       TEST_GROUP: "init"
+      GPU_TEST: "true"
     command:
-      - echo "--- Instantiate project"
-      - "julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
+      - echo "--- Initialize tests"
+      - "julia -O0 --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
-      slurm_gpus: 1
-      slurm_cpus_per_task: 8
+      slurm_mem: 8G
+      slurm_ntasks: 1
+      slurm_gpus_per_task: 1
 
   - wait
 
   - label: "🐉 cpu distributed unit tests"
     key: "distributed_cpu"
     env:
       TEST_GROUP: "distributed"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 8G
       slurm_ntasks: 4
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
 
   - label: "🐲 gpu distributed unit tests"
     key: "distributed_gpu"
     env:
       TEST_GROUP: "distributed"
+      GPU_TEST: "true"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 50G
       slurm_ntasks: 4
       slurm_gpus_per_task: 1
     retry:
       automatic:
         - exit_status: 1      
           limit: 1
 
-
   - label: "🦾 cpu distributed solvers tests"
     key: "distributed_solvers_cpu"
     env:
       TEST_GROUP: "distributed_solvers"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 50G
       slurm_ntasks: 4
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
 
   - label: "🛸 gpu distributed solvers tests"
     key: "distributed_solvers_gpu"
     env:
       TEST_GROUP: "distributed_solvers"
+      GPU_TEST: "true"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 50G
       slurm_ntasks: 4
       slurm_gpus_per_task: 1
     retry:
@@ -81,20 +95,27 @@ steps:
     key: "distributed_hydrostatic_model_cpu"
     env:
       TEST_GROUP: "distributed_hydrostatic_model"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 50G
       slurm_ntasks: 4
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
 
   - label: "🦏 gpu distributed hydrostatic model tests"
     key: "distributed_hydrostatic_model_gpu"
     env:
       TEST_GROUP: "distributed_hydrostatic_model"
+      GPU_TEST: "true"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 50G
       slurm_ntasks: 4
       slurm_gpus_per_task: 1
     retry:
@@ -106,20 +127,27 @@ steps:
     key: "distributed_nonhydrostatic_regression_cpu"
     env:
       TEST_GROUP: "distributed_nonhydrostatic_regression"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 50G
       slurm_ntasks: 4
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
 
   - label: "🕺 gpu distributed nonhydrostatic regression"
     key: "distributed_nonhydrostatic_regression_gpu"
     env:
       TEST_GROUP: "distributed_nonhydrostatic_regression"
+      GPU_TEST: "true"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 50G
       slurm_ntasks: 4
       slurm_gpus_per_task: 1
     retry:

diff --git a/Project.toml b/Project.toml
@@ -50,7 +50,7 @@ CubedSphere = "0.2, 0.3"
 Dates = "1.9"
 Distances = "0.10"
 DocStringExtensions = "0.8, 0.9"
-Enzyme = "0.13.3"
+Enzyme = "0.13.14"
 FFTW = "1"
 Glob = "1.3"
 IncompleteLU = "0.2"
@@ -78,9 +78,11 @@ julia = "1.9"
 [extras]
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
+MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TimesDates = "bdfc003b-8df8-5c39-adcd-3a9087f5df4a"
 
 [targets]
-test = ["DataDeps", "Enzyme", "SafeTestsets", "Test", "TimesDates"]
+test = ["DataDeps", "SafeTestsets", "Test", "Enzyme", "CUDA_Runtime_jll", "MPIPreferences", "TimesDates"]
diff --git a/src/Fields/set!.jl b/src/Fields/set!.jl
@@ -3,7 +3,7 @@ using KernelAbstractions: @kernel, @index
 using Adapt: adapt_structure
 
 using Oceananigans.Grids: on_architecture, node_names
-using Oceananigans.Architectures: child_architecture, device, GPU, CPU
+using Oceananigans.Architectures: child_architecture, cpu_architecture, device, GPU, CPU
 using Oceananigans.Utils: work_layout
 
 #####
@@ -45,13 +45,16 @@ end
 
 function set_to_function!(u, f)
     # Supports serial and distributed
-    arch = child_architecture(u)
+    arch = architecture(u)
+    child_arch = child_architecture(u)
 
     # Determine cpu_grid and cpu_u
-    if arch isa GPU
-        cpu_grid = on_architecture(CPU(), u.grid)
-        cpu_u = Field(location(u), cpu_grid; indices = indices(u))
-    elseif arch isa CPU
+    if child_arch isa GPU
+        cpu_arch = cpu_architecture(arch)
+        cpu_grid = on_architecture(cpu_arch, u.grid)
+        cpu_u    = Field(location(u), cpu_grid; indices = indices(u))
+
+    elseif child_arch isa CPU
         cpu_grid = u.grid
         cpu_u = u
     end
@@ -65,8 +68,8 @@ function set_to_function!(u, f)
     catch err
         u_loc = Tuple(L() for L in location(u))
 
-        arg_str = tuple_string(node_names(u.grid, u_loc...))
-        loc_str = tuple_string(location(u))
+        arg_str  = tuple_string(node_names(u.grid, u_loc...))
+        loc_str  = tuple_string(location(u))
         topo_str = tuple_string(topology(u.grid))
 
         msg = string("An error was encountered within set! while setting the field", '\n', '\n',
@@ -81,10 +84,8 @@ function set_to_function!(u, f)
     end
 
     # Transfer data to GPU if u is on the GPU
-    if child_architecture(u) isa GPU
-        set!(u, cpu_u)
-    end
-
+    child_arch isa GPU && set!(u, cpu_u)
+
     return u
 end
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -172,25 +172,33 @@ CUDA.allowscalar() do
 
     if group == :distributed || group == :all
         MPI.Initialized() || MPI.Init()
+        # In case CUDA is not found, we reset CUDA and restart the julia session
+        reset_cuda_if_necessary()
         archs = test_architectures()
         include("test_distributed_models.jl")
     end
 
     if group == :distributed_solvers || group == :all
         MPI.Initialized() || MPI.Init()
+        # In case CUDA is not found, we reset CUDA and restart the julia session
+        reset_cuda_if_necessary()
         include("test_distributed_transpose.jl")
         include("test_distributed_poisson_solvers.jl")
     end
 
     if group == :distributed_hydrostatic_model || group == :all
         MPI.Initialized() || MPI.Init()
+        # In case CUDA is not found, we reset CUDA and restart the julia session
+        reset_cuda_if_necessary()
         archs = test_architectures()
         include("test_hydrostatic_regression.jl")
         include("test_distributed_hydrostatic_model.jl")
     end
 
     if group == :distributed_nonhydrostatic_regression || group == :all
         MPI.Initialized() || MPI.Init()
+        # In case CUDA is not found, we reset CUDA and restart the julia session
+        reset_cuda_if_necessary()
         archs = nonhydrostatic_regression_test_architectures()
         include("test_nonhydrostatic_regression.jl")
     end

diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
@@ -121,9 +121,7 @@ function divergence_free_poisson_tridiagonal_solution(grid_points, ranks, stretc
     return Array(interior(∇²ϕ)) ≈ Array(R)
 end
 
-@testset "Distributed FFT-based Poisson solver" begin
-    child_arch = test_child_arch()
-
+@testset "Distributed FFT-based Poisson solver" begin    
     for topology in ((Periodic, Periodic, Periodic), 
                      (Periodic, Periodic, Bounded),
                      (Periodic, Bounded, Bounded),

diff --git a/test/test_distributed_transpose.jl b/test/test_distributed_transpose.jl
@@ -38,8 +38,6 @@ function test_transpose(grid_points, ranks, topo, child_arch)
 end
 
 @testset "Distributed Transpose" begin
-    child_arch = test_child_arch()
-
     for topology in ((Periodic, Periodic, Periodic), 
                      (Periodic, Periodic, Bounded),
                      (Periodic, Bounded, Bounded),

diff --git a/test/utils_for_runtests.jl b/test/utils_for_runtests.jl
@@ -3,21 +3,55 @@ using Oceananigans.DistributedComputations: Distributed, Partition, child_archit
 
 import Oceananigans.Fields: interior
 
-test_child_arch() = CUDA.has_cuda() ? GPU() : CPU()
+# Are the test running on the GPUs? 
+# Are the test running in parallel?
+child_arch = get(ENV, "GPU_TEST", nothing) == "true" ? GPU() : CPU()
+mpi_test   = get(ENV, "MPI_TEST", nothing) == "true"
+
+# Sometimes when running tests in parallel, the CUDA.jl package is not loaded correctly.
+# This function is a failsafe to re-load CUDA.jl using the suggested cach compilation from 
+# https://github.com/JuliaGPU/CUDA.jl/blob/a085bbb3d7856dfa929e6cdae04a146a259a2044/src/initialization.jl#L105
+# To make sure Julia restarts, an error is thrown.
+function reset_cuda_if_necessary()
+
+    # Do nothing if we are on the CPU
+    if child_arch isa CPU
+        return
+    end
+
+    try 
+        c = CUDA.zeros(10) # This will fail if CUDA is not available
+    catch err
 
-function test_architectures() 
-    child_arch =  test_child_arch()
+        # Avoid race conditions and precompile on rank 0 only
+        if MPI.Comm_rank(MPI.COMM_WORLD) == 0
+            pkg = Base.PkgId(Base.UUID("76a88914-d11a-5bdc-97e0-2f5a05c973a2"), "CUDA_Runtime_jll")
+            Base.compilecache(pkg)
+            @info "CUDA.jl was not correctly loaded. Re-loading CUDA.jl and re-starting Julia."
+        end
 
+        MPI.Barrier(MPI.COMM_WORLD)
+
+        # re-start Julia and re-load CUDA.jl
+        throw(err)
+    end
+end
+
+function test_architectures() 
     # If MPI is initialized with MPI.Comm_size > 0, we are running in parallel.
     # We test several different configurations: `Partition(x = 4)`, `Partition(y = 4)`, 
     # `Partition(x = 2, y = 2)`, and different fractional subdivisions in x, y and xy
-    if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4
-        return (Distributed(child_arch; partition = Partition(4)),
-                Distributed(child_arch; partition = Partition(1, 4)),
-                Distributed(child_arch; partition = Partition(2, 2)),
-                Distributed(child_arch; partition = Partition(x = Fractional(1, 2, 3, 4))),
-                Distributed(child_arch; partition = Partition(y = Fractional(1, 2, 3, 4))),
-                Distributed(child_arch; partition = Partition(x = Fractional(1, 2), y = Equal()))) 
+    if mpi_test
+        if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4
+            return (Distributed(child_arch; partition = Partition(4)),
+                    Distributed(child_arch; partition = Partition(1, 4)),
+                    Distributed(child_arch; partition = Partition(2, 2)),
+                    Distributed(child_arch; partition = Partition(x = Fractional(1, 2, 3, 4))),
+                    Distributed(child_arch; partition = Partition(y = Fractional(1, 2, 3, 4))),
+                    Distributed(child_arch; partition = Partition(x = Fractional(1, 2), y = Equal()))) 
+        else
+            return throw("The MPI partitioning is not correctly configured.")
+        end
     else
         return tuple(child_arch)
     end
@@ -26,15 +60,17 @@ end
 # For nonhydrostatic simulations we cannot use `Fractional` at the moment (requirements
 # for the tranpose are more stringent than for hydrostatic simulations).
 function nonhydrostatic_regression_test_architectures() 
-    child_arch =  test_child_arch()
-
     # If MPI is initialized with MPI.Comm_size > 0, we are running in parallel.
     # We test 3 different configurations: `Partition(x = 4)`, `Partition(y = 4)` 
     # and `Partition(x = 2, y = 2)`
-    if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4
-        return (Distributed(child_arch; partition = Partition(4)),
-                Distributed(child_arch; partition = Partition(1, 4)),
-                Distributed(child_arch; partition = Partition(2, 2)))
+    if mpi_test
+        if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4
+            return (Distributed(child_arch; partition = Partition(4)),
+                    Distributed(child_arch; partition = Partition(1, 4)),
+                    Distributed(child_arch; partition = Partition(2, 2)))
+        else
+            return throw("The MPI partitioning is not correctly configured.")
+        end        
     else
         return tuple(child_arch)
     end