merzlab · ohearnk · Apr 15, 2024 · Apr 18, 2024 · Apr 19, 2024 · Apr 19, 2024
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ Features
 * Supports QM/MM calculations with Amber22 and later
 * Fortran API to use QUICK as QM energy and force engine
 * MPI parallelization for CPU platforms
-* Massively parallel GPU implementation via CUDA/HIP for Nvidia/AMD GPUs (HIP available in QUICK-23.08, currently disabled)
+* Massively parallel GPU implementation via CUDA/HIP for Nvidia/AMD GPUs
 * Multi-GPU support via MPI + CUDA/HIP, also across multiple compute nodes
 
 Limitations
@@ -36,7 +36,6 @@ Limitations
 * Effective core potentials (ECPs) are not supported
 * DFT calculations are performed exclusively using the SG1 grid system 
 * No meta-GGA functionals, no range-separated hybrid functionals
-* HIP (AMD GPU support) is currently disabled (available in QUICK-23.08 but not QUICK-24.03)
 
 Installation
 ------------
@@ -61,9 +60,9 @@ Citation
 --------
 Please cite QUICK-24.03 as follows.
 
-Manathunga, M.; O'Hearn, K. A., Shajan, A.; Smith, J.; Miao, Y.; He, X.; Ayers, K;
-Brothers, E.; Götz, A. W.; Merz, K. M. QUICK-24.03 
-University of California San Diego, CA and
+Manathunga, M.; O'Hearn, K. A.; Shajan, A.; Smith, J.; Miao, Y.; He, X.; Ayers, K;
+Brothers, E.; Götz, A. W.; Merz, K. M. QUICK-24.03.
+University of California, San Diego, CA and
 Michigan State University, East Lansing, MI, 2024.
 
 If you perform density functional theory calculations please also cite:

diff --git a/configure b/configure
@@ -1306,7 +1306,7 @@ for buildtype in $buildtypes; do
 
     if [ "$enablef" = 'yes' ]; then
       echo "F functions will be compiled in the $buildtype version."
-      cuda_incl_flags="$cuda_incl_flags -DCUDA_SPDF"
+      cuda_incl_flags="$cuda_incl_flags -DGPU_SPDF"
     fi
 
     # set cew flag for nvcc
@@ -1331,7 +1331,7 @@ for buildtype in $buildtypes; do
 
     if [ "$enablef" = 'yes' ]; then
       echo "F functions will be compiled in the $buildtype version."
-      hip_incl_flags="$hip_incl_flags -DHIP_SPDF"
+      hip_incl_flags="$hip_incl_flags -DGPU_SPDF"
     fi
 
   fi
@@ -1345,31 +1345,31 @@ for buildtype in $buildtypes; do
 
   elif [ "$buildtype" = 'cuda' ]; then
 
-    fort_flags="$fort_flags -DCUDA"
-    cc_flags="$cc_flags -DCUDA"
-    cxx_flags="$cxx_flags -DCUDA"
-    cuda_incl_flags="$cuda_incl_flags -DCUDA"
+    fort_flags="$fort_flags -DGPU -DCUDA"
+    cc_flags="$cc_flags -DGPU -DCUDA"
+    cxx_flags="$cxx_flags -DGPU -DCUDA"
+    cuda_incl_flags="$cuda_incl_flags -DGPU -DCUDA"
 
   elif [ "$buildtype" = 'cudampi' ]; then
 
-    fort_flags="$fort_flags -DMPIV -DCUDA_MPIV"
-    cc_flags="$cc_flags -DMPIV -DCUDA_MPIV"
-    cxx_flags="$cxx_flags -DMPIV -DCUDA_MPIV"
-    cuda_incl_flags="$cuda_incl_flags -DMPIV -DCUDA_MPIV"
+    fort_flags="$fort_flags -DMPIV -DMPIV_GPU -DCUDA_MPIV"
+    cc_flags="$cc_flags -DMPIV -DMPIV_GPU -DCUDA_MPIV"
+    cxx_flags="$cxx_flags -DMPIV -DMPIV_GPU -DCUDA_MPIV"
+    cuda_incl_flags="$cuda_incl_flags -DMPIV -DMPIV_GPU -DCUDA_MPIV"
 
   elif [ "$buildtype" = 'hip' ]; then
 
-    fort_flags="$fort_flags -DHIP"
-    cc_flags="$cc_flags -DHIP"
-    cxx_flags="$cxx_flags -DHIP"
-    hip_incl_flags="$hip_incl_flags -DHIP"
+    fort_flags="$fort_flags -DGPU -DHIP"
+    cc_flags="$cc_flags -DGPU -DHIP"
+    cxx_flags="$cxx_flags -DGPU -DHIP"
+    hip_incl_flags="$hip_incl_flags -DGPU -DHIP"
 
   elif [ "$buildtype" = 'hipmpi' ]; then
 
-    fort_flags="$fort_flags -DMPIV -DHIP_MPIV"
-    cc_flags="$cc_flags -DMPIV -DHIP_MPIV"
-    cxx_flags="$cxx_flags -DMPIV -DHIP_MPIV"
-    hip_incl_flags="$hip_incl_flags -DMPIV -DHIP_MPIV"
+    fort_flags="$fort_flags -DMPIV -DMPIV_GPU -DHIP_MPIV"
+    cc_flags="$cc_flags -DMPIV -DMPIV_GPU -DHIP_MPIV"
+    cxx_flags="$cxx_flags -DMPIV -DMPIV_GPU -DHIP_MPIV"
+    hip_incl_flags="$hip_incl_flags -DMPIV -DMPIV_GPU -DHIP_MPIV"
 
   fi
 
@@ -1438,13 +1438,13 @@ for buildtype in $buildtypes; do
   if [ "$buildtype" = 'mpi' ]; then
     fort_ext_lib_flags="$fort_ext_lib_flags -DMPIV"
   elif [ "$buildtype" = 'cuda' ]; then
-    fort_ext_lib_flags="$fort_ext_lib_flags -DCUDA"
+    fort_ext_lib_flags="$fort_ext_lib_flags -DGPU -DCUDA"
   elif [ "$buildtype" = 'cudampi' ]; then
-    fort_ext_lib_flags="$fort_ext_lib_flags -DMPIV -DCUDA_MPIV"
+    fort_ext_lib_flags="$fort_ext_lib_flags -DMPIV -DMPIV_GPU -DCUDA_MPIV"
   elif [ "$buildtype" = 'hip' ]; then
-    fort_ext_lib_flags="$fort_ext_lib_flags -DHIP"
+    fort_ext_lib_flags="$fort_ext_lib_flags -DGPU -DHIP"
   elif [ "$buildtype" = 'hipmpi' ]; then
-    fort_ext_lib_flags="$fort_ext_lib_flags -DMPIV -DHIP_MPIV"
+    fort_ext_lib_flags="$fort_ext_lib_flags -DMPIV -DMPIV_GPU -DHIP_MPIV"
   fi
 
   # set the installer

diff --git a/quick-cmake/FindHipCUDA.cmake b/quick-cmake/FindHipCUDA.cmake
@@ -890,12 +890,12 @@ endif()
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath")
 
 # Use target ID syntax if supported for AMDGPU_TARGETS
-if(TARGET_ID_SUPPORT)
+#if(TARGET_ID_SUPPORT)
 #  set(AMDGPU_TARGETS gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack- CACHE STRING "List of specific machine types for library to target")
-  set(AMDGPU_TARGETS ${QUICK_USER_ARCH} CACHE STRING "List of specific machine types for library to target")
-else()
-  set(AMDGPU_TARGETS gfx803;gfx900;gfx906;gfx908;gfx90a CACHE STRING "List of specific machine types for library to target")
-endif()
+#else()
+#  set(AMDGPU_TARGETS gfx803;gfx900;gfx906;gfx908;gfx90a CACHE STRING "List of specific machine types for library to target")
+#endif()
+set(AMDGPU_TARGETS "${QUICK_USER_ARCH}" CACHE STRING "List of specific machine types for library to target")
 set(AMDGPU_TEST_TARGETS "" CACHE STRING "List of specific device types to test for") # Leave empty for default system device
 
 list(APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/hip)

diff --git a/quick-cmake/QUICKCudaConfig.cmake b/quick-cmake/QUICKCudaConfig.cmake
@@ -7,7 +7,6 @@ set(QUICK_GPU_TARGET_NAME "cuda")
 set(GPU_LD_FLAGS "") # hipcc requires special flags for linking (see below)
 
 if(CUDA)
-
     find_package(CUDA REQUIRED)
 
     if(NOT CUDA_FOUND)
@@ -126,7 +125,7 @@ if(CUDA)
         if("${QUICK_USER_ARCH}" MATCHES "maxwell")
 	    message(STATUS "Configuring QUICK for SM5.0")
             list(APPEND CUDA_NVCC_FLAGS ${SM50FLAGS})
-	    list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
+            list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
             set(DISABLE_OPTIMIZER_CONSTANTS TRUE)
             set(FOUND "TRUE")
         endif()
@@ -266,15 +265,15 @@ if(CUDA)
     endif()
 
     # extra CUDA flags
-    list(APPEND CUDA_NVCC_FLAGS -use_fast_math)
+    list(APPEND CUDA_NVCC_FLAGS --use_fast_math)
 
     if(TARGET_LINUX OR TARGET_OSX)
         list(APPEND CUDA_NVCC_FLAGS --compiler-options -fPIC)
     endif()
 
     # SPDF
     if(ENABLEF)
-        list(APPEND CUDA_NVCC_FLAGS -DCUDA_SPDF)
+        list(APPEND CUDA_NVCC_FLAGS -DGPU_SPDF)
     endif()
 
     if(DISABLE_OPTIMIZER_CONSTANTS)
@@ -299,9 +298,6 @@ endif()
 #option(HIP_RDC "Build relocatable device code, also known as separate compilation mode." FALSE)
 #option(HIP_WARP64 "Build for CDNA AMD GPUs (warp size 64) or RDNA (warp size 32)" TRUE)
 if(HIP)
-    # HIP builds currently unavailable (TODO: fix post release)
-    message(FATAL_ERROR "Error: HIP support is currently unavailable in this QUICK release. Support will be added back in a future release.")
-
     set(QUICK_GPU_PLATFORM "HIP")
     set(QUICK_GPU_TARGET_NAME "hip")
     set(GPU_LD_FLAGS -fgpu-rdc --hip-link)
@@ -325,19 +321,15 @@ if(HIP)
     endif()
 
     list(APPEND AMD_HIP_FLAGS -fPIC -std=c++14)
-    set(TARGET_ID_SUPPORT ON)
+    #set(TARGET_ID_SUPPORT ON)
 
 #    if(HIP_WARP64)
 #        add_compile_definitions(QUICK_PLATFORM_AMD_WARP64)
 #    endif()
 
-    # HIP codes currently do not support f-functions with -DUSE_LEGACY_ATOMICS targets (gfx906 and gfx908)
-    if(ENABLEF AND (("${QUICK_USER_ARCH}" STREQUAL "") OR ("${QUICK_USER_ARCH}" MATCHES "gfx906") OR ("${QUICK_USER_ARCH}" MATCHES "gfx908")))
-	    message(FATAL_ERROR "Error: Unsupported HIP options (ENABLEF with -DUSE_LEGACY_ATOMICS). ${PROJECT_NAME} support for f-functions requires newer HIP architecture targets not using LEGACY_ATOMICS.  Please specify architectures with QUICK_USER_ARCH not needing LEGACY_ATOMICS (post-gfx908) or disable f-function support.")
-    endif()
-
     if( NOT "${QUICK_USER_ARCH}" STREQUAL "")
         set(FOUND "FALSE")
+
         if("${QUICK_USER_ARCH}" MATCHES "gfx908")
             message(STATUS "Configuring QUICK for gfx908")
             list(APPEND AMD_HIP_FLAGS -DUSE_LEGACY_ATOMICS)
@@ -346,25 +338,78 @@ if(HIP)
 
         if("${QUICK_USER_ARCH}" MATCHES "gfx90a")
             message(STATUS "Configuring QUICK for gfx90a")
-            list(APPEND AMD_HIP_FLAGS -munsafe-fp-atomics -DAMD_ARCH_GFX90a)
+            list(APPEND AMD_HIP_FLAGS -DAMD_ARCH_GFX90a)
+            set(FOUND "TRUE")
+        endif()
+
+        if("${QUICK_USER_ARCH}" MATCHES "gfx942")
+            message(STATUS "Configuring QUICK for gfx942")
+            list(APPEND AMD_HIP_FLAGS -DAMD_ARCH_GFX90a)
             set(FOUND "TRUE")
         endif()
 
         if (NOT ${FOUND})
-            message(FATAL_ERROR "Invalid value for QUICK_USER_ARCH. Possible values are gfx908, gfx90a.")
+            message(FATAL_ERROR "Invalid value for QUICK_USER_ARCH. Possible values are gfx908, gfx90a, gfx942.")
         endif()
     else()
-        list(APPEND AMD_HIP_FLAGS -DUSE_LEGACY_ATOMICS)
         set(QUICK_USER_ARCH "gfx908")
+        list(APPEND AMD_HIP_FLAGS -DUSE_LEGACY_ATOMICS)
         message(STATUS "AMD GPU architecture not specified. Code will be optimized for gfx908.")
     endif()
 
     find_package(HipCUDA REQUIRED)
 
+    execute_process(
+          COMMAND ${HIP_HIPCC_EXECUTABLE} --version
+	  OUTPUT_VARIABLE HIPCC_VERSION_OUTPUT
+	  RESULT_VARIABLE HIPCC_VERSION_RESULT)
+
+    if(NOT HIPCC_VERSION_RESULT EQUAL "0")
+        message(FATAL_ERROR "Failed to get ROCm/HIP version.")
+    endif()
+
+    string(REPLACE "\n" ";" HIPCC_VERSION_OUTPUT ${HIPCC_VERSION_OUTPUT})
+    string(REGEX MATCH "rocm-([0-9]+).([0-9]+).([0-9]+)" _ "${HIPCC_VERSION_OUTPUT}")
+    set(HIP_VERSION_MAJOR ${CMAKE_MATCH_1})
+    set(HIP_VERSION_MINOR ${CMAKE_MATCH_2})
+    set(HIP_VERSION_PATCH ${CMAKE_MATCH_3})
+    set(HIP_VERSION "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_PATCH}" CACHE STRING "ROCm/HIP version (reported by hipcc).")
+    mark_as_advanced(HIP_VERSION)
+    message(STATUS "Detected ROCm/HIP version: ${HIP_VERSION}")
+
+    #  check ROCm version (as reported by hipcc),
+    #  as the QUICK HIP codes trigger a known scalar register fill/spill bug
+    #  in several ROCm versions
+    if (${HIP_VERSION} VERSION_GREATER_EQUAL 5.4.3)
+        message(STATUS "")
+        message("************************************************************")
+	message("Error: Incompatible ROCm/HIP version: ${HIP_VERSION}")
+        message("  The QUICK HIP codes trigger a known compiler scalar register ")
+        message("  fill/spill bug in ROCm >= v5.4.3.")
+        message("  Please build QUICK with a known working ROCm version.")
+        message("************************************************************")
+        message(STATUS "")
+        message(FATAL_ERROR)
+    endif()
+
     list(APPEND CUDA_NVCC_FLAGS ${AMD_HIP_FLAGS})
 
+    if(QUICK_DEBUG_HIP_ASAN)
+	set(QUICK_USER_ARCH "${QUICK_USER_ARCH}:xnack+")
+	list(APPEND CUDA_NVCC_FLAGS -fsanitize=address -fsanitize-recover=address -shared-libsan -g --offload-arch=${QUICK_USER_ARCH})
+    endif()
+
+    # SPDF
+    if(ENABLEF)
+        list(APPEND CUDA_NVCC_FLAGS -DGPU_SPDF)
+    endif()
+
+    if(USE_LEGACY_ATOMICS)
+        list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
+    endif()
+
     set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-    set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+    set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
 
 #    if(HIP_RDC)
 #        # Only hipcc can link a library compiled using RDC mode