[MINOR] Push loading Cuda libraries until when required

This patch pushes out the loading of cuDNN and cuSPARSE libraries until required. Moreover, we now record the unusable GPU memory due to fragmentation and use that to avoid unnecessary cudaMalloc failures.
apache · Aug 8, 2023 · 8033619 · 8033619
1 parent 6b3a6ce
commit 8033619
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 17 deletions.
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CudaMemoryAllocator.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CudaMemoryAllocator.java
@@ -32,7 +32,11 @@
 import static jcuda.runtime.JCuda.cudaFree;
 
 public class CudaMemoryAllocator implements GPUMemoryAllocator {
-
+	// Record the unusable free memory to avoid unnecessary cudaMalloc calls.
+	// An allocation request may fail due to fragmented memory even if cudaMemGetInfo
+	// says enough memory is available. CudaMalloc is expensive even when fails.
+	private static long unusableFreeMem = 0;
+
 	/**
 	 * Allocate memory on the device. 
 	 * 
@@ -41,10 +45,15 @@ public class CudaMemoryAllocator implements GPUMemoryAllocator {
 	 * @throws jcuda.CudaException if unable to allocate
 	 */
 	@Override
-	public void allocate(Pointer devPtr, long size) throws CudaException {
-		int status = cudaMalloc(devPtr, size);
-		if(status != cudaSuccess) {
-			throw new jcuda.CudaException("cudaMalloc failed:" + cudaError.stringFor(status));
+	public void allocate(Pointer devPtr, long size) {
+		try {
+			int status = cudaMalloc(devPtr, size);
+		}
+		catch(CudaException e) {
+			if (e.getMessage().equals("cudaErrorMemoryAllocation"))
+				// Update unusable memory
+				unusableFreeMem = getAvailableMemory();
+			throw new jcuda.CudaException("cudaMalloc failed: " + e.getMessage());
 		}
 	}
 
@@ -70,7 +79,7 @@ public void free(Pointer devPtr) throws CudaException {
 	 */
 	@Override
 	public boolean canAllocate(long size) {
-		return size <= getAvailableMemory();
+		return size <= (getAvailableMemory() - unusableFreeMem);
 	}
 
 	/**
@@ -86,4 +95,8 @@ public long getAvailableMemory() {
 		return (long) (free[0] * DMLScript.GPU_MEMORY_UTILIZATION_FACTOR);
 	}
 
+	public static void resetUnusableFreeMemory() {
+		unusableFreeMem = 0;
+	}
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
@@ -107,7 +107,7 @@ protected GPUContext(int deviceNum) {
 
 
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
+			GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
 
 		memoryManager = new GPUMemoryManager(this);
 	}
@@ -139,10 +139,10 @@ private void initializeCudaLibraryHandles() throws DMLRuntimeException {
 		// This has a huge performance impact on scripts that has large number of layers (i.e. FunctionCallCP) for example ResNet.
 		// If this is absolutely required for parfor, please add appropriate safeguard for non-parfor scripts. 
 		// deleteCudaLibraryHandles();
-		if (cudnnHandle == null) {
+		/*if (cudnnHandle == null) {
 			cudnnHandle = new cudnnHandle();
 			cudnnCreate(cudnnHandle);
-		}
+		}*/
 
 		if (cublasHandle == null) {
 			cublasHandle = new cublasHandle();
@@ -152,10 +152,10 @@ private void initializeCudaLibraryHandles() throws DMLRuntimeException {
 		// This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
 		// cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
 
-		if (cusparseHandle == null) {
+		/*if (cusparseHandle == null) {
 			cusparseHandle = new cusparseHandle();
 			cusparseCreate(cusparseHandle);
-		}
+		}*/
 
 		if (kernels == null) {
 			kernels = new JCudaKernels();
@@ -340,6 +340,15 @@ public int getWarpSize() {
 	 * @return cudnnHandle for current thread
 	 */
 	public cudnnHandle getCudnnHandle() {
+		if (cudnnHandle == null) {
+			// Load the library if not done already
+			GPUContext.LOG.info("Initializing cuDNN Library Handle");
+			long start = System.nanoTime();
+			cudnnHandle = new cudnnHandle();
+			cudnnCreate(cudnnHandle);
+			if (DMLScript.STATISTICS)
+				GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
+		}
 		return cudnnHandle;
 	}
 
@@ -349,6 +358,15 @@ public cudnnHandle getCudnnHandle() {
 	 * @return cublasHandle for current thread
 	 */
 	public cublasHandle getCublasHandle() {
+		if (cublasHandle == null) {
+			// Load the library if not done already
+			GPUContext.LOG.info("Initializing cuBLAS Library Handle");
+			long start = System.nanoTime();
+			cublasHandle = new cublasHandle();
+			cublasCreate(cublasHandle);
+			if (DMLScript.STATISTICS)
+				GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
+		}
 		return cublasHandle;
 	}
 
@@ -358,6 +376,15 @@ public cublasHandle getCublasHandle() {
 	 * @return cusparseHandle for current thread
 	 */
 	public cusparseHandle getCusparseHandle() {
+		if (cusparseHandle == null) {
+			// Load the library if not done already
+			GPUContext.LOG.info("Initializing cuSPARSE Library Handle");
+			long start = System.nanoTime();
+			cusparseHandle = new cusparseHandle();
+			cusparseCreate(cusparseHandle);
+			if (DMLScript.STATISTICS)
+				GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
+		}
 		return cusparseHandle;
 	}
 

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryAllocator.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryAllocator.java
@@ -30,7 +30,7 @@ public interface GPUMemoryAllocator {
 	 * @param size size in bytes
 	 * @throws jcuda.CudaException if unable to allocate
 	 */
-	public void allocate(Pointer devPtr, long size) throws jcuda.CudaException;
+	public void allocate(Pointer devPtr, long size);
 
 	/**
 	 * Frees memory on the device

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -97,9 +97,9 @@ private Set<Pointer> getNonMatrixLockedPointers() {
 	 * To record size of all allocated pointers allocated by above memory managers
 	 */
 	protected final HashMap<Pointer, PointerInfo> allPointers = new HashMap<>();
-	
+
 	/*****************************************************************************************/
-	
+
 
 	/**
 	 * Get size of allocated GPU Pointer
@@ -415,6 +415,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
 			LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
 			LOG.info("Before clearAllUnlocked, GPU Memory info:" + toString());
 			matrixMemoryManager.clearAllUnlocked(opcode);
+			CudaMemoryAllocator.resetUnusableFreeMemory();
 			LOG.info("GPU Memory info after evicting all unlocked matrices:" + toString());
 			A = cudaMallocNoWarn(tmpA, size, null);
 		}

diff --git a/src/main/java/org/apache/sysds/utils/GPUStatistics.java b/src/main/java/org/apache/sysds/utils/GPUStatistics.java
@@ -36,7 +36,7 @@ public class GPUStatistics {
 	private static int iNoOfExecutedGPUInst = 0;
 
 	public static long cudaInitTime = 0;
-	public static long cudaLibrariesInitTime = 0;
+	public static LongAdder cudaLibrariesInitTime = new LongAdder();
 	public static LongAdder cudaSparseToDenseTime = new LongAdder();		// time spent in converting sparse matrix block to dense
 	public static LongAdder cudaDenseToSparseTime = new LongAdder();		// time spent in converting dense matrix block to sparse
 	public static LongAdder cudaSparseConversionTime = new LongAdder();	// time spent in converting between sparse block types
@@ -96,7 +96,7 @@ public static void resetMiscTimers(){
 	 */
 	public static void reset(){
 		cudaInitTime = 0;
-		cudaLibrariesInitTime = 0;
+		cudaLibrariesInitTime.reset();
 		cudaAllocTime.reset();
 		cudaDeAllocTime.reset();
 		cudaMemSet0Time.reset();
@@ -183,7 +183,7 @@ public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
 	public static String getStringForCudaTimers() {
 		StringBuffer sb = new StringBuffer();
 		sb.append("CUDA/CuLibraries init time:\t" + String.format("%.3f", cudaInitTime*1e-9) + "/"
-				+ String.format("%.3f", cudaLibrariesInitTime*1e-9) + " sec.\n");
+				+ String.format("%.3f", cudaLibrariesInitTime.longValue()*1e-9) + " sec.\n");
 		sb.append("Number of executed GPU inst:\t" + getNoOfExecutedGPUInst() + ".\n");
 		// cudaSparseConversionCount
 		sb.append("GPU mem alloc time  (alloc(success/fail) / dealloc / set0):\t"