apache · phaniarnab · Jul 25, 2023 · Aug 8, 2023
diff --git a/src/main/java/org/apache/sysds/lops/UnaryCP.java b/src/main/java/org/apache/sysds/lops/UnaryCP.java
@@ -72,10 +72,12 @@ public String getOpCode() {
 
 	@Override
 	public String getInstructions(String input, String output) {
-		return InstructionUtils.concatOperands(
+		String ret =  InstructionUtils.concatOperands(
 			getExecType().name(), getOpCode(),
 			getInputs().get(0).prepScalarInputOperand(getExecType()),
-			prepOutputOperand(output),
-			Integer.toString(_numThreads));
+			prepOutputOperand(output));
+		if (getExecType() == ExecType.CP || getExecType() == ExecType.FED)
+			ret = InstructionUtils.concatOperands(ret, Integer.toString(_numThreads));
+		return ret;
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CudaMemoryAllocator.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CudaMemoryAllocator.java
@@ -32,7 +32,11 @@
 import static jcuda.runtime.JCuda.cudaFree;
 
 public class CudaMemoryAllocator implements GPUMemoryAllocator {
-
+	// Record the unusable free memory to avoid unnecessary cudaMalloc calls.
+	// An allocation request may fail due to fragmented memory even if cudaMemGetInfo
+	// says enough memory is available. CudaMalloc is expensive even when fails.
+	private static long unusableFreeMem = 0;
+
 	/**
 	 * Allocate memory on the device. 
 	 * 
@@ -41,10 +45,15 @@ public class CudaMemoryAllocator implements GPUMemoryAllocator {
 	 * @throws jcuda.CudaException if unable to allocate
 	 */
 	@Override
-	public void allocate(Pointer devPtr, long size) throws CudaException {
-		int status = cudaMalloc(devPtr, size);
-		if(status != cudaSuccess) {
-			throw new jcuda.CudaException("cudaMalloc failed:" + cudaError.stringFor(status));
+	public void allocate(Pointer devPtr, long size) {
+		try {
+			int status = cudaMalloc(devPtr, size);
+		}
+		catch(CudaException e) {
+			if (e.getMessage().equals("cudaErrorMemoryAllocation"))
+				// Update unusable memory
+				unusableFreeMem = getAvailableMemory();
+			throw new jcuda.CudaException("cudaMalloc failed: " + e.getMessage());
 		}
 	}
 
@@ -70,7 +79,7 @@ public void free(Pointer devPtr) throws CudaException {
 	 */
 	@Override
 	public boolean canAllocate(long size) {
-		return size <= getAvailableMemory();
+		return size <= (getAvailableMemory() - unusableFreeMem);
 	}
 
 	/**
@@ -86,4 +95,8 @@ public long getAvailableMemory() {
 		return (long) (free[0] * DMLScript.GPU_MEMORY_UTILIZATION_FACTOR);
 	}
 
+	public static void resetUnusableFreeMemory() {
+		unusableFreeMem = 0;
+	}
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
@@ -107,7 +107,7 @@ protected GPUContext(int deviceNum) {
 
 
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
+			GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
 
 		memoryManager = new GPUMemoryManager(this);
 	}
@@ -139,10 +139,10 @@ private void initializeCudaLibraryHandles() throws DMLRuntimeException {
 		// This has a huge performance impact on scripts that has large number of layers (i.e. FunctionCallCP) for example ResNet.
 		// If this is absolutely required for parfor, please add appropriate safeguard for non-parfor scripts. 
 		// deleteCudaLibraryHandles();
-		if (cudnnHandle == null) {
+		/*if (cudnnHandle == null) {
 			cudnnHandle = new cudnnHandle();
 			cudnnCreate(cudnnHandle);
-		}
+		}*/
 
 		if (cublasHandle == null) {
 			cublasHandle = new cublasHandle();
@@ -152,10 +152,10 @@ private void initializeCudaLibraryHandles() throws DMLRuntimeException {
 		// This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
 		// cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
 
-		if (cusparseHandle == null) {
+		/*if (cusparseHandle == null) {
 			cusparseHandle = new cusparseHandle();
 			cusparseCreate(cusparseHandle);
-		}
+		}*/
 
 		if (kernels == null) {
 			kernels = new JCudaKernels();
@@ -340,6 +340,15 @@ public int getWarpSize() {
 	 * @return cudnnHandle for current thread
 	 */
 	public cudnnHandle getCudnnHandle() {
+		if (cudnnHandle == null) {
+			// Load the library if not done already
+			GPUContext.LOG.info("Initializing cuDNN Library Handle");
+			long start = System.nanoTime();
+			cudnnHandle = new cudnnHandle();
+			cudnnCreate(cudnnHandle);
+			if (DMLScript.STATISTICS)
+				GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
+		}
 		return cudnnHandle;
 	}
 
@@ -349,6 +358,15 @@ public cudnnHandle getCudnnHandle() {
 	 * @return cublasHandle for current thread
 	 */
 	public cublasHandle getCublasHandle() {
+		if (cublasHandle == null) {
+			// Load the library if not done already
+			GPUContext.LOG.info("Initializing cuBLAS Library Handle");
+			long start = System.nanoTime();
+			cublasHandle = new cublasHandle();
+			cublasCreate(cublasHandle);
+			if (DMLScript.STATISTICS)
+				GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
+		}
 		return cublasHandle;
 	}
 
@@ -358,6 +376,15 @@ public cublasHandle getCublasHandle() {
 	 * @return cusparseHandle for current thread
 	 */
 	public cusparseHandle getCusparseHandle() {
+		if (cusparseHandle == null) {
+			// Load the library if not done already
+			GPUContext.LOG.info("Initializing cuSPARSE Library Handle");
+			long start = System.nanoTime();
+			cusparseHandle = new cusparseHandle();
+			cusparseCreate(cusparseHandle);
+			if (DMLScript.STATISTICS)
+				GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
+		}
 		return cusparseHandle;
 	}
 

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryAllocator.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryAllocator.java
@@ -30,7 +30,7 @@ public interface GPUMemoryAllocator {
 	 * @param size size in bytes
 	 * @throws jcuda.CudaException if unable to allocate
 	 */
-	public void allocate(Pointer devPtr, long size) throws jcuda.CudaException;
+	public void allocate(Pointer devPtr, long size);
 
 	/**
 	 * Frees memory on the device

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -97,9 +97,9 @@ private Set<Pointer> getNonMatrixLockedPointers() {
 	 * To record size of all allocated pointers allocated by above memory managers
 	 */
 	protected final HashMap<Pointer, PointerInfo> allPointers = new HashMap<>();
-	
+
 	/*****************************************************************************************/
-	
+
 
 	/**
 	 * Get size of allocated GPU Pointer
@@ -415,6 +415,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
 			LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
 			LOG.info("Before clearAllUnlocked, GPU Memory info:" + toString());
 			matrixMemoryManager.clearAllUnlocked(opcode);
+			CudaMemoryAllocator.resetUnusableFreeMemory();
 			LOG.info("GPU Memory info after evicting all unlocked matrices:" + toString());
 			A = cudaMallocNoWarn(tmpA, size, null);
 		}

diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -161,6 +161,7 @@ else if (e.isRDDPersist()) {
 								//putValueRDD method will save the RDD and call persist
 								e.setCacheStatus(LineageCacheStatus.PERSISTEDRDD);
 								//Cannot reuse rdd as already garbage collected
+								ec.replaceLineageItem(outName, e._key); //still reuse the lineage trace
 								return false;
 							case PERSISTEDRDD:
 								//Reuse the persisted intermediate at the executors
@@ -180,6 +181,12 @@ else if (e.isRDDPersist()) {
 						Pointer gpuPtr = e.getGPUPointer();
 						if (gpuPtr == null && e.getCacheStatus() == LineageCacheStatus.NOTCACHED)
 							return false;  //the executing thread removed this entry from cache
+						if (e.getCacheStatus() == LineageCacheStatus.TOCACHEGPU) {  //second hit
+							//Cannot reuse as already garbage collected
+							ec.replaceLineageItem(outName, e._key); //still reuse the lineage trace
+							return false;
+						}
+						//Reuse from third hit onwards (status == GPUCACHED)
 						//Create a GPUObject with the cached pointer
 						GPUObject gpuObj = new GPUObject(ec.getGPUContext(0),
 							ec.getMatrixObject(outName), gpuPtr);
@@ -295,9 +302,20 @@ else if (e.isScalarValue()) {
 		if (reuse) {
 			//Additional maintenance for GPU pointers and RDDs
 			for (LineageCacheEntry e : funcOutLIs) {
-				if (e.isGPUObject())
-					//Increment the live count for this pointer
-					LineageGPUCacheEviction.incrementLiveCount(e.getGPUPointer());
+				if (e.isGPUObject()) {
+					switch(e.getCacheStatus()) {
+						case TOCACHEGPU:
+							//Cannot reuse as already garbage collected putValue method
+							// will save the pointer while caching the original instruction
+							return false;
+						case GPUCACHED:
+							//Increment the live count for this pointer
+							LineageGPUCacheEviction.incrementLiveCount(e.getGPUPointer());
+							break;
+						default:
+							return false;
+					}
+				}
 				else if (e.isRDDPersist()) {
 					//Reuse the cached RDD (local or persisted at the executors)
 					switch(e.getCacheStatus()) {
@@ -598,7 +616,7 @@ public static void putValue(Instruction inst, ExecutionContext ec, long starttim
 				}
 			}
 			else if (inst instanceof GPUInstruction) {
-				// TODO: gpu multiretrun instructions
+				// TODO: gpu multi-return instructions
 				Data gpudata = ec.getVariable(((GPUInstruction) inst)._output);
 				liGPUObj = gpudata instanceof MatrixObject ?
 						ec.getMatrixObject(((GPUInstruction)inst)._output).
@@ -708,14 +726,27 @@ private static void putValueGPU(GPUObject gpuObj, LineageItem instLI, long compu
 				removePlaceholder(instLI);
 				return;
 			}
-			// Update the total size of lineage cached gpu objects
-			// The eviction is handled by the unified gpu memory manager
-			LineageGPUCacheEviction.updateSize(gpuObj.getAllocatedSize(), true);
-			// Set the GPUOject in the cache
-			centry.setGPUValue(gpuObj.getDensePointer(), gpuObj.getAllocatedSize(),
-				gpuObj.getMatrixObject().getMetaData(), computetime);
-			// Maintain order for eviction
-			LineageGPUCacheEviction.addEntry(centry);
+			switch(centry.getCacheStatus()) {
+				case EMPTY:  //first hit
+					// Set the GPUOject in the cache. Will be garbage collected
+					centry.setGPUValue(gpuObj.getDensePointer(), gpuObj.getAllocatedSize(),
+						gpuObj.getMatrixObject().getMetaData(), computetime);
+					centry.setCacheStatus(LineageCacheStatus.TOCACHEGPU);
+					break;
+				case TOCACHEGPU:  //second hit
+					// Update the total size of lineage cached gpu objects
+					// The eviction is handled by the unified gpu memory manager
+					LineageGPUCacheEviction.updateSize(gpuObj.getAllocatedSize(), true);
+					// Set the GPUOject in the cache and update the status
+					centry.setGPUValue(gpuObj.getDensePointer(), gpuObj.getAllocatedSize(),
+						gpuObj.getMatrixObject().getMetaData(), computetime);
+					centry.setCacheStatus(LineageCacheStatus.GPUCACHED);
+					// Maintain order for eviction
+					LineageGPUCacheEviction.addEntry(centry);
+					break;
+				default:
+					throw new DMLRuntimeException("Execution should not reach here: "+centry._key);
+			}
 		}
 	}
 

diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -54,7 +54,7 @@ public class LineageCacheConfig
 		"^", "uamax", "uark+", "uacmean", "eigen", "ctableexpand", "replace",
 		"^2", "*2", "uack+", "tak+*", "uacsqk+", "uark+", "n+", "uarimax", "qsort",
 		"qpick", "transformapply", "uarmax", "n+", "-*", "castdtm", "lowertri",
-		"prefetch", "mapmm", "contains", "mmchain", "mapmmchain", "+*"
+		"prefetch", "mapmm", "contains", "mmchain", "mapmmchain", "+*", "=="
 		//TODO: Reuse everything.
 	};
 
@@ -152,6 +152,7 @@ protected enum LineageCacheStatus {
 		SPILLED,   //Data is in disk. Empty value. Cannot be evicted.
 		RELOADED,  //Reloaded from disk. Can be evicted.
 		PINNED,    //Pinned to memory. Cannot be evicted.
+		TOCACHEGPU, //To be cached in GPU if the instruction reoccur
 		GPUCACHED, //Points to GPU intermediate
 		PERSISTEDRDD, //Persisted at the Spark executors
 		TOPERSISTRDD, //To be persisted if the instruction reoccur

diff --git a/src/main/java/org/apache/sysds/utils/GPUStatistics.java b/src/main/java/org/apache/sysds/utils/GPUStatistics.java
@@ -36,7 +36,7 @@ public class GPUStatistics {
 	private static int iNoOfExecutedGPUInst = 0;
 
 	public static long cudaInitTime = 0;
-	public static long cudaLibrariesInitTime = 0;
+	public static LongAdder cudaLibrariesInitTime = new LongAdder();
 	public static LongAdder cudaSparseToDenseTime = new LongAdder();		// time spent in converting sparse matrix block to dense
 	public static LongAdder cudaDenseToSparseTime = new LongAdder();		// time spent in converting dense matrix block to sparse
 	public static LongAdder cudaSparseConversionTime = new LongAdder();	// time spent in converting between sparse block types
@@ -96,7 +96,7 @@ public static void resetMiscTimers(){
 	 */
 	public static void reset(){
 		cudaInitTime = 0;
-		cudaLibrariesInitTime = 0;
+		cudaLibrariesInitTime.reset();
 		cudaAllocTime.reset();
 		cudaDeAllocTime.reset();
 		cudaMemSet0Time.reset();
@@ -183,7 +183,7 @@ public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
 	public static String getStringForCudaTimers() {
 		StringBuffer sb = new StringBuffer();
 		sb.append("CUDA/CuLibraries init time:\t" + String.format("%.3f", cudaInitTime*1e-9) + "/"
-				+ String.format("%.3f", cudaLibrariesInitTime*1e-9) + " sec.\n");
+				+ String.format("%.3f", cudaLibrariesInitTime.longValue()*1e-9) + " sec.\n");
 		sb.append("Number of executed GPU inst:\t" + getNoOfExecutedGPUInst() + ".\n");
 		// cudaSparseConversionCount
 		sb.append("GPU mem alloc time  (alloc(success/fail) / dealloc / set0):\t"

diff --git a/src/test/java/org/apache/sysds/test/functions/async/LineageReuseSparkTest.java b/src/test/java/org/apache/sysds/test/functions/async/LineageReuseSparkTest.java
@@ -30,6 +30,7 @@
 import org.apache.sysds.runtime.lineage.Lineage;
 import org.apache.sysds.runtime.lineage.LineageCacheConfig;
 import org.apache.sysds.runtime.lineage.LineageCacheStatistics;
+import org.apache.sysds.runtime.lineage.LineageCacheConfig.ReuseCacheType;
 import org.apache.sysds.runtime.matrix.data.MatrixValue;
 import org.apache.sysds.test.AutomatedTestBase;
 import org.apache.sysds.test.TestConfiguration;
@@ -42,7 +43,7 @@ public class LineageReuseSparkTest extends AutomatedTestBase {
 
 	protected static final String TEST_DIR = "functions/async/";
 	protected static final String TEST_NAME = "LineageReuseSpark";
-	protected static final int TEST_VARIANTS = 6;
+	protected static final int TEST_VARIANTS = 7;
 	protected static String TEST_CLASS_DIR = TEST_DIR + LineageReuseSparkTest.class.getSimpleName() + "/";
 
 	@Override
@@ -54,44 +55,48 @@ public void setUp() {
 
 	@Test
 	public void testlmdsHB() {
-		runTest(TEST_NAME+"1", ExecMode.HYBRID, 1);
+		runTest(TEST_NAME+"1", ExecMode.HYBRID, ReuseCacheType.REUSE_FULL, 1);
 	}
 
 	@Test
 	public void testlmdsSP() {
 		// Only reuse the actions
-		runTest(TEST_NAME+"1", ExecMode.SPARK, 1);
+		runTest(TEST_NAME+"1", ExecMode.SPARK, ReuseCacheType.REUSE_MULTILEVEL, 1);
 	}
 
 	@Test
 	public void testlmdsRDD() {
 		// Cache all RDDs and persist shuffle-based Spark operations (eg. rmm, cpmm)
-		runTest(TEST_NAME+"2", ExecMode.HYBRID, 2);
+		runTest(TEST_NAME+"2", ExecMode.HYBRID, ReuseCacheType.REUSE_FULL, 2);
 	}
 
 	@Test
 	public void testL2svm() {
-		runTest(TEST_NAME+"3", ExecMode.HYBRID, 3);
+		runTest(TEST_NAME+"3", ExecMode.HYBRID, ReuseCacheType.REUSE_FULL, 3);
 	}
 
 	@Test
 	public void testlmdsMultiLevel() {
 		// Cache RDD and matrix block function returns and reuse
-		runTest(TEST_NAME+"4", ExecMode.HYBRID, 4);
+		runTest(TEST_NAME+"4", ExecMode.HYBRID, ReuseCacheType.REUSE_MULTILEVEL, 4);
 	}
 
 	@Test
 	public void testEnsemble() {
-		runTest(TEST_NAME+"5", ExecMode.HYBRID, 5);
+		runTest(TEST_NAME+"5", ExecMode.HYBRID, ReuseCacheType.REUSE_MULTILEVEL, 5);
 	}
 
 	//FIXME: Collecting a persisted RDD still needs the broadcast vars. Debug.
 	/*@Test
 	public void testHyperband() {
-		runTest(TEST_NAME+"6", ExecMode.HYBRID, 6);
+		runTest(TEST_NAME+"6", ExecMode.HYBRID, ReuseCacheType.REUSE_FULL, 6);
+	}*/
+	/*@Test
+	public void testBroadcastBug() {
+		runTest(TEST_NAME+"7", ExecMode.HYBRID, ReuseCacheType.REUSE_FULL, 7);
 	}*/
 
-	public void runTest(String testname, ExecMode execMode, int testId) {
+	public void runTest(String testname, ExecMode execMode, LineageCacheConfig.ReuseCacheType reuse, int testId) {
 		boolean old_simplification = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
 		boolean old_sum_product = OptimizerUtils.ALLOW_SUM_PRODUCT_REWRITES;
 		boolean old_trans_exec_type = OptimizerUtils.ALLOW_TRANSITIVE_SPARK_EXEC_TYPE;
@@ -126,7 +131,7 @@ public void runTest(String testname, ExecMode execMode, int testId) {
 			//proArgs.add("recompile_runtime");
 			proArgs.add("-stats");
 			proArgs.add("-lineage");
-			proArgs.add(LineageCacheConfig.ReuseCacheType.REUSE_MULTILEVEL.name().toLowerCase());
+			proArgs.add(reuse.name().toLowerCase());
 			proArgs.add("-args");
 			proArgs.add(output("R"));
 			programArgs = proArgs.toArray(new String[proArgs.size()]);