Skip to content

Commit

Permalink
[MINOR] Push loading Cuda libraries until when required
Browse files Browse the repository at this point in the history
This patch pushes out the loading of cuDNN and cuSPARSE libraries
until required. Moreover, we now record the unusable GPU memory due
to fragmentation and use that to avoid unnecessary cudaMalloc failures.
  • Loading branch information
phaniarnab committed Aug 8, 2023
1 parent 6b3a6ce commit 8033619
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,11 @@
import static jcuda.runtime.JCuda.cudaFree;

public class CudaMemoryAllocator implements GPUMemoryAllocator {

// Record the unusable free memory to avoid unnecessary cudaMalloc calls.
// An allocation request may fail due to fragmented memory even if cudaMemGetInfo
// says enough memory is available. CudaMalloc is expensive even when fails.
private static long unusableFreeMem = 0;

/**
* Allocate memory on the device.
*
Expand All @@ -41,10 +45,15 @@ public class CudaMemoryAllocator implements GPUMemoryAllocator {
* @throws jcuda.CudaException if unable to allocate
*/
@Override
public void allocate(Pointer devPtr, long size) throws CudaException {
int status = cudaMalloc(devPtr, size);
if(status != cudaSuccess) {
throw new jcuda.CudaException("cudaMalloc failed:" + cudaError.stringFor(status));
public void allocate(Pointer devPtr, long size) {
try {
int status = cudaMalloc(devPtr, size);
}
catch(CudaException e) {
if (e.getMessage().equals("cudaErrorMemoryAllocation"))
// Update unusable memory
unusableFreeMem = getAvailableMemory();
throw new jcuda.CudaException("cudaMalloc failed: " + e.getMessage());
}
}

Expand All @@ -70,7 +79,7 @@ public void free(Pointer devPtr) throws CudaException {
*/
@Override
public boolean canAllocate(long size) {
return size <= getAvailableMemory();
return size <= (getAvailableMemory() - unusableFreeMem);
}

/**
Expand All @@ -86,4 +95,8 @@ public long getAvailableMemory() {
return (long) (free[0] * DMLScript.GPU_MEMORY_UTILIZATION_FACTOR);
}

public static void resetUnusableFreeMemory() {
unusableFreeMem = 0;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ protected GPUContext(int deviceNum) {


if (DMLScript.STATISTICS)
GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);

memoryManager = new GPUMemoryManager(this);
}
Expand Down Expand Up @@ -139,10 +139,10 @@ private void initializeCudaLibraryHandles() throws DMLRuntimeException {
// This has a huge performance impact on scripts that has large number of layers (i.e. FunctionCallCP) for example ResNet.
// If this is absolutely required for parfor, please add appropriate safeguard for non-parfor scripts.
// deleteCudaLibraryHandles();
if (cudnnHandle == null) {
/*if (cudnnHandle == null) {
cudnnHandle = new cudnnHandle();
cudnnCreate(cudnnHandle);
}
}*/

if (cublasHandle == null) {
cublasHandle = new cublasHandle();
Expand All @@ -152,10 +152,10 @@ private void initializeCudaLibraryHandles() throws DMLRuntimeException {
// This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
// cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);

if (cusparseHandle == null) {
/*if (cusparseHandle == null) {
cusparseHandle = new cusparseHandle();
cusparseCreate(cusparseHandle);
}
}*/

if (kernels == null) {
kernels = new JCudaKernels();
Expand Down Expand Up @@ -340,6 +340,15 @@ public int getWarpSize() {
* @return cudnnHandle for current thread
*/
public cudnnHandle getCudnnHandle() {
if (cudnnHandle == null) {
// Load the library if not done already
GPUContext.LOG.info("Initializing cuDNN Library Handle");
long start = System.nanoTime();
cudnnHandle = new cudnnHandle();
cudnnCreate(cudnnHandle);
if (DMLScript.STATISTICS)
GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
}
return cudnnHandle;
}

Expand All @@ -349,6 +358,15 @@ public cudnnHandle getCudnnHandle() {
* @return cublasHandle for current thread
*/
public cublasHandle getCublasHandle() {
if (cublasHandle == null) {
// Load the library if not done already
GPUContext.LOG.info("Initializing cuBLAS Library Handle");
long start = System.nanoTime();
cublasHandle = new cublasHandle();
cublasCreate(cublasHandle);
if (DMLScript.STATISTICS)
GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
}
return cublasHandle;
}

Expand All @@ -358,6 +376,15 @@ public cublasHandle getCublasHandle() {
* @return cusparseHandle for current thread
*/
public cusparseHandle getCusparseHandle() {
if (cusparseHandle == null) {
// Load the library if not done already
GPUContext.LOG.info("Initializing cuSPARSE Library Handle");
long start = System.nanoTime();
cusparseHandle = new cusparseHandle();
cusparseCreate(cusparseHandle);
if (DMLScript.STATISTICS)
GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
}
return cusparseHandle;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public interface GPUMemoryAllocator {
* @param size size in bytes
* @throws jcuda.CudaException if unable to allocate
*/
public void allocate(Pointer devPtr, long size) throws jcuda.CudaException;
public void allocate(Pointer devPtr, long size);

/**
* Frees memory on the device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,9 @@ private Set<Pointer> getNonMatrixLockedPointers() {
* To record size of all allocated pointers allocated by above memory managers
*/
protected final HashMap<Pointer, PointerInfo> allPointers = new HashMap<>();

/*****************************************************************************************/


/**
* Get size of allocated GPU Pointer
Expand Down Expand Up @@ -415,6 +415,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
LOG.info("Before clearAllUnlocked, GPU Memory info:" + toString());
matrixMemoryManager.clearAllUnlocked(opcode);
CudaMemoryAllocator.resetUnusableFreeMemory();
LOG.info("GPU Memory info after evicting all unlocked matrices:" + toString());
A = cudaMallocNoWarn(tmpA, size, null);
}
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/org/apache/sysds/utils/GPUStatistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public class GPUStatistics {
private static int iNoOfExecutedGPUInst = 0;

public static long cudaInitTime = 0;
public static long cudaLibrariesInitTime = 0;
public static LongAdder cudaLibrariesInitTime = new LongAdder();
public static LongAdder cudaSparseToDenseTime = new LongAdder(); // time spent in converting sparse matrix block to dense
public static LongAdder cudaDenseToSparseTime = new LongAdder(); // time spent in converting dense matrix block to sparse
public static LongAdder cudaSparseConversionTime = new LongAdder(); // time spent in converting between sparse block types
Expand Down Expand Up @@ -96,7 +96,7 @@ public static void resetMiscTimers(){
*/
public static void reset(){
cudaInitTime = 0;
cudaLibrariesInitTime = 0;
cudaLibrariesInitTime.reset();
cudaAllocTime.reset();
cudaDeAllocTime.reset();
cudaMemSet0Time.reset();
Expand Down Expand Up @@ -183,7 +183,7 @@ public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
public static String getStringForCudaTimers() {
StringBuffer sb = new StringBuffer();
sb.append("CUDA/CuLibraries init time:\t" + String.format("%.3f", cudaInitTime*1e-9) + "/"
+ String.format("%.3f", cudaLibrariesInitTime*1e-9) + " sec.\n");
+ String.format("%.3f", cudaLibrariesInitTime.longValue()*1e-9) + " sec.\n");
sb.append("Number of executed GPU inst:\t" + getNoOfExecutedGPUInst() + ".\n");
// cudaSparseConversionCount
sb.append("GPU mem alloc time (alloc(success/fail) / dealloc / set0):\t"
Expand Down

0 comments on commit 8033619

Please sign in to comment.