Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache GPU objects on second hit #1876

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions src/main/java/org/apache/sysds/lops/UnaryCP.java
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,12 @@ public String getOpCode() {

@Override
public String getInstructions(String input, String output) {
return InstructionUtils.concatOperands(
String ret = InstructionUtils.concatOperands(
getExecType().name(), getOpCode(),
getInputs().get(0).prepScalarInputOperand(getExecType()),
prepOutputOperand(output),
Integer.toString(_numThreads));
prepOutputOperand(output));
if (getExecType() == ExecType.CP || getExecType() == ExecType.FED)
ret = InstructionUtils.concatOperands(ret, Integer.toString(_numThreads));
return ret;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,11 @@
import static jcuda.runtime.JCuda.cudaFree;

public class CudaMemoryAllocator implements GPUMemoryAllocator {

// Record the unusable free memory to avoid unnecessary cudaMalloc calls.
// An allocation request may fail due to fragmented memory even if cudaMemGetInfo
// says enough memory is available. CudaMalloc is expensive even when fails.
private static long unusableFreeMem = 0;

/**
* Allocate memory on the device.
*
Expand All @@ -41,10 +45,15 @@ public class CudaMemoryAllocator implements GPUMemoryAllocator {
* @throws jcuda.CudaException if unable to allocate
*/
@Override
public void allocate(Pointer devPtr, long size) throws CudaException {
int status = cudaMalloc(devPtr, size);
if(status != cudaSuccess) {
throw new jcuda.CudaException("cudaMalloc failed:" + cudaError.stringFor(status));
public void allocate(Pointer devPtr, long size) {
try {
int status = cudaMalloc(devPtr, size);
}
catch(CudaException e) {
if (e.getMessage().equals("cudaErrorMemoryAllocation"))
// Update unusable memory
unusableFreeMem = getAvailableMemory();
throw new jcuda.CudaException("cudaMalloc failed: " + e.getMessage());
}
}

Expand All @@ -70,7 +79,7 @@ public void free(Pointer devPtr) throws CudaException {
*/
@Override
public boolean canAllocate(long size) {
return size <= getAvailableMemory();
return size <= (getAvailableMemory() - unusableFreeMem);
}

/**
Expand All @@ -86,4 +95,8 @@ public long getAvailableMemory() {
return (long) (free[0] * DMLScript.GPU_MEMORY_UTILIZATION_FACTOR);
}

public static void resetUnusableFreeMemory() {
unusableFreeMem = 0;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ protected GPUContext(int deviceNum) {


if (DMLScript.STATISTICS)
GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);

memoryManager = new GPUMemoryManager(this);
}
Expand Down Expand Up @@ -139,10 +139,10 @@ private void initializeCudaLibraryHandles() throws DMLRuntimeException {
// This has a huge performance impact on scripts that has large number of layers (i.e. FunctionCallCP) for example ResNet.
// If this is absolutely required for parfor, please add appropriate safeguard for non-parfor scripts.
// deleteCudaLibraryHandles();
if (cudnnHandle == null) {
/*if (cudnnHandle == null) {
cudnnHandle = new cudnnHandle();
cudnnCreate(cudnnHandle);
}
}*/

if (cublasHandle == null) {
cublasHandle = new cublasHandle();
Expand All @@ -152,10 +152,10 @@ private void initializeCudaLibraryHandles() throws DMLRuntimeException {
// This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
// cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);

if (cusparseHandle == null) {
/*if (cusparseHandle == null) {
cusparseHandle = new cusparseHandle();
cusparseCreate(cusparseHandle);
}
}*/

if (kernels == null) {
kernels = new JCudaKernels();
Expand Down Expand Up @@ -340,6 +340,15 @@ public int getWarpSize() {
* @return cudnnHandle for current thread
*/
public cudnnHandle getCudnnHandle() {
if (cudnnHandle == null) {
// Load the library if not done already
GPUContext.LOG.info("Initializing cuDNN Library Handle");
long start = System.nanoTime();
cudnnHandle = new cudnnHandle();
cudnnCreate(cudnnHandle);
if (DMLScript.STATISTICS)
GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
}
return cudnnHandle;
}

Expand All @@ -349,6 +358,15 @@ public cudnnHandle getCudnnHandle() {
* @return cublasHandle for current thread
*/
public cublasHandle getCublasHandle() {
if (cublasHandle == null) {
// Load the library if not done already
GPUContext.LOG.info("Initializing cuBLAS Library Handle");
long start = System.nanoTime();
cublasHandle = new cublasHandle();
cublasCreate(cublasHandle);
if (DMLScript.STATISTICS)
GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
}
return cublasHandle;
}

Expand All @@ -358,6 +376,15 @@ public cublasHandle getCublasHandle() {
* @return cusparseHandle for current thread
*/
public cusparseHandle getCusparseHandle() {
if (cusparseHandle == null) {
// Load the library if not done already
GPUContext.LOG.info("Initializing cuSPARSE Library Handle");
long start = System.nanoTime();
cusparseHandle = new cusparseHandle();
cusparseCreate(cusparseHandle);
if (DMLScript.STATISTICS)
GPUStatistics.cudaLibrariesInitTime.add(System.nanoTime() - start);
}
return cusparseHandle;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public interface GPUMemoryAllocator {
* @param size size in bytes
* @throws jcuda.CudaException if unable to allocate
*/
public void allocate(Pointer devPtr, long size) throws jcuda.CudaException;
public void allocate(Pointer devPtr, long size);

/**
* Frees memory on the device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,9 @@ private Set<Pointer> getNonMatrixLockedPointers() {
* To record size of all allocated pointers allocated by above memory managers
*/
protected final HashMap<Pointer, PointerInfo> allPointers = new HashMap<>();

/*****************************************************************************************/


/**
* Get size of allocated GPU Pointer
Expand Down Expand Up @@ -415,6 +415,7 @@ public Pointer malloc(String opcode, long size, boolean initialize) {
LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
LOG.info("Before clearAllUnlocked, GPU Memory info:" + toString());
matrixMemoryManager.clearAllUnlocked(opcode);
CudaMemoryAllocator.resetUnusableFreeMemory();
LOG.info("GPU Memory info after evicting all unlocked matrices:" + toString());
A = cudaMallocNoWarn(tmpA, size, null);
}
Expand Down
55 changes: 43 additions & 12 deletions src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ else if (e.isRDDPersist()) {
//putValueRDD method will save the RDD and call persist
e.setCacheStatus(LineageCacheStatus.PERSISTEDRDD);
//Cannot reuse rdd as already garbage collected
ec.replaceLineageItem(outName, e._key); //still reuse the lineage trace
return false;
case PERSISTEDRDD:
//Reuse the persisted intermediate at the executors
Expand All @@ -180,6 +181,12 @@ else if (e.isRDDPersist()) {
Pointer gpuPtr = e.getGPUPointer();
if (gpuPtr == null && e.getCacheStatus() == LineageCacheStatus.NOTCACHED)
return false; //the executing thread removed this entry from cache
if (e.getCacheStatus() == LineageCacheStatus.TOCACHEGPU) { //second hit
//Cannot reuse as already garbage collected
ec.replaceLineageItem(outName, e._key); //still reuse the lineage trace
return false;
}
//Reuse from third hit onwards (status == GPUCACHED)
//Create a GPUObject with the cached pointer
GPUObject gpuObj = new GPUObject(ec.getGPUContext(0),
ec.getMatrixObject(outName), gpuPtr);
Expand Down Expand Up @@ -295,9 +302,20 @@ else if (e.isScalarValue()) {
if (reuse) {
//Additional maintenance for GPU pointers and RDDs
for (LineageCacheEntry e : funcOutLIs) {
if (e.isGPUObject())
//Increment the live count for this pointer
LineageGPUCacheEviction.incrementLiveCount(e.getGPUPointer());
if (e.isGPUObject()) {
switch(e.getCacheStatus()) {
case TOCACHEGPU:
//Cannot reuse as already garbage collected putValue method
// will save the pointer while caching the original instruction
return false;
case GPUCACHED:
//Increment the live count for this pointer
LineageGPUCacheEviction.incrementLiveCount(e.getGPUPointer());
break;
default:
return false;
}
}
else if (e.isRDDPersist()) {
//Reuse the cached RDD (local or persisted at the executors)
switch(e.getCacheStatus()) {
Expand Down Expand Up @@ -598,7 +616,7 @@ public static void putValue(Instruction inst, ExecutionContext ec, long starttim
}
}
else if (inst instanceof GPUInstruction) {
// TODO: gpu multiretrun instructions
// TODO: gpu multi-return instructions
Data gpudata = ec.getVariable(((GPUInstruction) inst)._output);
liGPUObj = gpudata instanceof MatrixObject ?
ec.getMatrixObject(((GPUInstruction)inst)._output).
Expand Down Expand Up @@ -708,14 +726,27 @@ private static void putValueGPU(GPUObject gpuObj, LineageItem instLI, long compu
removePlaceholder(instLI);
return;
}
// Update the total size of lineage cached gpu objects
// The eviction is handled by the unified gpu memory manager
LineageGPUCacheEviction.updateSize(gpuObj.getAllocatedSize(), true);
// Set the GPUOject in the cache
centry.setGPUValue(gpuObj.getDensePointer(), gpuObj.getAllocatedSize(),
gpuObj.getMatrixObject().getMetaData(), computetime);
// Maintain order for eviction
LineageGPUCacheEviction.addEntry(centry);
switch(centry.getCacheStatus()) {
case EMPTY: //first hit
// Set the GPUOject in the cache. Will be garbage collected
centry.setGPUValue(gpuObj.getDensePointer(), gpuObj.getAllocatedSize(),
gpuObj.getMatrixObject().getMetaData(), computetime);
centry.setCacheStatus(LineageCacheStatus.TOCACHEGPU);
break;
case TOCACHEGPU: //second hit
// Update the total size of lineage cached gpu objects
// The eviction is handled by the unified gpu memory manager
LineageGPUCacheEviction.updateSize(gpuObj.getAllocatedSize(), true);
// Set the GPUOject in the cache and update the status
centry.setGPUValue(gpuObj.getDensePointer(), gpuObj.getAllocatedSize(),
gpuObj.getMatrixObject().getMetaData(), computetime);
centry.setCacheStatus(LineageCacheStatus.GPUCACHED);
// Maintain order for eviction
LineageGPUCacheEviction.addEntry(centry);
break;
default:
throw new DMLRuntimeException("Execution should not reach here: "+centry._key);
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public class LineageCacheConfig
"^", "uamax", "uark+", "uacmean", "eigen", "ctableexpand", "replace",
"^2", "*2", "uack+", "tak+*", "uacsqk+", "uark+", "n+", "uarimax", "qsort",
"qpick", "transformapply", "uarmax", "n+", "-*", "castdtm", "lowertri",
"prefetch", "mapmm", "contains", "mmchain", "mapmmchain", "+*"
"prefetch", "mapmm", "contains", "mmchain", "mapmmchain", "+*", "=="
//TODO: Reuse everything.
};

Expand Down Expand Up @@ -152,6 +152,7 @@ protected enum LineageCacheStatus {
SPILLED, //Data is in disk. Empty value. Cannot be evicted.
RELOADED, //Reloaded from disk. Can be evicted.
PINNED, //Pinned to memory. Cannot be evicted.
TOCACHEGPU, //To be cached in GPU if the instruction reoccur
GPUCACHED, //Points to GPU intermediate
PERSISTEDRDD, //Persisted at the Spark executors
TOPERSISTRDD, //To be persisted if the instruction reoccur
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/org/apache/sysds/utils/GPUStatistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public class GPUStatistics {
private static int iNoOfExecutedGPUInst = 0;

public static long cudaInitTime = 0;
public static long cudaLibrariesInitTime = 0;
public static LongAdder cudaLibrariesInitTime = new LongAdder();
public static LongAdder cudaSparseToDenseTime = new LongAdder(); // time spent in converting sparse matrix block to dense
public static LongAdder cudaDenseToSparseTime = new LongAdder(); // time spent in converting dense matrix block to sparse
public static LongAdder cudaSparseConversionTime = new LongAdder(); // time spent in converting between sparse block types
Expand Down Expand Up @@ -96,7 +96,7 @@ public static void resetMiscTimers(){
*/
public static void reset(){
cudaInitTime = 0;
cudaLibrariesInitTime = 0;
cudaLibrariesInitTime.reset();
cudaAllocTime.reset();
cudaDeAllocTime.reset();
cudaMemSet0Time.reset();
Expand Down Expand Up @@ -183,7 +183,7 @@ public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
public static String getStringForCudaTimers() {
StringBuffer sb = new StringBuffer();
sb.append("CUDA/CuLibraries init time:\t" + String.format("%.3f", cudaInitTime*1e-9) + "/"
+ String.format("%.3f", cudaLibrariesInitTime*1e-9) + " sec.\n");
+ String.format("%.3f", cudaLibrariesInitTime.longValue()*1e-9) + " sec.\n");
sb.append("Number of executed GPU inst:\t" + getNoOfExecutedGPUInst() + ".\n");
// cudaSparseConversionCount
sb.append("GPU mem alloc time (alloc(success/fail) / dealloc / set0):\t"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.sysds.runtime.lineage.Lineage;
import org.apache.sysds.runtime.lineage.LineageCacheConfig;
import org.apache.sysds.runtime.lineage.LineageCacheStatistics;
import org.apache.sysds.runtime.lineage.LineageCacheConfig.ReuseCacheType;
import org.apache.sysds.runtime.matrix.data.MatrixValue;
import org.apache.sysds.test.AutomatedTestBase;
import org.apache.sysds.test.TestConfiguration;
Expand All @@ -42,7 +43,7 @@ public class LineageReuseSparkTest extends AutomatedTestBase {

protected static final String TEST_DIR = "functions/async/";
protected static final String TEST_NAME = "LineageReuseSpark";
protected static final int TEST_VARIANTS = 6;
protected static final int TEST_VARIANTS = 7;
protected static String TEST_CLASS_DIR = TEST_DIR + LineageReuseSparkTest.class.getSimpleName() + "/";

@Override
Expand All @@ -54,44 +55,48 @@ public void setUp() {

@Test
public void testlmdsHB() {
runTest(TEST_NAME+"1", ExecMode.HYBRID, 1);
runTest(TEST_NAME+"1", ExecMode.HYBRID, ReuseCacheType.REUSE_FULL, 1);
}

@Test
public void testlmdsSP() {
// Only reuse the actions
runTest(TEST_NAME+"1", ExecMode.SPARK, 1);
runTest(TEST_NAME+"1", ExecMode.SPARK, ReuseCacheType.REUSE_MULTILEVEL, 1);
}

@Test
public void testlmdsRDD() {
// Cache all RDDs and persist shuffle-based Spark operations (eg. rmm, cpmm)
runTest(TEST_NAME+"2", ExecMode.HYBRID, 2);
runTest(TEST_NAME+"2", ExecMode.HYBRID, ReuseCacheType.REUSE_FULL, 2);
}

@Test
public void testL2svm() {
runTest(TEST_NAME+"3", ExecMode.HYBRID, 3);
runTest(TEST_NAME+"3", ExecMode.HYBRID, ReuseCacheType.REUSE_FULL, 3);
}

@Test
public void testlmdsMultiLevel() {
// Cache RDD and matrix block function returns and reuse
runTest(TEST_NAME+"4", ExecMode.HYBRID, 4);
runTest(TEST_NAME+"4", ExecMode.HYBRID, ReuseCacheType.REUSE_MULTILEVEL, 4);
}

@Test
public void testEnsemble() {
runTest(TEST_NAME+"5", ExecMode.HYBRID, 5);
runTest(TEST_NAME+"5", ExecMode.HYBRID, ReuseCacheType.REUSE_MULTILEVEL, 5);
}

//FIXME: Collecting a persisted RDD still needs the broadcast vars. Debug.
/*@Test
public void testHyperband() {
runTest(TEST_NAME+"6", ExecMode.HYBRID, 6);
runTest(TEST_NAME+"6", ExecMode.HYBRID, ReuseCacheType.REUSE_FULL, 6);
}*/
/*@Test
public void testBroadcastBug() {
runTest(TEST_NAME+"7", ExecMode.HYBRID, ReuseCacheType.REUSE_FULL, 7);
}*/

public void runTest(String testname, ExecMode execMode, int testId) {
public void runTest(String testname, ExecMode execMode, LineageCacheConfig.ReuseCacheType reuse, int testId) {
boolean old_simplification = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
boolean old_sum_product = OptimizerUtils.ALLOW_SUM_PRODUCT_REWRITES;
boolean old_trans_exec_type = OptimizerUtils.ALLOW_TRANSITIVE_SPARK_EXEC_TYPE;
Expand Down Expand Up @@ -126,7 +131,7 @@ public void runTest(String testname, ExecMode execMode, int testId) {
//proArgs.add("recompile_runtime");
proArgs.add("-stats");
proArgs.add("-lineage");
proArgs.add(LineageCacheConfig.ReuseCacheType.REUSE_MULTILEVEL.name().toLowerCase());
proArgs.add(reuse.name().toLowerCase());
proArgs.add("-args");
proArgs.add(output("R"));
programArgs = proArgs.toArray(new String[proArgs.size()]);
Expand Down
Loading
Loading