From 20e4caedb3595f9d546c53db9b6065ef56d15c1e Mon Sep 17 00:00:00 2001 From: Fritz Stracke Date: Wed, 26 Apr 2023 12:26:25 +0200 Subject: [PATCH 01/11] Only copy nbody if it was compiled. The nbody sample might not always compile. Added simple check to see if nbody executabel is present. --- tests/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/Makefile b/tests/Makefile index 8adc5da7..be383ad0 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -26,7 +26,9 @@ bin: cpu samples test_apps cp test_apps/*.testapp bin cp samples/matrixMul/matrixMul bin cp samples/bandwidthTest/bandwidthTest bin +ifneq (,$(test -f "samples/nbody/nbody")) cp samples/nbody/nbody bin +endif clean: @echo -e "\033[31m----> Cleaning up tests/test_apps\033[0m" From 7ac026c397cf54c48878807daaa2ecb6327a8119 Mon Sep 17 00:00:00 2001 From: Fritz Stracke Date: Wed, 26 Apr 2023 12:32:50 +0200 Subject: [PATCH 02/11] Add mpfr dependency The mpfr library needed to be linked --- gpu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/Makefile b/gpu/Makefile index a144539a..09a82d1e 100644 --- a/gpu/Makefile +++ b/gpu/Makefile @@ -20,7 +20,7 @@ INC_DIRS := -I$(CUDA_GDB_PATH)/bfd \ LIB_DIR := ../submodules/lib BUILD_DIR := build -DLIBS = -lncurses -lpthread -lm -lz -ldl -lexpat -llzma -Wl,--dynamic-list=utils/proc-service.list +DLIBS = -lncurses -lpthread -lm -lz -ldl -lexpat -llzma -lmpfr -Wl,--dynamic-list=utils/proc-service.list # Order of .a files is important! SLIBS = libgdb.a libbfd.a libiberty.a libreadline.a libdecnumber.a libcudacore.a libopcodes.a libgnu.a SLIBS:= $(addprefix $(LIB_DIR)/, $(SLIBS)) From 0db80c181d9ccf98d8d3cf94e4e99ec0872af3e2 Mon Sep 17 00:00:00 2001 From: Fritz Stracke Date: Thu, 4 May 2023 12:30:37 +0200 Subject: [PATCH 03/11] Add colors to gpu/log.c --- gpu/src/log.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/gpu/src/log.c b/gpu/src/log.c index e104890e..81ab6dc5 100644 --- a/gpu/src/log.c +++ b/gpu/src/log.c @@ -59,7 +59,12 @@ void now_time(char* buf) const char* to_string(log_level level) { +#ifdef NOCOLORS static const char* const buffer[] = {"ERROR", "WARNING", "INFO", "DEBUG"}; +#else + static const char* const buffer[] = {"\033[1m\033[31mERROR\033[0m", "\033[33mWARNING\033[0m", "\033[34mINFO\033[0m", "\033[32mDEBUG\033[0m"}; +#endif //NOCOLORS + if(level > LOG_DEBUG){ return buffer[LOG_DEBUG]; } @@ -90,5 +95,10 @@ void loggfe(log_level level, int line, const char* file, const char* formatstr, char stripped[64]; strcpy(stripped, file); str_strip(stripped, get_log_data()->project_offset); - printf("\tin %s(%d)\n", stripped, line); +#ifdef NOCOLORS + printf("\tin %s:%d\n", stripped, line); +#else + printf("\tin \033[4m%s:%d\033[0m\n", stripped, line); +#endif //NOCOLORS + } From 46c53dd4c10a0f8630aa3b3154e304178f13b47f Mon Sep 17 00:00:00 2001 From: Fritz Stracke Date: Wed, 10 May 2023 12:49:23 +0200 Subject: [PATCH 04/11] Remove absolute path in cricket start command. - Removed two absolute paths in start command. - Fixes issue in said command. - The location of cricket can now be specified using CRICKET_PATH variable. --- gpu/src/main.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/gpu/src/main.c b/gpu/src/main.c index 7facb092..49e3bee1 100644 --- a/gpu/src/main.c +++ b/gpu/src/main.c @@ -31,7 +31,7 @@ #include "gdb.h" #ifndef LOG_LEVEL -#define LOG_LEVEL LOG_INFO +#define LOG_LEVEL LOG_DEBUG #endif #define CRICKET_PROFILE 1 @@ -1092,6 +1092,8 @@ int cricket_checkpoint(int argc, char *argv[]) int cricket_start(int argc, char *argv[]) { + char* cricket_path; + char cmd_str[1024]; struct cmd_list_element *alias = NULL; struct cmd_list_element *prefix_cmd = NULL; struct cmd_list_element *cmd = NULL; @@ -1101,12 +1103,19 @@ int cricket_start(int argc, char *argv[]) return -1; } + cricket_path = getenv("CRICKET_PATH"); + if (cricket_path == NULL) { + LOG(LOG_DEBUG, "no cricket path specified. assuming /usr/local/cricket\n"); + cricket_path = "/usr/local/cricket"; + } + gdb_init(argc, argv, argv[2], NULL); /* load files */ //exec_file_attach(argv[2], !batch_flag); // - execute_command("set exec-wrapper env 'LD_PRELOAD=/home/eiling/projects/cricket/bin/libtirpc.so.3:/home/eiling/projects/cricket/cpu/cricket-server.so'", !batch_flag); + snprintf(cmd_str, 1024, "set exec-wrapper env 'LD_PRELOAD=%s/bin/libtirpc.so.3:%s/cpu/cricket-server.so'", cricket_path, cricket_path); + execute_command(cmd_str, !batch_flag); //execute_command("break main", !batch_flag); execute_command("starti", !batch_flag); //execute_command("unset exec-wrapper", !batch_flag); From e8188a03102bfcc0aca43eeaa79597ca8d8b009c Mon Sep 17 00:00:00 2001 From: Fritz Stracke Date: Thu, 11 May 2023 10:31:19 +0200 Subject: [PATCH 05/11] Prevent fatal make error. Make would stop compilation of program if a link existed. This is now fixed with a simple conditional. --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9007482d..11482830 100644 --- a/Makefile +++ b/Makefile @@ -43,8 +43,9 @@ bin: mkdir bin bin/tests: bin tests +ifneq (, $(test -f "bin/tests")) ln -s ../tests/bin bin/tests - +endif bin/cricket-client.so: bin $(MAKE) -C cpu cricket-client.so cp cpu/cricket-client.so bin From 4e324e4f48ce3ad1b6857eefa0860e7bde5cece1 Mon Sep 17 00:00:00 2001 From: Fritz Stracke Date: Thu, 11 May 2023 11:35:36 +0200 Subject: [PATCH 06/11] Improve cli help. Signed-off-by: Fritz Stracke --- gpu/src/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/src/main.c b/gpu/src/main.c index 49e3bee1..0cec4812 100644 --- a/gpu/src/main.c +++ b/gpu/src/main.c @@ -827,7 +827,7 @@ int cricket_checkpoint(int argc, char *argv[]) #endif if (argc != 3) { - printf("wrong number of arguments, use: %s \n", argv[0]); + printf("wrong number of arguments, use: %s checkpoint \n", argv[0]); return -1; } From 7bcc6ef281fcea1ed11e6403d76f3e6a0f96fb2f Mon Sep 17 00:00:00 2001 From: Fritz Stracke Date: Wed, 24 May 2023 10:47:34 +0200 Subject: [PATCH 07/11] Repair make clean make clean would encounter an error in cpu. fixed. make clean now removes bin directory. Signed-off-by: Fritz Stracke --- Makefile | 3 +++ cpu/Makefile | 2 +- tests/cpu/Makefile | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 tests/cpu/Makefile diff --git a/Makefile b/Makefile index 11482830..54f498ff 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,9 @@ clean: $(MAKE) -C cpu clean @echo -e "\033[31m----> Cleaning up test kernels\033[0m" $(MAKE) -C tests clean + @echo -e "\033[31m----> Removing bin...\033[0m" + rm -rf bin + @echo -e "\033[31m All done!\033[0m" cuda-gdb: @echo -e "\033[36m----> Building submodules\033[0m" diff --git a/cpu/Makefile b/cpu/Makefile index a2d38223..cc6cbd2f 100644 --- a/cpu/Makefile +++ b/cpu/Makefile @@ -80,7 +80,7 @@ LIB_FLAGS += -L$(CUDA_SRC)/lib64 CC_FLAGS += -std=gnu99 $(INC_FLAGS) -O2 # TODO: use extern in header files instead of direct definition e.g. in cpu-common.h to remove -fcommon flag CC_FLAGS += -fcommon -LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lelf +LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lelf ifdef WITH_DEBUG # use ASAN_OPTIONS=protect_shadow_gap=0 LSAN_OPTIONS=fast_unwind_on_malloc=0 when running diff --git a/tests/cpu/Makefile b/tests/cpu/Makefile new file mode 100644 index 00000000..ab492786 --- /dev/null +++ b/tests/cpu/Makefile @@ -0,0 +1,7 @@ +.PHONY: all clean + +all: + +clean: + $(MAKE) -C cubin clean + $(MAKE) -C unit clean From f615a79c2677360b7e9702b9a8282939b6757821 Mon Sep 17 00:00:00 2001 From: Fritz Stracke Date: Wed, 24 May 2023 11:51:16 +0200 Subject: [PATCH 08/11] Remove absoute path from gpu-tests. Signed-off-by: Fritz Stracke --- tests/gpu/checkpoint.sh | 2 +- tests/gpu/restore.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/gpu/checkpoint.sh b/tests/gpu/checkpoint.sh index 2624b66b..5b75f81b 100755 --- a/tests/gpu/checkpoint.sh +++ b/tests/gpu/checkpoint.sh @@ -18,7 +18,7 @@ echo "using $CUDA_APP" CUDA_APP_NAME="$(basename -- $CUDA_APP)" CRICKET_CLIENT=${CRICKET_PATH}/cpu/cricket-client.so CRICKET_SERVER=${CRICKET_PATH}/cpu/cricket-server.so -CRIU=/home/eiling/tmp/criu/criu/criu +CRIU=${HOME}/tmp/criu/criu/criu export REMOTE_GPU_ADDRESS=localhost export CUDA_VISIBLE_DEVICES=0 diff --git a/tests/gpu/restore.sh b/tests/gpu/restore.sh index c34027fc..52764a12 100755 --- a/tests/gpu/restore.sh +++ b/tests/gpu/restore.sh @@ -15,7 +15,7 @@ echo "using $CUDA_APP" CUDA_APP_NAME="$(basename -- $CUDA_APP)" CRICKET_CLIENT=${CRICKET_PATH}/cpu/cricket-client.so CRICKET_SERVER=${CRICKET_PATH}/cpu/cricket-server.so -CRIU=/home/eiling/tmp/criu/criu/criu +CRIU=${HOME}/tmp/criu/criu/criu export REMOTE_GPU_ADDRESS=localhost export CUDA_VISIBLE_DEVICES=0 From eb7b396c75ab8b78626d2e19c8a3c47e3718851e Mon Sep 17 00:00:00 2001 From: Fritz Stracke Date: Thu, 25 May 2023 11:53:59 +0200 Subject: [PATCH 09/11] Add debugging comments helpful for refractoring current errors. Signed-off-by: Fritz Stracke --- gpu/src/cricket-cr.c | 3 +++ gpu/src/main.c | 11 +++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/gpu/src/cricket-cr.c b/gpu/src/cricket-cr.c index fcbfd0d2..9069bee3 100644 --- a/gpu/src/cricket-cr.c +++ b/gpu/src/cricket-cr.c @@ -1170,6 +1170,8 @@ bool cricket_cr_ckp_params(CUDBGAPI cudbgAPI, const char *ckp_dir, /* Parameters are the same for all warps so just use warp 0 * TODO: use first valid warp, because warp 0 may not be in use (is that * possible?) + * + * This seems to cause issues right now. Needs a solution. */ if ((param_mem = (uint8_t*)malloc(elf_info->param_size)) == NULL) return false; @@ -1482,6 +1484,7 @@ bool cricket_cr_ckp_globals(CUDBGAPI cudbgAPI, const char *ckp_dir) if (res != CUDBG_SUCCESS) { LOGE(LOG_ERROR, "cuda error: %s", cudbgGetErrorString(res)); + LOGE(LOG_DEBUG, "encountered in iteration %d of %d\n", i, globals_num); goto cleanup; } offset += globals[i].size; diff --git a/gpu/src/main.c b/gpu/src/main.c index 0cec4812..6e7ca605 100644 --- a/gpu/src/main.c +++ b/gpu/src/main.c @@ -831,10 +831,11 @@ int cricket_checkpoint(int argc, char *argv[]) return -1; } + printf("Initializing GDB!\n\n"); gdb_init(argc, argv, NULL, argv[2]); /* attach to process (both CPU and GPU) */ - // printf("attaching...\n"); + printf("attaching...\n"); // attach_command(argv[2], !batch_flag); if (cuda_api_get_state() != CUDA_API_STATE_INITIALIZED) { @@ -849,6 +850,9 @@ int cricket_checkpoint(int argc, char *argv[]) #ifdef CRICKET_PROFILE gettimeofday(&b, NULL); #endif + + printf("attached!\n\n"); + printf("trying to get CUDA debugger API\n"); /* get CUDA debugger API */ res = cudbgGetAPI(CUDBG_API_VERSION_MAJOR, CUDBG_API_VERSION_MINOR, @@ -858,7 +862,7 @@ int cricket_checkpoint(int argc, char *argv[]) goto cuda_error; } printf("got API\n"); - + printf("enumerating devices...\n"); if (!cricket_device_get_num(cudbgAPI, &numDev)) { printf("error getting device num\n"); @@ -1042,6 +1046,9 @@ int cricket_checkpoint(int argc, char *argv[]) //cricket_focus_kernel(!batch_flag); + /// TODO: Logic Error: first_warp might always be invalid! In line 889 first_warp is looped over + /// the number of available warps. As far as I can tell it is never reset afterwards. + /// That would explain why this line throws an invalid warp exception. Needs discussion. if (!cricket_cr_ckp_params(cudbgAPI, ckp_dir, &elf_info, 0, 0, first_warp)) { printf("cricket_cr_ckp_params unsuccessful\n"); From 54e63fd8e133f2abe8c40f1dd25ea91236af270f Mon Sep 17 00:00:00 2001 From: Fritz Stracke Date: Fri, 2 Jun 2023 10:32:00 +0200 Subject: [PATCH 10/11] Remove global checkpointing temporarily. Signed-off-by: Fritz Stracke --- gpu/src/main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gpu/src/main.c b/gpu/src/main.c index 6e7ca605..27fa1e48 100644 --- a/gpu/src/main.c +++ b/gpu/src/main.c @@ -1046,17 +1046,17 @@ int cricket_checkpoint(int argc, char *argv[]) //cricket_focus_kernel(!batch_flag); - /// TODO: Logic Error: first_warp might always be invalid! In line 889 first_warp is looped over - /// the number of available warps. As far as I can tell it is never reset afterwards. - /// That would explain why this line throws an invalid warp exception. Needs discussion. + /// TODO: verify if it is sufficient to set first_warp to 0 + first_warp = 0; if (!cricket_cr_ckp_params(cudbgAPI, ckp_dir, &elf_info, 0, 0, first_warp)) { printf("cricket_cr_ckp_params unsuccessful\n"); } - if (!cricket_cr_ckp_globals(cudbgAPI, ckp_dir)) { - printf("cricket_cr_ckp_globals unsuccessful\n"); - } + /// TODO: work out globals + //if (!cricket_cr_ckp_globals(cudbgAPI, ckp_dir)) { + // printf("cricket_cr_ckp_globals unsuccessful\n"); + //} #ifdef CRICKET_PROFILE gettimeofday(&f, NULL); From ebf722ba3810972e4070a3acc329ac5033227352 Mon Sep 17 00:00:00 2001 From: Fritz Stracke Date: Wed, 14 Jun 2023 12:48:05 +0200 Subject: [PATCH 11/11] Start moving gpu-checkpointing to cpu folder. Signed-off-by: Fritz Stracke --- cpu/Makefile | 19 ++++++++++--------- cpu/cpu-server.c | 6 ++++++ cpu/gpu/ckp-kernel.c | 7 +++++++ cpu/gpu/ckp-kernel.h | 6 ++++++ gpu/src/cricket-cr.c | 2 ++ gpu/src/main.c | 14 +++++++------- 6 files changed, 38 insertions(+), 16 deletions(-) create mode 100644 cpu/gpu/ckp-kernel.c create mode 100644 cpu/gpu/ckp-kernel.h diff --git a/cpu/Makefile b/cpu/Makefile index cc6cbd2f..d6f74438 100644 --- a/cpu/Makefile +++ b/cpu/Makefile @@ -32,16 +32,17 @@ SRC_SERVER = $(RPC_XDR) \ cpu-server-driver-hidden.c \ log.c \ cpu-libwrap.c \ - cpu-server-cusolver.c \ - cpu-server-cublas.c \ + cpu-server-cusolver.c \ + cpu-server-cublas.c \ list.c \ - api-recorder.c \ - resource-mg.c \ - cr.c \ - gsched_none.c \ - oob.c \ - mt-memcpy.c \ - cpu-elf2.c + api-recorder.c \ + resource-mg.c \ + cr.c \ + gsched_none.c \ + oob.c \ + mt-memcpy.c \ + cpu-elf2.c \ + gpu/ckp-kernel.c SRC_SERVER_LIB = server-library.c SRC_SERVER_EXE = server-exe.c diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c index e5182324..ea10c4a8 100644 --- a/cpu/cpu-server.c +++ b/cpu/cpu-server.c @@ -18,6 +18,7 @@ #include "cpu-server-driver.h" #include "rpc/xdr.h" #include "cr.h" +#include "gpu/ckp-kernel.h" #include "cpu-elf2.h" #ifdef WITH_IB #include "cpu-ib.h" @@ -90,6 +91,11 @@ int cricket_server_checkpoint(int dump_memory) goto error; } + if ((ret = gpu_checkpoint(/*TODO*/)) != 0) { + LOGE(LOG_ERROR, "gpu_checkpoint returned %d", ret); + goto error; + } + LOG(LOG_INFO, "checkpoint successfully created."); return 0; error: diff --git a/cpu/gpu/ckp-kernel.c b/cpu/gpu/ckp-kernel.c new file mode 100644 index 00000000..45a3c045 --- /dev/null +++ b/cpu/gpu/ckp-kernel.c @@ -0,0 +1,7 @@ +#include "ckp-kernel.h" +#include + +int gpu_checkpoint(void) { + printf("TESTING...\n"); + return 0; +} diff --git a/cpu/gpu/ckp-kernel.h b/cpu/gpu/ckp-kernel.h new file mode 100644 index 00000000..c535216c --- /dev/null +++ b/cpu/gpu/ckp-kernel.h @@ -0,0 +1,6 @@ +#ifndef __CKP_KERNEL_H__ +#define __CKP_KERNEL_H__ + +int gpu_checkpoint(void); + +#endif //!__CKP_KERNEL_H__ diff --git a/gpu/src/cricket-cr.c b/gpu/src/cricket-cr.c index 9069bee3..10e634b7 100644 --- a/gpu/src/cricket-cr.c +++ b/gpu/src/cricket-cr.c @@ -1113,6 +1113,7 @@ bool cricket_cr_rst_params(CUDBGAPI cudbgAPI, const char *ckp_dir, cudbgGetErrorString(res)); goto cleanup; } + /* for (int i = 0; i != elf_info->param_num; ++i) { if (elf_info->params[i].size != 8) continue; @@ -1150,6 +1151,7 @@ bool cricket_cr_rst_params(CUDBGAPI cudbgAPI, const char *ckp_dir, free(param_data); param_data = NULL; } + */ ret = true; cleanup: free(param_mem); diff --git a/gpu/src/main.c b/gpu/src/main.c index 27fa1e48..1fbe97e8 100644 --- a/gpu/src/main.c +++ b/gpu/src/main.c @@ -124,7 +124,7 @@ bool cricket_all_warps_broken(CUDBGAPI cudbgAPI, CricketDeviceProp *dev_prop) int cricket_analyze(int argc, char *argv[]) { if (argc != 3) { - LOG(LOG_ERROR, "wrong number of arguments, use: %s ", argv[0]); + LOG(LOG_ERROR, "wrong number of arguments, use: %s analyze ", argv[0]); return -1; } LOG(LOG_INFO, "Analyzing \"%s\"", argv[2]); @@ -155,7 +155,7 @@ int cricket_restore(int argc, char *argv[]) double bt, ct, dt, et, ft, gt, comt; #endif if (argc != 3) { - LOG(LOG_ERROR, "wrong number of arguments, use: %s ", argv[0]); + LOG(LOG_ERROR, "wrong number of arguments, use: %s restore ", argv[0]); return -1; } @@ -1046,17 +1046,17 @@ int cricket_checkpoint(int argc, char *argv[]) //cricket_focus_kernel(!batch_flag); - /// TODO: verify if it is sufficient to set first_warp to 0 - first_warp = 0; + /// TODO: There is a loop to determine the first warp, however + /// cricket_cr_ckp_params still causes errors over invalid warps... if (!cricket_cr_ckp_params(cudbgAPI, ckp_dir, &elf_info, 0, 0, first_warp)) { printf("cricket_cr_ckp_params unsuccessful\n"); } /// TODO: work out globals - //if (!cricket_cr_ckp_globals(cudbgAPI, ckp_dir)) { - // printf("cricket_cr_ckp_globals unsuccessful\n"); - //} + if (!cricket_cr_ckp_globals(cudbgAPI, ckp_dir)) { + printf("cricket_cr_ckp_globals unsuccessful\n"); + } #ifdef CRICKET_PROFILE gettimeofday(&f, NULL);