From 87137a4a6c3e12a22ae2a25d00d609d8ca9dae92 Mon Sep 17 00:00:00 2001 From: Jade Abraham Date: Tue, 29 Oct 2024 16:23:01 -0700 Subject: [PATCH] expand support for CHPL_LAUNCHER_GPUS_PER_NODE Signed-off-by: Jade Abraham --- .../slurm-gasnetrun_common.h | 34 ++++++++++++++ .../src/launch/slurm-srun/launch-slurm-srun.c | 45 ++++++++++++++----- 2 files changed, 68 insertions(+), 11 deletions(-) diff --git a/runtime/src/launch/slurm-gasnetrun_common/slurm-gasnetrun_common.h b/runtime/src/launch/slurm-gasnetrun_common/slurm-gasnetrun_common.h index c7f35dbc2459..9fd90e83c8d3 100644 --- a/runtime/src/launch/slurm-gasnetrun_common/slurm-gasnetrun_common.h +++ b/runtime/src/launch/slurm-gasnetrun_common/slurm-gasnetrun_common.h @@ -46,6 +46,7 @@ #define CHPL_NODELIST_FLAG "--nodelist" #define CHPL_PARTITION_FLAG "--partition" #define CHPL_EXCLUDE_FLAG "--exclude" +#define CHPL_GPUS_PER_NODE_FLAG "--gpus-per-node" #define CHPL_LPN_VAR "LOCALES_PER_NODE" @@ -54,6 +55,7 @@ static char* walltime = NULL; static char* nodelist = NULL; static char* partition = NULL; static char* exclude = NULL; +static char* gpusPerNode = NULL; char slurmFilename[FILENAME_MAX]; /* copies of binary to run per node */ @@ -137,6 +139,12 @@ static void genNumLocalesOptions(FILE* slurmFile, sbatchVersion sbatch, exclude = getenv("CHPL_LAUNCHER_EXCLUDE"); } + // command line gpus per node takes precedence over env var + if (!gpusPerNode) { + gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE"); + } + + if (walltime) fprintf(slurmFile, "#SBATCH --time=%s\n", walltime); if (nodelist) @@ -145,6 +153,8 @@ static void genNumLocalesOptions(FILE* slurmFile, sbatchVersion sbatch, fprintf(slurmFile, "#SBATCH --partition=%s\n", partition); if (exclude) fprintf(slurmFile, "#SBATCH --exclude=%s\n", exclude); + if (gpusPerNode) + fprintf(slurmFile, "#SBATCH --gpus-per-node=%s\n", gpusPerNode); switch (sbatch) { case slurm: { fprintf(slurmFile, "#SBATCH --nodes=%d\n", numNodes); @@ -237,6 +247,11 @@ static char* chpl_launch_create_command(int argc, char* argv[], exclude = getenv("CHPL_LAUNCHER_EXCLUDE"); } + // command line gpus per node takes precedence over env var + if (!gpusPerNode) { + gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE"); + } + // request exclusive node access by default, but allow user to override nodeAccessEnv = getenv("CHPL_LAUNCHER_NODE_ACCESS"); if (nodeAccessEnv == NULL || strcmp(nodeAccessEnv, "exclusive") == 0) { @@ -316,6 +331,8 @@ static char* chpl_launch_create_command(int argc, char* argv[], len += snprintf(iCom+len, sizeof(iCom)-len, "--partition=%s ", partition); if(exclude) len += snprintf(iCom+len, sizeof(iCom)-len, "--exclude=%s ", exclude); + if(gpusPerNode) + len += snprintf(iCom+len, sizeof(iCom)-len, "--gpus-per-node=%s ", gpusPerNode); if(projectString && strlen(projectString) > 0) len += snprintf(iCom+len, sizeof(iCom)-len, "--account=%s ", projectString); @@ -410,6 +427,16 @@ int chpl_launch_handle_arg(int argc, char* argv[], int argNum, exclude = &(argv[argNum][strlen(CHPL_EXCLUDE_FLAG)+1]); return 1; } + + // handle --gpus-per-node or --gpus-per-node= + if (!strcmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG)) { + gpusPerNode = argv[argNum+1]; + return 2; + } else if (!strncmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG"=", strlen(CHPL_GPUS_PER_NODE_FLAG))) { + gpusPerNode = &(argv[argNum][strlen(CHPL_GPUS_PER_NODE_FLAG)+1]); + return 1; + } + return 0; } @@ -441,6 +468,13 @@ const argDescTuple_t* chpl_launch_get_help(void) { { "", "(or use $CHPL_LAUNCHER_EXCLUDE)" }, + { + CHPL_GPUS_PER_NODE_FLAG " ", + "specify the number of GPUs per node" + }, + { "", + "(or use $CHPL_LAUNCHER_GPUS_PER_NODE)" + }, { NULL, NULL }, }; return args; diff --git a/runtime/src/launch/slurm-srun/launch-slurm-srun.c b/runtime/src/launch/slurm-srun/launch-slurm-srun.c index 3208fdd14047..43ba7bb65b46 100644 --- a/runtime/src/launch/slurm-srun/launch-slurm-srun.c +++ b/runtime/src/launch/slurm-srun/launch-slurm-srun.c @@ -38,6 +38,8 @@ #define CHPL_NODELIST_FLAG "--nodelist" #define CHPL_PARTITION_FLAG "--partition" #define CHPL_EXCLUDE_FLAG "--exclude" +#define CHPL_GPUS_PER_NODE_FLAG "--gpus-per-node" + static char* debug = NULL; static char* walltime = NULL; @@ -46,6 +48,7 @@ static char* nodelist = NULL; static char* partition = NULL; static char* reservation = NULL; static char* exclude = NULL; +static char* gpusPerNode = NULL; char slurmFilename[FILENAME_MAX]; @@ -234,7 +237,6 @@ static char* chpl_launch_create_command(int argc, char* argv[], char* outputfn = getenv("CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME"); char* errorfn = getenv("CHPL_LAUNCHER_SLURM_ERROR_FILENAME"); char* nodeAccessEnv = getenv("CHPL_LAUNCHER_NODE_ACCESS"); - char* gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE"); char* memEnv = getenv("CHPL_LAUNCHER_MEM"); const char* nodeAccessStr = NULL; const char* memStr = NULL; @@ -304,6 +306,11 @@ static char* chpl_launch_create_command(int argc, char* argv[], exclude = getenv("CHPL_LAUNCHER_EXCLUDE"); } + // command line gpus per node takes precedence over env var + if (!gpusPerNode) { + gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE"); + } + reservation = getenv("SLURM_RESERVATION"); // request exclusive node access by default, but allow user to override @@ -407,6 +414,11 @@ static char* chpl_launch_create_command(int argc, char* argv[], fprintf(slurmFile, "#SBATCH --exclude=%s\n", exclude); } + // Set the gpus per node if it was specified + if (gpusPerNode) { + fprintf(slurmFile, "#SBATCH --gpus-per-node=%s\n", gpusPerNode); + } + // If needed a constraint can be specified with the env var CHPL_LAUNCHER_CONSTRAINT if (constraint) { fprintf(slurmFile, "#SBATCH --constraint=%s\n", constraint); @@ -417,11 +429,6 @@ static char* chpl_launch_create_command(int argc, char* argv[], fprintf(slurmFile, "#SBATCH --account=%s\n", account); } - // set gpus-per-node if one was provided - if (gpusPerNode && strlen(gpusPerNode) > 0) { - fprintf(slurmFile, "#SBATCH --gpus-per-node=%s\n", gpusPerNode); - } - // set the output file name to either the user specified // name or to the binaryName..out if none specified if (outputfn != NULL) { @@ -556,6 +563,12 @@ static char* chpl_launch_create_command(int argc, char* argv[], len += snprintf(iCom+len, sizeof(iCom)-len, "--exclude=%s ", exclude); } + // Set the gpus per node if it was specified + if (gpusPerNode) { + len += snprintf(iCom+len, sizeof(iCom)-len, "--gpus-per-node=%s ", + gpusPerNode); + } + // set any constraints if (constraint) { len += snprintf(iCom+len, sizeof(iCom)-len, "--constraint=%s ", constraint); @@ -566,11 +579,6 @@ static char* chpl_launch_create_command(int argc, char* argv[], len += snprintf(iCom+len, sizeof(iCom)-len, "--account=%s ", account); } - // set gpus-per-node if one was provided - if (gpusPerNode && strlen(gpusPerNode) > 0) { - len += snprintf(iCom+len, sizeof(iCom)-len, "--gpus-per-node=%s ", gpusPerNode); - } - // add the (possibly wrapped) binary name len += snprintf(iCom+len, sizeof(iCom)-len, "%s %s", chpl_get_real_binary_wrapper(), chpl_get_real_binary_name()); @@ -685,6 +693,15 @@ int chpl_launch_handle_arg(int argc, char* argv[], int argNum, return 1; } + // handle --gpus-per-node or --gpus-per-node= + if (!strcmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG)) { + gpusPerNode = argv[argNum+1]; + return 2; + } else if (!strncmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG"=", strlen(CHPL_GPUS_PER_NODE_FLAG))) { + gpusPerNode = &(argv[argNum][strlen(CHPL_GPUS_PER_NODE_FLAG)+1]); + return 1; + } + // handle --generate-sbatch-script if (!strcmp(argv[argNum], CHPL_GENERATE_SBATCH_SCRIPT)) { generate_sbatch_script = 1; @@ -729,6 +746,12 @@ const argDescTuple_t* chpl_launch_get_help(void) { { "", "(or use $CHPL_LAUNCHER_EXCLUDE)" }, + { CHPL_GPUS_PER_NODE_FLAG " ", + "specify number of gpus per node" + }, + { "", + "(or use $CHPL_LAUNCHER_GPUS_PER_NODE)" + }, { NULL, NULL }, }; return args;