From 87137a4a6c3e12a22ae2a25d00d609d8ca9dae92 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Tue, 29 Oct 2024 16:23:01 -0700
Subject: [PATCH] expand support for CHPL_LAUNCHER_GPUS_PER_NODE

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 .../slurm-gasnetrun_common.h                  | 34 ++++++++++++++
 .../src/launch/slurm-srun/launch-slurm-srun.c | 45 ++++++++++++++-----
 2 files changed, 68 insertions(+), 11 deletions(-)
diff --git a/runtime/src/launch/slurm-gasnetrun_common/slurm-gasnetrun_common.h b/runtime/src/launch/slurm-gasnetrun_common/slurm-gasnetrun_common.h
index c7f35dbc2459..9fd90e83c8d3 100644
--- a/runtime/src/launch/slurm-gasnetrun_common/slurm-gasnetrun_common.h
+++ b/runtime/src/launch/slurm-gasnetrun_common/slurm-gasnetrun_common.h
@@ -46,6 +46,7 @@
 #define CHPL_NODELIST_FLAG "--nodelist"
 #define CHPL_PARTITION_FLAG "--partition"
 #define CHPL_EXCLUDE_FLAG "--exclude"
+#define CHPL_GPUS_PER_NODE_FLAG "--gpus-per-node"
 
 #define CHPL_LPN_VAR "LOCALES_PER_NODE"
 
@@ -54,6 +55,7 @@ static char* walltime = NULL;
 static char* nodelist = NULL;
 static char* partition = NULL;
 static char* exclude = NULL;
+static char* gpusPerNode = NULL;
 char slurmFilename[FILENAME_MAX];
 
 /* copies of binary to run per node */
@@ -137,6 +139,12 @@ static void genNumLocalesOptions(FILE* slurmFile, sbatchVersion sbatch,
     exclude = getenv("CHPL_LAUNCHER_EXCLUDE");
   }
 
+  // command line gpus per node takes precedence over env var
+  if (!gpusPerNode) {
+    gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE");
+  }
+
+
   if (walltime)
     fprintf(slurmFile, "#SBATCH --time=%s\n", walltime);
   if (nodelist)
@@ -145,6 +153,8 @@ static void genNumLocalesOptions(FILE* slurmFile, sbatchVersion sbatch,
     fprintf(slurmFile, "#SBATCH --partition=%s\n", partition);
   if (exclude)
     fprintf(slurmFile, "#SBATCH --exclude=%s\n", exclude);
+  if (gpusPerNode)
+    fprintf(slurmFile, "#SBATCH --gpus-per-node=%s\n", gpusPerNode);
   switch (sbatch) {
   case slurm: {
     fprintf(slurmFile, "#SBATCH --nodes=%d\n", numNodes);
@@ -237,6 +247,11 @@ static char* chpl_launch_create_command(int argc, char* argv[],
     exclude = getenv("CHPL_LAUNCHER_EXCLUDE");
   }
 
+  // command line gpus per node takes precedence over env var
+  if (!gpusPerNode) {
+    gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE");
+  }
+
   // request exclusive node access by default, but allow user to override
   nodeAccessEnv = getenv("CHPL_LAUNCHER_NODE_ACCESS");
   if (nodeAccessEnv == NULL || strcmp(nodeAccessEnv, "exclusive") == 0) {
@@ -316,6 +331,8 @@ static char* chpl_launch_create_command(int argc, char* argv[],
       len += snprintf(iCom+len, sizeof(iCom)-len, "--partition=%s ", partition);
     if(exclude)
       len += snprintf(iCom+len, sizeof(iCom)-len, "--exclude=%s ", exclude);
+    if(gpusPerNode)
+      len += snprintf(iCom+len, sizeof(iCom)-len, "--gpus-per-node=%s ", gpusPerNode);
     if(projectString && strlen(projectString) > 0)
       len += snprintf(iCom+len, sizeof(iCom)-len, "--account=%s ",
                      projectString);
@@ -410,6 +427,16 @@ int chpl_launch_handle_arg(int argc, char* argv[], int argNum,
     exclude = &(argv[argNum][strlen(CHPL_EXCLUDE_FLAG)+1]);
     return 1;
   }
+
+  // handle --gpus-per-node <gpus> or --gpus-per-node=<gpus>
+  if (!strcmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG)) {
+    gpusPerNode = argv[argNum+1];
+    return 2;
+  } else if (!strncmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG"=", strlen(CHPL_GPUS_PER_NODE_FLAG))) {
+    gpusPerNode = &(argv[argNum][strlen(CHPL_GPUS_PER_NODE_FLAG)+1]);
+    return 1;
+  }
+
   return 0;
 }
 
@@ -441,6 +468,13 @@ const argDescTuple_t* chpl_launch_get_help(void) {
       { "",
         "(or use $CHPL_LAUNCHER_EXCLUDE)"
       },
+      {
+        CHPL_GPUS_PER_NODE_FLAG " <gpus>",
+        "specify the number of GPUs per node"
+      },
+      { "",
+        "(or use $CHPL_LAUNCHER_GPUS_PER_NODE)"
+      },
       { NULL, NULL },
     };
   return args;
diff --git a/runtime/src/launch/slurm-srun/launch-slurm-srun.c b/runtime/src/launch/slurm-srun/launch-slurm-srun.c
index 3208fdd14047..43ba7bb65b46 100644
--- a/runtime/src/launch/slurm-srun/launch-slurm-srun.c
+++ b/runtime/src/launch/slurm-srun/launch-slurm-srun.c
@@ -38,6 +38,8 @@
 #define CHPL_NODELIST_FLAG "--nodelist"
 #define CHPL_PARTITION_FLAG "--partition"
 #define CHPL_EXCLUDE_FLAG "--exclude"
+#define CHPL_GPUS_PER_NODE_FLAG "--gpus-per-node"
+
 
 static char* debug = NULL;
 static char* walltime = NULL;
@@ -46,6 +48,7 @@ static char* nodelist = NULL;
 static char* partition = NULL;
 static char* reservation = NULL;
 static char* exclude = NULL;
+static char* gpusPerNode = NULL;
 
 char slurmFilename[FILENAME_MAX];
 
@@ -234,7 +237,6 @@ static char* chpl_launch_create_command(int argc, char* argv[],
   char* outputfn = getenv("CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME");
   char* errorfn = getenv("CHPL_LAUNCHER_SLURM_ERROR_FILENAME");
   char* nodeAccessEnv = getenv("CHPL_LAUNCHER_NODE_ACCESS");
-  char* gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE");
   char* memEnv = getenv("CHPL_LAUNCHER_MEM");
   const char* nodeAccessStr = NULL;
   const char* memStr = NULL;
@@ -304,6 +306,11 @@ static char* chpl_launch_create_command(int argc, char* argv[],
     exclude = getenv("CHPL_LAUNCHER_EXCLUDE");
   }
 
+  // command line gpus per node takes precedence over env var
+  if (!gpusPerNode) {
+    gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE");
+  }
+
   reservation = getenv("SLURM_RESERVATION");
 
   // request exclusive node access by default, but allow user to override
@@ -407,6 +414,11 @@ static char* chpl_launch_create_command(int argc, char* argv[],
       fprintf(slurmFile, "#SBATCH --exclude=%s\n", exclude);
     }
 
+    // Set the gpus per node if it was specified
+    if (gpusPerNode) {
+      fprintf(slurmFile, "#SBATCH --gpus-per-node=%s\n", gpusPerNode);
+    }
+
     // If needed a constraint can be specified with the env var CHPL_LAUNCHER_CONSTRAINT
     if (constraint) {
       fprintf(slurmFile, "#SBATCH --constraint=%s\n", constraint);
@@ -417,11 +429,6 @@ static char* chpl_launch_create_command(int argc, char* argv[],
       fprintf(slurmFile, "#SBATCH --account=%s\n", account);
     }
 
-    // set gpus-per-node if one was provided
-    if (gpusPerNode && strlen(gpusPerNode) > 0) {
-      fprintf(slurmFile, "#SBATCH --gpus-per-node=%s\n", gpusPerNode);
-    }
-
     // set the output file name to either the user specified
     // name or to the binaryName.<jobID>.out if none specified
     if (outputfn != NULL) {
@@ -556,6 +563,12 @@ static char* chpl_launch_create_command(int argc, char* argv[],
       len += snprintf(iCom+len, sizeof(iCom)-len, "--exclude=%s ", exclude);
     }
 
+    // Set the gpus per node if it was specified
+    if (gpusPerNode) {
+      len += snprintf(iCom+len, sizeof(iCom)-len, "--gpus-per-node=%s ",
+                      gpusPerNode);
+    }
+
     // set any constraints
     if (constraint) {
       len += snprintf(iCom+len, sizeof(iCom)-len, "--constraint=%s ", constraint);
@@ -566,11 +579,6 @@ static char* chpl_launch_create_command(int argc, char* argv[],
       len += snprintf(iCom+len, sizeof(iCom)-len, "--account=%s ", account);
     }
 
-    // set gpus-per-node if one was provided
-    if (gpusPerNode && strlen(gpusPerNode) > 0) {
-      len += snprintf(iCom+len, sizeof(iCom)-len, "--gpus-per-node=%s ", gpusPerNode);
-    }
-
     // add the (possibly wrapped) binary name
     len += snprintf(iCom+len, sizeof(iCom)-len, "%s %s",
         chpl_get_real_binary_wrapper(), chpl_get_real_binary_name());
@@ -685,6 +693,15 @@ int chpl_launch_handle_arg(int argc, char* argv[], int argNum,
     return 1;
   }
 
+  // handle --gpus-per-node <gpus> or --gpus-per-node=<gpus>
+  if (!strcmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG)) {
+    gpusPerNode = argv[argNum+1];
+    return 2;
+  } else if (!strncmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG"=", strlen(CHPL_GPUS_PER_NODE_FLAG))) {
+    gpusPerNode = &(argv[argNum][strlen(CHPL_GPUS_PER_NODE_FLAG)+1]);
+    return 1;
+  }
+
   // handle --generate-sbatch-script
   if (!strcmp(argv[argNum], CHPL_GENERATE_SBATCH_SCRIPT)) {
     generate_sbatch_script = 1;
@@ -729,6 +746,12 @@ const argDescTuple_t* chpl_launch_get_help(void) {
       { "",
         "(or use $CHPL_LAUNCHER_EXCLUDE)"
       },
+      { CHPL_GPUS_PER_NODE_FLAG " <gpus>",
+        "specify number of gpus per node"
+      },
+      { "",
+        "(or use $CHPL_LAUNCHER_GPUS_PER_NODE)"
+      },
       { NULL, NULL },
     };
   return args;