BF: CS-577 job requesting a host resource with different amount for m…

…aster and slave sometimes is not scheduled
hpc-gridware · Sep 12, 2024 · b1be9f5 · b1be9f5
1 parent d207e4d
commit b1be9f5
Showing 1 changed file with 73 additions and 25 deletions.
diff --git a/source/libs/sched/sge_select_queue.cc b/source/libs/sched/sge_select_queue.cc
@@ -3969,7 +3969,19 @@ parallel_tag_queues_suitable4job(sge_assignment_t *a, category_use_t *use_catego
       }
       u_long32 host_seq_no = 0;
       for_each_rw (qep, a->queue_list) {
-         DPRINTF("AFTER SORT: %s soft violations %d\n", lGetString(qep, QU_full_name), lGetUlong(qep, QU_soft_violation));
+         /* @todo verify: if all.q is disabled or job requests queue != all.q, is it still on top of the list?
+          * looks incorrect to me, the unavailable queues should be at the end!
+          * AFTER SORT: all.q@ubuntu-22-amd64-1: available: 0, soft violations 0
+          * AFTER SORT: all.q@ubuntu-20-amd64-1: available: 0, soft violations 0
+          * AFTER SORT: all.q@solaris-11-1: available: 0, soft violations 0
+          * AFTER SORT: all.q@centos-7-amd64-1: available: 0, soft violations 0
+          * AFTER SORT: all.q@rocky-8-amd64-1: available: 0, soft violations 0
+          * AFTER SORT: all.q@ubuntu-24-amd64-1: available: 0, soft violations 0
+          * AFTER SORT: test.q@solaris-11-1: available: 1, soft violations 0
+          * AFTER SORT: test.q@centos-7-amd64-1: available: 1, soft violations 0
+          */
+         DPRINTF("AFTER SORT: %s: available: " sge_uu32 ", soft violations %d\n", lGetString(qep, QU_full_name),
+                 lGetUlong(qep, QU_tag), lGetUlong(qep, QU_soft_violation));
          if (lGetUlong(qep, QU_tag) == 0) {
             continue;
          }
@@ -4445,7 +4457,19 @@ static void host_clear_qinstance_tags(lList *queue_list, const char *host_name,
    for (next_queue = lGetElemHostFirstRW(queue_list, QU_qhostname, host_name, &queue_iterator);
         (qep = next_queue);
         next_queue = lGetElemHostNextRW(queue_list, QU_qhostname, host_name, &queue_iterator)) {
-      lClearUlongBitMask(qep, QU_tagged4schedule, TAG4SCHED_MASTER);
+      lClearUlongBitMask(qep, QU_tagged4schedule, bitmask);
+      print_tagged4schedule(qep);
+   }
+}
+
+static void
+host_or_queue_clear_tags(const char *object_name, lListElem *queue, lList *queue_list, u_long32 bitmask) {
+   if (queue != nullptr) {
+      lClearUlongBitMask(queue, QU_tagged4schedule, bitmask);
+      print_tagged4schedule(queue);
+   } else {
+      // we are coming from host matching, clear tags in all queues on this host
+      host_clear_qinstance_tags(queue_list, object_name, bitmask);
    }
 }
 
@@ -5311,7 +5335,7 @@ parallel_queue_slots(sge_assignment_t *a, lListElem *qep, int *slots, int *slots
          ar_queue_config_attr = lGetList(ar_queue, QU_consumable_config_list);
          ar_queue_actual_attr = lGetList(ar_queue, QU_resource_utilization);
 
-         DPRINTF("verifing AR queue\n");
+         DPRINTF("verifying AR queue\n");
          lSetUlong(ar_queue, QU_tagged4schedule, lGetUlong(qep, QU_tagged4schedule));
 
          result = parallel_rc_slots_by_time(a, &qslots, &qslots_qend,
@@ -5323,7 +5347,7 @@ parallel_queue_slots(sge_assignment_t *a, lListElem *qep, int *slots, int *slots
          if (a->is_advance_reservation
             || (((a->pi)?a->pi->par_rqs++:0), result = parallel_rqs_slots_by_time(a, &lslots, &lslots_qend, qep,
                                                                                   need_master, is_master_queue)) == DISPATCH_OK) {
-            DPRINTF("verifing normal queue\n");
+            DPRINTF("verifying normal queue\n");
 
             SCHED_PROF_INC(a->pi, par_qdyn);
 
@@ -6425,23 +6449,46 @@ parallel_rc_slots_by_time(const sge_assignment_t *a, int *slots, int *slots_qend
    lListElem *jrs;
    for_each_rw (jrs, lGetListRW(a->job, JB_request_set_list)) {
       u_long32 scope = lGetUlong(jrs, JRS_scope);
-      if (!need_master && scope == JRS_SCOPE_MASTER) {
-         DPRINTF("%s: parallel_rc_slots_by_time() no need to handle master requests\n", object_name);
-         if (is_master_host) {
-            // we already have the master host, need to respect the master usage when looking at slave requests
-            // in a second round for allocation rule round_robin
-            DPRINTF("%s: parallel_rc_slots_by_time() we already have the master task, considering master usage in slave matching\n", object_name);
-            master_usage = lGetListRW(jrs, JRS_hard_resource_list);
-            // @todo only if we actually have master_usage? We might have a master JRS with just queues
-            // @todo is this really correct even if master_usage has different entries than the slave requests?
-            //       need to do it per request? Or only if there is some overlap between the master and slave requests?
-            //       We have such scenarios in the scope_basic test (-scope master -l int=1 -scope slave -l dbl=2,
-            //       and they work fine.
-            master_slot = 1; // already have the master task, need to add it to the possible slave tasks
-            // @todo anything to do about master_slot_qend?
+      if (scope == JRS_SCOPE_MASTER) {
+         if (!need_master) {
+            DPRINTF("%s: parallel_rc_slots_by_time() no need to handle master requests\n", object_name);
+            if (is_master_host) {
+               // we already have the master host, need to respect the master usage when looking at slave requests
+               // in a second round for allocation rule round_robin
+               DPRINTF("%s: parallel_rc_slots_by_time() we already have the master task, considering master usage in slave matching\n", object_name);
+               master_usage = lGetListRW(jrs, JRS_hard_resource_list);
+               // @todo only if we actually have master_usage? We might have a master JRS with just queues
+               // @todo is this really correct even if master_usage has different entries than the slave requests?
+               //       need to do it per request? Or only if there is some overlap between the master and slave requests?
+               //       We have such scenarios in the scope_basic test (-scope master -l int=1 -scope slave -l dbl=2,
+               //       and they work fine.
+               master_slot = 1; // already have the master task, need to add it to the possible slave tasks
+               // @todo anything to do about master_slot_qend?
+            }
+            continue;
+         } else {
+            if (queue != nullptr) {
+               DPRINTF("=====> we are processing master requests and have a queue\n");
+               print_tagged4schedule(queue);
+               if (!lMatchUlongBitMask(queue, QU_tagged4schedule, TAG4SCHED_MASTER | TAG4SCHED_MASTER_LATER)) {
+                  // we still need the master task, but the queue is already tagged not to match (from exechost matching)
+                  DPRINTF("%s: parallel_rc_slots_by_time() master queue already tagged not to match, skipping\n", object_name);
+                  continue;
+               }
+            }
          }
-         continue;
       }
+      if (scope == JRS_SCOPE_SLAVE && queue != nullptr) {
+         DPRINTF("=====> we are processing slave requests and have a queue\n");
+         print_tagged4schedule(queue);
+         if (!lMatchUlongBitMask(queue, QU_tagged4schedule, TAG4SCHED_SLAVE | TAG4SCHED_SLAVE_LATER)) {
+            // we still need slave tasks, but the queue is already tagged not to match (from exechost matching)
+            DPRINTF("%s: parallel_rc_slots_by_time() slave queue already tagged not to match, skipping\n", object_name);
+            continue;
+         }
+      }
+      // @todo do the same check as above also for the global requests
+
       DPRINTF("%s: parallel_rc_slots_by_time() testing %s requests\n", object_name, job_scope_name(scope));
 
       lList *requests = lGetListRW(jrs, JRS_hard_resource_list);
@@ -6457,7 +6504,7 @@ parallel_rc_slots_by_time(const sge_assignment_t *a, int *slots, int *slots_qend
             if (scope == JRS_SCOPE_GLOBAL && lGetUlong(req, CE_consumable) == CONSUMABLE_JOB) {
                DPRINTF("===> CONSUMABLE_JOB %s does not match - can still use %s %s as slave queue\n",
                        name, queue == nullptr ? "host" : "qinstance", object_name);
-               lClearUlongBitMask(queue, QU_tagged4schedule, TAG4SCHED_MASTER | TAG4SCHED_MASTER_LATER);
+               host_or_queue_clear_tags(object_name, queue, a->queue_list, TAG4SCHED_MASTER | TAG4SCHED_MASTER_LATER);
                /* misuse of the DISPATCH_MISSING_ATTR
                 * add a new DISPATCH result, e.g. DISPATCH_HANDLE_CONSUMABLE_JOB
                 */
@@ -6474,7 +6521,7 @@ parallel_rc_slots_by_time(const sge_assignment_t *a, int *slots, int *slots_qend
             if (scope == JRS_SCOPE_GLOBAL && lGetUlong(req, CE_consumable) == CONSUMABLE_JOB) {
                DPRINTF("===> CONSUMABLE_JOB %s does not match now - can still use %s %s as slave queue (and master later)\n",
                        name, queue == nullptr ? "host" : "qinstance", object_name);
-               lClearUlongBitMask(queue, QU_tagged4schedule, TAG4SCHED_MASTER);
+               host_or_queue_clear_tags(object_name, queue, a->queue_list, TAG4SCHED_MASTER);
             } else {
                DSTRING_STATIC(dstr, 1024);
                const char *req_str = centry_list_append_to_dstring(requests, &dstr);
@@ -6500,7 +6547,7 @@ parallel_rc_slots_by_time(const sge_assignment_t *a, int *slots, int *slots_qend
                   break;
                case DISPATCH_NOT_AT_TIME:
                   // not suitable now, but later
-                  lClearUlongBitMask(queue, QU_tagged4schedule, TAG4SCHED_MASTER);
+                  host_or_queue_clear_tags(object_name, queue, a->queue_list, TAG4SCHED_MASTER);
                   master_usage = requests; // for slave matching need to consider what the master task would consume
                   if (lGetBool (a->pe, PE_job_is_first_task)) {
                      master_slot = 0;
@@ -6509,9 +6556,10 @@ parallel_rc_slots_by_time(const sge_assignment_t *a, int *slots, int *slots_qend
                   break;
                case DISPATCH_NEVER_CAT:
                case DISPATCH_NEVER_JOB:
-                  lClearUlongBitMask(queue, QU_tagged4schedule, TAG4SCHED_MASTER | TAG4SCHED_MASTER_LATER);
+                  DPRINTF("    --> we matched the master requests, -> never_cat or never_job, clearing master tags\n");
+                  host_or_queue_clear_tags(object_name, queue, a->queue_list, TAG4SCHED_MASTER | TAG4SCHED_MASTER_LATER);
                   master_slot = master_slot_qend = 0; // no master slot
-                  // continue with slave tasks, they might match and we can use the queue as slave only queue
+                  // continue with slave tasks, they might match, and we can use the queue as slave only queue
                   continue;
                default:
                   master_slot = master_slot_qend = 0; // no master slot
@@ -6571,7 +6619,7 @@ parallel_rc_slots_by_time(const sge_assignment_t *a, int *slots, int *slots_qend
                      max_slots_qend = MIN(max_slots_qend, avail_qend);
                      // only suitable for slave tasks
                      // @todo we did this already above, why repeat it? Can it get overwritten in between?
-                     lClearUlongBitMask(queue, QU_tagged4schedule, TAG4SCHED_MASTER | TAG4SCHED_MASTER_LATER);
+                     host_or_queue_clear_tags(object_name, queue, a->queue_list, TAG4SCHED_MASTER | TAG4SCHED_MASTER_LATER);
                   } else {
                      DPRINTF("%s: parallel_rc_slots_by_time(%s) <never found>\n", object_name, name);
                      *slots = *slots_qend = 0;