Skip to content

Commit

Permalink
Try to loop on run status
Browse files Browse the repository at this point in the history
  • Loading branch information
cmd-ntrf committed Jan 10, 2025
1 parent 2a4253a commit 561ebff
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 9 deletions.
15 changes: 7 additions & 8 deletions src/slurm_autoscale_tfe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import re
import sys
import time

from enum import Enum
from os import environ
Expand Down Expand Up @@ -189,21 +190,19 @@ def main(command, set_op, hostlist):

_, address_prefix = get_instances_from_tfe(tfe_client)
try:
tfe_client.apply(
run_id = tfe_client.apply(
f"Slurm {command.value} {hostlist}".strip(),
targets=[f'module.{address_prefix}["{hostname}"]' for hostname in hosts]
)
except Timeout as exc:
raise AutoscaleException("Connection to Terraform cloud timeout (5s)") from exc
logging.info("%s %s", command.value, hostlist)

# next_instances = set_op(tfe_instances, hosts)
# if tfe_instances != next_instances:
# logging.warning(
# 'TFE state was potentially unchanged following the issue of "%s %s"',
# command.value,
# hostlist,
# )
while tfe_client.get_run_status(run_id) not in ("applied", "discarded", "errored", "canceled", "force_canceled"):
time.sleep(5)

if tfe_client.get_run_status(run_id) == "errored":
raise AutoscaleException("Terraform Cloud errored while applying the change")

if command == Commands.RESUME_FAIL:
change_host_state(hostlist, "IDLE")
9 changes: 8 additions & 1 deletion src/slurm_autoscale_tfe/tfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,13 @@ def apply(self, message, targets):
},
}
}
return requests.post(
resp = requests.post(
RUNS_API, headers=self.headers, json=run_data, timeout=self.timeout
)
return resp.json()["data"]["id"]

def get_run_status(self, run_id):
"""Return status of run"""
url = "/".join((RUNS_API, run_id))
resp = requests.get(url, headers=self.headers, timeout=self.timeout)
return resp.json()["data"]["status"]

0 comments on commit 561ebff

Please sign in to comment.