Skip to content

Commit

Permalink
feat: update values for 1 hour of downtime
Browse files Browse the repository at this point in the history
  • Loading branch information
gmertes committed Nov 5, 2024
1 parent ceb197c commit 77c26e6
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions src/anemoi/training/diagnostics/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,15 @@ def get_mlflow_logger(config: DictConfig) -> None:
)
log_hyperparams = False

http_max_retries = config.diagnostics.log.mlflow.get("http_max_retries", 15)
http_timeout = config.diagnostics.log.mlflow.get("http_timeout", 600)
# 35 retries allow for 1 hour of server downtime
http_max_retries = config.diagnostics.log.mlflow.get("http_max_retries", 35)

os.environ["MLFLOW_HTTP_REQUEST_MAX_RETRIES"] = str(http_max_retries)
os.environ["_MLFLOW_HTTP_REQUEST_MAX_RETRIES_LIMIT"] = str(http_max_retries + 1)
os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = str(http_timeout)

# these are the default values, but set them explicitly in case they change
os.environ["MLFLOW_HTTP_REQUEST_BACKOFF_FACTOR"] = "2"
os.environ["MLFLOW_HTTP_REQUEST_BACKOFF_JITTER"] = "1"

LOGGER.info("AnemoiMLFlow logging to %s", tracking_uri)
logger = AnemoiMLflowLogger(
Expand Down

0 comments on commit 77c26e6

Please sign in to comment.