From 0b5dfada30c1ced6b99a3c042c44027ad4c85b9f Mon Sep 17 00:00:00 2001 From: Zach Newell Date: Mon, 3 Jun 2024 15:40:05 +0000 Subject: [PATCH] Added retries to EHOSTUNREACH socker error. --- src/include/socket.h | 2 ++ src/misc/socket.cc | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/src/include/socket.h b/src/include/socket.h index 60a413875..519e8e265 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -20,6 +20,7 @@ #define SLEEP_INT 1000 // connection retry sleep interval in usec #define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) #define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) +#define RETRY_NO_ROUTE_TIMES 3 // connection no route to host retry times (each one can take 20s) #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL @@ -57,6 +58,7 @@ struct ncclSocket { int acceptFd; int timedOutRetries; int refusedRetries; + int noRouteRetries; union ncclSocketAddress addr; volatile uint32_t* abortFlag; int asyncFlag; diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 6e9fb0790..05620478d 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -98,6 +98,21 @@ static int envSocketFamily(void) { return family; } +/* Set the number of retries for no route to host*/ +static int envNoRouteRetryCount(void) { + int retries = RETRY_NO_ROUTE_TIMES; + const char* env = ncclGetEnv("NCCL_NO_ROUTE_RETRY_COUNT"); + + if (env == NULL) + return retries; + + retries = atoi(env); + + INFO(NCCL_ENV, "NCCL_NO_ROUTE_RETRY_COUNT set by environment to %s", env); + + return retries; +} + static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; @@ -455,6 +470,8 @@ static ncclResult_t socketStartConnect(struct ncclSocket* sock) { /* blocking/non-blocking connect() is determined by asyncFlag. */ int ret = connect(sock->fd, &sock->addr.sa, sock->salen); + int noRouteRetriesCount = envNoRouteRetryCount(); + if (ret == 0) { sock->state = ncclSocketStateConnected; return ncclSuccess; @@ -478,6 +495,15 @@ static ncclResult_t socketStartConnect(struct ncclSocket* sock) { } usleep(SLEEP_INT); return ncclSuccess; + } else if (errno == EHOSTUNREACH) { + if (++sock->noRouteRetries == noRouteRetriesCount) { + sock->state = ncclSocketStateError; + WARN("socketStartConnect: exceeded no route retries (%d/%d)", sock->noRouteRetries, noRouteRetriesCount); + return ncclRemoteError; + } + INFO(NCCL_ALL, "socketStartConnect: no route retry (%d/%d)", sock->noRouteRetries, noRouteRetriesCount); + usleep(SLEEP_INT); + return ncclSuccess; } else { char line[SOCKET_NAME_MAXLEN+1]; sock->state = ncclSocketStateError; @@ -491,6 +517,8 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) { int timeout = 1, ret; socklen_t rlen = sizeof(int); + int noRouteRetriesCount = envNoRouteRetryCount(); + memset(&pfd, 0, sizeof(struct pollfd)); pfd.fd = sock->fd; pfd.events = POLLOUT; @@ -527,6 +555,15 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) { } usleep(SLEEP_INT); sock->state = ncclSocketStateConnecting; + } else if (ret == EHOSTUNREACH) { + if (++sock->noRouteRetries == noRouteRetriesCount) { + sock->state = ncclSocketStateError; + WARN("socketStartConnect: exceeded no route retries (%d/%d)", sock->noRouteRetries, noRouteRetriesCount); + return ncclRemoteError; + } + INFO(NCCL_ALL, "socketStartConnect: no route retry (%d/%d)", sock->noRouteRetries, noRouteRetriesCount); + usleep(SLEEP_INT); + return ncclSuccess; } else if (ret != EINPROGRESS) { sock->state = ncclSocketStateError; char line[SOCKET_NAME_MAXLEN+1]; @@ -698,6 +735,7 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad if (sock == NULL) goto exit; sock->timedOutRetries = 0; sock->refusedRetries = 0; + sock->noRouteRetries = 0; sock->abortFlag = abortFlag; sock->asyncFlag = asyncFlag; sock->state = ncclSocketStateInitialized;