Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added retries to EHOSTUNREACH socket error. #1311

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/include/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#define SLEEP_INT 1000 // connection retry sleep interval in usec
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
#define RETRY_NO_ROUTE_TIMES 3 // connection no route to host retry times (each one can take 20s)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is better to have configurable

Copy link
Author

@newellz2 newellz2 Jun 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. Maybe something similar to NCCL_IB_RETRY_CNT? What do you think about making RETRY_REFUSED_TIMES and RETRY_TIMEDOUT_TIMES also configurable via environmental variables?

#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
#define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL

Expand Down Expand Up @@ -57,6 +58,7 @@ struct ncclSocket {
int acceptFd;
int timedOutRetries;
int refusedRetries;
int noRouteRetries;
union ncclSocketAddress addr;
volatile uint32_t* abortFlag;
int asyncFlag;
Expand Down
38 changes: 38 additions & 0 deletions src/misc/socket.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,21 @@ static int envSocketFamily(void) {
return family;
}

/* Set the number of retries for no route to host*/
static int envNoRouteRetryCount(void) {
Copy link
Member

@sjeaugey sjeaugey Jun 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason to not use the standard NCCL param definition:

NCCL_PARAM(NoRouteRetryCount, "NO_ROUTE_RETRY_COUNT", RETRY_NO_ROUTE_TIMES);

Then call ncclParamNoRouteRetryCount() instead of envNoRouteRetryCount().

int retries = RETRY_NO_ROUTE_TIMES;
const char* env = ncclGetEnv("NCCL_NO_ROUTE_RETRY_COUNT");

if (env == NULL)
return retries;

retries = atoi(env);

INFO(NCCL_ENV, "NCCL_NO_ROUTE_RETRY_COUNT set by environment to %s", env);

return retries;
}

static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
#ifdef ENABLE_TRACE
char line[SOCKET_NAME_MAXLEN+1];
Expand Down Expand Up @@ -456,6 +471,8 @@ static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
/* blocking/non-blocking connect() is determined by asyncFlag. */
int ret = connect(sock->fd, &sock->addr.sa, sock->salen);

int noRouteRetriesCount = envNoRouteRetryCount();

if (ret == 0) {
sock->state = ncclSocketStateConnected;
return ncclSuccess;
Expand All @@ -479,6 +496,15 @@ static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
}
usleep(SLEEP_INT);
return ncclSuccess;
} else if (errno == EHOSTUNREACH) {
if (++sock->noRouteRetries == noRouteRetriesCount) {
sock->state = ncclSocketStateError;
WARN("socketStartConnect: exceeded no route retries (%d/%d)", sock->noRouteRetries, noRouteRetriesCount);
return ncclRemoteError;
}
INFO(NCCL_ALL, "socketStartConnect: no route retry (%d/%d)", sock->noRouteRetries, noRouteRetriesCount);
usleep(SLEEP_INT);
return ncclSuccess;
} else {
char line[SOCKET_NAME_MAXLEN+1];
sock->state = ncclSocketStateError;
Expand All @@ -492,6 +518,8 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
int timeout = 1, ret;
socklen_t rlen = sizeof(int);

int noRouteRetriesCount = envNoRouteRetryCount();

memset(&pfd, 0, sizeof(struct pollfd));
pfd.fd = sock->fd;
pfd.events = POLLOUT;
Expand Down Expand Up @@ -529,6 +557,15 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
}
usleep(SLEEP_INT);
sock->state = ncclSocketStateConnecting;
} else if (ret == EHOSTUNREACH) {
if (++sock->noRouteRetries == noRouteRetriesCount) {
sock->state = ncclSocketStateError;
WARN("socketStartConnect: exceeded no route retries (%d/%d)", sock->noRouteRetries, noRouteRetriesCount);
return ncclRemoteError;
}
INFO(NCCL_ALL, "socketStartConnect: no route retry (%d/%d)", sock->noRouteRetries, noRouteRetriesCount);
usleep(SLEEP_INT);
return ncclSuccess;
} else if (ret != EINPROGRESS) {
sock->state = ncclSocketStateError;
char line[SOCKET_NAME_MAXLEN+1];
Expand Down Expand Up @@ -700,6 +737,7 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
if (sock == NULL) goto exit;
sock->timedOutRetries = 0;
sock->refusedRetries = 0;
sock->noRouteRetries = 0;
sock->abortFlag = abortFlag;
sock->asyncFlag = asyncFlag;
sock->state = ncclSocketStateInitialized;
Expand Down