This repository has been archived by the owner on Jul 19, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nccl_communicator.cc
95 lines (81 loc) · 2.65 KB
/
nccl_communicator.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#include <iostream>
#include <stdexcept>
#include <string>
#include <nccl.h>
#include <absl/strings/str_format.h>
#include "communicator.h"
#include "kvs.h"
#include "nccl_communicator.h"
namespace elf {
void nccl_check(const char *filename, int lineno, ncclResult_t result) {
if (result) {
throw std::runtime_error(absl::StrFormat(
"%s:%d: ncclResult_t(%d): %s", filename, lineno, result, ncclGetErrorString(result)));
}
}
#define NCCL_CHECK(expr) nccl_check(__FILE__, __LINE__, expr)
ncclDataType_t comm_type_to_nccl(Communicator::DataType type) {
switch (type) {
default:
throw std::runtime_error(absl::StrFormat("invalid type %d", type));
case Communicator::i8:
return ncclInt8;
case Communicator::i32:
return ncclInt32;
case Communicator::i64:
return ncclInt64;
case Communicator::u8:
return ncclUint8;
case Communicator::u32:
return ncclUint32;
case Communicator::u64:
return ncclUint64;
case Communicator::f32:
return ncclFloat32;
case Communicator::f64:
return ncclFloat64;
}
}
class NcclCommunicator : public Communicator {
KeyValueStore *kvs;
const std::string identifier;
const int rank;
const int size;
ncclComm_t comm;
public:
NcclCommunicator(KeyValueStore *kvs, const std::string &identifier, int rank, int size)
: kvs(kvs), identifier(identifier), rank(rank), size(size) {
init();
}
~NcclCommunicator() {
ncclCommDestroy(comm);
}
void
allreduce(const void *src, void *dst, size_t count, Communicator::DataType datatype) override {
NCCL_CHECK(ncclAllReduce(src, dst, count, comm_type_to_nccl(datatype), ncclSum, comm, 0));
}
void broadcast(const void *src,
void *dst,
int root,
size_t count,
Communicator::DataType datatype) override {
NCCL_CHECK(ncclBroadcast(src, dst, count, comm_type_to_nccl(datatype), root, comm, 0));
}
private:
void init() {
ncclUniqueId nccl_id;
if (rank == 0) {
NCCL_CHECK(ncclGetUniqueId(&nccl_id));
kvs->set(identifier, std::string(nccl_id.internal, NCCL_UNIQUE_ID_BYTES));
} else {
std::string id_str = kvs->get(identifier).get();
memcpy(nccl_id.internal, id_str.c_str(), NCCL_UNIQUE_ID_BYTES);
}
NCCL_CHECK(ncclCommInitRank(&comm, size, nccl_id, rank));
}
};
std::unique_ptr<Communicator>
create_nccl_communicator(KeyValueStore *kvs, const std::string &identifier, int rank, int size) {
return std::make_unique<NcclCommunicator>(kvs, identifier, rank, size);
}
} // namespace elf