-
Notifications
You must be signed in to change notification settings - Fork 16
/
Dockerfile.ubuntu20
202 lines (182 loc) · 8.73 KB
/
Dockerfile.ubuntu20
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
ARG CUDA_VERSION_MINOR=11.7.1
ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION_MINOR}-cudnn8-devel-ubuntu20.04
FROM ${BASE_IMAGE} as base
ARG CUDA_VERSION_MAJOR=11.7
ARG TARGET_NCCL_VERSION=2.14.3-1
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get -qq update && \
apt-get -qq install -y \
--allow-change-held-packages \
--no-install-recommends \
--allow-downgrades \
build-essential libtool autoconf automake autotools-dev unzip \
ca-certificates \
wget curl openssh-server vim environment-modules \
iputils-ping net-tools \
libnuma1 libsubunit0 libpci-dev \
libpmix-dev \
datacenter-gpu-manager \
libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \
libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \
git
# Mellanox OFED (latest)
RUN wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add -
RUN cd /etc/apt/sources.list.d/ && wget https://linux.mellanox.com/public/repo/mlnx_ofed/latest/ubuntu20.04/mellanox_mlnx_ofed.list
RUN apt-get -qq update \
&& apt-get -qq install -y --no-install-recommends \
ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \
&& rm -rf /var/lib/apt/lists/*
# mlnx-ofed-hpc-user-only
# IB perftest with GDR
ENV PERFTEST_VERSION_HASH=5b47ede
RUN mkdir /tmp/build && \
cd /tmp/build && \
git clone https://github.com/coreweave/perftest && \
cd perftest && \
git checkout $PERFTEST_VERSION_HASH && \
./autogen.sh && \
./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h && \
make install && \
cd /tmp && \
rm -r /tmp/build
# Build GPU Bandwidthtest from samples
ARG CUDA_SAMPLES_VERSION
RUN mkdir /tmp/build && \
cd /tmp/build && \
curl -sLo master.zip https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v${CUDA_SAMPLES_VERSION}.zip && \
unzip master.zip && \
cd cuda-samples-${CUDA_SAMPLES_VERSION}/Samples/1_Utilities/bandwidthTest && \
make && \
install bandwidthTest /usr/bin/ && \
cd /tmp && \
rm -r /tmp/build
# HPC-X
# grep + sed is used as a workaround to update hardcoded pkg-config / libtools archive / CMake prefixes
ARG HPCX_DISTRIBUTION="hpcx-v2.14-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.16-x86_64"
RUN cd /tmp && \
export HPCX_DIR="/opt/hpcx" && \
wget -q -O - https://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \
grep -IrlF "/build-result/${HPCX_DISTRIBUTION}" ${HPCX_DISTRIBUTION} | xargs -rd'\n' sed -i -e "s:/build-result/${HPCX_DISTRIBUTION}:${HPCX_DIR}:g" && \
mv ${HPCX_DISTRIBUTION} ${HPCX_DIR}
FROM base as gdrcopy
RUN apt-get -qq update && \
apt-get -qq install -y --no-install-recommends \
build-essential devscripts debhelper fakeroot pkg-config check &&\
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# GDRCopy userspace components (2.4)
RUN mkdir /tmp/build /tmp/gdrcopy && \
cd /tmp/build && \
wget -qO- 'https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.4.tar.gz' | tar xzf - && \
CUDA=/usr/local/cuda ./gdrcopy-2.4/packages/build-deb-packages.sh -k && \
mv ./gdrcopy-tests_2.4*.deb ./libgdrapi_2.4*.deb /tmp/gdrcopy/ && \
cd /tmp && \
rm -r /tmp/build
FROM base
COPY --from=gdrcopy /tmp/gdrcopy /tmp/gdrcopy/
RUN cd /tmp/gdrcopy && \
dpkg -i *.deb && \
cd /tmp && \
rm -r /tmp/gdrcopy
# HPC-X Environment variables
COPY ./printpaths.sh /tmp
SHELL ["/bin/bash", "-c"]
RUN source /opt/hpcx/hpcx-init.sh && \
hpcx_load && \
# Uncomment to stop a run early with the ENV definitions for the below section
# /tmp/printpaths.sh ENV && false && \
# Preserve environment variables in new login shells \
alias install='install --owner=0 --group=0' && \
/tmp/printpaths.sh export \
| install --mode=644 /dev/stdin /etc/profile.d/hpcx-env.sh && \
# Preserve environment variables (except *PATH*) when sudoing
install -d --mode=0755 /etc/sudoers.d && \
/tmp/printpaths.sh \
| sed -E -e '{ \
# Convert NAME=value to just NAME \
s:^([^=]+)=.*$:\1:g ; \
# Filter out any variables with PATH in their names \
/PATH/d ; \
# Format them into /etc/sudoers env_keep directives \
s:^.*$:Defaults env_keep += "\0":g \
}' \
| install --mode=440 /dev/stdin /etc/sudoers.d/hpcx-env && \
# Register shared libraries with ld regardless of LD_LIBRARY_PATH
echo $LD_LIBRARY_PATH | tr ':' '\n' \
| install --mode=644 /dev/stdin /etc/ld.so.conf.d/hpcx.conf && \
rm /tmp/printpaths.sh
SHELL ["/bin/sh", "-c"]
# The following envs are from the output of the printpaths ENV script.
# Uncomment "/tmp/printpaths.sh ENV" above to run the script
# as part of a Docker build. Copy-paste the updated output in here.
# These ENVs need to be updated on new HPC-X install, different base image
# or any path related modifications before this stage in the Dockerfile.
# Begin auto-generated paths
ENV HPCX_DIR=/opt/hpcx
ENV HPCX_UCX_DIR=/opt/hpcx/ucx
ENV HPCX_UCC_DIR=/opt/hpcx/ucc
ENV HPCX_SHARP_DIR=/opt/hpcx/sharp
ENV HPCX_NCCL_RDMA_SHARP_PLUGIN_DIR=/opt/hpcx/nccl_rdma_sharp_plugin
ENV HPCX_HCOLL_DIR=/opt/hpcx/hcoll
ENV HPCX_MPI_DIR=/opt/hpcx/ompi
ENV HPCX_OSHMEM_DIR=/opt/hpcx/ompi
ENV HPCX_MPI_TESTS_DIR=/opt/hpcx/ompi/tests
ENV HPCX_OSU_DIR=/opt/hpcx/ompi/tests/osu-micro-benchmarks-5.8
ENV HPCX_OSU_CUDA_DIR=/opt/hpcx/ompi/tests/osu-micro-benchmarks-5.8-cuda
ENV HPCX_IPM_DIR=/opt/hpcx/ompi/tests/ipm-2.0.6
ENV HPCX_CLUSTERKIT_DIR=/opt/hpcx/clusterkit
ENV OMPI_HOME=/opt/hpcx/ompi
ENV MPI_HOME=/opt/hpcx/ompi
ENV OSHMEM_HOME=/opt/hpcx/ompi
ENV OPAL_PREFIX=/opt/hpcx/ompi
ENV OLD_PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV PATH=/opt/hpcx/sharp/bin:/opt/hpcx/clusterkit/bin:/opt/hpcx/hcoll/bin:/opt/hpcx/ucc/bin:/opt/hpcx/ucx/bin:/opt/hpcx/ompi/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV OLD_LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV LD_LIBRARY_PATH=/opt/hpcx/nccl_rdma_sharp_plugin/lib:/opt/hpcx/ucc/lib/ucc:/opt/hpcx/ucc/lib:/opt/hpcx/ucx/lib/ucx:/opt/hpcx/ucx/lib:/opt/hpcx/sharp/lib:/opt/hpcx/hcoll/lib:/opt/hpcx/ompi/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV OLD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs
ENV LIBRARY_PATH=/opt/hpcx/nccl_rdma_sharp_plugin/lib:/opt/hpcx/ompi/lib:/opt/hpcx/sharp/lib:/opt/hpcx/ucc/lib:/opt/hpcx/ucx/lib:/opt/hpcx/hcoll/lib:/opt/hpcx/ompi/lib:/usr/local/cuda/lib64/stubs
ENV OLD_CPATH=""
ENV CPATH=/opt/hpcx/ompi/include:/opt/hpcx/ucc/include:/opt/hpcx/ucx/include:/opt/hpcx/sharp/include:/opt/hpcx/hcoll/include:
ENV PKG_CONFIG_PATH=/opt/hpcx/hcoll/lib/pkgconfig:/opt/hpcx/sharp/lib/pkgconfig:/opt/hpcx/ucx/lib/pkgconfig:/opt/hpcx/ompi/lib/pkgconfig:
# End of auto-generated paths
# Disable UCX VFS to stop errors about fuse mount failure
ENV UCX_VFS_ENABLE=no
# Rebuild OpenMPI to support SLURM
# For Ubuntu 22, we can replace PMI2 (--with-pmi) with PMIx
# --with-pmix=/usr/lib/x86_64-linux-gnu/pmix2
RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends libpmi2-0 libpmi2-0-dev
RUN cd /opt/hpcx/sources/ && rm -r /opt/hpcx/ompi && tar -zxvf openmpi-gitclone.tar.gz && cd openmpi-gitclone && \
./configure --prefix=/opt/hpcx/ompi \
--with-hcoll=/opt/hpcx/hcoll --with-ucx=/opt/hpcx/ucx \
--with-platform=contrib/platform/mellanox/optimized \
--with-slurm --with-hwloc --with-libevent \
--with-pmi \
--without-xpmem --with-cuda --with-ucc=/opt/hpcx/ucc && \
make -j14 && \
make -j14 install && \
cd .. && \
rm -r openmpi-gitclone
# NCCL SHARP PLugin (master)
### Disabled as HPC-X has a recent enough version at this time
# RUN cd /tmp && \
# wget -q https://github.com/Mellanox/nccl-rdma-sharp-plugins/archive/refs/heads/master.zip && \
# unzip master.zip && \
# cd nccl-rdma-sharp-plugins-master && \
# ./autogen.sh && \
# ./configure --with-cuda=/usr/local/cuda-${CUDA_VERSION_MAJOR} --prefix=/usr && \
# make && \
# make install && \
# rm /opt/hpcx/nccl_rdma_sharp_plugin/lib/* && \
# rm -r /tmp/*
# NCCL Tests
ENV NCCL_TESTS_COMMITISH=2cbb968
WORKDIR /opt/nccl-tests
RUN wget -q -O - https://github.com/NVIDIA/nccl-tests/archive/${NCCL_TESTS_COMMITISH}.tar.gz | tar --strip-components=1 -xzf - && \
make MPI=1 && \
ln -s /opt/nccl-tests /opt/nccl_tests
RUN ldconfig
# SSH dependencies for MPI
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
mkdir /var/run/sshd -p