Skip to content

Commit acd3ada

Browse files
authored
Merge pull request #68 from coreweave/es/blas
feat(torch): Build with AOCL-BLAS and AOCL-LAPACK, and HPC-X v2.19
2 parents 696032c + 79b3cb5 commit acd3ada

File tree

2 files changed

+98
-17
lines changed

2 files changed

+98
-17
lines changed

.github/configurations/torch-nccl.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,28 @@ image:
33
- cuda: 12.2.2
44
os: ubuntu22.04
55
nccl: 2.19.3-1
6-
nccl-tests-hash: 868dc3d
6+
nccl-tests-hash: 85f9143
77
- cuda: 12.1.1
88
os: ubuntu22.04
99
nccl: 2.18.3-1
10-
nccl-tests-hash: 868dc3d
10+
nccl-tests-hash: 85f9143
1111
- cuda: 12.0.1
1212
os: ubuntu22.04
1313
nccl: 2.18.5-1
14-
nccl-tests-hash: 868dc3d
14+
nccl-tests-hash: 85f9143
1515
# Ubuntu 20.04
1616
- cuda: 12.2.2
1717
os: ubuntu20.04
1818
nccl: 2.21.5-1
19-
nccl-tests-hash: 027b52a
19+
nccl-tests-hash: 85f9143
2020
- cuda: 12.1.1
2121
os: ubuntu20.04
2222
nccl: 2.18.3-1
23-
nccl-tests-hash: 868dc3d
23+
nccl-tests-hash: 85f9143
2424
- cuda: 12.0.1
2525
os: ubuntu20.04
2626
nccl: 2.19.3-1
27-
nccl-tests-hash: 868dc3d
27+
nccl-tests-hash: 85f9143
2828
- cuda: 11.8.0
2929
os: ubuntu20.04
3030
nccl: 2.16.5-1

torch/Dockerfile

Lines changed: 92 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ ARG BUILD_TRITON_VERSION=""
99
ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX"
1010
# 8.7 is supported in the PyTorch main branch, but not 2.0.0
1111

12+
ARG AOCL_BASE="/opt/aocl"
13+
ARG AOCL_VER="4.2.0"
14+
ARG AOCL_URL="https://download.amd.com/developer/eula/aocl/aocl-4-2/aocl-linux-aocc-4.2.0.tar.gz"
15+
1216
# Clone PyTorch repositories independently from all other build steps
1317
# for cache-friendliness and parallelization
1418
FROM alpine/git:2.40.1 as downloader-base
@@ -60,6 +64,30 @@ RUN if [ -n "${BUILD_TRITON_VERSION}" ]; then \
6064
mkdir triton; \
6165
fi;
6266

67+
FROM alpine/curl:8.7.1 as aocl-downloader
68+
WORKDIR /tmp/install
69+
70+
RUN apk add --no-cache bash
71+
72+
ARG AOCL_BASE
73+
ARG AOCL_VER
74+
ARG AOCL_URL
75+
76+
RUN curl -sSfo- "${AOCL_URL}" | tar xzf - --strip-components 1 && \
77+
INSTALL_LIB() { ./install.sh -l "$1" -t "${AOCL_BASE}" -i lp64; } && \
78+
INSTALL_LIB blis && \
79+
INSTALL_LIB libflame && \
80+
INSTALL_LIB utils && \
81+
. ./amd-libs.cfg && \
82+
rm -r "${AOCL_ROOT}/include_ILP64" && \
83+
rm -r "${AOCL_ROOT}/lib_ILP64" && \
84+
ln -s "${AOCL_ROOT}/amd-libs.cfg" "${AOCL_BASE}/amd-libs.cfg" && \
85+
ln -s "${AOCL_ROOT}/include" "${AOCL_BASE}/include" && \
86+
ln -s "${AOCL_ROOT}/lib" "${AOCL_BASE}/lib" && \
87+
echo "${AOCL_BASE}/lib" \
88+
| install -m 0644 /dev/stdin "${AOCL_BASE}/aocl.conf" && \
89+
rm -r ./*
90+
6391

6492
## Build PyTorch on a builder image.
6593
FROM ${BUILDER_BASE_IMAGE} as builder
@@ -70,12 +98,14 @@ ARG BUILD_CCACHE_SIZE="1Gi"
7098
# ninja-build, ccache, and lld are optional but improve the build
7199
RUN apt-get -qq update && apt-get -qq install -y \
72100
libncurses5 python3 python3-pip git apt-utils ssh ca-certificates \
73-
libpng-dev libjpeg-dev pkg-config python3-distutils \
101+
libomp5 libpng-dev libjpeg-dev pkg-config python3-distutils \
74102
build-essential ninja-build && \
75103
apt-get clean && \
76104
/usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \
77105
update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
78-
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
106+
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
107+
ln -s libomp.so.5 /usr/lib/x86_64-linux-gnu/libomp.so && \
108+
ldconfig
79109

80110
RUN mkdir /tmp/ccache-install && \
81111
cd /tmp/ccache-install && \
@@ -116,6 +146,37 @@ RUN CODENAME="$(lsb_release -cs)" && \
116146
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \
117147
update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1
118148

149+
# Install AOCL-BLAS and AOCL-LAPACK
150+
# See: https://www.amd.com/en/developer/aocl/dense.html
151+
ARG AOCL_BASE
152+
COPY --from=aocl-downloader "${AOCL_BASE}" "${AOCL_BASE}"
153+
154+
# `ldconfig` lets the dynamic linker access AOCL libraries
155+
RUN install -m 0644 -t /etc/ld.so.conf.d "${AOCL_BASE}/aocl.conf" && \
156+
ldconfig
157+
158+
# These environment variables are only for the build stage,
159+
# and register paths to build-time AOCL resources.
160+
# This could alternatively be done by invoking `. "${AOCL_BASE}/amd-libs.cfg"`
161+
# in every RUN compilation step, but this will make sure it is never missed.
162+
#
163+
# PyTorch's logic to find LAPACK during CMake configuration
164+
# additionally requires its installed path to either be in:
165+
# - One of:
166+
# - /usr/local/lib, or
167+
# - /usr/lib, or
168+
# - /usr/local/lib64, or
169+
# - /usr/lib64, or
170+
# - /usr/lib/aarch64-linux-gnu, or
171+
# - $LD_LIBRARY_PATH
172+
# While skipping $LIBRARY_PATH, and ld's normal configured paths,
173+
# so it is necessary to add $LD_LIBRARY_PATH here as well.
174+
# See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindLAPACK.cmake#L56-L59
175+
ENV C_INCLUDE_PATH="${AOCL_BASE}/include${C_INCLUDE_PATH:+:$C_INCLUDE_PATH}" \
176+
CPLUS_INCLUDE_PATH="${AOCL_BASE}/include${CPLUS_INCLUDE_PATH:+:$CPLUS_INCLUDE_PATH}" \
177+
LD_LIBRARY_PATH="${AOCL_BASE}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" \
178+
LIBRARY_PATH="${AOCL_BASE}/lib${LIBRARY_PATH:+:$LIBRARY_PATH}"
179+
119180
RUN mkdir /build /build/dist
120181
WORKDIR /build
121182
COPY --chmod=755 effective_cpu_count.sh .
@@ -190,6 +251,19 @@ ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
190251
#
191252
# This step is itself cacheable as long as the downloaded files (and ARCH_LIST)
192253
# remain the same.
254+
#
255+
# NB: This cannot specify BLAS=FLAME directly, because PyTorch (v2.3.0)'s code
256+
# to explicitly choose a BLAS implementation is missing that option
257+
# (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Dependencies.cmake#L195-L266),
258+
# and using BLAS=blis makes it ignore the libflame LAPACK library, because
259+
# that triggers its FindBLIS logic rather than FindBLAS, and FindLAPACK depends
260+
# on a variable set only during FindBLAS (BLAS_INFO=FLAME)
261+
# (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindLAPACK.cmake#L176-L189).
262+
# Thus, we have to force it to use its generic FindBLAS logic,
263+
# and narrow it down from there by specifying WITH_BLAS=FLAME
264+
# (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271).
265+
# Without WITH_BLAS, it would detect the BLAS implementation as
266+
# BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either.
193267
RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \
194268
--mount=type=cache,target=/ccache \
195269
export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
@@ -208,16 +282,16 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
208282
UCC_HOME=${HPCX_UCC_DIR} UCX_HOME=${HPCX_UCX_DIR} \
209283
USE_NCCL_WITH_UCC=1 \
210284
USE_UCC=1 USE_SYSTEM_UCC=1; fi; } && \
211-
USE_OPENCV=1 \
212285
BUILD_TORCH=ON \
213286
BUILD_TEST=0 \
214287
CUDA_HOST_COMPILER=cc \
215288
USE_CUDA=1 \
216289
USE_NNPACK=1 \
217290
CC=cc \
218291
CXX=c++ \
219-
USE_EIGEN_FOR_BLAS=ON \
220-
USE_MKL=OFF \
292+
USE_BLAS=1 \
293+
USE_LAPACK=1 \
294+
WITH_BLAS=FLAME \
221295
PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \
222296
PYTORCH_BUILD_NUMBER=0 \
223297
TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
@@ -254,8 +328,6 @@ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=visi
254328
USE_NNPACK=1 \
255329
CC=cc \
256330
CXX=c++ \
257-
USE_EIGEN_FOR_BLAS=ON \
258-
USE_MKL=OFF \
259331
BUILD_VERSION="$(../version-string.sh "$TORCH_VISION_VERSION")" \
260332
TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
261333
python3 setup.py bdist_wheel --dist-dir ../dist
@@ -290,8 +362,6 @@ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/
290362
USE_NNPACK=1 \
291363
CC=cc \
292364
CXX=c++ \
293-
USE_EIGEN_FOR_BLAS=ON \
294-
USE_MKL=OFF \
295365
BUILD_VERSION="$(../version-string.sh "$TORCH_AUDIO_VERSION")" \
296366
TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
297367
python3 setup.py bdist_wheel --dist-dir ../dist
@@ -304,14 +374,16 @@ ENV DEBIAN_FRONTEND=noninteractive
304374
# Install core packages
305375
RUN apt-get -qq update && apt-get -qq install -y \
306376
libncurses5 python3 python3-pip python3-distutils \
307-
libpng16-16 libjpeg-turbo8 libsodium23 \
377+
libomp5 libpng16-16 libjpeg-turbo8 libsodium23 \
308378
curl git apt-utils ssh ca-certificates tmux nano vim-tiny sudo bash \
309379
rsync htop wget unzip tini && \
380+
apt-get clean && \
310381
/usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \
311382
update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
312383
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
313384
update-alternatives --install /usr/bin/vim vim /usr/bin/vim.tiny 1 && \
314-
apt-get clean
385+
ln -s libomp.so.5 /usr/lib/x86_64-linux-gnu/libomp.so && \
386+
ldconfig
315387

316388
RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \
317389
software-properties-common && \
@@ -323,6 +395,15 @@ RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \
323395
} && \
324396
{ SETUP_LIBSTDCXX || { sleep "$(shuf -i10-20 -n1)" && SETUP_LIBSTDCXX; }; }
325397

398+
# Install AOCL-BLAS and AOCL-LAPACK
399+
# See: https://www.amd.com/en/developer/aocl/dense.html
400+
ARG AOCL_BASE
401+
COPY --from=aocl-downloader "${AOCL_BASE}" "${AOCL_BASE}"
402+
403+
# `ldconfig` lets the dynamic linker access AOCL libraries
404+
RUN install -m 0644 -t /etc/ld.so.conf.d "${AOCL_BASE}/aocl.conf" && \
405+
ldconfig
406+
326407
ARG BUILD_TORCH_VERSION
327408
ARG BUILD_TORCH_VISION_VERSION
328409
ARG BUILD_TORCH_AUDIO_VERSION

0 commit comments

Comments
 (0)