Merge pull request #70 from coreweave/es/vllm-tensorizer

sangstar · web-flow · commit 467a303722c0 · 2024-06-16T09:47:07.000-04:00
build(vllm-tensorizer): Compile `vllm-flash-attn` from source
diff --git a/.github/workflows/vllm-tensorizer.yml b/.github/workflows/vllm-tensorizer.yml
@@ -4,6 +4,9 @@ on:
       commit:
         description: 'Commit to build'
         required: true
+      vllm-flash-attn-version:
+        description: 'vllm-flash-attn version to build'
+        required: true
   push:
     paths:
       - "vllm-tensorizer/**"
@@ -20,4 +23,5 @@ jobs:
       folder: vllm-tensorizer
       tag-suffix: ${{ inputs.commit || '51602eefd38250325e541abd28f051ffd7676c3f'}}
       build-args: |
-        COMMIT_HASH=${{ inputs.commit || '51602eefd38250325e541abd28f051ffd7676c3f'}}
+        COMMIT_HASH=${{ inputs.commit || '51602eefd38250325e541abd28f051ffd7676c3f'}}
+        VLLM_FLASH_ATTN_VERSION=${{ inputs.vllm-flash-attn-version || '2.5.9'}}
diff --git a/vllm-tensorizer/Dockerfile b/vllm-tensorizer/Dockerfile
@@ -47,17 +47,38 @@ RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \
     git submodule update --init --recursive --jobs 8 \
       --depth 1 --filter=blob:none
 
+FROM alpine/git:2.36.3 as vllm-flash-attn-downloader
+WORKDIR /git
+ARG VLLM_FLASH_ATTN_VERSION
+RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \
+      https://github.com/vllm-project/flash-attention.git && \
+    cd flash-attention && \
+    git checkout "v${VLLM_FLASH_ATTN_VERSION}" && \
+    git submodule update --init --recursive --jobs 8 \
+      --depth 1 --filter=blob:none
+
 FROM builder-base as vllm-builder
 WORKDIR /workspace
-RUN --mount=type=bind,from=vllm-downloader,source=/git/vllm,target=/workspace,rw \
-    --mount=type=bind,from=freezer,target=/tmp/frozen,rw \
-    /tmp/frozen/freeze.sh torch torchaudio torchvision xformers > /tmp/frozen/constraints.txt && \
-    LIBRARY_PATH="/usr/local/cuda/lib64/stubs${LIBRARY_PATH:+:$LIBRARY_PATH}" \
-      python3 -m pip wheel -w /wheels \
+
+ENV LIBRARY_PATH="/usr/local/cuda/lib64/stubs${LIBRARY_PATH:+:$LIBRARY_PATH}"
+
+RUN --mount=type=bind,from=freezer,target=/tmp/frozen,rw \
+    /tmp/frozen/freeze.sh torch torchaudio torchvision xformers > /tmp/constraints.txt
+
+RUN --mount=type=bind,from=vllm-flash-attn-downloader,source=/git/flash-attention,target=/workspace,rw \
+    python3 -m pip wheel -w /wheels \
       -v --no-cache-dir --no-build-isolation --no-deps \
-      -c /tmp/frozen/constraints.txt \
+      -c /tmp/constraints.txt \
       ./
 
+RUN --mount=type=bind,from=vllm-downloader,source=/git/vllm,target=/workspace,rw \
+    pip3 install /wheels/*.whl && \
+    python3 -m pip wheel -w /wheels \
+      -v --no-cache-dir --no-build-isolation --no-deps \
+      -c /tmp/constraints.txt \
+      ./ && \
+    pip3 uninstall -y vllm-flash-attn
+
 WORKDIR /wheels
 
 FROM ${BASE_IMAGE} as base