multiple BLAS backends support (#17)

1b5d · web-flow · commit c785a2cf5f71 · 2023-11-12T23:34:55.000Z
diff --git a/.github/workflows/publish-release.yml b/.github/workflows/publish-release.yml
@@ -11,6 +11,19 @@ on:
 jobs:
   push_to_dockerhub:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        include:
+          - suffix:
+            cmake_args: ""
+          - suffix: -openblas
+            cmake_args: "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
+          - suffix: -cublas
+            cmake_args: "-DLLAMA_CUBLAS=on"
+          - suffix: -clblast
+            cmake_args: "-DLLAMA_CLBLAST=on"
+          - suffix: -hipblas
+            cmake_args: "-DLLAMA_HIPBLAS=on"
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -32,6 +45,8 @@ jobs:
         uses: docker/metadata-action@v4
         with:
           images: 1b5d/llm-api
+          flavor: | 
+            suffix=${{ matrix.suffix }},onlatest=true
 
       - name: Build and push
         uses: docker/build-push-action@v4
@@ -43,6 +58,8 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=1b5d/llm-api:latest
           cache-to: type=inline
+          build-args: |
+            "CMAKE_ARGS=${{ matrix.cmake_args }}"
 
   push_gpu_to_dockerhub:
     runs-on: ubuntu-latest
diff --git a/Dockerfile b/Dockerfile
@@ -4,7 +4,8 @@ WORKDIR /llm-api
 
 COPY ./requirements.txt /llm-api/requirements.txt
 ENV FORCE_CMAKE "1"
-ENV CMAKE_ARGS "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
+ARG CMAKE_ARGS
+ENV CMAKE_ARGS=${CMAKE_ARGS:-""}
 
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
diff --git a/README.md b/README.md
@@ -186,6 +186,13 @@ model_params:
 
 Ensure to specify the repo_id and filename parameters to point to a Hugging Face repository where the desired model is hosted. The application will then handle the download for you.
 
+Running in this mode can be done using the docker image `1b5d/llm-api:latest`, several images are also available to support different BLAS backends:
+- OpenBLAS: `1b5d/llm-api:latest-openblas`
+- cuBLAS: `1b5d/llm-api:latest-cublas`
+- CLBlast: `1b5d/llm-api:latest-clblast`
+- hipBLAS: `1b5d/llm-api:latest-hipblas`
+
+
 The following example demonstrates the various parameters that can be sent to the Llama generate and agenerate endpoints:
 
 ```
@@ -246,7 +253,7 @@ docker compose -f docker-compose.gpu.yaml up
 **Important Note**: Before running Llama or Llama 2 on GPU, make sure to install the [NVIDIA Driver](https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html) on your host machine. You can verify the NVIDIA environment by executing the following command:
 
 ```
-docker run --rm --gpus all nvidia/cuda:11.7.1-base-ubuntu20.04 nvidia-smi
+docker run --rm --gpus all nvidia/cuda:11.8.0-base-ubuntu20.04 nvidia-smi
 ```
 
 You should see a table displaying the current NVIDIA driver version and related information, confirming the proper setup.