eth-easl
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 4 additions & 0 deletions b/‎Makefile‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docker/Dockerfile.aarch64-cuda‎
Lines changed: 12 additions & 2 deletions b/‎docker/Dockerfile.aarch64-cuda‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎docker/Dockerfile.x86_64-cuda‎
Lines changed: 12 additions & 5 deletions b/‎docker/Dockerfile.x86_64-cuda‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎docker/build_image.sh‎
Lines changed: 2 additions & 2 deletions b/‎docker/build_image.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docker/prometheus.yaml‎
Lines changed: 2 additions & 2 deletions b/‎docker/prometheus.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/sources/toppings.md‎
Lines changed: 13 additions & 0 deletions b/‎docs/sources/toppings.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎meta/requirements-extra.txt‎
Lines changed: 2 additions & 1 deletion b/‎meta/requirements-extra.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎meta/requirements.txt‎
Lines changed: 5 additions & 0 deletions b/‎meta/requirements.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎scratchpad/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎scratchpad/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -5,3 +5,5 @@ pyrightconfig.json
 .local
 .vscode
 .zed
+.data
+*.ipynb
@@ -14,3 +14,7 @@ html-docs:
 	sphinx-build -M html docs/sources docs/build
 cli-docs:
 	typer scratchpad.cli.sp utils docs --title "CLI Reference" --name "scratchpad" --output docs/sources/cli.md
+monitor-up:
+	docker compose -f docker/monitor.yaml up -d
+monitor-down:
+	docker compose -f docker/monitor.yaml down
@@ -8,16 +8,26 @@ LABEL org.opencontainers.image.architecture=arm64
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.10
 ENV DEBIAN_FRONTEND=noninteractive
+ENV TRITEIA_COMPUTE_CAP=90
+ENV TORCH_CUDA_ARCH_LIST="9.0"
+ENV FLASHINFER_ENABLE_AOT="1"
 
 RUN apt update && apt upgrade -y
 
 WORKDIR /scratchpad
 
 COPY . /scratchpad
 
-RUN pip install https://filedn.eu/lougUsdPvd1uJK2jfOYWogH/pypi/flashinfer-0.1.6-cp310-cp310-linux_aarch64.whl
-RUN pip install https://filedn.eu/lougUsdPvd1uJK2jfOYWogH/pypi/triteia-0.1.0-cp310-cp310-linux_aarch64.whl
+RUN git clone -b v0.1.6 https://github.com/flashinfer-ai/flashinfer.git --recursive && \
+    cd flashinfer/python && \
+    pip install --no-build-isolation --verbose --editable .
+
+RUN git clone https://github.com/eth-easl/triteia.git && \
+    cd triteia && \
+    git submodule update --init --recursive && \
+    pip install -e .
 RUN pip install -r meta/requirements-extra.txt
 RUN pip install .
+
 # todo(xiaozhe): figure out why pynvml is installed in the first place. We should use nvidia-ml-py instead.
 RUN pip uninstall pynvml -y
@@ -1,22 +1,29 @@
-FROM nvcr.io/nvidia/pytorch:24.07-py3 AS base
+FROM nvcr.io/nvidia/pytorch:24.05-py3 AS base
 
 LABEL org.opencontainers.image.source=https://github.com/xiaozheyao/Scratchpad
 LABEL org.opencontainers.image.description="Scratchpad: Adaptive Serving of LMs"
 LABEL org.opencontainers.image.licenses=Apache-2.0
 LABEL org.opencontainers.image.architecture=amd64
 
-ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.10
 ENV DEBIAN_FRONTEND=noninteractive
+ENV TRITEIA_COMPUTE_CAP=80
+ENV TORCH_CUDA_ARCH_LIST="8.0"
+ENV FLASHINFER_ENABLE_AOT="1"
 
 RUN apt update && apt upgrade -y
 
 WORKDIR /scratchpad
 
 COPY . /scratchpad
 
-RUN pip install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/
-RUN pip install https://filedn.eu/lougUsdPvd1uJK2jfOYWogH/pypi/triteia-0.1.0-cp310-cp310-linux_x86_64.whl
+RUN git clone -b v0.1.6 https://github.com/flashinfer-ai/flashinfer.git --recursive && \
+    cd flashinfer/python && \
+    pip install --no-build-isolation --verbose --editable .
+
+RUN git clone https://github.com/eth-easl/triteia.git && \
+    cd triteia && \
+    git submodule update --init --recursive && \
+    pip install -e .
 RUN pip install -r meta/requirements-extra.txt
 RUN pip install .
 RUN pip uninstall pynvml -y
@@ -8,5 +8,5 @@ if [ -z "$version" ]; then
     exit 1
 fi
 echo "Building image for $arch, version $version"
-$buildtool build -f docker/Dockerfile.$arch-cuda . -t ghcr.io/xiaozheyao/scratchpad:${version}dev-$arch --build-arg ARCH=$arch
-$buildtool push ghcr.io/xiaozheyao/scratchpad:${version}dev-$arch
+DOCKER_BUILDKIT=0 $buildtool build -f docker/Dockerfile.$arch-cuda . -t ghcr.io/xiaozheyao/scratchpad:${version}dev-$arch --build-arg ARCH=$arch
+$buildtool push ghcr.io/xiaozheyao/scratchpad:${version}dev-$arch
@@ -5,5 +5,5 @@ scrape_configs:
     - job_name: scratchpad
       static_configs:
           - targets:
-                # - 'host.docker.internal:8080'
-                - "172.25.4.12:8080"
+                - 'host.docker.internal:8080'
+                # - "172.25.4.12:8080"
@@ -0,0 +1,13 @@
+# Toppings
+
+```bash
+sp serve meta-llama/Llama-3.2-1B-Instruct --host 0.0.0.0 --port 8080 \
+--enable-system-controller \
+--use-heterogeneous-pool \
+--enable-toppings \
+--init-toppings lora:ketchup123/llama-3.2-1B-instruct-gsm8k:ketchup123/llama-3.2-1B-instruct-gsm8k,delta:deltazip/meta-llama.Llama-3.2-1B-Instruct.4b_2n4m_128bs:deltazip/meta-llama.Llama-3.2-1B-Instruct.4b_2n4m_128bs-1,delta:deltazip/meta-llama.Llama-3.2-1B-Instruct.4b_2n4m_128bs:deltazip/meta-llama.Llama-3.2-1B-Instruct.4b_2n4m_128bs-2 \
+--attention-backend triton \
+--sampling-backend pytorch \
+--max-toppings-per-batch 2 \
+--disable-cuda-graph
+```
@@ -1,4 +1,5 @@
 gguf
-pillow
+matplotlib
 prompt_toolkit
 openai
+faiss-cpu
@@ -15,3 +15,8 @@ python-multipart
 rich
 humanize
 prometheus_client
+tqdm
+einops
+pillow
+tenacity
+orjson
@@ -0,0 +1 @@
+__version__ = "0.1.4"
-Original file line number
+Diff line change
 .local
 .vscode
 .zed
 +.data
 +*.ipynb