Skip to content
This repository was archived by the owner on Oct 9, 2024. It is now read-only.

Commit dade1e2

Browse files
authored
Add dockerfile (#41)
1 parent a9bc5fd commit dade1e2

File tree

4 files changed

+81
-3
lines changed

4 files changed

+81
-3
lines changed

Dockerfile

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
FROM nvidia/cuda:11.6.0-devel-ubi8 as cuda
2+
3+
ENV PORT=5000
4+
5+
WORKDIR /src
6+
7+
FROM cuda as conda
8+
9+
# taken form pytorch's dockerfile
10+
RUN curl -L -o ./miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
11+
chmod +x ./miniconda.sh && \
12+
./miniconda.sh -b -p /opt/conda && \
13+
rm ./miniconda.sh
14+
15+
ENV PYTHON_VERSION=3.9 \
16+
PATH=/opt/conda/envs/inference/bin:/opt/conda/bin:${PATH}
17+
18+
# create conda env
19+
RUN conda create -n inference python=${PYTHON_VERSION} pip -y
20+
21+
# change shell to activate env
22+
SHELL ["conda", "run", "-n", "inference", "/bin/bash", "-c"]
23+
24+
FROM conda as conda_env
25+
26+
# update conda
27+
RUN conda update -n base -c defaults conda -y
28+
29+
COPY Makefile Makefile
30+
COPY LICENSE LICENSE
31+
32+
# necessary stuff
33+
RUN pip install torch==1.12.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116 \
34+
transformers \
35+
deepspeed==0.7.5 \
36+
deepspeed-mii==0.0.2 \
37+
accelerate \
38+
gunicorn \
39+
flask \
40+
flask_api \
41+
pydantic \
42+
huggingface_hub \
43+
grpcio-tools==1.50.0 \
44+
--no-cache-dir
45+
46+
# install grpc and compile protos
47+
COPY inference_server inference_server
48+
49+
RUN make gen-proto
50+
51+
# clean conda env
52+
RUN conda clean -ya
53+
54+
EXPOSE ${PORT}
55+
56+
# change this as you like 🤗
57+
ENV TRANSFORMERS_CACHE=/transformers_cache/ \
58+
HUGGINGFACE_HUB_CACHE=${TRANSFORMERS_CACHE} \
59+
HOME=/homedir
60+
61+
# Runs as arbitrary user in OpenShift
62+
RUN mkdir ${HOME} && chmod g+wx ${HOME} && \
63+
mkdir tmp && chmod -R g+w tmp
64+
# RUN chmod g+w Makefile
65+
66+
CMD make bloom-176b

Makefile

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
gen-proto:
2-
pip install grpcio-tools==1.50.0
2+
pip install grpcio-tools==1.50.0 --no-cache-dir
33

44
mkdir -p inference_server/model_handler/grpc_utils/pb
55

@@ -13,6 +13,18 @@ gen-proto:
1313
rm -rf inference_server/model_handler/grpc_utils/pb/*.py-e
1414

1515
bloom-176b:
16+
TOKENIZERS_PARALLELISM=false \
17+
MODEL_NAME=bigscience/bloom \
18+
MODEL_CLASS=AutoModelForCausalLM \
19+
DEPLOYMENT_FRAMEWORK=ds_inference \
20+
DTYPE=fp16 \
21+
MAX_INPUT_LENGTH=2048 \
22+
MAX_BATCH_SIZE=4 \
23+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
24+
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
25+
26+
# loads faster than the above one
27+
microsoft-bloom-176b:
1628
TOKENIZERS_PARALLELISM=false \
1729
MODEL_NAME=microsoft/bloom-deepspeed-inference-fp16 \
1830
MODEL_CLASS=AutoModelForCausalLM \
@@ -34,7 +46,7 @@ bloomz-176b:
3446
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
3547
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
3648

37-
bloomz-176b-int8:
49+
bloom-176b-int8:
3850
TOKENIZERS_PARALLELISM=false \
3951
MODEL_NAME=microsoft/bloom-deepspeed-inference-int8 \
4052
MODEL_CLASS=AutoModelForCausalLM \

inference_server/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ python -m inference_server.cli --model_name microsoft/bloom-deepspeed-inference-
4545

4646
#### BLOOM server deployment
4747

48-
[make <model_name>](../Makefile) can be used to launch a generation server. Please note that the serving method is synchronous and users have to wait in queue until the preceding requests have been processed.
48+
[make <model_name>](../Makefile) can be used to launch a generation server. Please note that the serving method is synchronous and users have to wait in queue until the preceding requests have been processed. An example to fire server requests is given [here](../server_request.py).
4949

5050
#### Benchmark system for BLOOM inference
5151

File renamed without changes.

0 commit comments

Comments
 (0)