Add dockerfile (#41)

mayank31398 · web-flow · commit dade1e2f81c0 · 2022-12-17T05:53:57.000+05:30
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,66 @@
+FROM nvidia/cuda:11.6.0-devel-ubi8 as cuda
+
+ENV PORT=5000
+
+WORKDIR /src
+
+FROM cuda as conda
+
+# taken form pytorch's dockerfile
+RUN curl -L -o ./miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    chmod +x ./miniconda.sh && \
+    ./miniconda.sh -b -p /opt/conda && \
+    rm ./miniconda.sh
+
+ENV PYTHON_VERSION=3.9 \
+    PATH=/opt/conda/envs/inference/bin:/opt/conda/bin:${PATH}
+
+# create conda env
+RUN conda create -n inference python=${PYTHON_VERSION} pip -y
+
+# change shell to activate env
+SHELL ["conda", "run", "-n", "inference", "/bin/bash", "-c"]
+
+FROM conda as conda_env
+
+# update conda
+RUN conda update -n base -c defaults conda -y
+
+COPY Makefile Makefile
+COPY LICENSE LICENSE
+
+# necessary stuff
+RUN pip install torch==1.12.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116 \
+    transformers \
+    deepspeed==0.7.5 \
+    deepspeed-mii==0.0.2 \
+    accelerate \
+    gunicorn \
+    flask \
+    flask_api \ 
+    pydantic \
+    huggingface_hub \
+	grpcio-tools==1.50.0 \
+    --no-cache-dir
+
+# install grpc and compile protos
+COPY inference_server inference_server
+
+RUN make gen-proto
+
+# clean conda env
+RUN conda clean -ya
+
+EXPOSE ${PORT}
+
+# change this as you like 🤗
+ENV TRANSFORMERS_CACHE=/transformers_cache/ \
+    HUGGINGFACE_HUB_CACHE=${TRANSFORMERS_CACHE} \
+    HOME=/homedir
+
+# Runs as arbitrary user in OpenShift
+RUN mkdir ${HOME} && chmod g+wx ${HOME} && \
+    mkdir tmp && chmod -R g+w tmp
+# RUN chmod g+w Makefile
+
+CMD make bloom-176b
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 gen-proto:
-	pip install grpcio-tools==1.50.0
+	pip install grpcio-tools==1.50.0 --no-cache-dir
 
 	mkdir -p inference_server/model_handler/grpc_utils/pb
 
@@ -13,6 +13,18 @@ gen-proto:
 	rm -rf inference_server/model_handler/grpc_utils/pb/*.py-e
 
 bloom-176b:
+	TOKENIZERS_PARALLELISM=false \
+	MODEL_NAME=bigscience/bloom \
+	MODEL_CLASS=AutoModelForCausalLM \
+	DEPLOYMENT_FRAMEWORK=ds_inference \
+	DTYPE=fp16 \
+	MAX_INPUT_LENGTH=2048 \
+	MAX_BATCH_SIZE=4 \
+	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
+
+# loads faster than the above one
+microsoft-bloom-176b:
 	TOKENIZERS_PARALLELISM=false \
 	MODEL_NAME=microsoft/bloom-deepspeed-inference-fp16 \
 	MODEL_CLASS=AutoModelForCausalLM \
@@ -34,7 +46,7 @@ bloomz-176b:
 	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
 
-bloomz-176b-int8:
+bloom-176b-int8:
 	TOKENIZERS_PARALLELISM=false \
 	MODEL_NAME=microsoft/bloom-deepspeed-inference-int8 \
 	MODEL_CLASS=AutoModelForCausalLM \
diff --git a/inference_server/README.md b/inference_server/README.md
@@ -45,7 +45,7 @@ python -m inference_server.cli --model_name microsoft/bloom-deepspeed-inference-
 
 #### BLOOM server deployment
 
-[make <model_name>](../Makefile) can be used to launch a generation server. Please note that the serving method is synchronous and users have to wait in queue until the preceding requests have been processed.
+[make <model_name>](../Makefile) can be used to launch a generation server. Please note that the serving method is synchronous and users have to wait in queue until the preceding requests have been processed. An example to fire server requests is given [here](../server_request.py).
 
 #### Benchmark system for BLOOM inference
 
diff --git a/server_request.py b/server_request.py