Skip to content
This repository was archived by the owner on Oct 9, 2024. It is now read-only.

Commit 134b703

Browse files
committed
fix num_generated_tokens
drop mii
1 parent 96cfc46 commit 134b703

File tree

14 files changed

+135
-506
lines changed

14 files changed

+135
-506
lines changed

Dockerfile

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
1-
FROM nvidia/cuda:11.6.0-devel-ubi8 as cuda
1+
FROM nvidia/cuda:11.6.1-runtime-ubi8 as base
22

3-
ENV PORT=5000
4-
5-
WORKDIR /src
6-
7-
FROM cuda as conda
3+
RUN dnf install -y --disableplugin=subscription-manager make git && dnf clean all --disableplugin=subscription-manager
84

95
# taken form pytorch's dockerfile
106
RUN curl -L -o ./miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
@@ -21,47 +17,52 @@ RUN conda create -n inference python=${PYTHON_VERSION} pip -y
2117
# change shell to activate env
2218
SHELL ["conda", "run", "-n", "inference", "/bin/bash", "-c"]
2319

24-
FROM conda as conda_env
20+
FROM base as conda
21+
22+
# update conda
23+
RUN conda update -n base -c defaults conda -y
24+
# cmake
25+
RUN conda install -c anaconda cmake -y
2526

2627
# update conda
2728
RUN conda update -n base -c defaults conda -y
2829

2930
# necessary stuff
3031
RUN pip install torch==1.12.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116 \
31-
transformers \
32-
deepspeed==0.7.5 \
33-
deepspeed-mii==0.0.2 \
34-
accelerate \
35-
gunicorn \
32+
transformers==4.25.1 \
33+
deepspeed==0.8.0 \
34+
accelerate==0.15.0 \
35+
gunicorn==20.1.0 \
3636
flask \
37-
flask_api \
38-
pydantic \
39-
huggingface_hub \
37+
flask_api \
38+
fastapi==0.89.1 \
39+
uvicorn==0.19.0 \
40+
jinja2==3.1.2 \
41+
pydantic==1.10.2 \
42+
huggingface_hub==0.10.1 \
4043
grpcio-tools==1.50.0 \
4144
--no-cache-dir
4245

43-
# copy the code
44-
COPY inference_server inference_server
45-
COPY Makefile Makefile
46-
COPY LICENSE LICENSE
47-
48-
# install grpc and compile protos
49-
RUN make gen-proto
50-
5146
# clean conda env
5247
RUN conda clean -ya
5348

54-
EXPOSE ${PORT}
55-
5649
# change this as you like 🤗
57-
ENV TRANSFORMERS_CACHE=/transformers_cache/ \
58-
HUGGINGFACE_HUB_CACHE=${TRANSFORMERS_CACHE} \
59-
HOME=/homedir
50+
ENV TRANSFORMERS_CACHE=/cos/HF_cache \
51+
HUGGINGFACE_HUB_CACHE=${TRANSFORMERS_CACHE}
6052

61-
RUN mkdir ${HOME} && chmod g+wx ${HOME} && \
62-
mkdir tmp && chmod -R g+w tmp
53+
FROM conda as app
6354

64-
# for debugging
65-
# RUN chmod -R g+w inference_server && chmod g+w Makefile
55+
WORKDIR /src
56+
RUN chmod -R g+w /src
6657

67-
CMD make bloom-176b
58+
ENV PORT=5000 \
59+
UI_PORT=5001
60+
EXPOSE ${PORT}
61+
EXPOSE ${UI_PORT}
62+
63+
CMD git clone https://github.com/huggingface/transformers-bloom-inference.git && \
64+
cd transformers-bloom-inference && \
65+
# install grpc and compile protos
66+
make gen-proto && \
67+
make ui && \
68+
make bloom-560m

Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
gen-proto:
2-
pip install grpcio-tools==1.50.0 --no-cache-dir
3-
42
mkdir -p inference_server/model_handler/grpc_utils/pb
53

64
python -m grpc_tools.protoc -Iinference_server/model_handler/grpc_utils/proto --python_out=inference_server/model_handler/grpc_utils/pb --grpc_python_out=inference_server/model_handler/grpc_utils/pb inference_server/model_handler/grpc_utils/proto/generation.proto
@@ -100,3 +98,6 @@ codegen-mono:
10098
MAX_BATCH_SIZE=4 \
10199
CUDA_VISIBLE_DEVICES=0 \
102100
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
101+
102+
ui:
103+
python -m ui &

inference_server/constants.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,4 @@
33
DS_INFERENCE = "ds_inference"
44
DS_ZERO = "ds_zero"
55

6-
# model weights
7-
DS_INFERENCE_BLOOM_FP16 = "microsoft/bloom-deepspeed-inference-fp16"
8-
DS_INFERENCE_BLOOM_INT8 = "microsoft/bloom-deepspeed-inference-int8"
9-
106
# GRPC_MAX_MSG_SIZE = 2**30 # 1GB

inference_server/download_model.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import argparse
22

3-
from .models import get_downloaded_model_path
3+
from inference_server.models import get_hf_model_class
4+
from transformers import AutoConfig, AutoTokenizer
45

56

67
def get_args() -> argparse.Namespace:
@@ -12,6 +13,12 @@ def get_args() -> argparse.Namespace:
1213
required=True,
1314
help="model to use",
1415
)
16+
parser.add_argument(
17+
"--model_class",
18+
type=str,
19+
required=True,
20+
help="model class to use",
21+
)
1522

1623
args = parser.parse_args()
1724

@@ -20,7 +27,10 @@ def get_args() -> argparse.Namespace:
2027

2128
def main() -> None:
2229
args = get_args()
23-
get_downloaded_model_path(args.model_name)
30+
print("downloading", args.model_name)
31+
AutoConfig.from_pretrained(args.model_name)
32+
AutoTokenizer.from_pretrained(args.model_name)
33+
get_hf_model_class(args.model_class).from_pretrained(args.model_name)
2434

2535

2636
if __name__ == "__main__":

inference_server/model_handler/deployment.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,9 @@
99
from typing import List
1010

1111
import grpc
12-
from mii.server_client import MIIServerClient
13-
from transformers import AutoTokenizer
1412

1513
from ..constants import DS_INFERENCE, DS_ZERO
16-
from ..models import get_downloaded_model_path, get_model_class, load_tokenizer
14+
from ..models import get_model_class, load_tokenizer
1715
from ..utils import (
1816
GenerateResponse,
1917
TokenizeRequest,
@@ -25,14 +23,14 @@
2523
from .grpc_utils.pb import generation_pb2, generation_pb2_grpc
2624

2725

28-
class ModelDeployment(MIIServerClient):
26+
class ModelDeployment:
2927
def __init__(self, args: argparse.Namespace, use_grpc_server: bool = False, cuda_visible_devices: List[int] = [0]):
3028
self.cuda_visible_devices = cuda_visible_devices
3129
self.num_gpus = len(self.cuda_visible_devices)
3230
self.use_grpc_server = use_grpc_server
3331

3432
if self.use_grpc_server:
35-
self.tokenizer = load_tokenizer(get_downloaded_model_path(args.model_name))
33+
self.tokenizer = load_tokenizer(args.model_name)
3634

3735
self.initialize_ports()
3836

@@ -57,6 +55,27 @@ def initialize_ports(self):
5755
for i in range(self.num_gpus):
5856
self.ports.append(50950 + self.cuda_visible_devices[i])
5957

58+
def _is_socket_open(self, port):
59+
import socket
60+
61+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
62+
result = sock.connect_ex(("0.0.0.0", port))
63+
sock.close()
64+
return result == 0
65+
66+
def _is_server_process_alive(self):
67+
if self.process is None:
68+
return True
69+
try:
70+
self.process.wait(1)
71+
except subprocess.TimeoutExpired as err:
72+
# timeout means we're still running and all (probably) okay
73+
is_alive = True
74+
else:
75+
# no exception case
76+
is_alive = False
77+
return is_alive
78+
6079
def _wait_until_server_is_live(self):
6180
sockets_open = False
6281
while not sockets_open:

inference_server/models/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from ..constants import DS_INFERENCE, DS_ZERO, HF_ACCELERATE
2-
from .model import Model, get_downloaded_model_path, load_tokenizer
2+
from .model import Model, get_hf_model_class, load_tokenizer
33

44

55
def get_model_class(deployment_framework: str):

inference_server/models/ds_inference.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@
99
import torch.distributed as dist
1010

1111
import deepspeed
12-
from transformers import AutoConfig, AutoTokenizer
12+
from huggingface_hub import try_to_load_from_cache
13+
from transformers import AutoConfig
1314

1415
from ..utils import print_rank_n, run_rank_n
15-
from .model import Model, get_downloaded_model_path, get_hf_model_class, load_tokenizer
16+
from .model import Model, get_hf_model_class
1617

1718

1819
# basic DeepSpeed inference model class for benchmarking
@@ -24,26 +25,22 @@ def __init__(self, args: Namespace) -> None:
2425

2526
world_size = int(os.getenv("WORLD_SIZE", "1"))
2627

27-
downloaded_model_path = get_downloaded_model_path(args.model_name)
28-
29-
self.tokenizer = load_tokenizer(downloaded_model_path)
30-
self.pad = self.tokenizer.pad_token_id
31-
3228
# create dummy tensors for allocating space which will be filled with
3329
# the actual weights while calling deepspeed.init_inference in the
3430
# following code
3531
with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
3632
self.model = get_hf_model_class(args.model_class).from_config(
37-
AutoConfig.from_pretrained(downloaded_model_path), torch_dtype=torch.bfloat16
33+
AutoConfig.from_pretrained(args.model_name), torch_dtype=torch.bfloat16
3834
)
3935
self.model = self.model.eval()
4036

37+
downloaded_model_path = get_model_path(args.model_name)
38+
4139
if args.dtype in [torch.float16, torch.int8]:
4240
# We currently support the weights provided by microsoft (which are
4341
# pre-sharded)
44-
if args.use_pre_sharded_checkpoints:
45-
checkpoints_json = os.path.join(downloaded_model_path, "ds_inference_config.json")
46-
42+
checkpoints_json = os.path.join(downloaded_model_path, "ds_inference_config.json")
43+
if os.path.isfile(checkpoints_json):
4744
self.model = deepspeed.init_inference(
4845
self.model,
4946
mp_size=world_size,
@@ -74,6 +71,8 @@ def __init__(self, args: Namespace) -> None:
7471
print_rank_n("Model loaded")
7572
dist.barrier()
7673

74+
self.post_init(args.model_name)
75+
7776

7877
class TemporaryCheckpointsJSON:
7978
def __init__(self, model_path: str):
@@ -91,5 +90,15 @@ def __enter__(self):
9190
run_rank_n(partial(self.write_checkpoints_json, model_path=self.model_path), barrier=True)
9291
return self.tmp_file
9392

94-
def __exit__(self, type, value, traceback):
95-
return
93+
94+
def get_model_path(model_name: str):
95+
config_file = "config.json"
96+
97+
# will fall back to HUGGINGFACE_HUB_CACHE
98+
config_path = try_to_load_from_cache(model_name, config_file, cache_dir=os.getenv("TRANSFORMERS_CACHE"))
99+
100+
if config_path is not None:
101+
return os.path.dirname(config_path)
102+
# treat the model name as an explicit model path
103+
elif os.path.isfile(os.path.join(model_name, config_file)):
104+
return model_name

inference_server/models/ds_zero.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
import torch.distributed as dist
66

77
import deepspeed
8-
from transformers import AutoConfig, AutoTokenizer
8+
from transformers import AutoConfig
99
from transformers.deepspeed import HfDeepSpeedConfig
1010

1111
from ..utils import print_rank_n
12-
from .model import Model, get_downloaded_model_path, get_hf_model_class, load_tokenizer
12+
from .model import Model, get_hf_model_class
1313

1414

1515
class DSZeROModel(Model):
@@ -18,9 +18,7 @@ def __init__(self, args: Namespace) -> None:
1818

1919
super().__init__(args)
2020

21-
downloaded_model_path = get_downloaded_model_path(args.model_name)
22-
23-
config = AutoConfig.from_pretrained(downloaded_model_path)
21+
config = AutoConfig.from_pretrained(args.model_name)
2422

2523
world_size = int(os.getenv("WORLD_SIZE", "1"))
2624
train_batch_size = 1 * world_size
@@ -54,12 +52,7 @@ def __init__(self, args: Namespace) -> None:
5452
# this tells from_pretrained to instantiate directly on gpus
5553
dschf = HfDeepSpeedConfig(ds_config)
5654

57-
self.tokenizer = load_tokenizer(downloaded_model_path)
58-
self.pad = self.tokenizer.pad_token_id
59-
60-
self.model = get_hf_model_class(args.model_class).from_pretrained(
61-
downloaded_model_path, torch_dtype=args.dtype
62-
)
55+
self.model = get_hf_model_class(args.model_class).from_pretrained(args.model_name, torch_dtype=args.dtype)
6356
self.model = self.model.eval()
6457

6558
# convert model to a fully sharded model using ZeRO
@@ -74,3 +67,5 @@ def __init__(self, args: Namespace) -> None:
7467

7568
print_rank_n("Model loaded")
7669
dist.barrier()
70+
71+
self.post_init(args.model_name)

inference_server/models/hf_accelerate.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22

33
import torch
44

5-
from transformers import AutoModelForCausalLM, AutoTokenizer
6-
75
from ..utils import print_rank_n
8-
from .model import Model, get_downloaded_model_path, get_hf_model_class, load_tokenizer
6+
from .model import Model, get_hf_model_class
97

108

119
class HFAccelerateModel(Model):
@@ -14,12 +12,7 @@ def __init__(self, args: Namespace) -> None:
1412

1513
super().__init__(args)
1614

17-
downloaded_model_path = get_downloaded_model_path(args.model_name)
18-
19-
self.tokenizer = load_tokenizer(downloaded_model_path)
20-
self.pad = self.tokenizer.pad_token_id
21-
22-
kwargs = {"pretrained_model_name_or_path": downloaded_model_path, "device_map": "auto"}
15+
kwargs = {"pretrained_model_name_or_path": args.model_name, "device_map": "auto"}
2316

2417
if len(args.cuda_visible_devices) > 1:
2518
kwargs["device_map"] = "balanced_low_0"
@@ -39,3 +32,5 @@ def __init__(self, args: Namespace) -> None:
3932
self.input_device = "cuda:0"
4033

4134
print_rank_n("Model loaded")
35+
36+
self.post_init(args.model_name)

0 commit comments

Comments
 (0)