File tree Expand file tree Collapse file tree 3 files changed +28
-1
lines changed
docker_compose/intel/cpu/xeon Expand file tree Collapse file tree 3 files changed +28
-1
lines changed Original file line number Diff line number Diff line change @@ -73,6 +73,13 @@ CPU example with Open Telemetry feature:
73
73
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
74
74
```
75
75
76
+ To enable Xeon Optimization like AMX or Tensor Parallel for vLLM, compose.perf.yaml file need to be merged along with default compose.yaml file.
77
+ CPU example with optimized vLLM feature:
78
+
79
+ ``` bash
80
+ docker compose -f compose.yaml -f compose.perf.yaml up -d
81
+ ```
82
+
76
83
** Note** : developers should build docker image from source when:
77
84
78
85
- Developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
Original file line number Diff line number Diff line change
1
+ # Copyright (C) 2024 Intel Corporation
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ services :
5
+ vllm-service :
6
+ image : ${REGISTRY:-public.ecr.aws/q9t5s3a7}/vllm-cpu-release-repo:${TAG:-v0.9.2}
7
+ command : --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --dtype bfloat16 --distributed-executor-backend mp --block-size 128 --enforce-eager --tensor-parallel-size 2
Original file line number Diff line number Diff line change 2
2
# SPDX-License-Identifier: Apache-2.0
3
3
4
4
vllm :
5
+ image :
6
+ repository : public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
7
+ tag : " v0.9.2"
8
+ resources : {}
5
9
LLM_MODEL_ID : meta-llama/Meta-Llama-3-8B-Instruct
6
10
# Uncomment the following model specific settings for DeepSeek models
7
- # VLLM_CPU_KVCACHE_SPACE: 40
11
+ VLLM_CPU_KVCACHE_SPACE : 40
12
+
13
+ extraCmdArgs : [
14
+ " --tensor-parallel-size" , "2",
15
+ " --block-size" , "128",
16
+ " --dtype" , "bfloat16",
17
+ " --max-model-len" ,"5196",
18
+ " --distributed_executor_backend" , "mp",
19
+ " --enable_chunked_prefill" ,
20
+ " --enforce-eager" ]
8
21
# resources:
9
22
# requests:
10
23
# memory: 60Gi # 40G for KV cache, and 20G for DeepSeek-R1-Distill-Qwen-7B, need to adjust it for other models
You can’t perform that action at this time.
0 commit comments