File tree Expand file tree Collapse file tree 3 files changed +26
-16
lines changed Expand file tree Collapse file tree 3 files changed +26
-16
lines changed Original file line number Diff line number Diff line change
1
+ # Copyright (C) 2025 Intel Corporation
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ vllm :
5
+ image :
6
+ repository : public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
7
+ tag : " v0.9.2"
8
+ resources : {}
9
+ LLM_MODEL_ID : meta-llama/Meta-Llama-3-8B-Instruct
10
+ # Uncomment the following model specific settings for DeepSeek models
11
+ VLLM_CPU_KVCACHE_SPACE : 40
12
+
13
+ extraCmdArgs : [
14
+ " --tensor-parallel-size" , "2",
15
+ " --block-size" , "128",
16
+ " --dtype" , "bfloat16",
17
+ " --max-model-len" ,"5196",
18
+ " --distributed_executor_backend" , "mp",
19
+ " --enable_chunked_prefill" ,
20
+ " --enforce-eager" ]
21
+ # resources:
22
+ # requests:
23
+ # memory: 60Gi # 40G for KV cache, and 20G for DeepSeek-R1-Distill-Qwen-7B, need to adjust it for other models
Original file line number Diff line number Diff line change 2
2
# SPDX-License-Identifier: Apache-2.0
3
3
4
4
vllm :
5
- image :
6
- repository : public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
7
- tag : " v0.9.2"
8
- resources : {}
9
5
LLM_MODEL_ID : meta-llama/Meta-Llama-3-8B-Instruct
10
6
# Uncomment the following model specific settings for DeepSeek models
11
- VLLM_CPU_KVCACHE_SPACE : 40
12
-
13
- extraCmdArgs : [
14
- " --tensor-parallel-size" , "2",
15
- " --block-size" , "128",
16
- " --dtype" , "bfloat16",
17
- " --max-model-len" ,"5196",
18
- " --distributed_executor_backend" , "mp",
19
- " --enable_chunked_prefill" ,
20
- " --enforce-eager" ]
7
+ # VLLM_CPU_KVCACHE_SPACE: 40
21
8
# resources:
22
9
# requests:
23
10
# memory: 60Gi # 40G for KV cache, and 20G for DeepSeek-R1-Distill-Qwen-7B, need to adjust it for other models
Original file line number Diff line number Diff line change @@ -43,7 +43,7 @@ function start_services() {
43
43
source set_env.sh
44
44
45
45
# Start Docker Containers
46
- docker compose -f compose.yaml -f compose.telemetry.yaml up -d --quiet-pull > ${LOG_PATH} /start_services_with_compose.log
46
+ docker compose -f compose.yaml -f compose.telemetry.yaml -f compose.perf.yaml up -d --quiet-pull > ${LOG_PATH} /start_services_with_compose.log
47
47
n=0
48
48
until [[ " $n " -ge 100 ]]; do
49
49
docker logs vllm-service > ${LOG_PATH} /vllm_service_start.log 2>&1
@@ -163,7 +163,7 @@ function validate_frontend() {
163
163
164
164
function stop_docker() {
165
165
cd $WORKPATH /docker_compose/intel/cpu/xeon
166
- docker compose -f compose.yaml -f compose.telemetry.yaml down
166
+ docker compose -f compose.yaml -f compose.telemetry.yaml -f compose.perf.yaml down
167
167
}
168
168
169
169
function main() {
You can’t perform that action at this time.
0 commit comments