changes to enable optimizatino from vLLM 0.9.2

louie-tsai · louie-tsai · commit 7072d2cb87a5 · 2025-06-30T18:31:08.000-07:00
Signed-off-by: Tsai, Louie &lt;louie.tsai@intel.com&gt;
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -73,6 +73,13 @@ CPU example with Open Telemetry feature:
 docker compose -f compose.yaml -f compose.telemetry.yaml up -d
 ```
 
+To enable Xeon Optimization like AMX or Tensor Parallel for vLLM, compose.perf.yaml file need to be merged along with default compose.yaml file.  
+CPU example with optimized vLLM feature:
+
+```bash
+docker compose -f compose.yaml -f compose.perf.yaml up -d
+```
+
 **Note**: developers should build docker image from source when:
 
 - Developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.perf.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.perf.yaml
@@ -0,0 +1,7 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  vllm-service:
+    image: ${REGISTRY:-public.ecr.aws/q9t5s3a7}/vllm-cpu-release-repo:${TAG:-v0.9.2}
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --dtype bfloat16 --distributed-executor-backend mp --block-size 128 --enforce-eager --tensor-parallel-size 2
diff --git a/ChatQnA/kubernetes/helm/cpu-values.yaml b/ChatQnA/kubernetes/helm/cpu-values.yaml
@@ -2,9 +2,22 @@
 # SPDX-License-Identifier: Apache-2.0
 
 vllm:
+  image:
+    repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+    tag: "v0.9.2"
+  resources: {}
   LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
   # Uncomment the following model specific settings for DeepSeek models
-  #VLLM_CPU_KVCACHE_SPACE: 40
+  VLLM_CPU_KVCACHE_SPACE: 40
+
+  extraCmdArgs: [
+    "--tensor-parallel-size", "2",
+    "--block-size", "128",
+    "--dtype", "bfloat16",
+    "--max-model-len","5196",
+    "--distributed_executor_backend", "mp",
+    "--enable_chunked_prefill",
+    "--enforce-eager"]
   #resources:
   #  requests:
   #    memory: 60Gi # 40G for KV cache, and 20G for DeepSeek-R1-Distill-Qwen-7B, need to adjust it for other models