adding CI test and new cpu-value-perf.yaml to address review feedback

louie-tsai · louie-tsai · commit 2ee36fc06a36 · 2025-07-07T11:25:08.000-07:00
Signed-off-by: Tsai, Louie &lt;louie.tsai@intel.com&gt;
diff --git a/ChatQnA/kubernetes/helm/cpu-values-perf.yaml b/ChatQnA/kubernetes/helm/cpu-values-perf.yaml
@@ -0,0 +1,23 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+vllm:
+  image:
+    repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+    tag: "v0.9.2"
+  resources: {}
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  # Uncomment the following model specific settings for DeepSeek models
+  VLLM_CPU_KVCACHE_SPACE: 40
+
+  extraCmdArgs: [
+    "--tensor-parallel-size", "2",
+    "--block-size", "128",
+    "--dtype", "bfloat16",
+    "--max-model-len","5196",
+    "--distributed_executor_backend", "mp",
+    "--enable_chunked_prefill",
+    "--enforce-eager"]
+  #resources:
+  #  requests:
+  #    memory: 60Gi # 40G for KV cache, and 20G for DeepSeek-R1-Distill-Qwen-7B, need to adjust it for other models
diff --git a/ChatQnA/kubernetes/helm/cpu-values.yaml b/ChatQnA/kubernetes/helm/cpu-values.yaml
@@ -2,22 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 vllm:
-  image:
-    repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
-    tag: "v0.9.2"
-  resources: {}
   LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
   # Uncomment the following model specific settings for DeepSeek models
-  VLLM_CPU_KVCACHE_SPACE: 40
-
-  extraCmdArgs: [
-    "--tensor-parallel-size", "2",
-    "--block-size", "128",
-    "--dtype", "bfloat16",
-    "--max-model-len","5196",
-    "--distributed_executor_backend", "mp",
-    "--enable_chunked_prefill",
-    "--enforce-eager"]
+  #VLLM_CPU_KVCACHE_SPACE: 40
   #resources:
   #  requests:
   #    memory: 60Gi # 40G for KV cache, and 20G for DeepSeek-R1-Distill-Qwen-7B, need to adjust it for other models
diff --git a/ChatQnA/tests/test_compose_on_xeon.sh b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -43,7 +43,7 @@ function start_services() {
     source set_env.sh
 
     # Start Docker Containers
-    docker compose -f compose.yaml -f compose.telemetry.yaml up -d --quiet-pull > ${LOG_PATH}/start_services_with_compose.log
+    docker compose -f compose.yaml -f compose.telemetry.yaml -f compose.perf.yaml up -d --quiet-pull > ${LOG_PATH}/start_services_with_compose.log
     n=0
     until [[ "$n" -ge 100 ]]; do
         docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
@@ -163,7 +163,7 @@ function validate_frontend() {
 
 function stop_docker() {
     cd $WORKPATH/docker_compose/intel/cpu/xeon
-    docker compose -f compose.yaml -f compose.telemetry.yaml down
+    docker compose -f compose.yaml -f compose.telemetry.yaml -f compose.perf.yaml down
 }
 
 function main() {