For vLLM health check, using docker service name instead to host_ip

louie-tsai · louie-tsai · commit f484e041b7be · 2025-08-14T23:28:43.000-07:00
Signed-off-by: Tsai, Louie &lt;louie.tsai@intel.com&gt;
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.perf.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.perf.yaml
@@ -8,3 +8,21 @@ services:
       VLLM_CPU_SGL_KERNEL: 1
     entrypoint: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
     command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --dtype bfloat16 --distributed-executor-backend mp --block-size 128 --enforce-eager --tensor-parallel-size $TP_NUM --pipeline-parallel-size $PP_NUM --max-num-batched-tokens $MAX_BATCHED_TOKENS --max-num-seqs $MAX_SEQS
+  vllm-ci-test:
+    image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f-cpu
+    container_name: vllm-ci-test
+    volumes:
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+    shm_size: 128g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HF_TOKEN}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_CPU_KVCACHE_SPACE: 40
+      ON_CPU: 1
+      REMOTE_HOST: vllm-service
+      REMOTE_PORT: 80
+    entrypoint: tail -f /dev/null
+
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -104,7 +104,7 @@ services:
       VLLM_TORCH_PROFILER_DIR: "/mnt"
       VLLM_CPU_KVCACHE_SPACE: 40
     healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:9009/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://vllm-service:80/health || exit 1"]
       interval: 10s
       timeout: 10s
       retries: 100