From 74d80ebd7bef1ac46f155677a9b8234435aa1c8d Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Thu, 25 Sep 2025 13:22:02 -0400 Subject: [PATCH 1/5] Add block quantization e2e test Signed-off-by: shanjiaz --- tests/e2e/vLLM/configs/fp8_block.yaml | 5 +++++ tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml | 6 ++++++ 2 files changed, 11 insertions(+) create mode 100644 tests/e2e/vLLM/configs/fp8_block.yaml create mode 100644 tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml diff --git a/tests/e2e/vLLM/configs/fp8_block.yaml b/tests/e2e/vLLM/configs/fp8_block.yaml new file mode 100644 index 0000000000..a468977e41 --- /dev/null +++ b/tests/e2e/vLLM/configs/fp8_block.yaml @@ -0,0 +1,5 @@ +cadence: "nightly" +test_type: "regression" +model: Qwen/Qwen2.5-0.5B +scheme: FP8_BLOCK +recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml diff --git a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml new file mode 100644 index 0000000000..ff0ac634cf --- /dev/null +++ b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml @@ -0,0 +1,6 @@ +quant_stage: + quant_modifiers: + QuantizationModifier: + targets: "Linear" + scheme: "FP8_BLOCK" + ignore: ["lm_head", "re:.*mlp.gate$"] From e262cc866a655356d2c50b7df64e7061f8d00dff Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Fri, 10 Oct 2025 18:45:59 +0000 Subject: [PATCH 2/5] use tinyllama instead Signed-off-by: shanjiaz --- tests/e2e/vLLM/configs/fp8_block.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/e2e/vLLM/configs/fp8_block.yaml b/tests/e2e/vLLM/configs/fp8_block.yaml index a468977e41..f1b9b15370 100644 --- a/tests/e2e/vLLM/configs/fp8_block.yaml +++ b/tests/e2e/vLLM/configs/fp8_block.yaml @@ -1,5 +1,4 @@ cadence: "nightly" -test_type: "regression" -model: Qwen/Qwen2.5-0.5B +model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 scheme: FP8_BLOCK recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml From b054dd76e35213edd7de3c1ac73bde5a922f3d98 Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Tue, 14 Oct 2025 13:14:51 +0000 Subject: [PATCH 3/5] remove recipe Signed-off-by: shanjiaz --- tests/e2e/vLLM/configs/fp8_block.yaml | 4 ++-- tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) delete mode 100644 tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml diff --git a/tests/e2e/vLLM/configs/fp8_block.yaml b/tests/e2e/vLLM/configs/fp8_block.yaml index f1b9b15370..838ac53b13 100644 --- a/tests/e2e/vLLM/configs/fp8_block.yaml +++ b/tests/e2e/vLLM/configs/fp8_block.yaml @@ -1,4 +1,4 @@ cadence: "nightly" -model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +test_type: "regression" +model: meta-llama/Llama-3.2-1B-Instruct scheme: FP8_BLOCK -recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml diff --git a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml deleted file mode 100644 index ff0ac634cf..0000000000 --- a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_block.yaml +++ /dev/null @@ -1,6 +0,0 @@ -quant_stage: - quant_modifiers: - QuantizationModifier: - targets: "Linear" - scheme: "FP8_BLOCK" - ignore: ["lm_head", "re:.*mlp.gate$"] From df63ecae2f6e0252f21460ca6fd05b97bba7007f Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Tue, 14 Oct 2025 13:45:53 +0000 Subject: [PATCH 4/5] tiny llama Signed-off-by: shanjiaz --- tests/e2e/vLLM/configs/fp8_block.yaml | 2 +- tests/e2e/vLLM/test_vllm.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/vLLM/configs/fp8_block.yaml b/tests/e2e/vLLM/configs/fp8_block.yaml index 838ac53b13..3d0c0512e9 100644 --- a/tests/e2e/vLLM/configs/fp8_block.yaml +++ b/tests/e2e/vLLM/configs/fp8_block.yaml @@ -1,4 +1,4 @@ cadence: "nightly" test_type: "regression" -model: meta-llama/Llama-3.2-1B-Instruct +model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 scheme: FP8_BLOCK diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index 9c099a5aea..66ac5c335a 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -18,7 +18,7 @@ HF_MODEL_HUB_NAME = "nm-testing" TEST_DATA_FILE = os.environ.get( - "TEST_DATA_FILE", "tests/e2e/vLLM/configs/int8_dynamic_per_token.yaml" + "TEST_DATA_FILE", "tests/e2e/vLLM/configs/fp8_block.yaml" ) SKIP_HF_UPLOAD = os.environ.get("SKIP_HF_UPLOAD", "") # vllm python environment From a1e5dca912882d012e03cffe9d22e0d60877e8f8 Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Tue, 14 Oct 2025 13:47:52 +0000 Subject: [PATCH 5/5] minimal change Signed-off-by: shanjiaz --- tests/e2e/vLLM/test_vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index 66ac5c335a..9c099a5aea 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -18,7 +18,7 @@ HF_MODEL_HUB_NAME = "nm-testing" TEST_DATA_FILE = os.environ.get( - "TEST_DATA_FILE", "tests/e2e/vLLM/configs/fp8_block.yaml" + "TEST_DATA_FILE", "tests/e2e/vLLM/configs/int8_dynamic_per_token.yaml" ) SKIP_HF_UPLOAD = os.environ.get("SKIP_HF_UPLOAD", "") # vllm python environment