Fix quantize kernels on rocm 6.4 (#4708)

jwfromm · facebook-github-bot · commit a5778ead15e0 · 2025-08-15T15:05:22.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1731 Interestingly, ROCM6.4 technically allows both OCP and FNUZ floating point formats. We have a check in our quantize kernels that sees if OCP formats are allowed and uses them if so. However, for pretty much any integration, FNUZ is still expected. This small diff fixes the behavior by checking env vars more carefully and exposes rowwise quantization on AMD to unit tests. Reviewed By: q10 Differential Revision: D80309166
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu
@@ -89,7 +89,7 @@ namespace fbgemm_gpu {
 // outputs are of size float[D]
 
 #if (defined(USE_ROCM) && ROCM_VERSION >= 60200)
-#if HIP_FP8_TYPE_OCP
+#if HIP_FP8_TYPE_OCP && !HIP_FP8_TYPE_FNUZ
 using __nv_fp8x4_e4m3 = __hip_fp8x4_e4m3;
 using __nv_fp8x2_e4m3 = __hip_fp8x2_e4m3;
 using __nv_fp8_e4m3 = __hip_fp8_e4m3;
@@ -1075,7 +1075,11 @@ void invokeComputeScalesAndQuantizeMatrix(
     bool stochastic_rounding,
     cudaStream_t stream) {
   dim3 grid(numel / lda);
+#ifdef USE_ROCM
   bool use_shmem = true;
+#else
+  bool use_shmem = false;
+#endif
   auto const shmem_size = lda * sizeof(T_IN);
   if (shmem_size >= (48 << 10)) {
     cudaError_t ret;
diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
@@ -289,7 +289,9 @@ def test_f8f8bf16(self, kernel: str, use_fast_accum: bool) -> None:
             ["rowwise", "blockwise"]
             + (["tensorwise_broadcast", "tensorwise"] if torch.version.cuda else [])
         ),
-        QType=st.sampled_from([fp8_e4m3, fp8_e5m2]),
+        QType=(
+            st.sampled_from([fp8_e4m3, fp8_e5m2] if torch.version.cuda else [fp8_e4m3])
+        ),
         Bias=st.sampled_from([True, False]),
         CudaGraph=st.sampled_from([True, False]),
         UseTriton=st.sampled_from([False] + ([True] if torch.version.cuda else [])),
@@ -406,14 +408,10 @@ def f(
             def f(
                 x: torch.Tensor, w: torch.Tensor, bias: Optional[torch.Tensor]
             ) -> torch.Tensor:
-                if torch.version.cuda:
-                    xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
-                        x, output_dtype=QType
-                    )
-                    wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
-                else:
-                    xq, x_scale = quantize_fp8_row(x)
-                    wq, w_scale = quantize_fp8_row(w)
+                xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
+                    x, output_dtype=QType
+                )
+                wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
                 if UseTriton and torch.version.cuda:
                     zq = matmul_fp8_row(xq, wq, x_scale, w_scale)
                     if bias is not None: