[PHI] Enable depthwise convolution cudnn

Dmovic · Dmovic · commit 7f72de7377f7 · 2025-03-18T03:05:17.000Z
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -29,6 +29,76 @@ namespace cub = hipcub;
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpudnn/conv_gpudnn.h"
+
+namespace phi {
+// To determine use cudnn or not.
+struct DWConvParams {
+  bool has_fuse_relu_;
+  std::string data_format_;
+  std::vector<int> strides_;
+  std::vector<int> dilations_;
+
+  DWConvParams(const bool has_fuse_relu,
+               const std::string& data_format,
+               const std::vector<int>& strides,
+               const std::vector<int>& dilations)
+      : has_fuse_relu_(has_fuse_relu),
+        data_format_(data_format),
+        strides_(strides),
+        dilations_(dilations) {}
+
+  bool is_strided() const {
+    for (const auto& stride : strides_) {
+      if (stride != 1) return true;
+    }
+    return false;
+  }
+
+  bool is_dilated() const {
+    for (const auto& dilation : dilations_) {
+      if (dilation != 1) return true;
+    }
+    return false;
+  }
+
+  // Use cudnn for NHWC and NCHW FP16.
+  bool UseCudnnDepthwise(const DenseTensor& input,
+                         const DenseTensor& filter) const {
+    // No fuse supported yet.
+    if (has_fuse_relu_) {
+      return false;
+    }
+    // Cudnn enable
+    if (!dynload::HasCUDNN()) {
+      return false;
+    }
+    // Use cudnn depthwise conv for channel last format.
+    if (data_format_ == "NHWC") {
+      return true;
+    }
+    // Only support FP16.
+    if (input.type() != phi::DataType::FLOAT16 &&
+        filter.type() != phi::DataType::FLOAT16) {
+      return false;
+    }
+    // Only support depthwise 2D.
+    if (input.dims().size() != 4) {
+      return false;
+    }
+    // No dilation and stride.
+    if (is_dilated() || is_strided()) {
+      return false;
+    }
+    // Format here is NCHW, channel greater than 32, need benchmarks.
+    if (input.dims()[1] < 32) {
+      return false;
+    }
+    return true;
+  }
+};
+
+}  // namespace phi
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
@@ -52,6 +52,37 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
   std::vector<int> paddings = paddings_t;
   std::vector<int> dilations = dilations_t;
 
+// Enable if cudnn above 8.2, hip already has cudnn kernel.
+#if defined(CUDNN_VERSION) && CUDNN_VERSION_MIN(8, 2, 0) && \
+    !defined(PADDLE_WITH_HIP)
+  DWConvParams params(has_fuse_relu, data_format, strides, dilations);
+  if (params.UseCudnnDepthwise(input, filter)) {
+    // Keep same with original kernel.
+    phi::funcs::SetConstant<Context, T> set_zero;
+    if (input_grad) {
+      dev_ctx.template Alloc<T>(input_grad);
+      set_zero(dev_ctx, input_grad, static_cast<T>(0));
+    }
+    if (filter_grad) {
+      dev_ctx.template Alloc<T>(filter_grad);
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+    }
+    phi::DepthwiseConvCudnnGradKernel<T>(dev_ctx,
+                                         input,
+                                         filter,
+                                         *output_grad,
+                                         strides_t,
+                                         paddings_t,
+                                         padding_algorithm,
+                                         groups,
+                                         dilations_t,
+                                         data_format,
+                                         input_grad,
+                                         filter_grad);
+    return;
+  }
+#endif
+
   // update padding and dilation
   auto in_dims = input.dims();
   auto filter_dims = filter.dims();
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
@@ -72,6 +72,25 @@ void DepthwiseConvKernel(const Context& dev_ctx,
             input.dims()[1]));
   }
 
+// Enable if cudnn above 8.2, hip already has cudnn kernel.
+#if defined(CUDNN_VERSION) && CUDNN_VERSION_MIN(8, 2, 0) && \
+    !defined(PADDLE_WITH_HIP)
+  DWConvParams params(has_fuse_relu, data_format, strides, dilations);
+  if (params.UseCudnnDepthwise(input, filter)) {
+    phi::DepthwiseConvCudnnKernel<T>(dev_ctx,
+                                     input,
+                                     filter,
+                                     strides_t,
+                                     paddings_t,
+                                     padding_algorithm,
+                                     groups,
+                                     dilations_t,
+                                     data_format,
+                                     out);
+    return;
+  }
+#endif
+
   // update padding and dilation
   auto in_dims = input.dims();
   auto filter_dims = filter.dims();
diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn.h b/paddle/phi/kernels/gpudnn/conv_gpudnn.h
@@ -0,0 +1,119 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/conv_kernel.h"
+
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
+#else
+#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
+#endif
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+
+#ifdef PADDLE_WITH_CUDNN_FRONTEND
+// clang-format off
+#include "paddle/phi/backends/dynload/cudnn_frontend.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h"
+// clang-format on
+#endif
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvCudnnKernel(const Context& ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& filter,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings_t,
+                     const std::string& padding_algorithm,
+                     const std::vector<int>& dilations_t,
+                     int groups,
+                     const std::string& data_format,
+                     DenseTensor* output);
+
+template <typename T, typename Context>
+void DepthwiseConvCudnnKernel(const Context& dev_ctx,
+                              const DenseTensor& input,
+                              const DenseTensor& filter,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              const std::string& padding_algorithm,
+                              int groups,
+                              const std::vector<int>& dilations,
+                              const std::string& data_format,
+                              DenseTensor* out) {
+  ConvCudnnKernel<T>(dev_ctx,
+                     input,
+                     filter,
+                     strides,
+                     paddings,
+                     padding_algorithm,
+                     dilations,
+                     groups,
+                     data_format,
+                     out);
+}
+
+template <typename T, typename Context>
+void ConvCudnnGradKernel(const Context& ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const DenseTensor& output_grad,
+                         const std::vector<int>& strides_t,
+                         const std::vector<int>& paddings_t,
+                         const std::string& padding_algorithm,
+                         const std::vector<int>& dilations_t,
+                         int groups,
+                         const std::string& data_format,
+                         DenseTensor* input_grad,
+                         DenseTensor* filter_grad);
+
+template <typename T, typename Context>
+void DepthwiseConvCudnnGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& input,
+                                  const DenseTensor& filter,
+                                  const DenseTensor& out_grad,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  const std::string& padding_algorithm,
+                                  int groups,
+                                  const std::vector<int>& dilations,
+                                  const std::string& data_format,
+                                  DenseTensor* input_grad,
+                                  DenseTensor* filter_grad) {
+  ConvCudnnGradKernel<T>(dev_ctx,
+                         input,
+                         filter,
+                         out_grad,
+                         strides,
+                         paddings,
+                         padding_algorithm,
+                         dilations,
+                         groups,
+                         data_format,
+                         input_grad,
+                         filter_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/gpudnn/conv_gpudnn.h"
 
 #include "glog/logging.h"
 
@@ -759,37 +760,6 @@ void Conv3DCudnnGradKernel(const Context& dev_ctx,
                          filter_grad);
 }
 
-template <typename T, typename Context>
-void DepthwiseConvCudnnGradKernel(const Context& dev_ctx,
-                                  const DenseTensor& input,
-                                  const DenseTensor& filter,
-                                  const DenseTensor& out_grad,
-                                  const std::vector<int>& strides,
-                                  const std::vector<int>& paddings,
-                                  const std::string& padding_algorithm,
-                                  int groups,
-                                  const std::vector<int>& dilations,
-                                  const std::string& data_format,
-                                  bool use_addto,
-                                  int workspace_size_MB,
-                                  bool exhaustive_search,
-                                  bool fuse_relu,
-                                  DenseTensor* input_grad,
-                                  DenseTensor* filter_grad) {
-  ConvCudnnGradKernel<T>(dev_ctx,
-                         input,
-                         filter,
-                         out_grad,
-                         strides,
-                         paddings,
-                         padding_algorithm,
-                         dilations,
-                         groups,
-                         data_format,
-                         input_grad,
-                         filter_grad);
-}
-
 template <typename T, typename Context>
 void ConvCudnnGradGradKernel(
     const Context& ctx,
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/gpudnn/conv_gpudnn.h"
 
 #include "glog/logging.h"
 
@@ -557,30 +558,6 @@ void Conv3DCudnnKernel(const Context& dev_ctx,
                      data_format,
                      out);
 }
-
-template <typename T, typename Context>
-void DepthwiseConvCudnnKernel(const Context& dev_ctx,
-                              const DenseTensor& input,
-                              const DenseTensor& filter,
-                              const std::vector<int>& strides,
-                              const std::vector<int>& paddings,
-                              const std::string& padding_algorithm,
-                              int groups,
-                              const std::vector<int>& dilations,
-                              const std::string& data_format,
-                              DenseTensor* out) {
-  ConvCudnnKernel<T>(dev_ctx,
-                     input,
-                     filter,
-                     strides,
-                     paddings,
-                     padding_algorithm,
-                     dilations,
-                     groups,
-                     data_format,
-                     out);
-}
-
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP