PaddlePaddle
diff --git a/‎paddle/cinn/backends/codegen_cuda_dev.cc
+2-1 b/‎paddle/cinn/backends/codegen_cuda_dev.cc
+2-1
diff --git a/‎paddle/cinn/backends/codegen_device_util.cc
+11-4 b/‎paddle/cinn/backends/codegen_device_util.cc
+11-4
diff --git a/‎paddle/cinn/common/target.cc
+45 b/‎paddle/cinn/common/target.cc
+45
diff --git a/‎paddle/cinn/common/target.h
+2 b/‎paddle/cinn/common/target.h
+2
diff --git a/‎paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+4 b/‎paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+4
diff --git a/‎paddle/cinn/hlir/framework/pir/trivial_op_impl.cc
+1-2 b/‎paddle/cinn/hlir/framework/pir/trivial_op_impl.cc
+1-2
diff --git a/‎paddle/cinn/ir/group_schedule/config/group_tile_util.cc
-76 b/‎paddle/cinn/ir/group_schedule/config/group_tile_util.cc
-76
diff --git a/‎paddle/cinn/ir/group_schedule/config/group_tile_util.h
-6 b/‎paddle/cinn/ir/group_schedule/config/group_tile_util.h
-6
@@ -34,6 +34,7 @@ using cinn::common::float162;
 using cinn::common::bfloat168;
 using cinn::common::bfloat164;
 using cinn::common::bfloat162;
+#include <cooperative_groups.h>
 #include "cinn_cuda_runtime_source.cuh"
 )";
 const std::string CodeGenCudaDev::source_header_ =  // NOLINT
@@ -55,8 +56,8 @@ using cinn::common::float162;
 using cinn::common::bfloat168;
 using cinn::common::bfloat164;
 using cinn::common::bfloat162;
+#include <cooperative_groups.h>
 #include <cinn_cuda_runtime_source_h>
-
 )";
 
 const std::string &CodeGenCudaDev::GetSourceHeader() { return source_header_; }
 
@@ -177,6 +177,15 @@ static std::string CurTailFnName(const std::string &origin_fn_name) {
   return new_fn_name;
 }
 
+bool RequiresCooperativeLaunch(const ir::LoweredFunc &func) {
+  for (auto &space : func->temp_spaces) {
+    if (space.size() != ir::Expr(0)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 std::string
 detail::CollectBucketStrategyHostFunctionVisitor::GenDeviceKernelName(
     const std::string &fn_name, ir::Expr predicate) {
@@ -257,10 +266,8 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
         CINN_NOT_IMPLEMENTED;
       },
       [&](common::NVGPUArch) {
-        // TODO(liangshuhao): when cooperative group is supported, change the
-        // second call to `call_cuda_cooperative_kernel`.
-        call_kernel = func->temp_spaces.empty()
-                          ? runtime::intrinsic::call_cuda_kernel
+        call_kernel = RequiresCooperativeLaunch(func)
+                          ? runtime::intrinsic::call_cuda_cooperative_kernel
                           : runtime::intrinsic::call_cuda_kernel;
       },
       [&](common::HygonDCUArchHIP) {
 
@@ -409,5 +409,50 @@ const Target &DefaultTarget() {
 #endif
 }
 
+bool GetSupportsCooperativeLaunchImpl(UnknownArch) {
+  LOG(FATAL)
+      << "The target is not GPU! Cannot get supports cooperative launch.";
+}
+
+bool GetSupportsCooperativeLaunchImpl(X86Arch) {
+  LOG(FATAL)
+      << "The target is not GPU! Cannot get supports cooperative launch.";
+}
+
+bool GetSupportsCooperativeLaunchImpl(ARMArch) {
+  LOG(FATAL)
+      << "The target is not GPU! Cannot get supports cooperative launch.";
+}
+
+bool GetSupportsCooperativeLaunchImpl(NVGPUArch) {
+  int supportsCoopLaunch = 0;
+#ifdef CINN_WITH_CUDA
+  cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, 0);
+#endif
+  return supportsCoopLaunch != 0;
+}
+
+bool GetSupportsCooperativeLaunchImpl(HygonDCUArchHIP) {
+  CINN_NOT_IMPLEMENTED
+  LOG(FATAL)
+      << "The target is not GPU! Cannot get supports cooperative launch.";
+}
+
+bool GetSupportsCooperativeLaunchImpl(HygonDCUArchSYCL) {
+  CINN_NOT_IMPLEMENTED
+  LOG(FATAL)
+      << "The target is not GPU! Cannot get supports cooperative launch.";
+}
+
+bool GetSupportsCooperativeLaunch(Arch arch) {
+  return std::visit(
+      [](const auto &impl) { return GetSupportsCooperativeLaunchImpl(impl); },
+      arch.variant());
+}
+
+bool Target::get_supports_cooperative_launch() const {
+  return GetSupportsCooperativeLaunch(arch);
+}
+
 }  // namespace common
 }  // namespace cinn
@@ -87,6 +87,8 @@ struct Target {
 
   std::vector<Lib> get_target_libs() const;
 
+  bool get_supports_cooperative_launch() const;
+
   std::string arch_str() const;
 
   std::string device_name_str() const;
 
@@ -141,6 +141,10 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower(
     fusion_group_info->can_apply_grid_reduce = false;
   }
 
+  if (!target_.get_supports_cooperative_launch()) {
+    fusion_group_info->can_apply_grid_reduce = false;
+  }
+
   if (FLAGS_cinn_check_tensor_buffer_map) {
     optim::CheckTensorBufferMap(func_bodies, "BucketLower OpFusion");
     VLOG(3) << "OpFusion tensor-buffer map check succeed";
 
@@ -607,8 +607,7 @@ std::shared_ptr<FusionGroupInfo> GetFusionGroupInfo(
   }
 
   if (FLAGS_cinn_enable_grid_reduce) {
-    group_info->can_apply_grid_reduce =
-        GetCanApplyGridReduce(op_compute_bodies, group_info->reduce_axis);
+    group_info->can_apply_grid_reduce = true;
   }
 
   if (FLAGS_cinn_enable_vectorize) {
 
@@ -386,82 +386,6 @@ std::vector<int64_t> GetLoopStrides(const ir::Expr& body) {
   return loop_strides;
 }
 
-bool GetCanApplyGridReduce(const std::vector<ir::Expr>& op_compute_bodies,
-                           const std::vector<int64_t>& reduce_axis) {
-  // Names of tensors that are downstream of reduce.
-  // A tensor is downstream of reduce either if it is produced by a reduce, or
-  // if it has data dependency on another tensor that is downstream of reduce.
-  std::unordered_set<std::string> reduce_downstream_tensor_names;
-
-  const auto IsReduceDownstream = [&](const ir::Expr& expr_block) {
-    for (auto& expr_load : GetRValueLoads(expr_block)) {
-      std::string load_tensor_name = expr_load.As<ir::Load>()->name();
-      if (reduce_downstream_tensor_names.count(load_tensor_name) > 0) {
-        return true;
-      }
-    }
-    return false;
-  };
-
-  const auto AddReduceDownstream = [&](const ir::Expr& expr_block) {
-    auto expr_store = analyzer::GetStoreOfSBlock(expr_block);
-    std::string store_tensor_name = expr_store.As<ir::Store>()->name();
-    reduce_downstream_tensor_names.insert(store_tensor_name);
-  };
-
-  const auto CheckOutputHasReduceAxis = [&](const ir::Expr& body,
-                                            const ir::Expr& expr_block) {
-    std::vector<ir::Var> all_loop_vars = GetAllForIters(body);
-    std::unordered_set<std::string> reduce_loop_vars;
-    for (int64_t axis : reduce_axis) {
-      reduce_loop_vars.insert(all_loop_vars[axis]->name);
-    }
-
-    std::unordered_set<std::string> reduce_iter_vars;
-    auto* block = expr_block.As<ir::ScheduleBlockRealize>();
-    auto& iter_vars = block->schedule_block.As<ir::ScheduleBlock>()->iter_vars;
-    for (int i = 0; i < iter_vars.size(); i++) {
-      if (block->iter_values[i].is_var() &&
-          reduce_loop_vars.count(block->iter_values[i].as_var()->name) > 0) {
-        reduce_iter_vars.insert(iter_vars[i]->name);
-      }
-    }
-
-    // The result is true if the indices of the output tensor contain any
-    // reduce iter vars.
-    auto expr_store = analyzer::GetStoreOfSBlock(expr_block);
-    for (auto& index_expr : expr_store.As<ir::Store>()->indices) {
-      if (index_expr.is_var() &&
-          reduce_iter_vars.count(index_expr.as_var_ref()->name) > 0) {
-        return true;
-      }
-    }
-    return false;
-  };
-
-  for (const auto& body : op_compute_bodies) {
-    ir::Expr expr_block =
-        (ChildScheduleBlockRealizes * ScheduleBlockRealizeIsNotInit)
-            .GetSingle(body);
-    bool is_reduce = analyzer::IsReductionSBlock(expr_block);
-    bool is_reduce_downstream = IsReduceDownstream(expr_block);
-    bool output_has_reduce_axis = CheckOutputHasReduceAxis(body, expr_block);
-
-    if (is_reduce_downstream || is_reduce) {
-      AddReduceDownstream(expr_block);
-    }
-
-    // When a block is downstream of reduce, its loop iters shouldn't contain
-    // any reduce axis. Otherwise, it broadcasts the result of reduce. If this
-    // is the case, we cannot apply grid reduce.
-    if (is_reduce_downstream && (is_reduce || output_has_reduce_axis)) {
-      VLOG(4) << "grid reduce is prohibited by block: " << expr_block;
-      return false;
-    }
-  }
-  return true;
-}
-
 GroupVectorizeInfo GetGroupVectorizeInfo(
     const std::vector<ir::Expr>& op_compute_bodies,
     const std::unordered_set<std::string>& group_args) {
 
@@ -56,12 +56,6 @@ namespace ir {
  */
 std::vector<int64_t> GetLoopStrides(const ir::Expr& reduce_compute_body);
 
-// Check whether we can apply grid reduce in this group.
-// We can apply grid reduce if there is no reduce-then-broadcast dependency
-// in this group.
-bool GetCanApplyGridReduce(const std::vector<ir::Expr>& op_compute_bodies,
-                           const std::vector<int64_t>& reduce_axis);
-
 // Check whether we can apply vectorize in this group.
 GroupVectorizeInfo GetGroupVectorizeInfo(
     const std::vector<ir::Expr>& op_compute_bodies,
Original file line number	Diff line number	Diff line change
`@@ -607,8 +607,7 @@ std::shared_ptr<FusionGroupInfo> GetFusionGroupInfo(`
`607`	`607`	`}`
`608`	`608`
`609`	`609`	`if (FLAGS_cinn_enable_grid_reduce) {`
`610`		`- group_info->can_apply_grid_reduce =`
`611`		`- GetCanApplyGridReduce(op_compute_bodies, group_info->reduce_axis);`
	`610`	`+ group_info->can_apply_grid_reduce = true;`
`612`	`611`	`}`
`613`	`612`
`614`	`613`	`if (FLAGS_cinn_enable_vectorize) {`