PaddlePaddle
diff --git a/‎paddle/cinn/backends/codegen_device_util.cc
Lines changed: 34 additions & 12 deletions b/‎paddle/cinn/backends/codegen_device_util.cc
Lines changed: 34 additions & 12 deletions
diff --git a/‎paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
Lines changed: 91 additions & 36 deletions b/‎paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
Lines changed: 91 additions & 36 deletions
diff --git a/‎paddle/cinn/hlir/framework/pir/op_lowering_impl.h
Lines changed: 9 additions & 1 deletion b/‎paddle/cinn/hlir/framework/pir/op_lowering_impl.h
Lines changed: 9 additions & 1 deletion
@@ -170,16 +170,24 @@ detail::CollectBucketStrategyHostFunctionVisitor::GenDeviceKernelName(
   std::string cond_str = Predicate2String(predicate);
   // replace '-' with 'NEG'
   size_t pos = cond_str.find("-", 0);
-  const std::string replacement = "NEG";
+  const std::string replacement_neg = "NEG";
   while (pos != std::string::npos) {
-    cond_str.replace(pos, 1, replacement);
-    pos = cond_str.find("-", pos + replacement.length());
+    cond_str.replace(pos, 1, replacement_neg);
+    pos = cond_str.find("-", pos + replacement_neg.length());
+  }
+
+  // replace '!' with 'NOT'
+  pos = cond_str.find("!", 0);
+  const std::string replacement_not = "NOT";
+  while (pos != std::string::npos) {
+    cond_str.replace(pos, 1, replacement_not);
+    pos = cond_str.find("!", pos + replacement_not.length());
   }
   VLOG(3) << "predicate string: " << cond_str;
   // NOTE(chenxi67): The kernel name is too long to be supported in cuda12.3 so
   // we need to curtail it.
   const std::string new_fn_name = CurTailFnName(fn_name);
-  return new_fn_name + "__COND_" + cond_str + "__kernel";
+  return new_fn_name + "_COND_" + cond_str + "__kernel";
 }
 
 void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
@@ -245,19 +253,33 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
         call_kernel = runtime::intrinsic::call_sycl_kernel;
       });
   // TODO(Dmovic): use new ir when backend update done.
+  // Author(liujinnan): Copy args instead of use func args directly in host
+  // func. because after longlong2int pass, some type of loweredfunc args may be
+  // changed to int32, it cause compile error when lower to LLVM IR.
+  std::vector<ir::Expr> kernel_args_int64 = {
+      ir::ir_utils::IRCopy(func_node->cuda_axis_info.grid_dim(0)),
+      ir::ir_utils::IRCopy(func_node->cuda_axis_info.grid_dim(1)),
+      ir::ir_utils::IRCopy(func_node->cuda_axis_info.grid_dim(2)),
+      ir::ir_utils::IRCopy(func_node->cuda_axis_info.block_dim(0)),
+      ir::ir_utils::IRCopy(func_node->cuda_axis_info.block_dim(1)),
+      ir::ir_utils::IRCopy(func_node->cuda_axis_info.block_dim(2)),
+      ir::ir_utils::IRCopy(shared_mem_bytes.value()),
+      cinn::common::make_const(Int(64), 0) /* enable TryElevateInt32ToInt64 */};
+  ir::TryElevateInt32ToInt64(kernel_args_int64);
+
   ir::Expr call_extern_api =
       ir::Call::Make(Void(),
                      call_kernel.value(),
                      {kernel_ptr,
                       kernel_args_,
                       kernel_args_num_,
-                      func_node->cuda_axis_info.grid_dim(0),   // grid_x
-                      func_node->cuda_axis_info.grid_dim(1),   // grid_y
-                      func_node->cuda_axis_info.grid_dim(2),   // grid_z
-                      func_node->cuda_axis_info.block_dim(0),  // block_x
-                      func_node->cuda_axis_info.block_dim(1),  // block_y
-                      func_node->cuda_axis_info.block_dim(2),  // block_z
-                      shared_mem_bytes.value(),                // shared_mem
+                      kernel_args_int64.at(0),  // grid_x
+                      kernel_args_int64.at(1),  // grid_y
+                      kernel_args_int64.at(2),  // grid_z
+                      kernel_args_int64.at(3),  // block_x
+                      kernel_args_int64.at(4),  // block_y
+                      kernel_args_int64.at(5),  // block_z
+                      kernel_args_int64.at(6),  // shared_mem
                       kernel_stream_},
                      {},
                      ir::CallType::Extern,
@@ -335,7 +357,7 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessArgs(
                          ir::CallType::Extern,
                          ir::FunctionRef(),
                          0);
-      ir::Expr let_symbol = ir::Expr(args[i].var_arg());
+      ir::Expr let_symbol = ir::ir_utils::IRCopy(args[i].var_arg());
       let_symbol->set_type(type_of<int64_t>());
       ir::stmt::StmtRef stmt =
           ir::stmt::Let(let_symbol, call_get_value_in_kernel_args);
 
@@ -48,6 +48,7 @@
 
 PD_DECLARE_bool(cinn_use_cuda_vectorize);
 PD_DECLARE_bool(cinn_check_tensor_buffer_map);
+PD_DECLARE_bool(cinn_longlong2int);
 const int default_priority = 100;
 
 namespace cinn {
@@ -195,49 +196,48 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower(
   // including preparing function args and temporary variables,
   // applying low-level optimization passes, etc.
   std::vector<ir::Expr> scheduled_func_bodies;
+  std::vector<ir::SymbolicPredicate> predicates;
   for (std::pair<ir::SymbolicPredicate, ir::Expr>& cond2body :
        cond2func_bodies) {
+    predicates.push_back(cond2body.first);
     scheduled_func_bodies.push_back(cond2body.second);
   }
   std::vector<ir::Tensor> group_func_arg_tensors_copy = group_func_arg_tensors;
   std::vector<ir::Argument> group_func_args;
   std::vector<ir::Tensor> infer_shape_tensor_args;
-  std::vector<ir::LoweredFunc> funcs = PostProcess(group,
-                                                   tensor_map,
-                                                   {scheduled_func_bodies},
-                                                   &group_func_arg_tensors_copy,
-                                                   &group_func_args,
-                                                   &infer_shape_tensor_args);
+
+  std::vector<CondFuncPriorWrapper> warps_processed =
+      PostProcess(group,
+                  tensor_map,
+                  fusion_group_info,
+                  {scheduled_func_bodies},
+                  {predicates},
+                  {priorities},
+                  &group_func_arg_tensors_copy,
+                  &group_func_args,
+                  &infer_shape_tensor_args);
   if (FLAGS_cinn_check_tensor_buffer_map) {
-    for (ir::LoweredFunc& func : funcs) {
-      optim::CheckTensorBufferMap(func->body, "BucketLower PostProcess");
+    for (auto& warp : warps_processed) {
+      optim::CheckTensorBufferMap(std::get<1>(warp)->body,
+                                  "BucketLower PostProcess");
     }
     VLOG(3) << "PostProcess tensor-buffer map check succeed";
   }
-  PADDLE_ENFORCE_EQ(funcs.size(),
-                    cond2func_bodies.size(),
-                    ::common::errors::InvalidArgument(
-                        "The size of funcs and cond2func_bodies should be "
-                        "the same."));
-  PADDLE_ENFORCE_EQ(funcs.size(),
-                    priorities.size() + 1,
-                    ::common::errors::InvalidArgument(
-                        "The size of funcs should equals to the "
-                        "size of priorities plus one."));
+
   BucketLoweredFuncsWrapper funcs_wrapper;
-  for (int i = 0; i < funcs.size() - 1; ++i) {
-    funcs_wrapper.predicate2funcs.emplace_back(
-        std::make_tuple(cond2func_bodies[i].first, funcs[i], priorities[i]));
+  for (int i = 0; i < warps_processed.size() - 1; ++i) {
+    funcs_wrapper.predicate2funcs.emplace_back(warps_processed[i]);
   }
+
   // The last func is x86 kernel.
-  for (size_t i = funcs.size() - 1; i < funcs.size(); ++i) {
-    if (funcs[i]->body == ir::Expr(-1)) {
-      continue;
-    }
-    funcs[i]->name = funcs[i]->name + "_CX86";
-    funcs_wrapper.predicate2funcsCX86.emplace_back(cond2func_bodies[i].first,
-                                                   funcs[i]);
+  auto [predicate_postprocessed, func_postprocessed, _] =
+      warps_processed[warps_processed.size() - 1];
+  if (func_postprocessed->body != ir::Expr(-1)) {
+    func_postprocessed->name = func_postprocessed->name + "_CX86";
+    funcs_wrapper.predicate2funcsCX86.emplace_back(predicate_postprocessed,
+                                                   func_postprocessed);
   }
+
   funcs_wrapper.infer_shape_func =
       GenerateInferShapeFunc(group, infer_shape_tensor_args, group_func_args);
 
@@ -258,13 +258,18 @@ std::unordered_set<std::string> CollectStoreBufferNames(
   return buffer_names;
 }
 
-std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
+std::vector<CondFuncPriorWrapper> OpLowererImpl::PostProcess(
     const OpLoweringGroupPtr& group,
     const std::unordered_map<::pir::Value, ir::Tensor>& tensor_map,
+    const std::shared_ptr<FusionGroupInfo>& fusion_group_info,
     std::vector<ir::Expr> func_bodies,
+    std::vector<ir::SymbolicPredicate> predicates,
+    std::vector<int> priorities,
     std::vector<ir::Tensor>* group_func_arg_tensors,
     std::vector<ir::Argument>* group_func_args,
     std::vector<ir::Tensor>* infer_shape_arg_tensor) {
+  std::vector<ir::Expr> inputs_element_size;
+
   // 1.Prepare function args
   group->mut_input_names().clear();
   std::unordered_set<std::string> store_buffer_names =
@@ -280,6 +285,12 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
             ? ir::Argument::IO::kOutput
             : ir::Argument::IO::kInput;
     (*group_func_args).emplace_back(arg_tensor->buffer, io_type);
+    // collect element size for longlong2int pass.
+    if (FLAGS_cinn_longlong2int) {
+      inputs_element_size.push_back(common::FoldExpr(
+          [](const Expr& a, const Expr& b) { return ir::Mul::Make(a, b); },
+          arg_tensor->shape));
+    }
     arg_name_set.insert(arg_tensor->buffer->name);
   }
 
@@ -330,6 +341,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
   std::map<int, CINNKernelInfo::SymbolArgBindInfo> mps;
   // update args for dynamic dim
   int non_tensor_arg_idx = group_func_args->size();
+
   std::unordered_set<std::string> symbol_args_set;
   for (int tensor_arg_idx = 0; tensor_arg_idx < input_tensor_size;
        tensor_arg_idx++) {
@@ -381,7 +393,11 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
     AddDimSymbolArgs();
     AddValueSymbolArgs();
   }
-  std::vector<ir::LoweredFunc> lowered_funcs;
+
+  std::vector<ir::LoweredFunc> ret_lowered_funcs;
+  std::vector<ir::SymbolicPredicate> ret_predicates;
+  std::vector<int> ret_priorities;
+
   for (int i = 0; i < func_bodies.size(); ++i) {
     ir::Expr func_body = func_bodies[i];
     optim::EliminateDeadScheduleBlock(&(func_body), group->output_names());
@@ -416,14 +432,53 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
       func = optim::Optimize(func, common::DefaultHostTarget(), false);
     }
     func->num_output_tensors = infer_shape_arg_tensor->size();
-    lowered_funcs.push_back(std::move(func));
-  }
 
-  // 5. Unify temp_space args and set temp_space sizes
-  UnifyTempSpaceArgs(&lowered_funcs);
-  group->mut_temp_space_sizes() = CollectTempSpaceSizes(lowered_funcs);
+    // 5. Apply longlong2int pass
+    if (i != func_bodies.size() - 1) {
+      LongLong2Int(symbol_args_set,
+                   fusion_group_info->loop_ranges_expr,
+                   inputs_element_size,
+                   priorities[i],
+                   &predicates[i],
+                   &func,
+                   &ret_predicates,
+                   &ret_lowered_funcs,
+                   &ret_priorities);
+    }
+    ret_predicates.push_back(std::move(predicates[i]));
+    ret_lowered_funcs.push_back(std::move(func));
+    // host func has no priority, since tuples require alignment, set -1 here.
+    if (i != func_bodies.size() - 1) {
+      ret_priorities.push_back(std::move(priorities[i]));
+    } else {
+      ret_priorities.push_back(-1);
+    }
+  }
 
-  return lowered_funcs;
+  // 6. Unify temp_space args and set temp_space sizes
+  UnifyTempSpaceArgs(&ret_lowered_funcs);
+  group->mut_temp_space_sizes() = CollectTempSpaceSizes(ret_lowered_funcs);
+
+  PADDLE_ENFORCE_EQ(
+      ret_lowered_funcs.size(),
+      ret_predicates.size(),
+      ::common::errors::InvalidArgument(
+          "The size of ret_lowered_funcs and ret_predicates should be "
+          "the same."));
+  PADDLE_ENFORCE_EQ(
+      ret_lowered_funcs.size(),
+      ret_priorities.size(),
+      ::common::errors::InvalidArgument(
+          "The size of ret_lowered_funcs and ret_priorities should be "
+          "the same."));
+
+  std::vector<CondFuncPriorWrapper> ret;
+  for (size_t i = 0; i < ret_lowered_funcs.size(); ++i) {
+    ret.emplace_back(std::move(ret_predicates[i]),
+                     std::move(ret_lowered_funcs[i]),
+                     std::move(ret_priorities[i]));
+  }
+  return ret;
 }
 
 std::vector<ir::stmt::BlockRef> OpLowererImpl::LowerOps(
 
@@ -41,6 +41,8 @@ namespace pir {
 
 class PrettyNamer;
 using OpLoweringGroupPtr = std::shared_ptr<OpLoweringGroup>;
+using CondFuncPriorWrapper =
+    std::tuple<ir::SymbolicPredicate, ir::LoweredFunc, int>;
 
 using cinn::common::Target;
 class OpLowererImpl;
@@ -66,15 +68,21 @@ class OpLowererImpl : public OpLowererImplBase<OpLoweringGroupPtr> {
    * variables, applying low-level optimization passes, etc.
    * @param group The group to be lowered.
    * @param tensor_map All tensors used for calculating the group.
+   * @param fusion_group_info The info of the fusion group.
    * @param func_bodies The scheduled func bodies of group.
+   * @param predicates The symbolic predicate of each func.
+   * @param priorities The priority of each func.
    * @param group_func_arg_tensors Tensors used as the group function arguments.
    * @param group_func_args Arguments used as the group function arguments.
    * @return The lowered funcs after the post processing.
    */
-  std::vector<ir::LoweredFunc> PostProcess(
+  std::vector<CondFuncPriorWrapper> PostProcess(
       const OpLoweringGroupPtr& group,
       const std::unordered_map<::pir::Value, ir::Tensor>& tensor_map,
+      const std::shared_ptr<FusionGroupInfo>& fusion_group_info,
       std::vector<ir::Expr> func_bodies,
+      std::vector<ir::SymbolicPredicate> predicates,
+      std::vector<int> priorities,
       std::vector<ir::Tensor>* group_func_arg_tensors,
       std::vector<ir::Argument>* group_func_args,
       std::vector<ir::Tensor>* infer_shape_arg_tensor);