[CINN] Remove GPU-bound For loops more cleanly (#69417)

lshpku · web-flow · commit b3cb80ea9ba1 · 2024-11-19T15:29:19.000+08:00
diff --git a/paddle/cinn/backends/codegen_gpu_dev.cc b/paddle/cinn/backends/codegen_gpu_dev.cc
@@ -148,6 +148,7 @@ void CodeGenGpuDev::Visit(const ir::_LoweredFunc_ *op) {
 
   std::vector<Expr> new_body;
 
+  auto axis_range_assumptions = op->PrepareAxisRangeAssumptions();
   auto alloca_temp_buffers = op->PrepareAllocTempBufferExprs();
   auto temp_buffer_alias = GenerateBufferAliasExprs(op, op->temp_bufs);
   auto alis_var_exprs = op->CudaAliasVarExprs();
@@ -156,6 +157,7 @@ void CodeGenGpuDev::Visit(const ir::_LoweredFunc_ *op) {
 
 #define APPEND_TO_NEW_BODY(field__) \
   new_body.insert(std::end(new_body), std::begin(field__), std::end(field__));
+  APPEND_TO_NEW_BODY(axis_range_assumptions)
   APPEND_TO_NEW_BODY(alloca_temp_buffers)
   APPEND_TO_NEW_BODY(temp_buffer_alias)
   APPEND_TO_NEW_BODY(alis_var_exprs)
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -386,7 +386,6 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
     // 4.Apply low level pass
     if (i != func_bodies.size() - 1) {
       func = optim::Optimize(func, target_, false);
-      optim::RearrangeLoadInstruction(&(func->body));
     } else {
       func = optim::Optimize(func, common::DefaultHostTarget(), false);
     }
diff --git a/paddle/cinn/ir/lowered_func.cc b/paddle/cinn/ir/lowered_func.cc
@@ -151,6 +151,35 @@ void _LoweredFunc_::PrepareAllocOutputBufferExprs() {
   }
 }
 
+std::vector<Expr> _LoweredFunc_::PrepareAxisRangeAssumptions() const {
+  std::vector<Expr> assumption_exprs;
+
+  const auto AssumeAxisLT = [&](std::string axis, const Expr& dim_size) {
+    if (!dim_size.defined()) {
+      return;
+    }
+    if (dim_size == common::make_const(1)) {
+      return;
+    }
+    Expr expr_lt = LT::Make(Var(axis), dim_size);
+    Expr call_lt = Call::Make(Void(),
+                              runtime::intrinsic::cuda_builtin_assume,
+                              {expr_lt},
+                              {},
+                              CallType::Intrinsic);
+    assumption_exprs.push_back(call_lt);
+  };
+
+  AssumeAxisLT("blockIdx.x", cuda_axis_info.grid_dim(0));
+  AssumeAxisLT("blockIdx.y", cuda_axis_info.grid_dim(1));
+  AssumeAxisLT("blockIdx.z", cuda_axis_info.grid_dim(2));
+  AssumeAxisLT("threadIdx.x", cuda_axis_info.block_dim(0));
+  AssumeAxisLT("threadIdx.y", cuda_axis_info.block_dim(1));
+  AssumeAxisLT("threadIdx.z", cuda_axis_info.block_dim(2));
+
+  return assumption_exprs;
+}
+
 std::vector<Expr> _LoweredFunc_::PrepareAllocTempBufferExprs() const {
   std::vector<Expr> alloc_temp_buffer_exprs;
   for (auto& temp_buf : temp_bufs) {
diff --git a/paddle/cinn/ir/lowered_func.h b/paddle/cinn/ir/lowered_func.h
@@ -208,6 +208,9 @@ struct _LoweredFunc_ : public IrNode {
 
   static const IrNodeTy _node_type_ = IrNodeTy::LoweredFunc;
 
+  //! Prepare the assumptions that a gpu axis should be less than its
+  //! corresponding dim size, e.g. threadIdx.x < blockDim.x.
+  std::vector<Expr> PrepareAxisRangeAssumptions() const;
   std::vector<Expr> PrepareCreateTempBufferExprs() const;
   //! Prepare the expressions for `alloc_tmp_buffer_exprs`.
   std::vector<Expr> PrepareAllocTempBufferExprs() const;
diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc
@@ -73,7 +73,7 @@ ir::LoweredFunc Optimize(ir::LoweredFunc fn,
 #ifdef CINN_WITH_CUDA
         ir::SetCudaAxisInfo(copied);
         if (remove_gpu_for_loops) {
-          RemoveGpuForloopsAxis(copied);
+          RemoveGpuForLoops(copied);
         }
         CudaSyncThreadsDropIfThenElse(copied);
     // CudaTransBufferWithDynamicShape(&copied);
@@ -83,7 +83,7 @@ ir::LoweredFunc Optimize(ir::LoweredFunc fn,
 #ifdef CINN_WITH_HIP
         ir::SetCudaAxisInfo(copied);
         if (remove_gpu_for_loops) {
-          RemoveGpuForloopsAxis(copied);
+          RemoveGpuForLoops(copied);
         }
         CudaSyncThreadsDropIfThenElse(copied);
     // CudaTransBufferWithDynamicShape(&copied);
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -22,6 +22,7 @@
 
 #include "paddle/cinn/backends/cuda_util.h"
 #include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/integer_set.h"
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_mutator.h"
@@ -43,27 +44,17 @@ PD_DECLARE_bool(cinn_longlong2int_for_integer);
 namespace cinn {
 namespace optim {
 
-/**
- * 1. Determine the grid and block dimensions.
- * It takes the domains like `[0, 20]` or `[0, min(20, M/2)]`, the domain should
- * have a integer right bound.
- *
- * 2. Replace the grid/thread iterators with something like `threadIdx.x`,
- * `threadIdx.y`.
- *
- * 3. Remove the forloops owning the gpu axis.
- *   1. if the extent is an IntImm, just remove this forloop.
- *   2. if the extent is a Min, replace the forloop with an IfThenElse, with
- * forloop's condition, new check will add (if the min of forloop is not zero).
- *
- * @param expr The expression to mutate.
- */
-void RemoveGpuForloopsAxis(ir::LoweredFunc fn) {
+void RemoveGpuForLoops(ir::LoweredFunc fn) {
   struct Mutator : public ir::IRMutator<Expr *> {
     using ir::IRMutator<>::Visit;
-    void operator()(ir::LoweredFunc fn) { Visit(fn.As<ir::_LoweredFunc_>()); }
+    void operator()(ir::Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+    explicit Mutator(const ir::CudaAxisInfo &cuda_axis_info)
+        : cuda_axis_info_(cuda_axis_info) {}
 
    private:
+    ir::CudaAxisInfo cuda_axis_info_;
+
     void Visit(const ir::For *op, Expr *expr) override {
       switch (op->for_type()) {
         case ir::ForType::GPUBlock:
@@ -90,56 +81,64 @@ void RemoveGpuForloopsAxis(ir::LoweredFunc fn) {
     }
 
     bool NeedToReplaceForloopWithIfThenElse(const ir::For *n) const {
+      // If the loop doesn't start from 0.
+      if (n->min != cinn::common::make_const(0)) {
+        return true;
+      }
+
+      // Get dim_size from the functions's cuda_axis_info as pre-condition.
+      ir::Expr dim_size;
+      switch (n->bind_info().for_type) {
+        case ir::ForType::GPUThread:
+          dim_size = cuda_axis_info_.block_dim(n->bind_info().offset);
+          break;
+        case ir::ForType::GPUBlock:
+          dim_size = cuda_axis_info_.grid_dim(n->bind_info().offset);
+          break;
+      }
+      if (!dim_size.defined()) {
+        return true;
+      }
+
+      // If we can prove the loop's extent >= dim_size, then it's safe not
+      // to add the IfThenElse guard.
+      common::cas_intervals_t var_intervals =
+          common::CollectVarIntervalsOfExprs({n->extent, dim_size});
+      common::SymbolicExprAnalyzer analyzer{var_intervals};
+      std::optional<bool> proved_ge = analyzer.ProveGE(n->extent, dim_size);
+      if (proved_ge.value_or(false)) {
+        return false;
+      }
       return true;
     }
 
     void ReplaceForloopWithIfThenElse(Expr *expr) {
       auto *for_n = expr->As<ir::For>();
-      auto *poly_for_n = expr->As<ir::PolyFor>();
-      PADDLE_ENFORCE_EQ(for_n || poly_for_n,
-                        true,
-                        ::common::errors::InvalidArgument(
-                            "PolyFor is not exist, please check."));
 
       Expr condition;
-
-      auto condition_append = [&](Expr new_cond) {
+      const auto AppendCondition = [&](Expr new_cond) {
         if (condition.defined()) {
           condition = ir::And::Make(condition, new_cond);
         } else {
           condition = new_cond;
         }
       };
 
-      if (for_n) {
-        // for(i, 2, 100);
-        //        ^
-        if (for_n->min != cinn::common::make_const(0)) {
-          condition_append(ir::GE::Make(for_n->loop_var, for_n->min));
-        }
-
-        // for(i, 2, min(M/2, 20)
-        //            ^
-        condition_append(ir::LT::Make(for_n->loop_var, for_n->extent));
-      } else {
-        if (poly_for_n->init != cinn::common::make_const(0)) {
-          condition_append(
-              ir::GE::Make(poly_for_n->iterator, poly_for_n->init));
-        }
-
-        condition_append(poly_for_n->condition);
+      // for(i, 2, 100);
+      //        ^
+      if (for_n->min != cinn::common::make_const(0)) {
+        AppendCondition(ir::GE::Make(for_n->loop_var, for_n->min));
       }
+      // for(i, 2, min(M/2, 20)
+      //            ^
+      AppendCondition(ir::LT::Make(for_n->loop_var, for_n->extent));
 
       PADDLE_ENFORCE_EQ(condition.defined(),
                         true,
                         ::common::errors::InvalidArgument(
                             "Condition is not defined, please check."));
 
-      VLOG(3) << "GPU replacing\n" << *expr;
-      VLOG(3) << "\nto\n";
-      auto if_n = ir::IfThenElse::Make(condition, for_n->body);
-      VLOG(3) << if_n;
-      *expr = if_n;
+      *expr = ir::IfThenElse::Make(condition, for_n->body);
     }
 
     void Visit(const ir::PolyFor *op, Expr *expr) override {
@@ -163,8 +162,8 @@ void RemoveGpuForloopsAxis(ir::LoweredFunc fn) {
     }
   };
 
-  Mutator mutator;
-  mutator(fn);
+  Mutator mutator(fn->cuda_axis_info);
+  mutator(&fn->body);
 }
 
 /**
diff --git a/paddle/cinn/optim/transform_gpu_forloop.h b/paddle/cinn/optim/transform_gpu_forloop.h
@@ -33,29 +33,34 @@ void OptimizeExprGPU(Expr* expr);
 */
 
 /**
- * Remove the forloops of block and thread axis, add the kernel launch thread
- * dimension information to the outermost LoweredFunc.
+ * Remove the GPU block/thread-bound For loops, add IfThenElse guards if needed.
  *
- * For example, input the code:
- * \code
- * // Note here, the outermost expression should be a LoweredFunc
- * _LoweredFunc_:
- *   for (blockIdx.x, 0, 10)
- *     for (threadIdx.x, 0, 20)
- *       A(blockIdx.x, threadIdx.x)
- * \endcode
+ * It's usually safe to remove bound loops, because when launching the kernel,
+ * we are expected to choose dim sizes that match the extents of these loops.
+ * However, there are cases where we cannot simply remove a loop, but need to
+ * add an IfThenElse as guard:
+ *   1) if the loop doesn't start from 0.
+ *   2) if we cannot prove that the loop's extent is always equal to or greater
+ *      than the corresponding dim size.
  *
- * will be modified to
- * \code
- * _LoweredFunc_<blockDim:10, threadDim:20>:
- *   A(blockIdx.x, threadIdx.x)
- * \endcode
+ * Example 1:
+ *   # assume blockDim.x == 256
+ *   thread_bind[threadIdx.x] for (k, 0, 256):
+ *     ScheduleBlock(A)
+ * =>
+ *   ScheduleBlock(A)
  *
- * \note For that the dimensions of each threadIdx or blockIdx should be
- * constant, so this only takes For nodes, not \note PolyFor nodes is allowed to
- * be GPU related.
+ * Example 2:
+ *   # assume gridDim.x == 8
+ *   thread_bind[blockIdx.x] for (k, 2, min(S0, 8)):
+ *     ScheduleBlock(A)
+ * =>
+ *   if (blockIdx.x >= 2 && blockIdx.x < min(S0, 8)):
+ *     ScheduleBlock(A)
+ *
+ * @param fn The LoweredFunc to process.
  */
-void RemoveGpuForloopsAxis(ir::LoweredFunc fn);
+void RemoveGpuForLoops(ir::LoweredFunc fn);
 
 /**
  * Add __syncthreads() to shared memory producer.
diff --git a/paddle/cinn/runtime/intrinsic.h b/paddle/cinn/runtime/intrinsic.h
@@ -129,6 +129,8 @@ static const char* debug_log_repr = "cinn_print_debug_string";
 
 static const char* cuda_sync_threads = "__syncthreads";
 
+static const char* cuda_builtin_assume = "__builtin_assume";
+
 static const char* parallel_launch = "cinn_backend_parallel_launch";
 
 }  // namespace intrinsic

Original file line number	Diff line number	Diff line change
`@@ -386,7 +386,6 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(`
`386`	`386`	`// 4.Apply low level pass`
`387`	`387`	`if (i != func_bodies.size() - 1) {`
`388`	`388`	`func = optim::Optimize(func, target_, false);`
`389`		`- optim::RearrangeLoadInstruction(&(func->body));`
`390`	`389`	`} else {`
`391`	`390`	`func = optim::Optimize(func, common::DefaultHostTarget(), false);`
`392`	`391`	`}`