Fix the loop index in Split/Fuse is not less than total loop's number.

baoqiwen · baoqiwen · commit 6ea3df1f4671 · 2025-03-31T15:40:42.000+08:00
diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_broadcast_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_broadcast_tactic.cc
@@ -100,10 +100,6 @@ class TileBroadcastTactic final : public ScheduleTactic {
                                              const std::string& block_id,
                                              int block_size,
                                              int vetorize_factor);
-  std::vector<std::string> TileVectorizeNHWC(ir::IRSchedule* sch,
-                                             const std::string& block_id,
-                                             int block_size,
-                                             int vetorize_factor);
 
  private:
   ScheduleContext* context_;
@@ -549,60 +545,15 @@ std::vector<std::string> TileBroadcastTactic::TileVectorizeNCHW(
   }
 }
 
-std::vector<std::string> TileBroadcastTactic::TileVectorizeNHWC(
-    ir::IRSchedule* sch,
-    const std::string& block_id,
-    int block_size,
-    int vectorize_factor) {
-  // NHWC layout will have 2 fused loops, so we start with (blockIdx.x,
-  // threadIdx.x)
-  VLOG(4) << "TileBroadcastTactic using NHWC layout, block size = "
-          << block_size << ", broadcast_size_ = " << broadcast_size_
-          << ", preserved_size_ = " << preserved_size_;
-  int vectorize_p_size = preserved_size_ / vectorize_factor;
-  if (broadcast_size_ <= 64) {
-    /**
-     * if the broadcast size is smaller than 64
-     * this means we need more blocks to increase the occupancy
-     * so no thread coarsening anyway
-     */
-    sch->Split(block_id, 1, {-1, block_size, vectorize_factor});
-    sch->Fuse(block_id, {0, 1});
-    return {"blockIdx.x", "threadIdx.x", ""};
-  } else {
-    if (vectorize_p_size == block_size) {
-      sch->Split(block_id, 1, {-1, vectorize_factor});
-      return {"blockIdx.x", "threadIdx.x", ""};
-    } else if (vectorize_p_size < block_size) {
-      // block size is larger (deliberately, to have enough threads)
-      // than preserved size
-      sch->Split(block_id, 1, {-1, vectorize_factor});
-      sch->Fuse(block_id, {0, 1});
-      sch->Split(
-          block_id, 0, {-1, block_size / vectorize_p_size, vectorize_p_size});
-      return {"blockIdx.x", "threadIdx.y", "threadIdx.x", ""};
-    } else {
-      /**
-       * block size is not enough to cover the preserved size
-       * make the load index invariant to inner loop
-       * (-1, v_p_size / block_size, block_size, vectorize_factor)
-       */
-      block_size = 128;
-      sch->Split(block_id, 1, {-1, block_size, vectorize_factor});
-      sch->Fuse(block_id, {0, 1});
-      sch->Split(block_id, 0, {-1, vectorize_p_size / block_size});
-      return {"blockIdx.x", "blockIdx.y", "threadIdx.x", ""};
-    }
-  }
-}
-
 void TileBroadcastTactic::Apply(ir::IRSchedule* sch,
                                 const std::string& block_id) {
-  if (applied_layout_ == BroadcastLayout::NCHWLayout) {
-    if (ScheduleBlockEnableVectorize(context_->config, block_id)) {
-      ApplyVectorize(sch, block_id);
-      return;
-    }
+  if (applied_layout_ == BroadcastLayout::NCHWLayout &&
+      ScheduleBlockEnableVectorize(context_->config, block_id) {
+    // TODO(baoqiwen): Due to register overflow issues, NHWC currently has
+    // performance problems. The current vectorization only supports NCHW, and
+    // future support for NHWC is needed.
+    ApplyVectorize(sch, block_id);
+    return;
   }
 
   if (applied_layout_ == BroadcastLayout::Invalid) return;
@@ -714,7 +665,7 @@ void TileBroadcastTactic::ApplyVectorize(ir::IRSchedule* sch,
     axis_bind = TileVectorizeNCHW(sch, block_id, block_size, vectorize_factor);
   } else {
     // [B, P] (for NHWC layout)
-    axis_bind = TileVectorizeNHWC(sch, block_id, block_size, vectorize_factor);
+    // TODO(baoqiwen): support TileVectorizeNHWC
   }
 
   // set vectorize schedule primitives