Refine bind gpu axis (PaddlePaddle#124)

Superjomn · web-flow · commit 1f98e3036382 · 2020-07-25T12:06:57.000+08:00
diff --git a/cinn/backends/codegen_c.cc b/cinn/backends/codegen_c.cc
@@ -524,10 +524,12 @@ void CodeGenC::Visit(const ir::_LoweredFunc_ *op) {
       << "the count of allocation and deallocaton expressions is not match";
 
   std::vector<Expr> new_body;
+
+  auto alloca_temp_buffers = op->PrepareAllocTempBufferExprs();
 #define APPEND_TO_NEW_BODY(field__) new_body.insert(std::end(new_body), std::begin(op->field__), std::end(op->field__));
   APPEND_TO_NEW_BODY(argument_prepare_exprs)
   APPEND_TO_NEW_BODY(alloc_output_buffer_exprs)
-  APPEND_TO_NEW_BODY(alloc_tmp_buffer_exprs)
+  new_body.insert(std::end(new_body), std::begin(alloca_temp_buffers), std::end(alloca_temp_buffers));
   APPEND_TO_NEW_BODY(buffer_data_cast_exprs)
   new_body.push_back(op->body);
   APPEND_TO_NEW_BODY(dealloc_output_buffer_exprs)
diff --git a/cinn/backends/codegen_cuda_dev.cc b/cinn/backends/codegen_cuda_dev.cc
@@ -79,7 +79,7 @@ void CodeGenCUDA_Dev::Visit(const ir::_LoweredFunc_ *op) {
 
   DoIndent();
 
-  Expr temp_buffer_alloc = ir::Block::Make(op->alloc_tmp_buffer_exprs);
+  Expr temp_buffer_alloc = ir::Block::Make(op->PrepareAllocTempBufferExprs());
   Expr func_body         = op->body;
   Expr temp_buffer_alias = ir::Block::Make(GenerateBufferAliasExprs(op, op->temp_bufs));
 
diff --git a/cinn/backends/codegen_cuda_dev_test.cc b/cinn/backends/codegen_cuda_dev_test.cc
@@ -58,10 +58,9 @@ TEST(CodeGenCUDA, basic) {
 
   auto C = Compute(
       {M, N}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "C");
-  C->WithBuffer();
 
-  C->stage()->GpuBlocks({C->stage()->axis(0)});
-  C->stage()->GpuThreads({C->stage()->axis(1)});
+  C->stage()->Bind(0, "blockIdx.x");
+  C->stage()->Bind(1, "threadIdx.x");
 
   CodeGenCUDA_Dev codegen(target);
 
@@ -883,5 +882,71 @@ TEST(Conv, optimize) {
   LOG(INFO) << Lower("conv", {A, W, BL}, {}, {AA, WW, AL, WL, B});
 }
 
+TEST(ElementwiseAdd, cache_read) {
+  Expr M(100);
+  Expr N(200);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  auto C = Compute(
+      {M, N}, [&](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
+  C->stage()->Split(1, 10);
+
+  auto AL = A->stage()->CacheRead("local", {C});
+  AL->stage()->Split(1, 10);
+
+  AL->stage()->ComputeAt(C->stage(), 1, poly::Stage::ComputeAtKind::kComputeAtUnk, A->name);
+  C->stage()->Bind(0, "threadIdx.x");
+  C->stage()->Bind(1, "blockIdx.x");
+
+  Target target;
+  CodeGenCUDA_Dev codegen(target);
+
+  auto fn = Lower("fn", {A, B, C}, {}, {AL});
+
+  Module::Builder builder("module", target);
+  builder.AddFunction(fn);
+
+  auto source_code = codegen.Compile(builder.Build());
+  LOG(INFO) << "source:\n" << source_code;
+
+  std::string source_target = R"ROC(
+extern "C" {
+
+#ifdef __CUDACC_RTC__
+typedef int int32_t;
+typedef char int8_t;
+#endif
+
+
+
+__global__
+void fn_kernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C)
+{
+  float _A_read_cache_3 [ 1 * 10 ];
+  float* A_read_cache_3 = _A_read_cache_3;
+  {
+    if (((((threadIdx.x >= 0) && (threadIdx.x <= 99)) && (blockIdx.x >= 0)) && (blockIdx.x <= 19))) {
+      for (int32_t j_inner = 0; j_inner < 10; j_inner += 1) {
+        A_read_cache_3[j_inner] = A[((10 * blockIdx.x) + ((200 * threadIdx.x) + j_inner))];
+      };
+    };
+    for (int32_t i = 0; i < 10; i += 1) {
+      C[((10 * blockIdx.x) + ((200 * threadIdx.x) + i))] = (A_read_cache_3[((10 * blockIdx.x) + ((10 * threadIdx.x) + i))] + B[((10 * blockIdx.x) + ((200 * threadIdx.x) + i))]);
+    };
+  };
+}
+
+}
+)ROC";
+  // ASSERT_EQ(utils::Trim(source_target), source);
+
+  backends::NVRTC_Compiler compiler;
+
+  auto ptx = compiler(source_code);
+  CHECK(!ptx.empty()) << "Compile error!";
+}
+
 }  // namespace backends
 }  // namespace cinn
diff --git a/cinn/common/ir_util.cc b/cinn/common/ir_util.cc
@@ -337,5 +337,49 @@ Expr cast(Expr e, Type type) {
   return ir::Cast::Make(type, e);
 }
 
+std::vector<std::string> GatherItersToTensorProducer(const std::string &target_tensor_name, Expr *expr) {
+  struct Visitor : public ir::IRMutator<> {
+    std::vector<std::string> iters;
+    const std::string &target_tensor_name;
+
+    Visitor(const std::string &target_tensor_name) : target_tensor_name(target_tensor_name) {}
+
+    std::vector<std::string> operator()(Expr *expr) {
+      ir::IRMutator<>::Visit(expr, expr);
+      return iters;
+    }
+
+    void Visit(const ir::Store *op, Expr *expr) {
+      if (op->tensor.as_tensor()->name == target_tensor_name) {
+        CHECK(iters.empty());
+        for (auto &e : for_stack) {
+          auto *for_n     = e->As<ir::For>();
+          auto *polyfor_n = e->As<ir::PolyFor>();
+          if (for_n) {
+            iters.push_back(for_n->loop_var->name);
+          } else {
+            iters.push_back(polyfor_n->iterator->name);
+          }
+        }
+      }
+    }
+
+    void Visit(const ir::For *op, Expr *expr) {
+      for_stack.push_back(expr);
+      ir::IRMutator<>::Visit(op, expr);
+      for_stack.pop_back();
+    }
+    void Visit(const ir::PolyFor *op, Expr *expr) {
+      for_stack.push_back(expr);
+      ir::IRMutator<>::Visit(op, expr);
+      for_stack.pop_back();
+    }
+
+    std::vector<Expr *> for_stack;
+  };
+
+  return Visitor(target_tensor_name)(expr);
+}
+
 }  // namespace common
 }  // namespace cinn
diff --git a/cinn/common/ir_util.h b/cinn/common/ir_util.h
@@ -76,6 +76,8 @@ void UnifyAllTensorsInExpr(Expr *expr);
  */
 void UnifyAllBuffersInExpr(Expr *Expr);
 
+std::vector<std::string> GatherItersToTensorProducer(const std::string &target_tensor_name, Expr *expr);
+
 bool is_zero(Expr v);
 
 bool MathEqual(const Expr &a, const Expr &b);
diff --git a/cinn/ir/lowered_func.cc b/cinn/ir/lowered_func.cc
@@ -71,10 +71,12 @@ void _LoweredFunc_::PrepareAllocOutputBufferExprs() {
   }
 }
 
-void _LoweredFunc_::PrepareAllocTempBufferExprs() {
+std::vector<Expr> _LoweredFunc_::PrepareAllocTempBufferExprs() const {
+  std::vector<Expr> alloc_output_buffer_exprs;
   for (auto& temp_buf : temp_bufs) {
-    alloc_tmp_buffer_exprs.push_back(Alloc::Make(temp_buf, temp_buf->type(), temp_buf->shape, Expr(), Expr()));
+    alloc_output_buffer_exprs.push_back(Alloc::Make(temp_buf, temp_buf->type(), temp_buf->shape, Expr(), Expr()));
   }
+  return alloc_output_buffer_exprs;
 }
 
 void _LoweredFunc_::PrepareDeallocOutputBufferExprs() {
diff --git a/cinn/ir/lowered_func.h b/cinn/ir/lowered_func.h
@@ -105,7 +105,6 @@ struct _LoweredFunc_ : ExprNode<_LoweredFunc_> {
   std::vector<Expr> dealloc_output_buffer_exprs;
   // @}
 
-  std::vector<Expr> alloc_tmp_buffer_exprs;
   //! something like: float* A_data = (float*)(A->host_memory);
   std::vector<Expr> buffer_data_cast_exprs;
 
@@ -123,12 +122,13 @@ struct _LoweredFunc_ : ExprNode<_LoweredFunc_> {
 
   static const IrNodeTy _node_type_ = IrNodeTy::_LoweredFunc_;
 
+  //! Prepare the expressions for `alloc_tmp_buffer_exprs`.
+  std::vector<Expr> PrepareAllocTempBufferExprs() const;
+
  private:
   void CheckValid() const;
   //! Prepare the expressions for `alloc_output_buffer_exprs`.
   void PrepareAllocOutputBufferExprs();
-  //! Prepare the expressions for `alloc_tmp_buffer_exprs`.
-  void PrepareAllocTempBufferExprs();
   //! Prepare the expressions for `dealloc_output_buffer_exprs`.
   void PrepareDeallocOutputBufferExprs();
   //! Insert the allocation expr for temporary variables.
diff --git a/cinn/lang/lower_impl.cc b/cinn/lang/lower_impl.cc
@@ -105,7 +105,15 @@ Expr LowerGroup(const poly::ScheduleGroup& group, const std::map<std::string, Ex
   {
     optim::forloop_infos_t forloop_infos;
     for (auto* stage : stages) {
-      forloop_infos[stage->id()] = stage->forloop_infos();
+      // transform the level identified for infors to iter name identified.
+      auto iters = common::GatherItersToTensorProducer(stage->id(), &e);
+      std::map<std::string, poly::StageForloopInfo> for_infos;
+      for (auto& item : stage->forloop_infos()) {
+        CHECK_LT(item.first, iters.size());
+        for_infos[iters[item.first]] = item.second;
+      }
+
+      forloop_infos[stage->id()] = for_infos;
     }
     optim::TransformGpuForloop(forloop_infos, &e);
   }
@@ -772,13 +780,6 @@ void UpdateComputeAtBufferShape(Expr* expr) {
         process_buffer(Reference(&buf).operator->(), *compute_at_it->second);
       }
     }
-
-    for (auto& expr : node->alloc_tmp_buffer_exprs) {
-      auto compute_at_it = buffer_to_compute_at_info.find(expr.As<ir::Alloc>()->destination.as_buffer()->name);
-      if (compute_at_it != buffer_to_compute_at_info.end()) {
-        process_alloca(Reference(&expr).As<ir::Alloc>(), *compute_at_it->second);
-      }
-    }
   }
 }
 
diff --git a/cinn/optim/ir_copy.cc b/cinn/optim/ir_copy.cc
@@ -218,7 +218,6 @@ struct IRCopyVisitor : public ir::IRVisitorBase<Expr> {
 
     std::vector<Expr> alloc_output_buffer_exprs;
     std::vector<Expr> dealloc_output_buffer_exprs;
-    std::vector<Expr> alloc_tmp_buffer_exprs;
     std::vector<Expr> buffer_data_cast_exprs;
     std::vector<Expr> argument_prepare_exprs;
 
@@ -230,7 +229,6 @@ struct IRCopyVisitor : public ir::IRVisitorBase<Expr> {
 
     COPY_ADD_FIELD(alloc_output_buffer_exprs);
     COPY_ADD_FIELD(dealloc_output_buffer_exprs);
-    COPY_ADD_FIELD(alloc_tmp_buffer_exprs);
     COPY_ADD_FIELD(buffer_data_cast_exprs);
     COPY_ADD_FIELD(argument_prepare_exprs);
 
diff --git a/cinn/poly/compute_at_transform.cc b/cinn/poly/compute_at_transform.cc
@@ -151,10 +151,11 @@ ComputeAtTransform::ComputeAtTransform(
       ptransform_(ptransform),
       ctransform_(ctransform),
       level_(level) {
-  LOG(INFO) << "pdomain: " << pdomain;
-  LOG(INFO) << "ptransform: " << ptransform;
-  LOG(INFO) << "cdomain: " << cdomain;
-  LOG(INFO) << "ctransform: " << ctransform;
+  VLOG(2) << "pdomain: " << pdomain;
+  VLOG(2) << "ptransform: " << ptransform;
+  VLOG(2) << "cdomain: " << cdomain;
+  VLOG(2) << "ctransform: " << ctransform;
+  VLOG(2) << "access: " << access;
 
   adjusted_ctransform_ = isl::manage(AddParamsTo(ctransform_.copy()));
   adjusted_cdomain_    = isl::manage(AddParamsTo(cdomain_.copy()));
diff --git a/cinn/poly/stage.cc b/cinn/poly/stage.cc
@@ -172,14 +172,27 @@ void Stage::ComputeAtSchedule(Stage *other, int level, ComputeAtKind kind) {
   }
 }
 
-void Stage::ComputeAt(Stage *other, int level, Stage::ComputeAtKind kind) {
-  auto accesses = GatherAccesses(other, tensor_->name);
-  if (accesses.empty()) return;
-  auto access = accesses[0];
-  for (int i = 1; i < accesses.size(); i++) {
-    access = isl::manage(isl_map_union(access.release(), accesses[i].copy()));
+void Stage::ComputeAt(Stage *other, int level, Stage::ComputeAtKind kind, const std::string &cached_tensor_name) {
+  isl::map access;
+  isl_map *access_raw{};
+  // For cache_read schedule, it will replace the producer tensor with cache in consumer, so replace the tuple name to
+  // cache's in access.
+  if (cached_tensor_name.empty())
+    access_raw = GatherAccesses(other, tensor_->name);
+  else
+    access_raw = GatherAccesses(other, cached_tensor_name);
+
+  if (!access_raw) {
+    LOG(ERROR) << "ComputeAt: " << other->tensor_->name << " has no access to " << tensor_->name << ", skipped it";
+    return;
   }
 
+  if (!cached_tensor_name.empty()) {
+    access_raw = isl_map_set_tuple_name(access_raw, isl_dim_out, tensor_->name.c_str());
+  }
+  access     = isl::manage(access_raw);
+  access_raw = nullptr;
+
   ComputeAtTransform transform(domain_, other->domain(), access, transform_, other->transform(), level);
   transform();
 
@@ -457,8 +470,9 @@ std::vector<std::string> Stage::axis_names() const { return GetDimNames(transfor
 void Stage::GpuThreads(const std::vector<Iterator> &iters, DeviceAPI device) {
   auto dim_names = axis_names();
   for (auto &iter : iters) {
-    CHECK(std::find(dim_names.begin(), dim_names.end(), iter.id) != dim_names.end());
-    forloop_infos_.emplace(iter.id, StageForloopInfo{ir::ForType::GPUThread, device});
+    auto it = std::find(dim_names.begin(), dim_names.end(), iter.id);
+    CHECK(it != dim_names.end());
+    AddForloopInfo(it - dim_names.begin(), StageForloopInfo{ir::ForType::GPUThread, device});
   }
 }
 
@@ -471,7 +485,6 @@ void Stage::GpuBlocks(const std::vector<int> &levels, DeviceAPI device) {
       levels.begin(), levels.end(), std::back_inserter(iters), [&](int i) { return Iterator(dim_names[i]); });
   GpuBlocks(iters, device);
 }
-
 void Stage::GpuBlocks(const Iterator &block_x, DeviceAPI device) {
   GpuBlocks(std::vector<Iterator>({block_x}), device);
 }
@@ -484,8 +497,21 @@ void Stage::GpuBlocks(const Iterator &block_x, const Iterator &block_y, const It
 void Stage::GpuBlocks(const std::vector<Iterator> &iters, DeviceAPI device) {
   auto dim_names = axis_names();
   for (auto &iter : iters) {
-    CHECK(std::find(dim_names.begin(), dim_names.end(), iter.id) != dim_names.end());
-    forloop_infos_.emplace(iter.id, StageForloopInfo{ir::ForType::GPUBlock, device});
+    auto it = std::find(dim_names.begin(), dim_names.end(), iter.id);
+    CHECK(it != dim_names.end());
+    AddForloopInfo(it - dim_names.begin(), StageForloopInfo{ir::ForType::GPUBlock, device});
+  }
+}
+void Stage::Bind(int level, const std::string &axis) {
+  auto dim_names = GetDimNames(transformed_domain().get());
+  CHECK_LT(level, dim_names.size());
+
+  if (axis == "threadIdx.x" || axis == "threadIdx.y" || axis == "threadIdx.z") {
+    AddForloopInfo(level, StageForloopInfo{ir::ForType::GPUThread, DeviceAPI::GPU});
+  } else if (axis == "blockIdx.x" || axis == "blockIdx.y" || axis == "blockIdx.z") {
+    AddForloopInfo(level, StageForloopInfo{ir::ForType::GPUBlock, DeviceAPI::GPU});
+  } else {
+    NOT_IMPLEMENTED
   }
 }
 
@@ -573,7 +599,7 @@ void Stage::ShareBufferWith(ir::Tensor other) {
 
 void Stage::CtrlDepend(const ir::Tensor &t) { add_extra_depend_stage(t->name); }
 
-std::vector<isl::map> GatherAccesses(Stage *stage, const std::string &tensor_name) {
+isl_map *__isl_give GatherAccesses(Stage *stage, const std::string &tensor_name) {
   CHECK(stage->tensor_);
   auto loads = ir::CollectIRNodes(stage->tensor_->body(), [&](const Expr *x) {
     return x->As<ir::Load>() && x->As<ir::Load>()->tensor.as_tensor()->name == tensor_name;
@@ -588,16 +614,26 @@ std::vector<isl::map> GatherAccesses(Stage *stage, const std::string &tensor_nam
   std::transform(
       loads.begin(), loads.end(), std::back_inserter(out_loads), [](const Expr &x) { return utils::GetStreamCnt(x); });
 
-  std::vector<isl::map> res;
-
+  isl_map *res = nullptr;
   for (auto &load : out_loads) {
     std::string repr = utils::StringFormat(
         "{ %s[%s] -> %s }", in_tuple_name.c_str(), utils::Join(in_dim_names, ",").c_str(), load.c_str());
-    res.push_back(isl::map(stage->domain().ctx(), repr));
+    isl_map *access = isl_map_read_from_str(stage->domain().ctx().get(), repr.c_str());
+    if (res) {
+      res = isl_map_union(res, access);
+    } else {
+      res = access;
+    }
   }
 
   return res;
 }
 
+void Stage::AddForloopInfo(int level, const StageForloopInfo &info) {
+  int num_levels = isl_map_dim(transform_.get(), isl_dim_out);
+  CHECK_LT(level, num_levels);
+  forloop_infos_[level] = info;
+}
+
 }  // namespace poly
 }  // namespace cinn
diff --git a/cinn/poly/stage.h b/cinn/poly/stage.h

Original file line number	Diff line number	Diff line change
`@@ -71,10 +71,12 @@ void _LoweredFunc_::PrepareAllocOutputBufferExprs() {`
`71`	`71`	`}`
`72`	`72`	`}`
`73`	`73`
`74`		`-void _LoweredFunc_::PrepareAllocTempBufferExprs() {`
	`74`	`+std::vector<Expr> _LoweredFunc_::PrepareAllocTempBufferExprs() const {`
	`75`	`+ std::vector<Expr> alloc_output_buffer_exprs;`
`75`	`76`	`for (auto& temp_buf : temp_bufs) {`
`76`		`- alloc_tmp_buffer_exprs.push_back(Alloc::Make(temp_buf, temp_buf->type(), temp_buf->shape, Expr(), Expr()));`
	`77`	`+ alloc_output_buffer_exprs.push_back(Alloc::Make(temp_buf, temp_buf->type(), temp_buf->shape, Expr(), Expr()));`
`77`	`78`	`}`
	`79`	`+ return alloc_output_buffer_exprs;`
`78`	`80`	`}`
`79`	`81`
`80`	`82`	`void _LoweredFunc_::PrepareDeallocOutputBufferExprs() {`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,15 @@ Expr LowerGroup(const poly::ScheduleGroup& group, const std::map<std::string, Ex`
`105`	`105`	`{`
`106`	`106`	`optim::forloop_infos_t forloop_infos;`
`107`	`107`	`for (auto* stage : stages) {`
`108`		`- forloop_infos[stage->id()] = stage->forloop_infos();`
	`108`	`+ // transform the level identified for infors to iter name identified.`
	`109`	`+ auto iters = common::GatherItersToTensorProducer(stage->id(), &e);`
	`110`	`+ std::map<std::string, poly::StageForloopInfo> for_infos;`
	`111`	`+ for (auto& item : stage->forloop_infos()) {`
	`112`	`+ CHECK_LT(item.first, iters.size());`
	`113`	`+ for_infos[iters[item.first]] = item.second;`
	`114`	`+ }`
	`115`	`+`
	`116`	`+ forloop_infos[stage->id()] = for_infos;`
`109`	`117`	`}`
`110`	`118`	`optim::TransformGpuForloop(forloop_infos, &e);`
`111`	`119`	`}`
`@@ -772,13 +780,6 @@ void UpdateComputeAtBufferShape(Expr* expr) {`
`772`	`780`	`process_buffer(Reference(&buf).operator->(), *compute_at_it->second);`
`773`	`781`	`}`
`774`	`782`	`}`
`775`		`-`
`776`		`- for (auto& expr : node->alloc_tmp_buffer_exprs) {`
`777`		`- auto compute_at_it = buffer_to_compute_at_info.find(expr.As<ir::Alloc>()->destination.as_buffer()->name);`
`778`		`- if (compute_at_it != buffer_to_compute_at_info.end()) {`
`779`		`- process_alloca(Reference(&expr).As<ir::Alloc>(), *compute_at_it->second);`
`780`		`- }`
`781`		`- }`
`782`	`783`	`}`
`783`	`784`	`}`
`784`	`785`