YqGe585
diff --git a/‎paddle/cinn/backends/codegen_c.cc
+2-2 b/‎paddle/cinn/backends/codegen_c.cc
+2-2
diff --git a/‎paddle/cinn/backends/codegen_gpu_dev.cc
+3-3 b/‎paddle/cinn/backends/codegen_gpu_dev.cc
+3-3
diff --git a/‎paddle/cinn/backends/sycl/codegen_sycl_dev.cc
+3-3 b/‎paddle/cinn/backends/sycl/codegen_sycl_dev.cc
+3-3
diff --git a/‎paddle/cinn/hlir/pe/ir_schedule_pe.cc
+4-4 b/‎paddle/cinn/hlir/pe/ir_schedule_pe.cc
+4-4
diff --git a/‎paddle/cinn/ir/group_schedule/config/group_tile_util.cc
+2-2 b/‎paddle/cinn/ir/group_schedule/config/group_tile_util.cc
+2-2
diff --git a/‎paddle/cinn/ir/ir_base.h
+6-2 b/‎paddle/cinn/ir/ir_base.h
+6-2
diff --git a/‎paddle/cinn/lang/compute.cc
+2-4 b/‎paddle/cinn/lang/compute.cc
+2-4
diff --git a/‎paddle/cinn/lang/lower_impl.cc
+1-1 b/‎paddle/cinn/lang/lower_impl.cc
+1-1
@@ -170,7 +170,7 @@ void CodeGenC::Visit(const ir::Mul *op) { IrPrinter::Visit(op); }
 void CodeGenC::Visit(const ir::Div *op) { IrPrinter::Visit(op); }
 void CodeGenC::Visit(const ir::Mod *op) {
   auto copied = op->b();
-  optim::Simplify(&copied);
+  copied = optim::ArithSimplify(copied);
   if (copied.is_constant()) {
     int temp = static_cast<int>(copied.get_constant());
     if ((temp & (temp - 1)) == 0) {
@@ -891,7 +891,7 @@ void CodeGenC::Visit(const ir::_LoweredFunc_ *op) {
 
   Expr func_body = ir::Block::Make(new_body);
 
-  optim::SimplifyBlocks(&func_body);
+  optim::SimplifyUnitBlock(&func_body);
 
   IrPrinter::Visit(func_body);
 }
 
@@ -183,7 +183,7 @@ void CodeGenGpuDev::Visit(const ir::_LoweredFunc_ *op) {
   ir::stmt::BlockRef func_body_block = ir::stmt::BlockRef(new_body_stmts);
 
   // Use ir_simplify when pass updated.
-  // optim::SimplifyBlocks(&func_body);
+  // optim::SimplifyUnitBlock(&func_body);
   // // Make sure that the function's body is wrapped by a block
   // if (!func_body.As<ir::Block>()) {
   //   func_body = ir::Block::Make({func_body});
@@ -320,7 +320,7 @@ void CodeGenGpuDev::PrintTempBufferCreation(const ir::Buffer &buffer) {
   for (int i = 0; i < buffer->shape.size(); i++) {
     buffer_size = buffer_size * buffer->shape[i];
   }
-  optim::Simplify(&buffer_size);
+  buffer_size = optim::ArithSimplify(buffer_size);
   bool has_symbolic_constant = false;
   ir::ir_utils::CollectIRNodes(buffer_size, [&](const Expr *x) {
     if (x->as_var()) {
@@ -352,7 +352,7 @@ void CodeGenGpuDev::PrintTempBufferCreation(const ir::Buffer &buffer) {
     int type_bytes = buffer->dtype.bytes();
     dyn_shared_mem_offset_ =
         dyn_shared_mem_offset_ + buffer_size * Expr(type_bytes);
-    optim::Simplify(&dyn_shared_mem_offset_);
+    dyn_shared_mem_offset_ = optim::ArithSimplify(dyn_shared_mem_offset_);
     VLOG(6) << "dyn_shared_mem_offset_ = " << dyn_shared_mem_offset_;
   } else if (buffer->memory_type == ir::MemoryType::GPULocal) {
     // print func of static allocation
 
@@ -194,7 +194,7 @@ void CodeGenSyclDevice::PrintFunctionBody(const ir::_LoweredFunc_ *op) {
   APPEND_TO_NEW_BODY_STMTS(dealloc_temp_buffer_stmts);
   ir::stmt::BlockRef func_body_block = ir::stmt::BlockRef(new_body_stmts);
   // Use ir_simplify when pass updated.
-  // optim::SimplifyBlocks(&func_body);
+  // optim::SimplifyUnitBlock(&func_body);
   // // Make sure that the function's body is wrapped by a block
   // if (!func_body.As<ir::Block>()) {
   //   func_body = ir::Block::Make({func_body});
@@ -253,7 +253,7 @@ void CodeGenSyclDevice::PrintTempBufferCreation(const ir::Buffer &buffer) {
     for (int i = 0; i < buffer->shape.size(); i++) {
       buffer_size = buffer_size * buffer->shape[i];
     }
-    optim::Simplify(&buffer_size);
+    buffer_size = optim::ArithSimplify(buffer_size);
     IrPrinter::Visit(buffer_size);
     str_ += " ]";
   };
@@ -268,7 +268,7 @@ void CodeGenSyclDevice::PrintTempBufferCreation(const ir::Buffer &buffer) {
       for (int i = 0; i < buffer->shape.size(); i++) {
         buffer_size = buffer_size * buffer->shape[i];
       }
-      optim::Simplify(&buffer_size);
+      buffer_size = optim::ArithSimplify(buffer_size);
       IrPrinter::Visit(buffer_size);
       str_ += " ]>(item.get_group())";
       break;
 
@@ -1300,9 +1300,9 @@ void IRCudaScheduleConv(ir::IRSchedule &ir_sch,  // NOLINT
 
   int n = output->shape[0].as_int32();
   int c = output->shape[1].as_int32();
-  optim::Simplify(&(output->shape[2]));
+  output->shape[2] = optim::ArithSimplify(output->shape[2]);
   int h = output->shape[2].as_int32();
-  optim::Simplify(&(output->shape[3]));
+  output->shape[3] = optim::ArithSimplify(output->shape[3]);
   int w = output->shape[3].as_int32();
   int rc = input_pad->shape[1].as_int32();
 
@@ -1480,8 +1480,8 @@ void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,  // NOLINT
 
   // stages[input_pad]->ComputeInline();
 
-  optim::Simplify(&(output->shape[2]));
-  optim::Simplify(&(output->shape[3]));
+  output->shape[2] = optim::ArithSimplify(output->shape[2]);
+  output->shape[3] = optim::ArithSimplify(output->shape[3]);
 
   VLOG(3) << "Begin IRCudaScheduleConv2 with expr : "
           << ir_sch.GetModule().GetExprs().at(0);
 
@@ -206,7 +206,7 @@ bool CheckTensorIsBroadcastAndContinuous(
   bool is_broadcast = false;
   for (int i = 0; i < indices.size(); ++i) {
     ir::Expr index = indices[i];
-    cinn::optim::Simplify(&index);
+    index = optim::ArithSimplify(index);
     if (index.is_constant() && index.get_constant() == 0) {
       is_broadcast = true;
       continue;
@@ -244,7 +244,7 @@ bool CheckTensorIsContinuous(
     const std::unordered_map<ir::Var, ir::Expr>& iter_var2value) {
   for (int i = 0; i < indices.size(); ++i) {
     ir::Expr index = indices[i];
-    cinn::optim::Simplify(&index);
+    index = optim::ArithSimplify(index);
     if (index.is_constant()) return false;
     if (!index.is_var()) return false;
     ir::Var iter_var = index.as_var_ref();
 
@@ -177,8 +177,12 @@ enum class StmtNodeTy { kUnk = -1, NODETY_FORALL_STMT(__m) };
 //! String representations for IrNodeTy.
 // @{
 #define __m(x__) #x__,
-const std::vector<std::string> kIrNodeTyReprs(
-    {NODETY_FORALL(__m) "IterSplit", "IterSum", "IterMark", "None"});
+const std::vector<std::string> kIrNodeTyReprs({"Module",
+                                               "LoweredFunc",
+                                               "IterSplit",
+                                               "IterSum",
+                                               "IterMark",
+                                               NODETY_FORALL(__m)});
 #undef __m
 // @}
 
 
@@ -178,14 +178,12 @@ ir::Tensor Compute(const std::vector<Expr> &domain,
 
   // construct the shape.
   for (auto dim : domain) {
-    auto copied = dim;
-    optim::Simplify(&copied);
+    auto copied = optim::ArithSimplify(dim);
     domain_without_reduce_axis.push_back(copied);
   }
 
   for (auto dim : shape) {
-    auto copied = dim;
-    optim::Simplify(&copied);
+    auto copied = optim::ArithSimplify(dim);
     shape_simplified.push_back(copied);
   }
 
 
@@ -384,7 +384,7 @@ std::vector<ir::LoweredFunc> LowerImpl::operator()() {
 
     if (support_ir_schedule_) {
       optim::TransformPolyForToFor(&func->body);
-      optim::SimplifyBlocks(&func->body);
+      optim::SimplifyUnitBlock(&func->body);
       func->body = ir::Block::Make({func->body});
       result.push_back(func);
       num_func++;
Original file line number	Diff line number	Diff line change
`@@ -178,14 +178,12 @@ ir::Tensor Compute(const std::vector<Expr> &domain,`
`178`	`178`
`179`	`179`	`// construct the shape.`
`180`	`180`	`for (auto dim : domain) {`
`181`		`- auto copied = dim;`
`182`		`- optim::Simplify(&copied);`
	`181`	`+ auto copied = optim::ArithSimplify(dim);`
`183`	`182`	`domain_without_reduce_axis.push_back(copied);`
`184`	`183`	`}`
`185`	`184`
`186`	`185`	`for (auto dim : shape) {`
`187`		`- auto copied = dim;`
`188`		`- optim::Simplify(&copied);`
	`186`	`+ auto copied = optim::ArithSimplify(dim);`
`189`	`187`	`shape_simplified.push_back(copied);`
`190`	`188`	`}`
`191`	`189`