thisjiang
diff --git a/‎cinn/backends/codegen_cuda_dev_test.cc
+77-7 b/‎cinn/backends/codegen_cuda_dev_test.cc
+77-7
diff --git a/‎cinn/common/cas.cc
+1 b/‎cinn/common/cas.cc
+1
diff --git a/‎cinn/hlir/pe/add.h
+1 b/‎cinn/hlir/pe/add.h
+1
diff --git a/‎cinn/lang/lower_impl.cc
+13-2 b/‎cinn/lang/lower_impl.cc
+13-2
diff --git a/‎cinn/lang/lower_impl.h
+3-1 b/‎cinn/lang/lower_impl.h
+3-1
diff --git a/‎cinn/optim/cache_read_write_replace.cc
+10-2 b/‎cinn/optim/cache_read_write_replace.cc
+10-2
diff --git a/‎cinn/optim/cache_read_write_replace.h
+3-1 b/‎cinn/optim/cache_read_write_replace.h
+3-1
diff --git a/‎cinn/optim/optimize.cc
+1-1 b/‎cinn/optim/optimize.cc
+1-1
diff --git a/‎cinn/poly/compute_at_transform.cc
+16-16 b/‎cinn/poly/compute_at_transform.cc
+16-16
@@ -24,7 +24,7 @@ namespace backends {
 
 std::tuple<CUdeviceptr, CUdeviceptr, CUdeviceptr, std::vector<float>, std::vector<float>, std::vector<float>>
 CreateNVMemory(int M, int N) {
-  CUDA_CALL(cudaThreadSynchronize());
+  CUDA_CALL(cudaDeviceSynchronize());
 
   CUdeviceptr Ad, Bd, Cd;
   cuMemAlloc(&Ad, M * N * sizeof(float));
@@ -419,7 +419,7 @@ TEST(CodeGenCUDA, jit_host_call_cuda_kernel) {
     B_buf->host_memory = reinterpret_cast<uint8_t*>(Bd);
     C_buf->host_memory = reinterpret_cast<uint8_t*>(Cd);
 
-    CUDA_CALL(cudaThreadSynchronize());
+    CUDA_CALL(cudaDeviceSynchronize());
 
     // call the kernel
     auto comp = reinterpret_cast<void (*)(cinn_pod_value_t*, int)>(fn_ptr);
@@ -428,7 +428,7 @@ TEST(CodeGenCUDA, jit_host_call_cuda_kernel) {
 
     comp(args.data(), args.size());
 
-    CUDA_CALL(cudaThreadSynchronize());
+    CUDA_CALL(cudaDeviceSynchronize());
 
     CUDA_CALL(cudaMemcpy(host_data3.data(),
                          reinterpret_cast<void*>(Cd),
@@ -716,7 +716,7 @@ TEST(elementwise_add, share_local_cache) {
     B_buf->host_memory = reinterpret_cast<uint8_t*>(Bd);
     C_buf->host_memory = reinterpret_cast<uint8_t*>(Cd);
 
-    CUDA_CALL(cudaThreadSynchronize());
+    CUDA_CALL(cudaDeviceSynchronize());
 
     // call the kernel
     auto comp = reinterpret_cast<void (*)(cinn_pod_value_t*, int)>(fn_ptr);
@@ -725,7 +725,7 @@ TEST(elementwise_add, share_local_cache) {
 
     comp(args.data(), args.size());
 
-    CUDA_CALL(cudaThreadSynchronize());
+    CUDA_CALL(cudaDeviceSynchronize());
   }
 
   CUDA_CALL(cudaFree(reinterpret_cast<void*>(Ad)))
@@ -883,6 +883,8 @@ TEST(Conv, optimize) {
 }
 
 TEST(ElementwiseAdd, cache_read) {
+  Context::Global().ResetNameId();
+
   Expr M(100);
   Expr N(200);
 
@@ -933,14 +935,82 @@ void fn_kernel(const float* __restrict__ A, const float* __restrict__ B, float*
       };
     };
     for (int32_t i = 0; i < 10; i += 1) {
-      C[((10 * blockIdx.x) + ((200 * threadIdx.x) + i))] = (A_read_cache_3[((10 * blockIdx.x) + ((10 * threadIdx.x) + i))] + B[((10 * blockIdx.x) + ((200 * threadIdx.x) + i))]);
+      C[((10 * blockIdx.x) + ((200 * threadIdx.x) + i))] = (A_read_cache_3[i] + B[((10 * blockIdx.x) + ((200 * threadIdx.x) + i))]);
+    };
+  };
+}
+
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(source_target), source_code);
+
+  backends::NVRTC_Compiler compiler;
+
+  auto ptx = compiler(source_code);
+  CHECK(!ptx.empty()) << "Compile error!";
+}
+
+TEST(ElementwiseAdd, cache_read1) {
+  Expr M(100);
+  Expr N(200);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  auto C = Compute(
+      {M - 2, N}, [&](Expr i, Expr j) { return A(i, j) + A(i + 1, j) + A(i + 2, j) + B(i, j); }, "C");
+  C->stage()->Split(1, 10);
+
+  auto AL = A->stage()->CacheRead("local", {C});
+  AL->stage()->Split(1, 10);
+
+  AL->stage()->ComputeAt(C->stage(), 1, poly::Stage::ComputeAtKind::kComputeAtUnk, A->name);
+  C->stage()->Bind(0, "threadIdx.x");
+  C->stage()->Bind(1, "blockIdx.x");
+
+  Target target;
+  CodeGenCUDA_Dev codegen(target);
+
+  auto fn = Lower("fn", {A, B, C}, {}, {AL});
+
+  Module::Builder builder("module", target);
+  builder.AddFunction(fn);
+
+  auto source_code = codegen.Compile(builder.Build());
+  std::cout << "source:\n" << source_code << std::endl;
+
+  std::string source_target = R"ROC(
+extern "C" {
+
+#ifdef __CUDACC_RTC__
+typedef int int32_t;
+typedef char int8_t;
+#endif
+
+
+
+__global__
+void fn_kernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C)
+{
+  float _A_read_cache_6 [ 3 * 10 ];
+  float* A_read_cache_6 = _A_read_cache_6;
+  {
+    if (((((threadIdx.x >= 0) && (threadIdx.x <= 97)) && (blockIdx.x >= 0)) && (blockIdx.x <= 19))) {
+      for (int32_t i = threadIdx.x; i < (3 + threadIdx.x); i += 1) {
+        for (int32_t j_inner = 0; j_inner < 10; j_inner += 1) {
+          A_read_cache_6[((10 * i) + j_inner)] = A[((10 * blockIdx.x) + ((200 * i) + j_inner))];
+        };
+      };
+    };
+    for (int32_t i = 0; i < 10; i += 1) {
+      C[((10 * blockIdx.x) + ((200 * threadIdx.x) + i))] = (A_read_cache_6[i] + (A_read_cache_6[(10 + i)] + (A_read_cache_6[(20 + i)] + B[((10 * blockIdx.x) + ((200 * threadIdx.x) + i))])));
     };
   };
 }
 
 }
 )ROC";
-  // ASSERT_EQ(utils::Trim(source_target), source);
+  ASSERT_EQ(utils::Trim(source_target), source_code);
 
   backends::NVRTC_Compiler compiler;
 
 
@@ -1478,6 +1478,7 @@ Expr SolveInequality(Expr inequality, Var val) {
   } else {
     return AutoSimplify(inequality);
   }
+  return Expr();
 }
 
 }  // namespace common
 
@@ -1,6 +1,7 @@
 #pragma once
 #include <string>
 #include <vector>
+
 #include "cinn/common/common.h"
 #include "cinn/ir/ir.h"
 #include "cinn/ir/node.h"
 
@@ -7,6 +7,7 @@
 #include "cinn/common/ir_util.h"
 #include "cinn/ir/ir_printer.h"
 #include "cinn/lang/tensor.h"
+#include "cinn/optim/cache_read_write_replace.h"
 #include "cinn/optim/ir_replace.h"
 #include "cinn/poly/compute_at_transform.h"
 
@@ -30,7 +31,9 @@ void CheckNoIslCallRemains(Expr* expr) {
   }
 }
 
-Expr LowerGroup(const poly::ScheduleGroup& group, const std::map<std::string, Expr>& tuple_to_expr) {
+Expr LowerGroup(const poly::ScheduleGroup& group,
+                const std::map<std::string, Expr>& tuple_to_expr,
+                std::map<std::string, ir::Tensor>* global_tensor_map) {
   std::vector<poly::Stage*> stages;
   for (auto& node : group.nodes) {
     if (node->stage->has_expression()) {
@@ -73,6 +76,8 @@ Expr LowerGroup(const poly::ScheduleGroup& group, const std::map<std::string, Ex
   }
   CheckNoIslCallRemains(&e);
 
+  optim::CacheReadWriteReplace(&e, global_tensor_map);
+
   // deal with the compute_at relations
   ProcessComputeAtInfo(&e);
 
@@ -375,6 +380,8 @@ Expr LowerImpl::GenerateFunctionBody(const poly::Schedule* schedule) {
   auto tensor_map = GenAllTensorMap();
   std::map<std::string, Expr> tuple_to_expr;
   CHECK(!schedule->groups.empty()) << "no group is generated";
+
+  std::map<std::string, ir::Tensor> global_tensor_map;
   for (auto& group : schedule->groups) {
     CHECK_GT(group.nodes.size(), 0) << "group is empty";
     for (auto& node : group.nodes) {
@@ -384,7 +391,7 @@ Expr LowerImpl::GenerateFunctionBody(const poly::Schedule* schedule) {
       tuple_to_expr[tensor->name] = tensor->tensor_store_expanded_body();
     }
 
-    Expr group_expr = LowerGroup(group, tuple_to_expr);
+    Expr group_expr = LowerGroup(group, tuple_to_expr, &global_tensor_map);
     if (group_expr.defined()) {
       VLOG(3) << "group expr:\n" << group_expr;
       exprs.push_back(group_expr);
@@ -530,8 +537,10 @@ struct CorrectComputeAtRelatedIndiceMutator : public ir::IRMutator<> {
         auto* node = expr->As<ir::Store>();
         CHECK(node);
 
+        VLOG(3) << "SetProducerAxisToZeroInStore: " << *expr;
         for (auto& indice : node->indices) {
           for (auto& consumer_axis : consumer_axis) {
+            VLOG(3) << indice << " set producer axis [" << consumer_axis << "] to 0";
             optim::IrReplace(&indice, consumer_axis, common::make_const(0));
           }
         }
@@ -626,11 +635,13 @@ struct CorrectComputeAtRelatedIndiceMutator : public ir::IRMutator<> {
       void operator()(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
 
       void Visit(const ir::Load* op, Expr* expr) override {
+        VLOG(3) << "Consumer modify Load " << *expr << "'s axis for producer [" << producer_tensor_name << "]";
         auto* node = expr->As<ir::Load>();
         if (op->tensor.as_tensor()->name == producer_tensor_name) {
           CHECK_LE(compute_at_info.preceding_offset_for_producer_load.size(), node->indices.size());
           for (auto axis : consumer_axis) {
             for (auto& indice : node->indices) {
+              VLOG(3) << "Consumer Load " << indice << " set axis [" << axis << "] to 0";
               optim::IrReplace(&indice, axis, common::make_const(0));
             }
           }
 
@@ -48,7 +48,9 @@ void CheckNoIslCallRemains(const Expr* expr);
  * @param group A single schedule group containing several Stages and the scheduling order.
  * @param tuple_to_expr A map from isl set tuple name to CINN expressions.
  */
-Expr LowerGroup(const poly::ScheduleGroup& group, const std::map<std::string, Expr>& tuple_to_expr);
+Expr LowerGroup(const poly::ScheduleGroup& group,
+                const std::map<std::string, Expr>& tuple_to_expr,
+                std::map<std::string, Tensor>* global_tensor_map);
 
 /**
  * A Computation graph node.
 
@@ -76,12 +76,13 @@ struct CacheReplaceMutator : public ir::IRMutator<> {
 
 }  // namespace
 
-void CacheReadWriteReplace(Expr* expr) {
+void CacheReadWriteReplace(Expr* expr, std::map<std::string, ir::Tensor>* global_tensor_map) {
   auto cached_tensors = ir::CollectIRNodes(*expr, [](const Expr* x) {
     auto* t = x->as_tensor();
     return t && (t->read_cache_relation || t->write_cache_relation);
   });
 
+  LOG(INFO) << "expr: " << *expr;
   auto tensors = ir::CollectIRNodes(*expr, [](const Expr* x) { return x->as_tensor(); });
 
   std::set<ir::Tensor> uniq_cached_tensors;
@@ -95,9 +96,16 @@ void CacheReadWriteReplace(Expr* expr) {
     tensor_map[t->name] = t;
   }
 
+  // update global_tensor_map
+  for (auto& item : tensor_map) {
+    if (!global_tensor_map->count(item.first)) {
+      (*global_tensor_map)[item.first] = item.second;
+    }
+  }
+
   for (auto& t : uniq_cached_tensors) {
     if (t->read_cache_relation) {
-      auto cache = tensor_map.at(t->read_cache_relation->cache_name);
+      auto cache = global_tensor_map->at(t->read_cache_relation->cache_name);
       CacheReplaceMutator(t->name, cache, t->read_cache_relation->readers, true /*read*/)(expr);
     }
     if (t->write_cache_relation) {
 
@@ -1,10 +1,12 @@
 #pragma once
+#include <string>
+
 #include "cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
 
-void CacheReadWriteReplace(Expr* expr);
+void CacheReadWriteReplace(Expr* expr, std::map<std::string, ir::Tensor>* global_tensor_map);
 
 }  // namespace optim
 }  // namespace cinn
@@ -33,7 +33,7 @@ Expr Optimize(Expr e, bool runtime_debug_info) {
 #ifdef CINN_WITH_CUDA
   RemoveGpuForloopsAxis(&copied);
 #endif
-  CacheReadWriteReplace(&copied);
+  // CacheReadWriteReplace(&copied);
 
   RemoveNestedBlock(&copied);
 
 
@@ -9,11 +9,11 @@ void ComputeAtTransform::AdjustPdomain() {
 
   isl::set cdomain1 = isl::manage(AddParamsTo(cdomain_.copy()));
 
-  LOG(INFO) << "ct_domain: " << ct_domain.space();
-  LOG(INFO) << "cdomain1: " << cdomain1.space();
+  VLOG(3) << "ct_domain: " << ct_domain.space();
+  VLOG(3) << "cdomain1: " << cdomain1.space();
 
   ct_domain = ct_domain.intersect(cdomain1);
-  LOG(INFO) << "ct_domain: " << ct_domain;
+  VLOG(3) << "ct_domain: " << ct_domain;
 
   // get producer domain from access
   isl::map access_with_params = isl::manage(AddParamsTo(access_.copy()));
@@ -22,11 +22,11 @@ void ComputeAtTransform::AdjustPdomain() {
 
   // intect with the original producer domain
   auto pdomain_params = isl::manage(AddParamsTo(pdomain_.copy()));
-  LOG(INFO) << "pdomain: " << pdomain;
-  LOG(INFO) << "pdomain_params: " << pdomain_params;
+  VLOG(4) << "pdomain: " << pdomain;
+  VLOG(4) << "pdomain_params: " << pdomain_params;
   adjusted_pdomain_ = isl::manage(isl_set_intersect(pdomain.release(), pdomain_params.release()));
   adjusted_pdomain_ = isl::manage(isl_simplify(adjusted_pdomain_.release()));
-  LOG(INFO) << "adjusted pdomain: " << adjusted_pdomain_;
+  VLOG(4) << "adjusted pdomain: " << adjusted_pdomain_;
 }
 
 void ComputeAtTransform::AdjustPtransform() {
@@ -53,7 +53,7 @@ void ComputeAtTransform::AdjustPtransform() {
     ct_range1 = isl::manage(isl_set_set_tuple_name(ct_range1.release(), ptuple()));
 
     adjusted_ptransform_ = adjusted_ptransform_.intersect_range(ct_range1);
-    LOG(INFO) << "adjusted_ptransform: " << adjusted_ptransform_;
+    VLOG(4) << "adjusted_ptransform: " << adjusted_ptransform_;
   }
 
   {  // add params
@@ -86,8 +86,8 @@ isl::map ComputeAtTransform::ctransform_with_params() {
 }
 
 void ComputeAtTransform::DisplayC(isl_map* pschedule, isl_map* cschedule) {
-  LOG(INFO) << "adjusted cdomain: " << adjusted_cdomain_;
-  LOG(INFO) << "adjusted ctransform: " << adjusted_ctransform_;
+  VLOG(3) << "adjusted cdomain: " << adjusted_cdomain_;
+  VLOG(3) << "adjusted ctransform: " << adjusted_ctransform_;
 
   auto adjusted_ctransform = adjusted_ctransform_;
   auto adjusted_ptransform = adjusted_ptransform_;
@@ -101,11 +101,11 @@ void ComputeAtTransform::DisplayC(isl_map* pschedule, isl_map* cschedule) {
 
   auto whole_domain = isl::manage(isl_union_set_from_set(adjusted_pdomain_.copy()));
   whole_domain      = isl::manage(isl_union_set_add_set(whole_domain.release(), adjusted_cdomain_.copy()));
-  LOG(INFO) << "whole domain: " << whole_domain;
+  VLOG(3) << "whole domain: " << whole_domain;
 
   auto whole_schedule = isl::manage(isl_union_map_from_map(adjusted_ptransform.copy()));
   whole_schedule      = isl::manage(isl_union_map_add_map(whole_schedule.release(), adjusted_ctransform.copy()));
-  LOG(INFO) << "whole_schedule: " << whole_schedule;
+  VLOG(3) << "whole_schedule: " << whole_schedule;
 
   isl::set context(whole_domain.ctx(), "{:}");
 
@@ -166,7 +166,7 @@ std::string GenConsumerParamName(const char* tuple, int id) {
 }
 
 std::vector<int> ComputeAtTransform::GetProducerAdjustedShape() const {
-  LOG(INFO) << "domain: " << adjusted_pdomain();
+  VLOG(3) << "domain: " << adjusted_pdomain();
   isl::set param_limit = isl::manage(isl_set_universe(adjusted_pdomain().space().release()));
   // set all the params to 0
   isl_local_space* local_space = isl_local_space_from_space(param_limit.space().release());
@@ -193,12 +193,12 @@ std::vector<int> ComputeAtTransform::GetAccessesPrecedingIndicesMinAssumingParam
   std::vector<int> res;
 
   isl::set cdomain_with_param = isl::manage(AddParamsTo(cdomain_.copy()));
-  LOG(INFO) << "cdomain_with_param: " << cdomain_with_param;
+  VLOG(4) << "cdomain_with_param: " << cdomain_with_param;
   isl::map access_with_param = isl::manage(AddParamsTo(access_.copy()));
 
-  LOG(INFO) << "*** applied: " << cdomain_with_param.apply(access_with_param);
+  VLOG(4) << "applied: " << cdomain_with_param.apply(access_with_param);
   isl::set param_limited_cdomain = ctransform_with_params().domain();
-  LOG(INFO) << "ctransform.domain: " << param_limited_cdomain;
+  VLOG(4) << "ctransform.domain: " << param_limited_cdomain;
   isl::set access_domain = param_limited_cdomain.apply(access_with_param);
 
   // set all the params to 0
@@ -212,7 +212,7 @@ std::vector<int> ComputeAtTransform::GetAccessesPrecedingIndicesMinAssumingParam
 
   access_domain = access_domain.intersect(adjusted_pdomain());
 
-  LOG(INFO) << "access_with_param: " << access_domain;
+  VLOG(3) << "access_with_param: " << access_domain;
 
   for (int i = 0; i < level_ + 1; i++) {
     auto [minv, maxv] = isl_set_get_axis_range(access_domain.get(), i);
Original file line number	Diff line number	Diff line change
`@@ -1478,6 +1478,7 @@ Expr SolveInequality(Expr inequality, Var val) {`
`1478`	`1478`	`} else {`
`1479`	`1479`	`return AutoSimplify(inequality);`
`1480`	`1480`	`}`
	`1481`	`+ return Expr();`
`1481`	`1482`	`}`
`1482`	`1483`
`1483`	`1484`	`} // namespace common`