From 7beba0f088076c4dd2fb6b58c5ffc8af327b596e Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Wed, 10 Apr 2024 21:22:58 +0800
Subject: [PATCH 1/3] update

---
 paddle/cinn/hlir/framework/pir/utils.cc       | 33 +++++++++----------
 paddle/cinn/hlir/framework/pir_compiler.cc    |  2 +-
 paddle/cinn/hlir/pe/broadcast.cc              |  5 +--
 .../pir/cinn/inference/test_llama_forward.py  | 13 +++++---
 test/ir/pir/cinn/llama_test_model.py          |  8 +++--
 5 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/paddle/cinn/hlir/framework/pir/utils.cc b/paddle/cinn/hlir/framework/pir/utils.cc
index 4ba4bc6d3b2762..5d7d1aa3ac0fae 100644
--- a/paddle/cinn/hlir/framework/pir/utils.cc
+++ b/paddle/cinn/hlir/framework/pir/utils.cc
@@ -125,23 +125,22 @@ class OpTransInfo {
   DeParamCondT deny_param_cond_{{"batch_norm", {"ReserveSpace"}},
                                 {"batch_norm_grad", {"ReserveSpace"}}};
 
-  std::unordered_set<std::string> default_deny_ops_{
-      "feed",
-      "fetch",
-      "conv2d",
-      "conv2d_grad",
-      "depthwise_conv2d",
-      "depthwise_conv2d_grad",
-      "dropout",
-      "pool2d",
-      "pool2d_grad",
-      "split",
-      "matmul",
-      "matmul_grad",
-      "embedding_grad",
-      "embedding",
-      "arange",
-  };
+  std::unordered_set<std::string> default_deny_ops_{"feed",
+                                                    "fetch",
+                                                    "conv2d",
+                                                    "conv2d_grad",
+                                                    "depthwise_conv2d",
+                                                    "depthwise_conv2d_grad",
+                                                    "dropout",
+                                                    "pool2d",
+                                                    "pool2d_grad",
+                                                    "split",
+                                                    "matmul",
+                                                    "matmul_grad",
+                                                    "embedding_grad",
+                                                    "embedding",
+                                                    "arange",
+                                                    "softmax"};
 };
 
 std::string OpNameAfterStripDialect(const ::pir::Operation& op) {
diff --git a/paddle/cinn/hlir/framework/pir_compiler.cc b/paddle/cinn/hlir/framework/pir_compiler.cc
index 2db39508ce1e10..6daba068ac660c 100644
--- a/paddle/cinn/hlir/framework/pir_compiler.cc
+++ b/paddle/cinn/hlir/framework/pir_compiler.cc
@@ -31,7 +31,7 @@ std::vector<pir::CINNKernelInfo> PirCompiler::Build(
     kernel_infos[index] = task.GetCINNKernelInfo();
   };
   utils::parallel_run(
-      worker_fn, utils::SequenceDispatcher(0, groups.size()), -1);
+      worker_fn, utils::SequenceDispatcher(0, groups.size()), 1);
   return kernel_infos;
 }
 
diff --git a/paddle/cinn/hlir/pe/broadcast.cc b/paddle/cinn/hlir/pe/broadcast.cc
index fb47ed737fdf3b..fab2af9c5f0dcf 100644
--- a/paddle/cinn/hlir/pe/broadcast.cc
+++ b/paddle/cinn/hlir/pe/broadcast.cc
@@ -400,10 +400,7 @@ Tensor BroadcastTo(const Tensor& A,
           } else if (MathEqual(a_shape_i, out_shape[idx])) {
             broadcast_indice.push_back(indice[idx]);
           } else {
-            std::stringstream ss;
-            ss << "fail to broad cast input shape " << a_shape_i
-               << " to output shape " << out_shape[idx];
-            PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+            broadcast_indice.push_back(indice[idx] % a_shape_i);
           }
         }
         return A(broadcast_indice);
diff --git a/test/ir/pir/cinn/inference/test_llama_forward.py b/test/ir/pir/cinn/inference/test_llama_forward.py
index 8c4753e6cff352..61943597f93c79 100644
--- a/test/ir/pir/cinn/inference/test_llama_forward.py
+++ b/test/ir/pir/cinn/inference/test_llama_forward.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import sys
 import unittest
 from os.path import dirname
 
 import numpy as np
 
+os.environ["FLAGS_prim_forward_blacklist"] = "pd_op.embedding;pd_op.softmax"
+
 import paddle
 from paddle.static import InputSpec
 
@@ -85,11 +88,11 @@ def eval(self, use_cinn):
 
     def test_eval(self):
         dy_out = self.eval(use_cinn=False)
-        if utils.unittest_use_cinn():
-            cinn_out = self.eval(use_cinn=True)
-            np.testing.assert_allclose(
-                cinn_out.numpy(), dy_out.numpy(), atol=1e-5, rtol=1e-6
-            )
+        # if utils.unittest_use_cinn():
+        cinn_out = self.eval(use_cinn=True)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-5, rtol=1e-6
+        )
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/llama_test_model.py b/test/ir/pir/cinn/llama_test_model.py
index d02bd17acb16ca..32d34612672f30 100644
--- a/test/ir/pir/cinn/llama_test_model.py
+++ b/test/ir/pir/cinn/llama_test_model.py
@@ -84,8 +84,12 @@ def _set_cos_sin_cache(self, seq_len):
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        cos = self.cos_cached[:, :seq_len, :, :]
-        sin = self.sin_cached[:, :seq_len, :, :]
+        # TODO(phlrain): cinn slice not support end is a DimExpr
+        # WIP for support it
+        # cos = self.cos_cached[:, :seq_len, :, :]
+        # sin = self.sin_cached[:, :seq_len, :, :]
+        cos = self.cos_cached
+        sin = self.sin_cached
         return (
             cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
             sin.cast(x.dtype) if sin.dtype != x.dtype else sin,

From c93a99523d86fdddae1285468167e3c9f2edb3ab Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Wed, 10 Apr 2024 21:37:51 +0800
Subject: [PATCH 2/3] remove useless code

---
 test/ir/pir/cinn/inference/test_llama_forward.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/ir/pir/cinn/inference/test_llama_forward.py b/test/ir/pir/cinn/inference/test_llama_forward.py
index 61943597f93c79..eb41f6ce3f941b 100644
--- a/test/ir/pir/cinn/inference/test_llama_forward.py
+++ b/test/ir/pir/cinn/inference/test_llama_forward.py
@@ -88,7 +88,6 @@ def eval(self, use_cinn):
 
     def test_eval(self):
         dy_out = self.eval(use_cinn=False)
-        # if utils.unittest_use_cinn():
         cinn_out = self.eval(use_cinn=True)
         np.testing.assert_allclose(
             cinn_out.numpy(), dy_out.numpy(), atol=1e-5, rtol=1e-6

From 09219dbc50318827c6d1fe9f3e80119d4a97e62a Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Thu, 11 Apr 2024 14:04:20 +0800
Subject: [PATCH 3/3] fix auto parallel api bug

---
 .../paddle/distributed/auto_parallel/api.py   | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
index f7c69e1fe64646..9a6f0af9c83ae3 100644
--- a/python/paddle/distributed/auto_parallel/api.py
+++ b/python/paddle/distributed/auto_parallel/api.py
@@ -1924,16 +1924,21 @@ def __convert_strategy(self, strategy):
         inner_strategy.gradient_merge = copy.deepcopy(strategy.gradient_merge)
         inner_strategy.pipeline = copy.deepcopy(strategy.pipeline)
         # The below are template interfaces
-        inner_strategy.recompute = copy.deepcopy(strategy._recompute)
-        inner_strategy.mp_optimization = copy.deepcopy(
-            strategy._mp_optimization
-        )
-        inner_strategy.dp_optimization = copy.deepcopy(
-            strategy._dp_optimization
-        )
-        inner_strategy.sp_optimization = copy.deepcopy(
-            strategy._sp_optimization
-        )
+        if hasattr(strategy, "_recompute"):
+            inner_strategy.recompute = copy.deepcopy(strategy._recompute)
+
+        if hasattr(strategy, "_mp_optimization"):
+            inner_strategy.mp_optimization = copy.deepcopy(
+                strategy._mp_optimization
+            )
+        if hasattr(strategy, "_dp_optimization"):
+            inner_strategy.dp_optimization = copy.deepcopy(
+                strategy._dp_optimization
+            )
+        if hasattr(strategy, "_sp_optimization"):
+            inner_strategy.sp_optimization = copy.deepcopy(
+                strategy._sp_optimization
+            )
 
         return inner_strategy