PaddlePaddle
diff --git a/‎ci/run_sot_test.sh
+2-2 b/‎ci/run_sot_test.sh
+2-2
diff --git a/‎cmake/external/nvshmem.cmake
+106 b/‎cmake/external/nvshmem.cmake
+106
diff --git a/‎cmake/external/xpu.cmake
+2-2 b/‎cmake/external/xpu.cmake
+2-2
diff --git a/‎cmake/third_party.cmake
+11 b/‎cmake/third_party.cmake
+11
diff --git a/‎paddle/cinn/backends/codegen_cuda_dev.cc
+29-5 b/‎paddle/cinn/backends/codegen_cuda_dev.cc
+29-5
diff --git a/‎paddle/cinn/backends/codegen_cuda_dev.h
+5 b/‎paddle/cinn/backends/codegen_cuda_dev.h
+5
diff --git a/‎paddle/cinn/backends/nvrtc/header_generator.cc
+50 b/‎paddle/cinn/backends/nvrtc/header_generator.cc
+50
diff --git a/‎paddle/cinn/backends/nvrtc/nvrtc_util.cc
+12 b/‎paddle/cinn/backends/nvrtc/nvrtc_util.cc
+12
diff --git a/‎paddle/cinn/common/simplify_special_pattern.cc
+12-12 b/‎paddle/cinn/common/simplify_special_pattern.cc
+12-12
diff --git a/‎paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
-3 b/‎paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
-3
@@ -18,10 +18,10 @@ function run_sot_test() {
     PY_VERSION_NO_DOT=$(echo $PY_VERSION | sed 's/\.//g')
 
     export STRICT_MODE=1
-    export COST_MODEL=False
     export MIN_GRAPH_SIZE=0
     export SOT_LOG_LEVEL=0
     export FLAGS_cudnn_deterministic=True
+    export SOT_ENABLE_STRICT_GUARD_CHECK=True
 
     # Install PaddlePaddle
     echo "::group::Installing paddle wheel..."
@@ -54,7 +54,7 @@ function run_sot_test() {
                 echo "skip ${PY_VERSION_NO_DOT} ${file}"
                 continue
             fi
-            echo Running:" STRICT_MODE=1 COST_MODEL=False MIN_GRAPH_SIZE=0 SOT_LOG_LEVEL=0 FLAGS_cudnn_deterministic=True python " $file
+            echo Running:" STRICT_MODE=1 MIN_GRAPH_SIZE=0 SOT_LOG_LEVEL=0 FLAGS_cudnn_deterministic=True SOT_ENABLE_STRICT_GUARD_CHECK=True python " $file
             # run unittests
             python_output=$($PYTHON_WITH_SPECIFY_VERSION $file 2>&1)
 
 
@@ -0,0 +1,106 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+set(GDRCOPY_HOME
+    $ENV{GDRCOPY_HOME}
+    CACHE PATH "Path to GDRCOPY installation")
+if(GDRCOPY_HOME)
+  message(STATUS "GDRCOPY_HOME: ${GDRCOPY_HOME}")
+else()
+  message(
+    WARNING
+      "Setting GDRCOPY_HOME environment or cmake option maybe needed to specify your install path GDRCOPY."
+  )
+endif()
+
+set(NVSHMEM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nvshmem)
+set(NVSHMEM_PREFIX_DIR ${THIRD_PARTY_PATH}/nvshmem)
+set(NVSHMEM_SOURCE_DIR ${NVSHMEM_PREFIX_DIR}/src/extern_nvshmem)
+message(STATUS "NVSHMEM_INSTALL_DIR: ${NVSHMEM_INSTALL_DIR}")
+
+set(NVSHMEM_INCLUDE_DIR
+    "${NVSHMEM_INSTALL_DIR}/include"
+    CACHE PATH "nvshmem include directory." FORCE)
+
+include_directories(${NVSHMEM_INCLUDE_DIR})
+
+if(NVSHMEM_SRC_TAR_PATH)
+  set(NVSHMEM_DOWNLOAD_COMMAND
+      rm -rf extern_nvshmem nvshmem_src_3.1.7-1.txz && cp
+      ${NVSHMEM_SRC_TAR_PATH} . && tar xf nvshmem_src_3.1.7-1.txz && mv
+      nvshmem_src extern_nvshmem)
+else()
+  set(NVSHMEM_URL
+      "https://developer.download.nvidia.com/compute/redist/nvshmem/3.1.7/source/nvshmem_src_3.1.7-1.txz"
+      CACHE STRING "" FORCE)
+  set(NVSHMEM_DOWNLOAD_COMMAND
+      rm -rf extern_nvshmem nvshmem_src_3.1.7-1.txz && wget
+      --no-check-certificate -q ${NVSHMEM_URL} && tar xf
+      nvshmem_src_3.1.7-1.txz && mv nvshmem_src extern_nvshmem)
+endif()
+
+set(NVSHMEM_PATCH_PATH ${PADDLE_SOURCE_DIR}/third_party/nvshmem.patch)
+set(NVSHMEM_PATCH_COMMAND
+    git init && git config user.name "PaddlePaddle" && git config user.email
+    "paddle@baidu.com" && git config --add safe.directory . && git add . && git
+    commit -m "init" && git apply ${NVSHMEM_PATCH_PATH})
+
+set(NVSHMEM_LIB ${NVSHMEM_INSTALL_DIR}/lib/libnvshmem.a)
+set(NVSHMEM_BOOTSTRAP_UID_LIB
+    ${NVSHMEM_INSTALL_DIR}/lib/nvshmem_bootstrap_uid.so)
+set(NVSHMEM_BOOTSTRAP_MPI_LIB
+    ${NVSHMEM_INSTALL_DIR}/lib/nvshmem_bootstrap_mpi.so)
+set(NVSHMEM_BOOTSTRAP_PMI_LIB
+    ${NVSHMEM_INSTALL_DIR}/lib/nvshmem_bootstrap_pmi.so)
+set(NVSHMEM_BOOTSTRAP_PMI2_LIB
+    ${NVSHMEM_INSTALL_DIR}/lib/nvshmem_bootstrap_pmi2.so)
+set(NVSHMEM_TRANSPORT_IBRC_LIB
+    ${NVSHMEM_INSTALL_DIR}/lib/nvshmem_transport_ibrc.so.3)
+set(NVSHMEM_TRANSPORT_IBGDA_LIB
+    ${NVSHMEM_INSTALL_DIR}/lib/nvshmem_transport_ibgda.so.3)
+
+# only compile nvshmem for sm90
+set(CUDA_ARCHITECTURES "90")
+
+ExternalProject_Add(
+  extern_nvshmem
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  PREFIX ${NVSHMEM_PREFIX_DIR}
+  SOURCE_DIR ${NVSHMEM_SOURCE_DIR}
+  DOWNLOAD_DIR ${NVSHMEM_PREFIX_DIR}/src
+  DOWNLOAD_COMMAND ${NVSHMEM_DOWNLOAD_COMMAND}
+  PATCH_COMMAND ${NVSHMEM_PATCH_COMMAND}
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${NVSHMEM_INSTALL_DIR}
+             -DGDRCOPY_HOME:PATH=${GDRCOPY_HOME}
+             -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHITECTURES}
+             -DNVSHMEM_ENABLE_ALL_DEVICE_INLINING=0
+             -DNVSHMEM_SHMEM_SUPPORT=0
+             -DNVSHMEM_UCX_SUPPORT=0
+             -DNVSHMEM_USE_NCCL=0
+             -DNVSHMEM_IBGDA_SUPPORT=1
+             -DNVSHMEM_PMIX_SUPPORT=0
+             -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0
+             -DNVSHMEM_USE_GDRCOPY=1
+             -DNVSHMEM_IBRC_SUPPORT=1
+             -DNVSHMEM_BUILD_TESTS=0
+  CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${NVSHMEM_INSTALL_DIR}
+  BUILD_BYPRODUCTS ${NVSHMEM_LIB})
+
+add_definitions(-DPADDLE_WITH_NVSHMEM)
+add_library(nvshmem STATIC IMPORTED GLOBAL)
+set_property(TARGET nvshmem PROPERTY IMPORTED_LOCATION ${NVSHMEM_LIB})
+add_dependencies(nvshmem extern_nvshmem)
@@ -30,9 +30,9 @@ set(XPU_XFA_LIB_NAME "libxpu_flash_attention.so")
 set(XPU_XPUDNN_LIB_NAME "libxpu_dnn.so")
 
 if(NOT DEFINED XPU_XHPC_BASE_DATE)
-  set(XPU_XHPC_BASE_DATE "dev/20250304")
+  set(XPU_XHPC_BASE_DATE "dev/20250306")
 endif()
-set(XPU_XCCL_BASE_VERSION "3.0.2.3") # For XRE5
+set(XPU_XCCL_BASE_VERSION "3.0.2.5") # For XRE5
 if(NOT DEFINED XPU_XFT_BASE_VERSION)
   set(XPU_XFT_BASE_VERSION "20230602")
 endif()
 
@@ -693,4 +693,15 @@ if(WITH_OPENVINO)
   list(APPEND third_party_deps extern_openvino)
 endif()
 
+string(FIND "${CUDA_ARCH_BIN}" "90" ARCH_BIN_CONTAINS_90)
+if(NOT WITH_GPU
+   OR NOT WITH_DISTRIBUTE
+   OR (ARCH_BIN_CONTAINS_90 EQUAL -1))
+  set(WITH_NVSHMEM OFF)
+endif()
+if(WITH_NVSHMEM)
+  include(external/nvshmem)
+  list(APPEND third_party_deps extern_nvshmem)
+endif()
+
 add_custom_target(third_party ALL DEPENDS ${third_party_deps})
@@ -16,10 +16,10 @@
 
 namespace cinn {
 namespace backends {
-
-const std::string CodeGenCudaDev::source_header_ =  // NOLINT
-    R"(#include <cstdint>
-
+const std::string CodeGenCudaDev::general_source_header_ =  // NOLINT
+    R"(
+#pragma once
+#include <cstdint>
 #define CINN_WITH_CUDA
 #include "bfloat16.h"
 #include "float16.h"
@@ -34,11 +34,35 @@ using cinn::common::float162;
 using cinn::common::bfloat168;
 using cinn::common::bfloat164;
 using cinn::common::bfloat162;
-
 #include "cinn_cuda_runtime_source.cuh"
+)";
+const std::string CodeGenCudaDev::source_header_ =  // NOLINT
+    R"(
+#pragma once
+#include <cinn_with_cuda_h>
+
+#include <bfloat16_h>
+#include <cstdint>
+#include <float16_h>
+using cinn::common::bfloat16;
+using cinn::common::float16;
+using cinn::common::float8;
+using cinn::common::half4;
+using cinn::common::half8;
+using cinn::common::float168;
+using cinn::common::float164;
+using cinn::common::float162;
+using cinn::common::bfloat168;
+using cinn::common::bfloat164;
+using cinn::common::bfloat162;
+#include <cinn_cuda_runtime_source_h>
+
 )";
 
 const std::string &CodeGenCudaDev::GetSourceHeader() { return source_header_; }
+const std::string &CodeGenCudaDev::GetGeneralSourceHeader() {
+  return general_source_header_;
+}
 
 CodeGenCudaDev::CodeGenCudaDev(Target target) : CodeGenGpuDev(target) {}
 
 
@@ -31,10 +31,15 @@ class CodeGenCudaDev : public CodeGenGpuDev {
  public:
   explicit CodeGenCudaDev(Target target);
   static const std::string& GetSourceHeader();
+  static const std::string& GetGeneralSourceHeader();
   void PrintIncludes() override;
 
  private:
   static const std::string source_header_;
+  // general_source_header_ is used for the more general situation, which load
+  // some header files while compiling but not set them into header files while
+  // creating the kernel function.
+  static const std::string general_source_header_;
 };
 
 }  // namespace backends
 
@@ -14,9 +14,12 @@
 
 #include "paddle/cinn/backends/nvrtc/header_generator.h"
 
+#include <fstream>
 #include "glog/logging.h"
 #include "jitify.hpp"  // NOLINT
+#include "paddle/cinn/common/common.h"
 #include "paddle/common/enforce.h"
+
 namespace cinn {
 namespace backends {
 namespace nvrtc {
@@ -34,12 +37,59 @@ const size_t JitSafeHeaderGenerator::size() const {
   return include_names_.size();
 }
 
+std::string read_file_as_string(const std::string& file_path) {
+#ifdef RUNTIME_INCLUDE_DIR
+  static constexpr char* defined_runtime_include_dir = RUNTIME_INCLUDE_DIR;
+#else
+  static constexpr char* defined_runtime_include_dir = nullptr;
+#endif
+
+#ifdef CINN_WITH_CUDA
+  std::string cinn_path = defined_runtime_include_dir;
+  std::ifstream file(cinn_path + '/' + file_path);
+
+  if (!file.is_open()) {
+    VLOG(1) << "Unable to open file : " << cinn_path << '/' << file_path;
+    return "";
+  }
+  std::stringstream buffer;
+  buffer << file.rdbuf();
+  file.close();
+  return buffer.str();
+#else
+  return "";
+#endif
+}
+#ifdef CINN_WITH_CUDA
+
+static const std::string cinn_float16_header =  // NOLINT
+    read_file_as_string("float16.h");
+static const std::string cinn_bfloat16_header =  // NOLINT
+    read_file_as_string("bfloat16.h");
+static const std::string cinn_with_cuda_header =  // NOLINT
+    R"(
+#pragma once
+#define CINN_WITH_CUDA
+)";
+static const std::string cinn_cuda_runtime_source_header =  // NOLINT
+    read_file_as_string("cinn_cuda_runtime_source.cuh");
+#endif
 JitSafeHeaderGenerator::JitSafeHeaderGenerator() {
   const auto& headers_map = ::jitify::detail::get_jitsafe_headers_map();
   for (auto& pair : headers_map) {
     include_names_.emplace_back(pair.first.data());
     headers_.emplace_back(pair.second.data());
   }
+#ifdef CINN_WITH_CUDA
+  include_names_.emplace_back("float16_h");
+  headers_.emplace_back(cinn_float16_header.data());
+  include_names_.emplace_back("bfloat16_h");
+  headers_.emplace_back(cinn_bfloat16_header.data());
+  include_names_.emplace_back("cinn_with_cuda_h");
+  headers_.emplace_back(cinn_with_cuda_header.data());
+  include_names_.emplace_back("cinn_cuda_runtime_source_h");
+  headers_.emplace_back(cinn_cuda_runtime_source_header.data());
+#endif
 }
 
 }  // namespace nvrtc
 
@@ -24,6 +24,7 @@
 #include <fstream>
 #include <iostream>
 
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/backends/cuda_util.h"
 #include "paddle/cinn/backends/nvrtc/header_generator.h"
 #include "paddle/cinn/common/common.h"
@@ -181,6 +182,17 @@ std::string Compiler::CompileCudaSource(const std::string& code,
   nvrtcResult compile_res =
       nvrtcCompileProgram(prog, param_cstrings.size(), param_cstrings.data());
 
+  if (compile_res != NVRTC_SUCCESS) {
+    std::string new_code = CodeGenCudaDev::GetGeneralSourceHeader() + code;
+    NVRTC_CALL(nvrtcCreateProgram(&prog,
+                                  new_code.c_str(),
+                                  nullptr,
+                                  header_gen.size(),
+                                  header_gen.headers().data(),
+                                  header_gen.include_names().data()));
+    compile_res =
+        nvrtcCompileProgram(prog, param_cstrings.size(), param_cstrings.data());
+  }
   {  // get log
     size_t log_size;
     NVRTC_CALL(nvrtcGetProgramLogSize(prog, &log_size));
 
@@ -23,10 +23,10 @@
 #include "paddle/cinn/optim/simplify_util.h"
 namespace cinn {
 namespace common {
-using cinn::optim::CheckPattern;
 using cinn::optim::GetFlattenExprs;
 using cinn::optim::IsNegatedIndexExpr;
 using cinn::optim::IsSumPartialBySymbol;
+using cinn::optim::MatchPattern;
 using cinn::optim::ProveDivisible;
 using cinn::optim::SimplifySymbolicAdd;
 
@@ -51,7 +51,7 @@ static void MergeMulModInsertElements(
       *has_mult = true;
       mult_exprs->emplace_back(ele);
     } else {
-      *no_opt_sum = no_opt_sum->get() ? *no_opt_sum + ele : ele;
+      *no_opt_sum = no_opt_sum->get() ? ir::Add::Make(*no_opt_sum, ele) : ele;
     }
   }
 }
@@ -250,24 +250,24 @@ std::optional<ir::IndexExpr> AddMulCornerCase(
 // S0 / (S1 * S2) * S2 + S0 % (S1 * S2) / S1 ===>  S0 / S1
 std::optional<ir::IndexExpr> DivMulAddModDivCase(const ir::IndexExpr& lhs,
                                                  const ir::IndexExpr& rhs) {
-  ir::Var a = ir::Var("a");
-  ir::Var b = ir::Var("b");
-  ir::Var c = ir::Var("c");
-  ir::Var f = ir::Var("f");
-  std::unordered_map<std::string, ir::IndexExpr> map;
-
-  ir::IndexExpr pattern = f / c * a + f % c / b;
+  if (!MatchPattern(rhs, "f % c / b")) return std::nullopt;
 
   auto flatten = GetFlattenExprs<ir::Add>(lhs);
   ir::IndexExpr res;
   bool find = false;
   for (const auto& expr : flatten) {
     if (!find) {
       ir::IndexExpr cand = ir::Add::Make(expr, rhs);
-      map.clear();
+
       // Check if the pattern is matched
-      if (CheckPattern(cand, pattern, &map) &&
-          map.at("c") == map.at("a") * map.at("b")) {
+      auto opt_map = MatchPattern(
+          cand,
+          "f / c * a + f % c / b",
+          [](const std::unordered_map<std::string, ir::IndexExpr>& m) {
+            return m.at("c") == m.at("a") * m.at("b");
+          });
+      if (opt_map) {
+        auto map = opt_map.value();
         ir::IndexExpr simplified = map.at("f") / map.at("b");
         res = res.defined() ? res + simplified : simplified;
         find = true;
 
@@ -31,8 +31,6 @@
 #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/accuracy_check_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/add_broadcast_to_elementwise_pass.h"
-#include "paddle/cinn/hlir/dialect/operator/transforms/add_store_in_fusion_op_pass.h"
-#include "paddle/cinn/hlir/dialect/operator/transforms/add_store_in_group_op_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/conv2d_transpose_filter_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/convert_fa_to_qkvmha_pass.h"
@@ -201,7 +199,6 @@ void ApplyDivideGroupOpToFusionOpPass(
   std::shared_ptr<pir::PassManager> pass_manager = CreatePassManager();
   pass_manager->AddPass(
       cinn::dialect::ir::CreateRemoveRedundantGroupOutputPass());
-  pass_manager->AddPass(cinn::dialect::ir::CreateAddStoreInGroupOpPass());
   pass_manager->AddPass(cinn::dialect::ir::CreateCinnGroupClusterPass());
 
   pass_manager->AddPass(cinn::dialect::ir::CreateSingleOpFallbackToPhiPass());