diff --git a/.gitmodules b/.gitmodules index 8b06f4fb771cbb..0c41450793fc2a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -110,3 +110,11 @@ path = third_party/cccl url = https://github.com/NVIDIA/cccl.git ignore = dirty +[submodule "third_party/cryptopp"] + path = third_party/cryptopp + url = https://github.com/weidai11/cryptopp.git + ignore = dirty +[submodule "third_party/cryptopp-cmake"] + path = third_party/cryptopp-cmake + url = https://github.com/noloader/cryptopp-cmake.git + ignore = dirty diff --git a/README.md b/README.md index 8f708334ed28f1..001352ea45fc4d 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ PaddlePaddle is originated from industrial practices with dedication and commitm ## Installation -### Latest PaddlePaddle Release: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5) +### Latest PaddlePaddle Release: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6) Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle. diff --git a/README_cn.md b/README_cn.md index a13fa5ba214503..cd45e4e3ecd2b7 100644 --- a/README_cn.md +++ b/README_cn.md @@ -18,9 +18,9 @@ ## 安装 -### PaddlePaddle最新版本: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5) +### PaddlePaddle 最新版本: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6) -跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) +跟进 PaddlePaddle 最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) ### 安装最新稳定版本: ``` diff --git a/README_ja.md b/README_ja.md index 22c78a1a79bbd9..dad60eb7ffcf87 100644 --- a/README_ja.md +++ b/README_ja.md @@ -20,7 +20,7 @@ PaddlePaddle は、工業化に対するコミットメントを持つ工業的 ## インストール -### PaddlePaddle の最新リリース: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5) +### PaddlePaddle の最新リリース: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6) 私たちのビジョンは、PaddlePaddle を通じて、誰もが深層学習を行えるようにすることです。 PaddlePaddle の最新機能を追跡するために、私たちの[リリースのお知らせ](https://github.com/PaddlePaddle/Paddle/releases)を参照してください。 diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake index 9daa4be7468e42..b3ec8f622923fd 100644 --- a/cmake/external/cryptopp.cmake +++ b/cmake/external/cryptopp.cmake @@ -14,12 +14,13 @@ include(ExternalProject) +set(CRYPTOPP_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cryptopp) +set(CRYPTOPP_CMAKE_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cryptopp-cmake) set(CRYPTOPP_PREFIX_DIR ${THIRD_PARTY_PATH}/cryptopp) set(CRYPTOPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cryptopp) set(CRYPTOPP_INCLUDE_DIR "${CRYPTOPP_INSTALL_DIR}/include" CACHE PATH "cryptopp include directory." FORCE) -set(CRYPTOPP_REPOSITORY ${GIT_URL}/weidai11/cryptopp.git) set(CRYPTOPP_TAG CRYPTOPP_8_2_0) if(WIN32) @@ -63,17 +64,16 @@ include_directories(${CRYPTOPP_INCLUDE_DIR}) ExternalProject_Add( extern_cryptopp ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} - GIT_REPOSITORY ${CRYPTOPP_REPOSITORY} - GIT_TAG ${CRYPTOPP_TAG} PREFIX ${CRYPTOPP_PREFIX_DIR} + SOURCE_DIR ${CRYPTOPP_SOURCE_DIR} UPDATE_COMMAND "" PATCH_COMMAND - COMMAND ${CMAKE_COMMAND} -E remove_directory "/cmake/" - COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "/cmake" - COMMAND cd "/cmake" && git checkout tags/${CRYPTOPP_TAG} -b - ${CRYPTOPP_TAG} - COMMAND ${CMAKE_COMMAND} -E copy_directory "/cmake/" - "/" + COMMAND ${CMAKE_COMMAND} -E copy "${CRYPTOPP_CMAKE_SOURCE_DIR}/CMakeLists.txt" + "/CMakeLists.txt" + COMMAND + ${CMAKE_COMMAND} -E copy + "${CRYPTOPP_CMAKE_SOURCE_DIR}/cryptopp-config.cmake" + "/cryptopp-config.cmake" COMMAND ${CRYPTOPP_PATCH_COMMAND} INSTALL_DIR ${CRYPTOPP_INSTALL_DIR} CMAKE_ARGS ${CRYPTOPP_CMAKE_ARGS} diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 788237cc4699b4..e506f2e3714da5 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -882,12 +882,6 @@ function(hip_library TARGET_NAME) cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) if(hip_library_SRCS) - # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found - if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators" - OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels")) - set_source_files_properties(${hip_library_SRCS} - PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - endif() if(hip_library_SHARED OR hip_library_shared) # build *.so hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS}) else() @@ -901,6 +895,10 @@ function(hip_library TARGET_NAME) endif() # cpplint code style foreach(source_file ${hip_library_SRCS}) + if(NOT ${source_file} MATCHES "\\.cu$") + set_source_files_properties(${source_file} + PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + endif() string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) list(APPEND hip_library_HEADERS diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 06dc5d6173794a..517ac24cccc72e 100755 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -237,6 +237,16 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR}) set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") +if(WIN32) + set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/common.*) +else() + set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/libcommon.*) +endif() +copy( + inference_lib_dist + SRCS ${paddle_common_lib} + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) + if(WIN32) if(WITH_STATIC_LIB) set(paddle_inference_lib @@ -268,11 +278,6 @@ else() SRCS ${paddle_phi_lib} DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) endif() - set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/libcommon.*) - copy( - inference_lib_dist - SRCS ${paddle_common_lib} - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) endif() copy( diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h index c333448d029ae0..0047100ebcfdfc 100644 --- a/paddle/cinn/ir/ir_base.h +++ b/paddle/cinn/ir/ir_base.h @@ -110,16 +110,23 @@ class Dim; macro__(Product) \ macro__(Sum) \ macro__(PrimitiveNode) \ - macro__(IntrinsicOp) \ macro__(_BufferRange_) \ macro__(ScheduleBlock) \ macro__(ScheduleBlockRealize) \ macro__(_Dim_) \ +#define NODETY_CONTROL_OP_FOR_INTRINSIC(macro__) \ + macro__(IntrinsicOp) \ #define NODETY_FORALL(__m) \ NODETY_PRIMITIVE_TYPE_FOR_EACH(__m) \ NODETY_OP_FOR_EACH(__m) \ + NODETY_CONTROL_OP_FOR_INTRINSIC(__m) \ + NODETY_CONTROL_OP_FOR_EACH(__m) + +#define NODETY_FORALL_EXCEPT_INTRINSIC(__m) \ + NODETY_PRIMITIVE_TYPE_FOR_EACH(__m) \ + NODETY_OP_FOR_EACH(__m) \ NODETY_CONTROL_OP_FOR_EACH(__m) // clang-format on diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.cc b/paddle/cinn/ir/utils/ir_nodes_collector.cc index ac2f0317e9213f..e4ebaca653bae9 100644 --- a/paddle/cinn/ir/utils/ir_nodes_collector.cc +++ b/paddle/cinn/ir/utils/ir_nodes_collector.cc @@ -15,6 +15,8 @@ #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include +#include "paddle/cinn/ir/intrinsic_ops.h" +#include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/ir_mutator.h" #include "paddle/cinn/ir/ir_printer.h" @@ -71,8 +73,71 @@ struct IrNodesCollector : public IRVisitorRequireReImpl { } \ } - NODETY_FORALL(__m) + NODETY_FORALL_EXCEPT_INTRINSIC(__m) #undef __m + + void Visit(const ir::IntrinsicOp* op) { + switch (op->getKind()) { +#define __(x) \ + case ir::IntrinsicKind::k##x: \ + Visit(llvm::dyn_cast(op)); \ + break; + + INTRINSIC_KIND_FOR_EACH(__) +#undef __ + } + } + + void Visit(const ir::intrinsics::GetAddr* x) { + if (x->data.defined()) { + Visit(&(x->data)); + } + } + + void Visit(const ir::intrinsics::BufferGetDataHandle* x) { + if (x->buffer.defined()) { + Visit(&(x->buffer)); + } + } + + void Visit(const ir::intrinsics::BufferGetDataConstHandle* x) { + if (x->buffer.defined()) { + Visit(&(x->buffer)); + } + } + + void Visit(const ir::intrinsics::PodValueToX* x) { + if (x->pod_value_ptr.defined()) { + Visit(&(x->pod_value_ptr)); + } + } + + void Visit(const ir::intrinsics::BufferCreate* x) { + if (x->buffer.defined()) { + Visit(&(x->buffer)); + } + } + + void Visit(const ir::intrinsics::ArgsConstruct* x) { + if (x->var.defined()) { + Expr convert = Expr(x->var); + Visit(&convert); + } + for (int i = 0; i < x->args.size(); ++i) { + if (x->args[i].defined()) { + Visit(&(x->args[i])); + } + } + } + + void Visit(const ir::intrinsics::BuiltinIntrin* x) { + for (int i = 0; i < x->args.size(); ++i) { + if (x->args[i].defined()) { + Visit(&(x->args[i])); + } + } + } + std::set visited_; }; diff --git a/paddle/fluid/distributed/common/chunk_allocator.h b/paddle/fluid/distributed/common/chunk_allocator.h index 17f7bb14224d35..7b19b3a1098398 100644 --- a/paddle/fluid/distributed/common/chunk_allocator.h +++ b/paddle/fluid/distributed/common/chunk_allocator.h @@ -14,6 +14,7 @@ #pragma once #include +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace distributed { @@ -77,9 +78,16 @@ class ChunkAllocator { void create_new_chunk() { Chunk* chunk; - posix_memalign(reinterpret_cast(&chunk), - std::max(sizeof(void*), alignof(Chunk)), - sizeof(Chunk) + sizeof(Node) * _chunk_size); + size_t alloc_size = sizeof(Chunk) + sizeof(Node) * _chunk_size; + int error = posix_memalign(reinterpret_cast(&chunk), + std::max(sizeof(void*), alignof(Chunk)), + alloc_size); + PADDLE_ENFORCE_EQ(error, + 0, + paddle::platform::errors::ResourceExhausted( + "Fail to alloc memory of %ld size, error code is %d.", + alloc_size, + error)); chunk->next = _chunks; _chunks = chunk; diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc index 2bd9213cae610d..47509d025722d8 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc @@ -61,8 +61,9 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x, // Type promotion Logic if (phi::NeedTypePromotion(x.dtype(), y.dtype())) { VLOG(5) << "got different data type, run type protmotion automatically."; - LOG(WARNING) << "got different data type, run type protmotion " - "automatically, this may cause data type been changed."; + LOG_FIRST_N(WARNING, 1) + << "got different data type, run type protmotion " + "automatically, this may cause data type been changed."; auto op_name = phi::TransToFluidOpName("multiply"); auto promotion_type = phi::GetPromoteDtype(op_name, x.dtype(), y.dtype()); @@ -407,8 +408,9 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x, // Type promotion Logic if (phi::NeedTypePromotion(x.dtype(), y.dtype())) { VLOG(5) << "got different data type, run type protmotion automatically."; - LOG(WARNING) << "got different data type, run type protmotion " - "automatically, this may cause data type been changed."; + LOG_FIRST_N(WARNING, 1) + << "got different data type, run type protmotion " + "automatically, this may cause data type been changed."; auto op_name = phi::TransToFluidOpName("multiply"); auto promotion_type = phi::GetPromoteDtype(op_name, x.dtype(), y.dtype()); diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index 2a96fddccbce70..75d6cb94c6b5f2 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -528,7 +528,7 @@ class {} : public egr::GradNodeBase {{ TYPE_PROMOTION_LOGIC_TEMPLATE = """ if (phi::NeedTypePromotion({x}.dtype(), {y}.dtype())) {{ VLOG(5) << "got different data type, run type protmotion automatically."; - LOG(WARNING) << "got different data type, run type protmotion automatically, this may cause data type been changed."; + LOG_FIRST_N(WARNING, 1) << "got different data type, run type protmotion automatically, this may cause data type been changed."; {op_name} auto promotion_type = phi::GetPromoteDtype(op_name, {x}.dtype(), {y}.dtype()); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 3f4e7a9344a30c..d2f834a5938e96 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -35,7 +35,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(phi_modules GLOBAL PROPERTY PHI_MODULES) get_property(ir_targets GLOBAL PROPERTY IR_TARGETS) get_property(not_infer_modules GLOBAL PROPERTY NOT_INFER_MODULES) -set(utils_modules pretty_log string_helper benchmark utf8proc) +set(utils_modules pretty_log string_helper utf8proc) if(NOT WITH_GFLAGS) set(utils_modules ${utils_modules} paddle_flags) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index d9d7d5aa3659ad..9cec6ac6878dc2 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -180,6 +180,11 @@ void AnalysisConfig::EnableXpu(int l3_size, bool transformer_encoder_adaptive_seqlen, bool enable_multi_stream) { #if defined(PADDLE_WITH_XPU) || defined(LITE_SUBGRAPH_WITH_XPU) + LOG_FIRST_N(WARNING, 1) + << "Parameters in EnableXpu/enable_xpu is deprecated since version " + "2.6.1, and will be removed in version 3.0! Please use " + "EnableXpu/enable_xpu without parameters, and use " + "SetXpuConfig/set_xpu_config to set options."; use_xpu_ = true; xpu_config_.l3_size = l3_size; xpu_config_.conv_autotune_level = conv_autotune; diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 6a3e943dec7e9a..b5a26ff9225aa4 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -111,6 +111,7 @@ struct PD_INFER_DECL XpuConfig { bool conv_autotune_file_writeback{false}; // Fc autotune level. The Optional values are 0-9. Default 0 means no + // autotune. int fc_autotune_level{0}; // Base fc autotune info is read from fc_autotune_file. std::string fc_autotune_file; @@ -367,7 +368,7 @@ struct PD_INFER_DECL AnalysisConfig { /// void EnableXpu(int l3_size = 0xfffc00, bool l3_locked = false, - bool conv_autotune = true, + bool conv_autotune = false, const std::string& conv_autotune_file = "", const std::string& transformer_encoder_precision = "int16", bool transformer_encoder_adaptive_seqlen = false, diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc old mode 100755 new mode 100644 index 8cf589541b1e04..10763eb911543a --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -47,6 +47,7 @@ struct SimpleOpTypeSetTeller : public Teller { #endif #if IS_TRT_VERSION_GE(7000) teller_set.insert("tile"); + int8_teller_set.insert("tile"); teller_set.insert("flatten_contiguous_range"); int8_teller_set.insert("flatten_contiguous_range"); teller_set.insert("rnn"); @@ -2302,15 +2303,20 @@ struct SimpleOpTypeSetTeller : public Teller { if (!with_dynamic_shape) { if (tile_inputs.find("repeat_times_tensor") != tile_inputs.end()) { if (!desc.Input("repeat_times_tensor").empty()) { + VLOG(3) << "Tile op: repeat_times_tensor is not empty."; return false; } } if (tile_inputs.find("RepeatTimes") != tile_inputs.end()) { if (!desc.Input("RepeatTimes").empty()) { + VLOG(3) << "Tile op: RepeatTimes is not empty."; return false; } } - if (!desc.HasAttr("repeat_times")) return false; + if (!desc.HasAttr("repeat_times")) { + VLOG(3) << "Tile op:`repeat_times` is not set."; + return false; + } } } #endif diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt index 3dbc06bfc11b7e..0ad2cb0e3f0c84 100644 --- a/paddle/fluid/inference/utils/CMakeLists.txt +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -1,8 +1,3 @@ -cc_library( - benchmark - SRCS benchmark.cc - DEPS enforce common) -paddle_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) cc_library( infer_io_utils SRCS io_utils.cc @@ -13,13 +8,5 @@ cc_library( DEPS proto_desc enforce common) cc_library(table_printer SRCS table_printer.cc) -paddle_test(test_table_printer SRCS table_printer_tester.cc) proto_library(shape_range_info_proto SRCS shape_range_info.proto) - -if(WITH_ONNXRUNTIME AND WIN32) - # Copy onnxruntime for some c++ test in Windows, since the test will - # be build only in CI, so suppose the generator in Windows is Ninja. - copy_onnx(test_benchmark) - copy_onnx(test_table_printer) -endif() diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc deleted file mode 100644 index 24bc99ed183fad..00000000000000 --- a/paddle/fluid/inference/utils/benchmark.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/utils/benchmark.h" - -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace inference { - -std::string Benchmark::SerializeToString() const { - std::stringstream ss; - ss << "-----------------------------------------------------\n"; - ss << "name\t"; - ss << "batch_size\t"; - ss << "num_threads\t"; - ss << "latency\t"; - ss << "qps"; - ss << '\n'; - - ss << name_ << "\t"; - ss << batch_size_ << "\t\t"; - ss << num_threads_ << "\t"; - ss << latency_ << "\t"; - ss << 1000.0 / latency_; - ss << '\n'; - return ss.str(); -} -void Benchmark::PersistToFile(const std::string &path) const { - std::ofstream file(path, std::ios::app); - PADDLE_ENFORCE_EQ( - file.is_open(), - true, - platform::errors::Unavailable("Can not open %s to add benchmark.", path)); - file << SerializeToString(); - file.flush(); - file.close(); -} - -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h deleted file mode 100644 index 56789843c3728e..00000000000000 --- a/paddle/fluid/inference/utils/benchmark.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include - -#include "paddle/utils/test_macros.h" - -namespace paddle { -namespace inference { - -/* - * Helper class to calculate the performance. - */ -struct TEST_API Benchmark { - int batch_size() const { return batch_size_; } - void SetBatchSize(int x) { batch_size_ = x; } - - int num_threads() const { return num_threads_; } - void SetNumThreads(int x) { num_threads_ = x; } - - bool use_gpu() const { return use_gpu_; } - void SetUseGpu() { use_gpu_ = true; } - - float latency() const { return latency_; } - void SetLatency(float x) { latency_ = x; } - - const std::string& name() const { return name_; } - void SetName(const std::string& name) { name_ = name; } - - std::string SerializeToString() const; - void PersistToFile(const std::string& path) const; - - private: - bool use_gpu_{false}; - int batch_size_{0}; - float latency_; - int num_threads_{1}; - std::string name_; -}; - -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc deleted file mode 100644 index 8f7614cb10a44e..00000000000000 --- a/paddle/fluid/inference/utils/benchmark_tester.cc +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "paddle/fluid/inference/utils/benchmark.h" - -using namespace paddle::inference; // NOLINT -TEST(Benchmark, basic) { - Benchmark benchmark; - benchmark.SetName("key0"); - benchmark.SetBatchSize(10); - benchmark.SetUseGpu(); - benchmark.SetLatency(220); - LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString(); -} - -TEST(Benchmark, PersistToFile) { - Benchmark benchmark; - benchmark.SetName("key0"); - benchmark.SetBatchSize(10); - benchmark.SetUseGpu(); - benchmark.SetLatency(220); - - benchmark.PersistToFile("1.log"); - benchmark.PersistToFile("2.log"); - benchmark.PersistToFile("3.log"); -} diff --git a/paddle/fluid/inference/utils/table_printer_tester.cc b/paddle/fluid/inference/utils/table_printer_tester.cc deleted file mode 100644 index fc482807b2854c..00000000000000 --- a/paddle/fluid/inference/utils/table_printer_tester.cc +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "paddle/fluid/inference/utils/table_printer.h" - -namespace paddle { -namespace inference {} // namespace inference -} // namespace paddle - -TEST(table_printer, output) { - std::vector header{"config", "value"}; - paddle::inference::TablePrinter table(header); - - // model_dir - table.InsertRow({"model_dir", "./model_dir"}); - // model - table.InsertRow({"model_file", "./model.pdmodel"}); - table.InsertRow({"params_file", "./model.pdiparams"}); - - table.InsetDivider(); - // gpu - table.InsertRow({"use_gpu", "true"}); - table.InsertRow({"gpu_device_id", "0"}); - table.InsertRow({"memory_pool_init_size", "100MB"}); - table.InsertRow({"thread_local_stream", "false"}); - table.InsetDivider(); - - // trt precision - table.InsertRow({"use_trt", "true"}); - table.InsertRow({"trt_precision", "fp32"}); - table.InsertRow({"enable_dynamic_shape", "true"}); - table.InsertRow({"DisableTensorRtOPs", "{}"}); - table.InsertRow({"EnableVarseqlen", "ON"}); - table.InsertRow({"tensorrt_dla_enabled", "ON"}); - table.InsetDivider(); - - // lite - table.InsertRow({"use_lite", "ON"}); - table.InsetDivider(); - - // xpu - table.InsertRow({"use_xpu", "true"}); - table.InsertRow({"xpu_device_id", "0"}); - table.InsetDivider(); - - // ir - table.InsertRow({"ir_optim", "true"}); - table.InsertRow({"ir_debug", "false"}); - table.InsertRow({"enable_memory_optim", "false"}); - table.InsertRow({"EnableProfile", "false"}); - table.InsertRow({"glog_info_disabled", "false"}); - table.InsetDivider(); - - // cpu - table.InsertRow({"CpuMathLibrary", "4"}); - // mkldnn - table.InsertRow({"enable_mkldnn", "false"}); - table.InsertRow({"mkldnn_cache_capacity", "10"}); - - // a long string - table.InsertRow( - {"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ a long string " - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~", - "------------------------------------------ a long value " - "-----------------------------------------------------"}); - - LOG(INFO) << table.PrintTable(); -} diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 16864b80b5c765..a0aa1f589191ff 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -151,32 +151,26 @@ class SetValueGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr op) const override { - if (this->HasInput("ValueTensor")) { - op->SetType("set_value_grad"); - - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetInput("ValueTensor", this->Input("ValueTensor")); - if (this->HasInput("StartsTensorList")) { - op->SetInput("StartsTensorList", this->Input("StartsTensorList")); - } - if (this->HasInput("EndsTensorList")) { - op->SetInput("EndsTensorList", this->Input("EndsTensorList")); - } - if (this->HasInput("StepsTensorList")) { - op->SetInput("StepsTensorList", this->Input("StepsTensorList")); - } - - op->SetAttrMap(this->Attrs()); - - op->SetOutput(framework::GradVarName("ValueTensor"), - this->InputGrad("ValueTensor")); - op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); - - } else { - op->SetType("assign"); - op->SetInput("X", this->OutputGrad("Out")); - op->SetOutput("Out", this->InputGrad("Input")); + op->SetType("set_value_grad"); + op->SetInput("ValueTensor", this->Input("ValueTensor")); + op->SetOutput(framework::GradVarName("ValueTensor"), + this->InputGrad("ValueTensor")); + + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + + if (this->HasInput("StartsTensorList")) { + op->SetInput("StartsTensorList", this->Input("StartsTensorList")); + } + if (this->HasInput("EndsTensorList")) { + op->SetInput("EndsTensorList", this->Input("EndsTensorList")); } + if (this->HasInput("StepsTensorList")) { + op->SetInput("StepsTensorList", this->Input("StepsTensorList")); + } + + op->SetAttrMap(this->Attrs()); + + op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); } }; diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h index a35095c98d4a29..66f17168ec01a5 100644 --- a/paddle/fluid/primitive/composite/composite.h +++ b/paddle/fluid/primitive/composite/composite.h @@ -22,6 +22,9 @@ namespace paddle { namespace primitive { namespace details { +// empty_shape means x.shape=[] +static std::vector empty_shape; + template Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) { auto org_dtype = x.dtype(); @@ -345,62 +348,66 @@ std::tuple layer_norm_decomp( // cast dtype to float32 if dtype =float16 or bfloat16 if (need_cast) { - x_cast = cast(x_cast, phi::DataType::FLOAT32); + x_cast = cast(x_cast, DataType::FLOAT32); } auto x_dim = common::vectorize(x.dims()); for (size_t i = begin_norm_axis; i < x_dim.size(); i++) { axis.push_back(static_cast(i)); } - auto mean_ = mean_decomp(x_cast, IntArray(axis), true); + auto mean_ = mean_decomp(x_cast, axis, true); auto difference = x_cast - mean_; auto var_tmp1 = difference * difference; - auto variance = mean_decomp(var_tmp1, IntArray(axis), true); + auto variance = mean_decomp(var_tmp1, axis, true); auto var_tmp3 = variance + epsilon; auto rsqrt_var = elementwise_pow( - var_tmp3, - full(common::vectorize(var_tmp3.dims()), -0.5, var_tmp3.dtype())); + var_tmp3, full(empty_shape, -0.5, var_tmp3.dtype())); auto out = difference * rsqrt_var; auto scale_ptr = scale.get_ptr(); auto bias_ptr = bias.get_ptr(); - std::vector slice_shape; - for (int64_t i = begin_norm_axis; i < static_cast(x_dim.size()); - i++) { - slice_shape.push_back(x_dim[i]); + std::vector slice_shape_l; + std::vector slice_shape_r; + for (int64_t i = 0; i < static_cast(x_dim.size()); i++) { + if (i < begin_norm_axis) { + slice_shape_l.push_back(x_dim[i]); + } else { + slice_shape_r.push_back(x_dim[i]); + } } Tensor scale_cast; if (scale_ptr) { - if (slice_shape != scale_ptr->shape()) { - scale_cast = reshape(*scale_ptr, slice_shape); + if (slice_shape_r != scale_ptr->shape()) { + scale_cast = reshape(*scale_ptr, slice_shape_r); } else { scale_cast = *scale_ptr; } if (need_cast) { - scale_cast = cast(scale_cast, phi::DataType::FLOAT32); + scale_cast = cast(scale_cast, DataType::FLOAT32); } out = out * scale_cast; } Tensor bias_cast; if (bias_ptr) { - if (slice_shape != bias_ptr->shape()) { - bias_cast = reshape(*bias_ptr, slice_shape); + if (slice_shape_r != bias_ptr->shape()) { + bias_cast = reshape(*bias_ptr, slice_shape_r); } else { bias_cast = *bias_ptr; } if (need_cast) { - bias_cast = cast(bias_cast, phi::DataType::FLOAT32); + bias_cast = cast(bias_cast, DataType::FLOAT32); } out = out + bias_cast; } - mean_ = reshape(mean_, std::vector({-1})); - variance = reshape(variance, std::vector({-1})); + mean_ = reshape(mean_, slice_shape_l); + variance = reshape(variance, slice_shape_l); + // same as LayerNormInferMeta + // x: float32 --> out: float32, mean: float32, variance: float32 + // x: float16 --> out: float16, mean: float32, variance: float32 if (need_cast) { out = cast(out, org_dtype); - mean_ = cast(mean_, org_dtype); - variance = cast(variance, org_dtype); } return std::make_tuple(out, mean_, variance); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 5306d282e797ca..8a70396bddee6e 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -287,7 +287,7 @@ if(WITH_PYTHON) eager_legacy_op_function_generator.cc) set(GENERATOR_DEPS ${PYBIND_DEPS}) list(REMOVE_DUPLICATES GENERATOR_DEPS) - if(NOT WITH_ARM) + if(WIN32) list(REMOVE_ITEM GENERATOR_DEPS python) endif() target_link_libraries(eager_legacy_op_function_generator ${GENERATOR_DEPS}) diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 48a8fdc8daa700..617ed37f6fd816 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -54,6 +54,7 @@ typedef SSIZE_T ssize_t; #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #include "paddle/common/ddim.h" #include "paddle/fluid/eager/amp_utils.h" +#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/eager_amp_auto_cast.h" #include "paddle/fluid/framework/python_headers.h" @@ -1361,6 +1362,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, &use_strided_slice); // step2: Dealing with basic indexing + bool out_is_view = false; auto out = getTensorWithBasicIndexing(tensor, &slice_axes, &slice_starts, @@ -1369,7 +1371,8 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, &decrease_axis, &none_axes, &infer_flags, - &use_strided_slice); + &use_strided_slice, + &out_is_view); if (!has_advanced_index) { return ToPyObject(out); @@ -1377,7 +1380,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, // step3: Dealing with advanced indexing std::vector transed_index; - std::vector trans_back_dim; + std::vector trans_back_dim, trans_dim; int pos_of_new_dim = INT_MAX, rank_of_new_dim = 1; paddle::Tensor transed_tensor = dealWithAdvancedIndex(out, @@ -1387,7 +1390,9 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, &transed_index, &trans_back_dim, &pos_of_new_dim, - &rank_of_new_dim); + &rank_of_new_dim, + &trans_dim, + &out_is_view); if (transed_index.size() == 1 && transed_index[0].dtype() == phi::DataType::BOOL) { @@ -1417,14 +1422,14 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, if (pos_of_new_dim != 0) { std::vector perm(out.shape().size(), 0); - int tmp1 = pos_of_new_dim, tmp2 = 0, + int tmp1 = rank_of_new_dim, tmp2 = 0, tmp3 = pos_of_new_dim + rank_of_new_dim; for (int i = 0; i < static_cast(out.shape().size()); ++i) { - if (i < rank_of_new_dim) { + if (i < pos_of_new_dim) { perm[i] = - tmp1++; // range(pos_of_new_dim, pos_of_new_dim + rank_of_new_dim) - } else if (i >= rank_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) { - perm[i] = tmp2++; // range(0, pos_of_new_dim) + tmp1++; // range(rank_of_new_dim, pos_of_new_dim + rank_of_new_dim) + } else if (i >= pos_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) { + perm[i] = tmp2++; // range(0, rank_of_new_dim) } else { perm[i] = tmp3++; // range(pos_of_new_dim + rank_of_new_dim, out.ndim) } @@ -1609,12 +1614,9 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, &use_strided_slice); // step2: Parse values - PADDLE_ENFORCE( - PyCheckTensor(value_obj), - platform::errors::InvalidArgument("The value must be a Tensor")); - + std::vector values; paddle::Tensor value_tensor = - reinterpret_cast(value_obj)->tensor; + dealWithValues(tensor, value_obj, &values, has_advanced_index); if (!has_advanced_index) { // use set_value OP if there is no advanced index @@ -1622,45 +1624,60 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, // Release gil and do tracing py::gil_scoped_release release; // use inplace set_value_ operator - if (value_tensor.initialized() && - (self->tensor.dtype() != value_tensor.dtype())) { - if (egr::Controller::Instance().GetAMPLevel() != - paddle::imperative::AmpLevel::O0) { - paddle::small_vector, - egr::kSlotSmallVectorSize> - tmps = {{self->tensor}, {value_tensor}}; - auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps); - self->tensor = egr::EagerAmpAutoCast( - self->tensor.name(), self->tensor, amp_dtype, "set_value"); - value_tensor = egr::EagerAmpAutoCast( - value_tensor.name(), value_tensor, amp_dtype, "set_value"); - } + if (value_tensor.initialized()) { if (self->tensor.dtype() != value_tensor.dtype()) { - value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + paddle::small_vector, + egr::kSlotSmallVectorSize> + tmps = {{self->tensor}, {value_tensor}}; + auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps); + self->tensor = egr::EagerAmpAutoCast( + self->tensor.name(), self->tensor, amp_dtype, "set_value"); + value_tensor = egr::EagerAmpAutoCast( + value_tensor.name(), value_tensor, amp_dtype, "set_value"); + } + if (self->tensor.dtype() != value_tensor.dtype()) { + value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); + } } - } - // step3.1: Only basic indexing, use OP set_value. - const phi::distributed::ProcessMesh* mesh = nullptr; - if (InputsContainDistTensor(&mesh, self->tensor, value_tensor)) { - ConvertAllInputsToDistTensor(mesh, self->tensor, value_tensor); - } - self->tensor = set_value_with_tensor__ad_func(self->tensor, - value_tensor, - slice_starts, - slice_ends, - slice_strides, - slice_axes, - decrease_axis, - none_axes); - if (PyCheckTensor(value_obj)) { - // pass the stop_gradient from value to tensor. - // pass stop gradient should be done after CheckInplace in - // set_value__dygraph_function. - if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && - egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { - egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); + // step3.1: Only basic indexing, use OP set_value. + const phi::distributed::ProcessMesh* mesh = nullptr; + if (InputsContainDistTensor(&mesh, self->tensor, value_tensor)) { + ConvertAllInputsToDistTensor(mesh, self->tensor, value_tensor); } + self->tensor = set_value_with_tensor__ad_func(self->tensor, + value_tensor, + slice_starts, + slice_ends, + slice_strides, + slice_axes, + decrease_axis, + none_axes); + if (PyCheckTensor(value_obj)) { + // pass the stop_gradient from value to tensor. + // pass stop gradient should be done after CheckInplace in + // set_value__dygraph_function. + if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && + egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { + egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); + } + } + } else { + const phi::distributed::ProcessMesh* mesh = nullptr; + if (InputsContainDistTensor(&mesh, self->tensor)) { + ConvertAllInputsToDistTensor(mesh, self->tensor); + } + self->tensor = set_value__ad_func(self->tensor, + slice_starts, + slice_ends, + slice_strides, + slice_axes, + decrease_axis, + none_axes, + {1}, + values); } } else { // step3.2: Case for there are advanced indexing. @@ -1670,6 +1687,7 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, // 3. assign values to the sliced result by index_put OP; // 4. transpose back and assign the result to original tensor by set_value // OP. + bool out_is_view = false; paddle::Tensor sub_tensor = getTensorWithBasicIndexing(tensor, &slice_axes, &slice_starts, @@ -1678,12 +1696,13 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, &decrease_axis, &none_axes, &infer_flags, - &use_strided_slice); + &use_strided_slice, + &out_is_view); std::vector transed_index; - std::vector trans_back_dim; + std::vector trans_back_dim, trans_dim; - int pos_of_new_dim = 0, rank_of_new_dim = 0; + int pos_of_new_dim = INT_MAX, rank_of_new_dim = 1; paddle::Tensor transed_sub_tensor = dealWithAdvancedIndex(sub_tensor, @@ -1693,61 +1712,127 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, &transed_index, &trans_back_dim, &pos_of_new_dim, - &rank_of_new_dim); + &rank_of_new_dim, + &trans_dim, + &out_is_view); // Release gil and do tracing py::gil_scoped_release release; - - if (value_tensor.initialized() && - (self->tensor.dtype() != value_tensor.dtype())) { - if (egr::Controller::Instance().GetAMPLevel() != - paddle::imperative::AmpLevel::O0) { - paddle::small_vector, - egr::kSlotSmallVectorSize> - tmps = {{self->tensor}, {value_tensor}}; - auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps); - self->tensor = egr::EagerAmpAutoCast( - self->tensor.name(), self->tensor, amp_dtype, "index_put"); - value_tensor = egr::EagerAmpAutoCast( - value_tensor.name(), value_tensor, amp_dtype, "index_put"); - } + if (value_tensor.initialized()) { if (self->tensor.dtype() != value_tensor.dtype()) { - value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + paddle::small_vector, + egr::kSlotSmallVectorSize> + tmps = {{self->tensor}, {value_tensor}}; + auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps); + self->tensor = egr::EagerAmpAutoCast( + self->tensor.name(), self->tensor, amp_dtype, "index_put"); + value_tensor = egr::EagerAmpAutoCast( + value_tensor.name(), value_tensor, amp_dtype, "index_put"); + } + if (self->tensor.dtype() != value_tensor.dtype()) { + value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); + } } - } - // TODO(zoooo0820) 1.Using inplace version index_put - // 2.Remove following code after backward bug fixed. - transed_sub_tensor = assign_ad_func(transed_sub_tensor); + if (value_tensor.dims().size() > 1 && pos_of_new_dim != 0) { + value_tensor = transpose_ad_func(value_tensor, trans_dim); + } - const phi::distributed::ProcessMesh* mesh = nullptr; - if (InputsContainDistTensor( - &mesh, self->tensor, transed_sub_tensor, value_tensor)) { - ConvertAllInputsToDistTensor( - mesh, self->tensor, transed_sub_tensor, value_tensor); - } + const phi::distributed::ProcessMesh* mesh = nullptr; + if (InputsContainDistTensor( + &mesh, self->tensor, transed_sub_tensor, value_tensor)) { + ConvertAllInputsToDistTensor( + mesh, self->tensor, transed_sub_tensor, value_tensor); + } - transed_sub_tensor = - index_put_ad_func(transed_sub_tensor, transed_index, value_tensor); - - paddle::Tensor transback_sub_tensor = - transpose_ad_func(transed_sub_tensor, trans_back_dim); - - self->tensor = set_value_with_tensor__ad_func(self->tensor, - transback_sub_tensor, - slice_starts, - slice_ends, - slice_strides, - slice_axes, - decrease_axis, - none_axes); - if (PyCheckTensor(value_obj)) { - // pass the stop_gradient from value to tensor. - // pass stop gradient should be done after CheckInplace in - // set_value__dygraph_function. - if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && - egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { - egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); + if (transed_index.size() == 1 && + transed_index[0].dtype() == phi::DataType::BOOL && + transed_index[0].shape().size() == self->tensor.shape().size()) { + if (value_tensor.shape() != self->tensor.shape()) { + value_tensor = expand_ad_func(value_tensor, self->tensor.shape()); + } + transed_sub_tensor = + where__ad_func(logical_not_ad_func(transed_index[0]), + transed_sub_tensor, + value_tensor); + } else { + transed_sub_tensor = + index_put__ad_func(transed_sub_tensor, transed_index, value_tensor); + } + + if (out_is_view) { + // NOTE(zoooo0820): if out_is_view is true, it is a case of + // combined-indexing setitem, i.e. firstly we get a view of + // self->tensor, then modified it with inplace api index_put_ For now, + // in design of Paddle, the forward result is right. But the backward + // edge can not be established because the Base Tensor cannot sense + // whether it has been modified by other operations. Following codes are + // to add a new node (set_value_with_tensor_grad) to record the backward + // edge, with out ad_function which needs to do the forward calculation. + + egr::AutogradMeta* x_autograd_meta = + egr::EagerUtils::nullable_autograd_meta(self->tensor); + egr::AutogradMeta* values_autograd_meta = + egr::EagerUtils::nullable_autograd_meta(transed_sub_tensor); + bool trace_backward = egr::Controller::Instance().HasGrad(); + bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( + trace_backward, x_autograd_meta, values_autograd_meta); + // Node Declaration + std::shared_ptr grad_node; + // Set grad_node before API Call + if (require_any_grad) { + paddle::Tensor transback_sub_tensor = + transpose_ad_func(transed_sub_tensor, trans_back_dim); + const auto& values_tmp = + (require_any_grad && transback_sub_tensor.is_dense_tensor() && + !std::dynamic_pointer_cast( + transback_sub_tensor.impl()) + ->meta() + .is_contiguous()) + ? paddle::Tensor( + std::make_shared( + std::move(paddle::experimental::Trans2Contiguous( + *(std::dynamic_pointer_cast( + transback_sub_tensor.impl()))))), + transback_sub_tensor.mutable_autograd_meta()) + : transback_sub_tensor; + + grad_node = std::shared_ptr( + new SetValueWithTensorGradNode(1, 2)); // NOLINT + grad_node->SetAttributestarts(slice_starts); + grad_node->SetAttributeends(slice_ends); + grad_node->SetAttributesteps(slice_strides); + grad_node->SetAttributeaxes(slice_axes); + grad_node->SetAttributedecrease_axes(decrease_axis); + grad_node->SetAttributenone_axes(none_axes); + grad_node->SetTensorWrappervalues(values_tmp); + + paddle::memory::LogDeviceMemoryStats( + egr::Controller::Instance().GetExpectedPlace(), + "set_value_with_tensor"); + egr::EagerUtils::CheckInplace( + self->tensor, x_autograd_meta, require_any_grad); + egr::EagerUtils::PassStopGradient(false, x_autograd_meta); + // SetGradOutMeta & SetEdges + grad_node->SetGradOutMeta(self->tensor, 0); + grad_node->SetGradOutMeta(transback_sub_tensor, 1); + if (x_autograd_meta) { + egr::EagerUtils::SetOutRankWithSlot(x_autograd_meta, 0); + egr::EagerUtils::SetHistory(x_autograd_meta, grad_node); + } + grad_node->SetGradInMeta(self->tensor, 0); + } + } + if (PyCheckTensor(value_obj)) { + // pass the stop_gradient from value to tensor. + // pass stop gradient should be done after CheckInplace in + // set_value__dygraph_function. + if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && + egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { + egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); + } } } } diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 1db2ab7f871c69..20e644c11919ff 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -800,7 +800,7 @@ void BindAnalysisConfig(py::module *m) { &AnalysisConfig::EnableXpu, py::arg("l3_size") = 16 * 1024 * 1024, py::arg("l3_locked") = false, - py::arg("conv_autotune") = true, + py::arg("conv_autotune") = false, py::arg("conv_autotune_file") = "", py::arg("transformer_encoder_precision") = "int16", py::arg("transformer_encoder_adaptive_seqlen") = false, diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h index 918d2eeae4272a..919a3a4650d3e7 100644 --- a/paddle/fluid/pybind/slice_utils.h +++ b/paddle/fluid/pybind/slice_utils.h @@ -26,9 +26,11 @@ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/scope_guard.h" #include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" +#include "pybind11/numpy.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" @@ -345,11 +347,13 @@ static paddle::Tensor getTensorWithBasicIndexing( std::vector* decrease_axis, std::vector* none_axes, std::vector* infer_flags, - bool* use_strided_slice) { + bool* use_strided_slice, + bool* out_is_view) { paddle::Tensor out; if (slice_axes->empty()) { out = tensor; } else { + *out_is_view = true; if (!(*use_strided_slice)) { eager_gil_scoped_release guard; out = slice_ad_func(tensor, @@ -370,6 +374,7 @@ static paddle::Tensor getTensorWithBasicIndexing( } } if (!none_axes->empty()) { + *out_is_view = true; eager_gil_scoped_release guard; // Deal with cases that decrease_axes is not empty // For example: @@ -397,9 +402,9 @@ static paddle::Tensor dealWithAdvancedIndex( std::vector* transed_index, std::vector* trans_back_dim, int* pos_of_new_dim, - int* rank_of_new_dim) { - std::vector trans_dim; - + int* rank_of_new_dim, + std::vector* trans_dim, + bool* out_is_view) { int p = 0; for (size_t i = 0; i < advanced_index_dim->size(); ++i) { auto index_dim = (*advanced_index_dim)[i]; @@ -408,30 +413,28 @@ static paddle::Tensor dealWithAdvancedIndex( // advanced_index_dim auto index = (*advanced_index)[p++]; - if (!is_for_setitem) { - if (index_dim == 0) { - // case 1: advanced indices at axis 0, the new dim will be at first. - *pos_of_new_dim = 0; - } else if (index_dim > 0 && trans_dim.size() > 0 && - trans_dim[trans_dim.size() - 1] != index_dim - 1) { - // case 2: there are not adjacent advanced indices, the new dim will - // be at first. - *pos_of_new_dim = 0; - } else { - *pos_of_new_dim = std::min(index_dim, *pos_of_new_dim); - } - *rank_of_new_dim = - std::max(*rank_of_new_dim, static_cast(index.shape().size())); + if (index_dim == 0) { + // case 1: advanced indices at axis 0, the new dim will be at first. + *pos_of_new_dim = 0; + } else if (index_dim > 0 && trans_dim->size() > 0 && + (*trans_dim)[trans_dim->size() - 1] != index_dim - 1) { + // case 2: there are not adjacent advanced indices, the new dim will + // be at first. + *pos_of_new_dim = 0; + } else { + *pos_of_new_dim = std::min(index_dim, *pos_of_new_dim); } + *rank_of_new_dim = + std::max(*rank_of_new_dim, static_cast(index.shape().size())); - trans_dim.push_back(index_dim); + trans_dim->push_back(index_dim); transed_index->push_back(std::move(index)); } } for (size_t i = 0; i < tensor.shape().size(); ++i) { if ((*advanced_index_dim)[i] == -1) { - trans_dim.push_back(i); + trans_dim->push_back(i); } } @@ -441,19 +444,20 @@ static paddle::Tensor dealWithAdvancedIndex( std::vector original_dim_order(tensor.shape().size()); std::iota(original_dim_order.begin(), original_dim_order.end(), 0); - if (original_dim_order == trans_dim) { + if (original_dim_order == *trans_dim) { transed_tensor = tensor; } else { - transed_tensor = transpose_ad_func(tensor, trans_dim); + *out_is_view = true; + transed_tensor = transpose_ad_func(tensor, *trans_dim); } if (is_for_setitem) { - trans_back_dim->resize(trans_dim.size()); + trans_back_dim->resize(trans_dim->size()); std::iota(trans_back_dim->begin(), trans_back_dim->end(), 0); std::sort(trans_back_dim->begin(), trans_back_dim->end(), [&trans_dim](int left, int right) { - return trans_dim[left] < trans_dim[right]; + return (*trans_dim)[left] < (*trans_dim)[right]; }); } return transed_tensor; @@ -511,5 +515,104 @@ static void ParseBoolAndBroadcastIndices( } } +static paddle::Tensor dealWithValues(const paddle::Tensor& tensor, + PyObject* value_obj, + std::vector* values, + const bool trans_to_tensor) { + paddle::Tensor value_tensor; + if (PyCheckTensor(value_obj)) { + value_tensor = reinterpret_cast(value_obj)->tensor; + } else if (py::isinstance(value_obj)) { + paddle::Tensor value_tensor_tmp( + std::make_shared(), + egr::Controller::Instance().GenerateUniqueName()); + py::object value_obj_tmp(py::handle(value_obj), true); + py::object value = value_obj_tmp; + if (tensor.dtype() == phi::DataType::FLOAT32) { + if (!py::isinstance>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray(value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::FLOAT64) { + if (!py::isinstance>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray(value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::INT32) { + if (!py::isinstance>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray(value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::INT64) { + if (!py::isinstance>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray(value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::BOOL) { + if (!py::isinstance>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray(value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::COMPLEX64) { + if (!py::isinstance>>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray>( + value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::COMPLEX128) { + if (!py::isinstance>>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray>( + value_obj_tmp); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "When assign a numpy.np value to a paddle.Tensor, " + "the data type of the paddle.Tensor must be bool, " + "float32, float64, complex64, complex128, int32 or int64, " + "please check the type of tensor.")); + } + SetTensorFromPyArray( + static_cast(value_tensor_tmp.impl().get()), + value, + tensor.place(), + false); + value_tensor = value_tensor_tmp; + } else { + py::object value_obj_tmp(py::handle(value_obj), true); + // convert the value to self data type + if (py::isinstance(value_obj_tmp) || + py::isinstance(value_obj_tmp) || + py::isinstance(value_obj_tmp) || + PyComplex_Check(value_obj)) { + if (tensor.dtype() == phi::DataType::FLOAT32 || + tensor.dtype() == phi::DataType::FLOAT16 || + tensor.dtype() == phi::DataType::BFLOAT16) { + values->push_back(value_obj_tmp.cast()); + } else if (tensor.dtype() == phi::DataType::FLOAT64) { + values->push_back(value_obj_tmp.cast()); + } else if (tensor.dtype() == phi::DataType::INT32 || + tensor.dtype() == phi::DataType::INT16 || + tensor.dtype() == phi::DataType::INT8 || + tensor.dtype() == phi::DataType::UINT8) { + values->push_back(value_obj_tmp.cast()); + } else if (tensor.dtype() == phi::DataType::INT64) { + values->push_back(value_obj_tmp.cast()); + } else if (tensor.dtype() == phi::DataType::BOOL) { + values->push_back(value_obj_tmp.cast()); + } else if (tensor.dtype() == phi::DataType::COMPLEX64) { + values->push_back(value_obj_tmp.cast>()); + } else if (tensor.dtype() == phi::DataType::COMPLEX128) { + values->push_back(value_obj_tmp.cast>()); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Value type error. The assign value allows " + "Tensor, numpy.ndarray, integer, float, complex or bool, " + "but received %s.", + Py_TYPE(value_obj))); + } + + if (trans_to_tensor) { + value_tensor = + full_ad_func({1}, (*values)[0], tensor.dtype(), tensor.place()); + } + } + return value_tensor; +} + } // namespace pybind } // namespace paddle diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 3a87826337465b..81339a24c50de8 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -944,8 +944,6 @@ func : gather_nd_grad composite : gather_nd_grad(x, index, out_grad, x_grad) no_need_buffer : x - data_transform : - skip_transform : index - backward_op : gaussian_inplace_grad forward : gaussian_inplace(Tensor x, float mean=0, float std=1.0, int seed=0) -> Tensor(out) @@ -1762,8 +1760,8 @@ optional : boxes_num - backward_op : put_along_axis_grad - forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign") -> Tensor(out) - args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce) + forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true) -> Tensor(out) + args : (Tensor arr, Tensor indices, Tensor values, Tensor out, Tensor out_grad, int axis, str reduce, bool include_self) output : Tensor(arr_grad), Tensor(values_grad) infer_meta : func : GeneralBinaryGradInferMeta diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py index 3769155eb27e11..c7ec9ace290ac7 100644 --- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py +++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py @@ -425,6 +425,7 @@ def source_include(header_file_path, fw_header_file_path): #include "{fw_header_file_path}" #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/unary.h" +#include "paddle/phi/infermeta/fusion.h" #include "paddle/phi/api/profiler/event_tracing.h" #include "paddle/phi/api/profiler/supplement_tracing.h" diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 04cf57a88bb7cb..3f11781dfe88eb 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -614,14 +614,14 @@ - backward_op : set_value_grad forward : set_value (Tensor x, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes, int64_t[] shape, Scalar[] values) -> Tensor(out) - args : (Tensor out_grad) + args : (Tensor out_grad, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes) output : Tensor(x_grad) infer_meta: func: UnchangedInferMeta param: [out_grad] kernel: - func: assign - param: [out_grad] + func: set_value_with_scalar_grad + param: [out_grad, starts, ends, steps, axes, decrease_axes, none_axes] - backward_op : set_value_with_tensor_grad forward: set_value_with_tensor (Tensor x, Tensor values, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes) -> Tensor(out) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index e4bbb15073f418..dfcdf65673e208 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -2432,7 +2432,7 @@ outputs : out : Result attrs : - {axis : Axis, reduce : Reduce} + {axis : Axis, reduce : Reduce, include_self: Include_self} - op : pylayer backward : pylayer_grad diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 092b3d71a60b4d..efc1b17714a854 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -2032,7 +2032,7 @@ backward : psroi_pool_grad - op : put_along_axis - args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign") + args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true) output : Tensor(out) infer_meta : func : UnchangedInferMeta diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc index 48bedd1bd939e4..ddbfc60f19f083 100644 --- a/paddle/phi/backends/custom/custom_device.cc +++ b/paddle/phi/backends/custom/custom_device.cc @@ -383,7 +383,7 @@ class CustomDevice : public DeviceInterface { void* ptr = nullptr; const auto device = &devices_pool[dev_id]; - if (!pimpl_->unified_memory_allocate) { + if (!pimpl_->host_memory_allocate) { PADDLE_THROW(phi::errors::Unavailable( "MemoryAllocateHost is not supported on %s.", Type())); } else { diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h index 98ebea87eedfd8..03c33a221c4d3e 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -399,6 +399,182 @@ CUDA_ATOMIC_WRAPPER(Add, complex) { CudaAtomicAdd(imag, val.imag)); } +// For atomicMul. +CUDA_ATOMIC_WRAPPER(Mul, int) { + int res = *address, old = res; // NOLINT + do { + old = res; + res = atomicCAS(address, // NOLINT + old, // NOLINT + val * old); // NOLINT + } while (old != res); + return res; +} + +CUDA_ATOMIC_WRAPPER(Mul, unsigned int) { + unsigned int res = *address, old = res; // NOLINT + do { + old = res; + res = atomicCAS(address, // NOLINT + old, // NOLINT + val * old); // NOLINT + } while (old != res); + return res; +} +// CUDA API uses unsigned long long int, we cannot use uint64_t here. +// It because unsigned long long int is not necessarily uint64_t +CUDA_ATOMIC_WRAPPER(Mul, unsigned long long int) { // NOLINT + unsigned long long int old = *address, assumed; // NOLINT + + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; +} + +CUDA_ATOMIC_WRAPPER(Mul, int64_t) { + // Here, we check long long int must be int64_t. + static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT + "long long should be int64"); + long long int res = *address, old = res; // NOLINT + do { + old = res; + res = (long long int)atomicCAS( // NOLINT + (unsigned long long int *)address, // NOLINT + (unsigned long long int)old, // NOLINT + (unsigned long long int)val * (unsigned long long int)old); // NOLINT + } while (old != res); + return res; +} + +CUDA_ATOMIC_WRAPPER(Mul, float) { + int *const address_as_i = reinterpret_cast(address); + int old = *address_as_i, assumed; + + do { + assumed = old; + old = atomicCAS( + address_as_i, assumed, __float_as_int(val * __int_as_float(assumed))); + } while (assumed != old); + + return __int_as_float(old); +} + +CUDA_ATOMIC_WRAPPER(Mul, double) { + unsigned long long int *const address_as_ull = // NOLINT + reinterpret_cast(address); // NOLINT + unsigned long long int old = *address_as_ull, assumed; // NOLINT + + do { + assumed = old; + + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(val * __longlong_as_double(assumed))); + } while (assumed != old); + + return __longlong_as_double(old); +} + +#ifdef PADDLE_CUDA_FP16 +inline static __device__ uint32_t mul_to_low_half(uint32_t val, float x) { + phi::dtype::float16 low_half; + // The float16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = static_cast(static_cast(low_half) * x); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t mul_to_high_half(uint32_t val, float x) { + phi::dtype::float16 high_half; + // The float16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = + static_cast(static_cast(high_half) * x); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::float16) { + if (*address >= val) { + return *address; + } + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // The float16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, mul_to_low_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::float16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // The float16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, mul_to_high_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::float16 ret; + ret.x = old >> 16; + return ret; + } +} +#endif + +inline static __device__ uint32_t bf16_mul_to_low_half(uint32_t val, float x) { + phi::dtype::bfloat16 low_half; + // The bfloat16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = + static_cast(static_cast(low_half) * x); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t bf16_mul_to_high_half(uint32_t val, float x) { + phi::dtype::bfloat16 high_half; + // The bfloat16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = + static_cast(static_cast(high_half) * x); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::bfloat16) { + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // The bfloat16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS( + address_as_ui, assumed, bf16_mul_to_low_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::bfloat16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // The bfloat16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS( + address_as_ui, assumed, bf16_mul_to_high_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::bfloat16 ret; + ret.x = old >> 16; + return ret; + } +} + // For atomicMax USE_CUDA_ATOMIC(Max, int); USE_CUDA_ATOMIC(Max, unsigned int); diff --git a/paddle/phi/capi/include/c_meta_tensor.h b/paddle/phi/capi/include/c_meta_tensor.h index 08f01084c6abf3..f4c9a541e526aa 100644 --- a/paddle/phi/capi/include/c_meta_tensor.h +++ b/paddle/phi/capi/include/c_meta_tensor.h @@ -39,6 +39,13 @@ int64_t PD_MetaTensorGetDim(const PD_MetaTensor *tensor, size_t index, PD_Status *status); +int64_t PD_MetaTensorGetNumStrides(const PD_MetaTensor *tensor, + PD_Status *status); + +int64_t PD_MetaTensorGetStride(const PD_MetaTensor *tensor, + size_t index, + PD_Status *status); + bool PD_MetaTensorIsValid(const PD_MetaTensor *tensor, PD_Status *status); void PD_MetaTensorSetDims(PD_MetaTensor *tensor, @@ -46,6 +53,11 @@ void PD_MetaTensorSetDims(PD_MetaTensor *tensor, const int64_t *dims, PD_Status *status); +void PD_MetaTensorSetStrides(PD_MetaTensor *tensor, + int64_t nstrides, + const int64_t *strides, + PD_Status *status); + void PD_MetaTensorSetDataType(PD_MetaTensor *tensor, PD_DataType dtype, PD_Status *status); diff --git a/paddle/phi/capi/include/c_tensor.h b/paddle/phi/capi/include/c_tensor.h index c4f706c70ccfb4..2df292c6b946b2 100644 --- a/paddle/phi/capi/include/c_tensor.h +++ b/paddle/phi/capi/include/c_tensor.h @@ -41,6 +41,12 @@ int64_t PD_TensorGetDim(const PD_Tensor *tensor, size_t index, PD_Status *status); +int64_t PD_TensorGetNumStrides(const PD_Tensor *tensor, PD_Status *status); + +int64_t PD_TensorGetStride(const PD_Tensor *tensor, + size_t index, + PD_Status *status); + void PD_TensorGetLoD(const PD_Tensor *tensor, PD_List *data, PD_List *offset, @@ -52,11 +58,22 @@ bool PD_TensorIsValid(const PD_Tensor *tensor, PD_Status *status); void *PD_TensorGetHolder(const PD_Tensor *tensor, PD_Status *status); +size_t PD_TensorGetOffset(const PD_Tensor *tensor, PD_Status *status); + void PD_TensorSetDims(PD_Tensor *tensor, int64_t ndims, const int64_t *dims, PD_Status *status); +void PD_TensorSetOffset(PD_Tensor *tensor, + const int64_t offset, + PD_Status *status); + +void PD_TensorSetStrides(PD_Tensor *tensor, + int64_t nstrides, + const int64_t *strides, + PD_Status *status); + void PD_TensorSetDataType(PD_Tensor *tensor, PD_DataType dtype, PD_Status *status); diff --git a/paddle/phi/capi/include/wrapper_base.h b/paddle/phi/capi/include/wrapper_base.h index 061561008a95e7..75f3e2d9e350eb 100644 --- a/paddle/phi/capi/include/wrapper_base.h +++ b/paddle/phi/capi/include/wrapper_base.h @@ -72,6 +72,19 @@ inline std::vector PD_TensorGetDims(PD_Tensor* tensor, return std::vector(); } +inline std::vector PD_TensorGetStrides(PD_Tensor* tensor, + PD_Status* status) { + int64_t nstrides = PD_TensorGetNumStrides(tensor, status); + if (nstrides > 0) { + std::vector shape(nstrides); + for (int64_t i = 0; i < nstrides; ++i) { + shape[i] = PD_TensorGetStride(tensor, i, status); + } + return shape; + } + return std::vector(); +} + inline std::vector PD_MetaTensorGetDims(PD_MetaTensor* tensor, PD_Status* status) { int64_t ndims = PD_MetaTensorGetNumDims(tensor, status); @@ -85,6 +98,19 @@ inline std::vector PD_MetaTensorGetDims(PD_MetaTensor* tensor, return std::vector(); } +inline std::vector PD_MetaTensorGetStrides(PD_MetaTensor* tensor, + PD_Status* status) { + int64_t nstrides = PD_MetaTensorGetNumStrides(tensor, status); + if (nstrides > 0) { + std::vector shape(nstrides); + for (int64_t i = 0; i < nstrides; ++i) { + shape[i] = PD_MetaTensorGetStride(tensor, i, status); + } + return shape; + } + return std::vector(); +} + template class WrapperBase { public: @@ -134,6 +160,13 @@ class DenseTensor : public WrapperBase { return holder; } + size_t offset() const { + C_Status status; + auto offset = PD_TensorGetOffset(raw_data(), &status); + PD_CHECK_STATUS(status); + return offset; + } + std::vector dims() const { C_Status status; auto dimension = PD_TensorGetDims(raw_data(), &status); @@ -141,6 +174,13 @@ class DenseTensor : public WrapperBase { return dimension; } + std::vector strides() const { + C_Status status; + auto strides = PD_TensorGetStrides(raw_data(), &status); + PD_CHECK_STATUS(status); + return strides; + } + PD_DataType dtype() const { C_Status status; auto data_type = PD_TensorGetPDDataType(raw_data(), &status); @@ -207,6 +247,18 @@ class DenseTensor : public WrapperBase { PD_CHECK_STATUS(status); } + void set_offset(const int64_t& offset) { + C_Status status; + PD_TensorSetOffset(raw_data(), offset, &status); + PD_CHECK_STATUS(status); + } + + void set_strides(const std::vector& strides) { + C_Status status; + PD_TensorSetStrides(raw_data(), strides.size(), strides.data(), &status); + PD_CHECK_STATUS(status); + } + void set_dtype(PD_DataType data_type) { C_Status status; PD_TensorSetDataType(raw_data(), data_type, &status); @@ -513,6 +565,13 @@ class MetaTensor : WrapperBase { return dimension; } + std::vector strides() const { + C_Status status; + auto strides = PD_MetaTensorGetStrides(raw_data(), &status); + PD_CHECK_STATUS(status); + return strides; + } + PD_DataType dtype() const { C_Status status; auto data_type = PD_MetaTensorGetPDDataType(raw_data(), &status); @@ -540,6 +599,13 @@ class MetaTensor : WrapperBase { PD_CHECK_STATUS(status); } + void set_strides(const std::vector& strides) { + C_Status status; + PD_MetaTensorSetStrides( + raw_data(), strides.size(), strides.data(), &status); + PD_CHECK_STATUS(status); + } + void set_dtype(PD_DataType data_type) { C_Status status; PD_MetaTensorSetDataType(raw_data(), data_type, &status); diff --git a/paddle/phi/capi/lib/c_meta_tensor.cc b/paddle/phi/capi/lib/c_meta_tensor.cc index 6ea6eda1a7f23e..f436ba9d3cde0d 100644 --- a/paddle/phi/capi/lib/c_meta_tensor.cc +++ b/paddle/phi/capi/lib/c_meta_tensor.cc @@ -88,6 +88,36 @@ int64_t PD_MetaTensorGetDim(const PD_MetaTensor *tensor, return cc_tensor->dims()[index]; } +int64_t PD_MetaTensorGetNumStrides(const PD_MetaTensor *tensor, + PD_Status *status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return 0; + } + *status = C_SUCCESS; + } + + auto cc_tensor = reinterpret_cast(tensor); + return cc_tensor->strides().size(); +} + +int64_t PD_MetaTensorGetStride(const PD_MetaTensor *tensor, + size_t index, + PD_Status *status) { + auto cc_tensor = reinterpret_cast(tensor); + + if (status) { + if (!tensor || index >= static_cast(cc_tensor->strides().size())) { + *status = C_FAILED; + return 0; + } + *status = C_SUCCESS; + } + + return cc_tensor->strides()[index]; +} + bool PD_MetaTensorIsValid(const PD_MetaTensor *tensor, PD_Status *status) { if (status) { if (!tensor) { @@ -117,6 +147,22 @@ void PD_MetaTensorSetDims(PD_MetaTensor *tensor, cc_tensor->set_dims(common::make_ddim(shape)); } +void PD_MetaTensorSetStrides(PD_MetaTensor *tensor, + int64_t nstrides, + const int64_t *strides, + PD_Status *status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return; + } + *status = C_SUCCESS; + } + auto cc_tensor = reinterpret_cast(tensor); + std::vector shape(strides, strides + nstrides); + cc_tensor->set_strides(common::make_ddim(shape)); +} + void PD_MetaTensorSetDataType(PD_MetaTensor *tensor, PD_DataType dtype, PD_Status *status) { diff --git a/paddle/phi/capi/lib/c_tensor.cc b/paddle/phi/capi/lib/c_tensor.cc index 31a724447b7c7f..eb8c8c6f4eb47d 100644 --- a/paddle/phi/capi/lib/c_tensor.cc +++ b/paddle/phi/capi/lib/c_tensor.cc @@ -111,6 +111,35 @@ int64_t PD_TensorGetDim(const PD_Tensor* tensor, return cc_tensor->dims()[index]; } +int64_t PD_TensorGetNumStrides(const PD_Tensor* tensor, PD_Status* status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return 0; + } + *status = C_SUCCESS; + } + + auto cc_tensor = reinterpret_cast(tensor); + return cc_tensor->strides().size(); +} + +int64_t PD_TensorGetStride(const PD_Tensor* tensor, + size_t index, + PD_Status* status) { + auto cc_tensor = reinterpret_cast(tensor); + + if (status) { + if (!tensor || index >= static_cast(cc_tensor->strides().size())) { + *status = C_FAILED; + return 0; + } + *status = C_SUCCESS; + } + + return cc_tensor->strides()[index]; +} + void PD_TensorGetLoD(const PD_Tensor* tensor, PD_List* data, PD_List* offset, @@ -185,6 +214,19 @@ void* PD_TensorGetHolder(const PD_Tensor* tensor, PD_Status* status) { return cc_tensor->Holder().get(); } +size_t PD_TensorGetOffset(const PD_Tensor* tensor, PD_Status* status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return 0; + } + *status = C_SUCCESS; + } + + auto cc_tensor = reinterpret_cast(tensor); + return cc_tensor->offset(); +} + void PD_TensorSetDims(PD_Tensor* tensor, int64_t ndims, const int64_t* dims, @@ -201,6 +243,36 @@ void PD_TensorSetDims(PD_Tensor* tensor, cc_tensor->Resize(common::make_ddim(shape)); } +void PD_TensorSetOffset(PD_Tensor* tensor, + const int64_t offset, + PD_Status* status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return; + } + *status = C_SUCCESS; + } + auto cc_tensor = reinterpret_cast(tensor); + cc_tensor->set_offset(offset); +} + +void PD_TensorSetStrides(PD_Tensor* tensor, + int64_t nstrides, + const int64_t* strides, + PD_Status* status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return; + } + *status = C_SUCCESS; + } + auto cc_tensor = reinterpret_cast(tensor); + std::vector shape(strides, strides + nstrides); + cc_tensor->set_strides(common::make_ddim(shape)); +} + void PD_TensorSetDataType(PD_Tensor* tensor, PD_DataType dtype, PD_Status* status) { diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc index a2fe426b0ec47b..978552e13c0e8a 100644 --- a/paddle/phi/core/generator.cc +++ b/paddle/phi/core/generator.cc @@ -278,7 +278,7 @@ uint64_t Generator::Random64() { std::pair Generator::IncrementOffset( uint64_t increment_offset) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE) std::lock_guard lock(this->mu_); uint64_t cur_offset = this->state_.thread_offset; VLOG(10) << "cur_offset = " << cur_offset diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h index 6c61c3964b52d6..296db9b1781987 100644 --- a/paddle/phi/core/visit_type.h +++ b/paddle/phi/core/visit_type.h @@ -355,7 +355,7 @@ namespace phi { "`"); \ } \ }() -#if defined(PADDLE_WITH_XPU) +#if defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_HIP) #define PD_VISIT_ALL_TYPES(TYPE, NAME, ...) \ [&] { \ const auto& __dtype__ = TYPE; \ diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 2df3f34b57936c..6432dc19f768e9 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -77,6 +77,8 @@ if(WITH_MUSA) "selected_rows/gpu/clip_by_norm_kernel.cu" "gpu/softmax_grad_kernel.cu" "gpu/softmax_kernel.cu" + "gpu/put_along_axis_grad_kernel.cu" + "gpu/put_along_axis_kernel.cu" ) endif() @@ -217,6 +219,32 @@ if(NOT WITH_CUDNN_FRONTEND) "fusion/gpu/fused_dconv_drelu_dbn_kernel.cu") endif() +# Note(qili93): remove kernels not supported on DCU yet +if(WITH_ROCM) + list( + REMOVE_ITEM + kernel_cu + "gpu/affine_grid_grad_kernel.cu" + "gpu/apply_per_channel_scale_kernel.cu" + "gpu/cholesky_solve_kernel.cu" + "gpu/eigh_kernel.cu" + "gpu/eigvalsh_kernel.cu" + "gpu/lstsq_kernel.cu" + "gpu/lu_kernel.cu" + "gpu/matrix_rank_kernel.cu" + "gpu/matrix_rank_tol_kernel.cu" + "gpu/multiclass_nms3_kernel.cu" + "gpu/put_along_axis_grad_kernel.cu" + "gpu/put_along_axis_kernel.cu" + "gpu/qr_kernel.cu" + "gpu/svd_kernel.cu" + "gpudnn/mha_cudnn_frontend.cu" + "fusion/gpu/block_multi_head_attention_kernel.cu" + "fusion/gpu/fused_bn_add_activation_grad_kernel.cu" + "fusion/gpu/fused_bn_add_activation_kernel.cu" + "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu") +endif() + set(cc_search_pattern "*.cc" "cpu/*.cc" @@ -243,6 +271,10 @@ if(WITH_MKLDNN) "fusion/onednn/*.cc") endif() +if(WITH_CUSTOM_DEVICE) + set(cc_search_pattern ${cc_search_pattern} "custom/*.cc") +endif() + file( GLOB kernel_cc RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" diff --git a/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc b/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc index acd84a80be2ad1..47e804b7de2775 100644 --- a/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc @@ -38,10 +38,10 @@ void CummaxGradKernel(const Context& dev_ctx, } if (dtype == DataType::INT32) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } else if (dtype == DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } } @@ -61,10 +61,10 @@ void CumminGradKernel(const Context& dev_ctx, } if (dtype == DataType::INT32) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } else if (dtype == DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } } diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc index dd7b762849d16b..aeb2071b136de8 100644 --- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc @@ -25,11 +25,14 @@ namespace phi { template void PutAlongAxisGradKernel(const Context& dev_ctx, - const DenseTensor& x UNUSED, + const DenseTensor& x, const DenseTensor& index, + const DenseTensor& value, + const DenseTensor& out, const DenseTensor& out_grad, int axis, - const std::string& reduce UNUSED, + const std::string& reduce, + bool include_self, DenseTensor* x_grad, DenseTensor* value_grad) { PADDLE_ENFORCE_EQ( @@ -40,31 +43,135 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, const auto& index_type = index.dtype(); if (x_grad) { phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_input_grad_kernel( - // Here passing an unused argument out_grad, because it's - // convenient to instantiate a bunch of template function with the - // same arguments list. - out_grad, - axis, - index, - *x_grad, - dev_ctx); - } else { - phi::funcs::cpu_scatter_input_grad_kernel( - out_grad, axis, index, *x_grad, dev_ctx); + if (include_self == false || reduce == "assign") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_input_grad_kernel( + // Here passing an unused argument out_grad, because it's + // convenient to instantiate a bunch of template function with the + // same arguments list. + out_grad, + axis, + index, + *x_grad, + include_self, + dev_ctx); + } else { + phi::funcs::cpu_scatter_input_grad_kernel( + out_grad, axis, index, *x_grad, include_self, dev_ctx); + } + } else if (reduce == "multiply" || reduce == "mul" || reduce == "amin" || + reduce == "amax") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_mul_min_max_input_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *x_grad, + reduce, + include_self, + dev_ctx); + } else { + phi::funcs::cpu_scatter_mul_min_max_input_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *x_grad, + reduce, + include_self, + dev_ctx); + } + } else if (reduce == "mean") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_mean_input_grad_kernel( + // Here passing an unused argument out_grad, because it's + // convenient to instantiate a bunch of template function with the + // same arguments list. + out_grad, + axis, + index, + *x_grad, + include_self, + dev_ctx); + } else { + phi::funcs::cpu_scatter_mean_input_grad_kernel( + out_grad, axis, index, *x_grad, include_self, dev_ctx); + } } } if (value_grad) { value_grad->Resize(index.dims()); dev_ctx.template Alloc(value_grad); - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_value_grad_kernel( - out_grad, axis, index, *value_grad, dev_ctx); - } else { - phi::funcs::cpu_scatter_value_grad_kernel( - out_grad, axis, index, *value_grad, dev_ctx); + auto* grad_data = value_grad->data(); + int64_t grad_size = value_grad->numel(); + memset(grad_data, 0, sizeof(T) * grad_size); + if (reduce == "assign") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_value_grad_kernel( + out_grad, axis, index, *value_grad, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::cpu_scatter_value_grad_kernel( + out_grad, axis, index, *value_grad, include_self, dev_ctx); + } + } else if (reduce == "add" || reduce == "mean") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_add_mean_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } else { + phi::funcs::cpu_scatter_add_mean_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } + } else if (reduce == "mul" || reduce == "multiply" || reduce == "amin" || + reduce == "amax") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_mul_min_max_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } else { + phi::funcs::cpu_scatter_mul_min_max_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } } } } diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc index 5417f9463a62f8..4411755d61cbaf 100644 --- a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc +++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc @@ -30,6 +30,7 @@ void PutAlongAxisKernel(const Context& dev_ctx, const DenseTensor& value, int axis, const std::string& reduce, + bool include_self, DenseTensor* out) { PADDLE_ENFORCE_EQ( dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU, @@ -41,31 +42,56 @@ void PutAlongAxisKernel(const Context& dev_ctx, if (reduce == "add") { if (index_type == DataType::INT32) { phi::funcs::cpu_scatter_add_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } } else if (reduce == "multiply" || reduce == "mul") { if (index_type == DataType::INT32) { phi::funcs::cpu_scatter_mul_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::cpu_scatter_mul_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } } else if (reduce == "assign") { if (index_type == DataType::INT32) { phi::funcs::cpu_scatter_assign_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::cpu_scatter_assign_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); + } + } else if (reduce == "mean") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_mean_kernel( + *out, axis, index, value, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::cpu_scatter_mean_kernel( + *out, axis, index, value, include_self, dev_ctx); + } + } else if (reduce == "amax") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_max_kernel( + *out, axis, index, value, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::cpu_scatter_max_kernel( + *out, axis, index, value, include_self, dev_ctx); + } + } else if (reduce == "amin") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_min_kernel( + *out, axis, index, value, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::cpu_scatter_min_kernel( + *out, axis, index, value, include_self, dev_ctx); } } else { PADDLE_THROW(errors::InvalidArgument( "can not support reduce: '%s' for scatter kernel, only " - "support reduce op: 'add', 'assign', 'mul' and 'multiply', the " + "support reduce op: 'add', 'assign', 'mul', 'mean', 'amin', 'amax' and " + "'multiply', the " "default reduce " "op is 'assign' ", reduce)); diff --git a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc index b7b33d4290daec..66f3ef0cd790d1 100644 --- a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc @@ -104,7 +104,8 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(repeat_interleave_grad, CPU, @@ -113,4 +114,5 @@ PD_REGISTER_KERNEL(repeat_interleave_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc index 388e243eff42a0..8b00d7e38f304c 100644 --- a/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc +++ b/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc @@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(repeat_interleave, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index, CPU, @@ -34,4 +35,5 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc index ed35513d985505..237a892dbb356c 100644 --- a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc @@ -35,3 +35,20 @@ PD_REGISTER_KERNEL(set_value_grad, phi::dtype::float16, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(set_value_with_scalar_grad, + CPU, + ALL_LAYOUT, + phi::SetValueWithScalarGradKernel, + float, + double, + int, + int64_t, + bool, + int16_t, + uint8_t, + int8_t, + phi::dtype::bfloat16, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc index 8a7238203ec647..4e5fc0c305100c 100644 --- a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc @@ -50,10 +50,11 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx, axis, index, out_grad, + true, dev_ctx); // the gradient of gather is scatter } else if (index_type == phi::DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, index, out_grad, dev_ctx); + *x_grad, axis, index, out_grad, true, dev_ctx); } } diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc index d1b4a24b54eba5..d006f688ae2434 100644 --- a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc +++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc @@ -38,9 +38,11 @@ void TakeAlongAxisKernel(const Context& dev_ctx, const auto& index_type = index.dtype(); if (index_type == DataType::INT32) { - phi::funcs::cpu_gather_kernel(x, axis, index, *out, dev_ctx); + phi::funcs::cpu_gather_kernel( + x, axis, index, *out, true, dev_ctx); } else if (index_type == DataType::INT64) { - phi::funcs::cpu_gather_kernel(x, axis, index, *out, dev_ctx); + phi::funcs::cpu_gather_kernel( + x, axis, index, *out, true, dev_ctx); } } diff --git a/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc b/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc new file mode 100644 index 00000000000000..ff61688513b139 --- /dev/null +++ b/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/c_embedding_grad_kernel.h" +#include "glog/logging.h" +#include "paddle/phi/api/backward/backward_api.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +template +void CEmbeddingGradKernel(const Context& dev_ctx, + const DenseTensor& w, + const DenseTensor& ids, + const DenseTensor& out_grad, + int64_t start_index, + DenseTensor* w_grad) { + w_grad->Resize(w.dims()); + dev_ctx.template Alloc(w_grad, w.dtype()); + const auto& index_type = ids.dtype(); + if (index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64) { + auto K = ids.numel(); + auto N = w.dims()[0]; + auto D = w.dims()[1]; + + auto x_tmp = std::make_shared(); + x_tmp->ShareDataWith(ids).Resize({K}); + auto w_tmp = std::make_shared(); + w_tmp->set_meta(w.meta()); + dev_ctx.Alloc(w_tmp.get(), w_tmp->dtype()); + auto out_grad_tmp = std::make_shared(); + out_grad_tmp->ShareDataWith(out_grad).Resize({K, D}); + paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp), + out_grad_tensor(out_grad_tmp); + + auto start_index_tensor = paddle::experimental::full_like( + x_tensor, start_index, x_tensor.dtype(), x_tensor.place()); + auto end_index_tensor = paddle::experimental::full_like( + x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place()); + auto ids_mask_tensor = paddle::experimental::logical_and( + x_tensor.greater_equal(start_index_tensor), + x_tensor.less_than(end_index_tensor)); + auto real_ids_tensor = (x_tensor - start_index_tensor) + .multiply(paddle::experimental::cast( + ids_mask_tensor, x_tensor.dtype())); + auto out_grad_tensor_mul_mask = + paddle::experimental::reshape(out_grad_tensor, {K, D}) + .multiply(paddle::experimental::reshape( + paddle::experimental::cast(ids_mask_tensor, w.dtype()), + {K, 1})); + paddle::Tensor w_grad_tensor; + paddle::experimental::embedding_grad(real_ids_tensor, + w_tensor, + out_grad_tensor_mul_mask, + -1, + false, + &w_grad_tensor); + w_grad->ShareDataWith( + *reinterpret_cast(w_grad_tensor.impl().get())); + + } else { + PADDLE_THROW(phi::errors::Unavailable( + "Custom Device c_embedding_grad ids only support int32 or int64.")); + } +} +#endif +} // namespace phi + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +PD_REGISTER_KERNEL(c_embedding_grad, + Custom, + ALL_LAYOUT, + phi::CEmbeddingGradKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/custom/c_embedding_kernel.cc b/paddle/phi/kernels/custom/c_embedding_kernel.cc new file mode 100644 index 00000000000000..0cacf61d46f3a8 --- /dev/null +++ b/paddle/phi/kernels/custom/c_embedding_kernel.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/c_embedding_kernel.h" +#include "glog/logging.h" +#include "paddle/phi/api/backward/backward_api.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +template +void CEmbeddingKernel(const Context& dev_ctx, + const DenseTensor& w, + const DenseTensor& ids, + int64_t start_index, + int64_t vocab_size, + DenseTensor* out) { + const auto& index_type = ids.dtype(); + if (index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64) { + auto out_dims = out->dims(); + auto K = ids.numel(); + auto N = w.dims()[0]; + auto D = w.dims()[1]; + + auto x_tmp = std::make_shared(); + x_tmp->ShareDataWith(ids).Resize({K}); + auto w_tmp = std::make_shared(); + w_tmp->ShareDataWith(w).Resize({N, D}); + paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp); + + auto start_index_tensor = paddle::experimental::full_like( + x_tensor, start_index, x_tensor.dtype(), x_tensor.place()); + auto end_index_tensor = paddle::experimental::full_like( + x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place()); + auto ids_mask_tensor = paddle::experimental::logical_and( + x_tensor.greater_equal(start_index_tensor), + x_tensor.less_than(end_index_tensor)); + auto ids_tensor = (x_tensor - start_index_tensor) + .multiply(paddle::experimental::cast( + ids_mask_tensor, x_tensor.dtype())); + auto out_tensor = + paddle::experimental::reshape( + paddle::experimental::cast(ids_mask_tensor, w_tensor.dtype()), + {K, 1}) + .multiply(paddle::experimental::reshape( + paddle::experimental::embedding( + ids_tensor, w_tensor, -1, false), + {K, D})); + out->ShareDataWith( + *reinterpret_cast(out_tensor.impl().get())) + .Resize(out_dims); + } else { + PADDLE_THROW(phi::errors::Unavailable( + "Custom Device c_embedding ids only support int32 or int64.")); + } +} +#endif +} // namespace phi + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +PD_REGISTER_KERNEL(c_embedding, + Custom, + ALL_LAYOUT, + phi::CEmbeddingKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index f2d43a19a246d6..3b63d4f2ab407a 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -15,6 +15,9 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) "*.cu") endif() +if(WITH_ROCM) + list(REMOVE_ITEM func_cu_srcs "weight_only_gemv.cu") +endif() if(WITH_MUSA) list(REMOVE_ITEM func_cu_srcs "softmax.cu") diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc index 7be86351c47ff6..ca6c44dbdbd761 100644 --- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc +++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc @@ -48,6 +48,24 @@ class ReduceMultiply { }; static ReduceMultiply reduce_mul; +class ReduceMax { + public: + template + void operator()(tensor_t* self_data, tensor_t* src_data) const { + *self_data = *src_data > *self_data ? *src_data : *self_data; + } +}; +static ReduceMax reduce_max; + +class ReduceMin { + public: + template + void operator()(tensor_t* self_data, tensor_t* src_data) const { + *self_data = *src_data < *self_data ? *src_data : *self_data; + } +}; +static ReduceMin reduce_min; + template @@ -55,10 +73,11 @@ struct cpu_gather_scatter_functor { template void operator()(phi::DenseTensor self, int dim, - const phi::DenseTensor& index UNUSED, + const phi::DenseTensor& index, const phi::DenseTensor& src, - const std::string& method_name UNUSED, + const std::string& method_name, const func_t& reduce_op, + bool include_self, const phi::DeviceContext& ctx UNUSED) { if (index.numel() == 0) { return; @@ -96,6 +115,7 @@ struct cpu_gather_scatter_functor { outer_dim_size_src *= src_dims[i]; } int64_t index_idx = 0; + std::vector nums_of_elements(self.numel(), 0); // N layer loop squeezed into 3 layers loop for (int64_t i = 0; i < inner_dim_size; i++) { for (int64_t j = 0; j < select_dim_size; j++) { @@ -132,12 +152,31 @@ struct cpu_gather_scatter_functor { replace_index_src = k + index * outer_dim_size_src + i * outer_dim_size_src * src_select_dim_size; } - reduce_op((tensor_t*)(self_data + replace_index_self), // NOLINT - (tensor_t*)(src_data + replace_index_src)); // NOLINT + if (include_self == false && + nums_of_elements[replace_index_self] == 0) { + self_data[replace_index_self] = src_data[replace_index_src]; + } else { + reduce_op((tensor_t*)(self_data + replace_index_self), // NOLINT + (tensor_t*)(src_data + replace_index_src)); // NOLINT + } + nums_of_elements[replace_index_self] += 1; index_idx++; } } } + if (method_name == "scatter_mean_cpu") { + for (int i = 0; i < self_size; i++) { + if (nums_of_elements[i]) { + if (include_self) { + self_data[i] = + self_data[i] / static_cast(nums_of_elements[i] + 1); + } else { + self_data[i] = + self_data[i] / static_cast(nums_of_elements[i]); + } + } + } + } } }; @@ -146,11 +185,18 @@ void cpu_gather_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor result, + bool include_self, const phi::DeviceContext& ctx) { cpu_gather_scatter_functor()( - result, dim, index, self, "gather_out_cpu", tensor_assign, ctx); + /*is_scatter_like=*/false>()(result, + dim, + index, + self, + "gather_out_cpu", + tensor_assign, + include_self, + ctx); } template @@ -158,11 +204,18 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_assign_cpu", tensor_assign, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_assign_cpu", + tensor_assign, + include_self, + ctx); } template @@ -170,11 +223,12 @@ void cpu_scatter_add_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_add_cpu", reduce_add, ctx); + self, dim, index, src, "scatter_add_cpu", reduce_add, include_self, ctx); } template @@ -182,11 +236,51 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx) { + cpu_gather_scatter_functor()( + self, dim, index, src, "scatter_mul_cpu", reduce_mul, include_self, ctx); +} + +template +void cpu_scatter_mean_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx) { + cpu_gather_scatter_functor()( + self, dim, index, src, "scatter_mean_cpu", reduce_add, include_self, ctx); +} + +template +void cpu_scatter_max_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx) { + cpu_gather_scatter_functor()( + self, dim, index, src, "scatter_max_cpu", reduce_max, include_self, ctx); +} + +template +void cpu_scatter_min_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_mul_cpu", reduce_mul, ctx); + self, dim, index, src, "scatter_min_cpu", reduce_min, include_self, ctx); } template @@ -194,6 +288,7 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED, int dim, const phi::DenseTensor& index, phi::DenseTensor grad, + bool include_self UNUSED, const phi::DeviceContext& ctx UNUSED) { auto* index_data = index.data(); auto* grad_data = grad.data(); @@ -229,11 +324,135 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED, } } +template +void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self UNUSED, + const phi::DeviceContext& ctx) { + auto* index_data = index.data(); + auto* grad_data = grad.data(); + auto* out_data = out.data(); + auto* x_data = x.data(); + auto* value_data = value.data(); + + int64_t grad_size = grad.numel(); + auto index_dims = index.dims(); + auto grad_dims = grad.dims(); + auto value_dims = value.dims(); + + int64_t inner_dim_size = 1; + int64_t outer_dim_size = 1; + int64_t outer_dim_size_grad = 1; + int64_t outer_dim_size_value = 1; + int64_t select_dim_size = index_dims[dim]; + int64_t grad_select_dim_size = grad_dims[dim]; + int64_t value_select_dim_size = value_dims[dim]; + for (int i = 0; i < dim; ++i) { + inner_dim_size *= index_dims[i]; + } + + for (int i = dim + 1; i < index_dims.size(); i++) { + outer_dim_size *= index_dims[i]; + outer_dim_size_grad *= grad_dims[i]; + outer_dim_size_value *= value_dims[i]; + } + + int64_t index_idx = 0; + std::vector num_elements(grad_size, 0); + for (int64_t i = 0; i < inner_dim_size; i++) { + for (int64_t j = 0; j < select_dim_size; j++) { + for (int64_t k = 0; k < outer_dim_size; k++) { + int64_t index = index_data[index_idx]; + int64_t replace_index_grad = + k + index * outer_dim_size_grad + + i * outer_dim_size_grad * grad_select_dim_size; + if ((reduce == "multiply" || reduce == "mul") && + num_elements[replace_index_grad] == 0) { + grad_data[replace_index_grad] = static_cast( + grad_data[replace_index_grad] * out_data[replace_index_grad] / + x_data[replace_index_grad]); + num_elements[replace_index_grad] += 1; + } else if (reduce == "amin" || reduce == "amax") { + if (out_data[replace_index_grad] != x_data[replace_index_grad]) { + grad_data[replace_index_grad] = 0; + } else { + int64_t replace_index_value = + k + j * outer_dim_size_value + + i * outer_dim_size_value * value_select_dim_size; + if (out_data[replace_index_grad] == value_data[replace_index_value]) + num_elements[replace_index_grad] += 1; + } + } + index_idx++; + } + } + } + if (reduce == "amin" || reduce == "amax") { + for (int64_t i = 0; i < grad_size; i++) { + grad_data[i] = grad_data[i] / static_cast(num_elements[i] + 1); + } + } +} + +template +void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self UNUSED, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor grad, + bool include_self UNUSED, + const phi::DeviceContext& ctx UNUSED) { + auto* index_data = index.data(); + auto* grad_data = grad.data(); + + auto index_dims = index.dims(); + auto grad_dims = grad.dims(); + + int64_t grad_size = grad.numel(); + + int64_t inner_dim_size = 1; + int64_t outer_dim_size = 1; + int64_t outer_dim_size_data = 1; + int64_t select_dim_size = index_dims[dim]; + int64_t grad_select_dim_size = grad_dims[dim]; + for (int i = 0; i < dim; ++i) { + inner_dim_size *= index_dims[i]; + } + + for (int i = dim + 1; i < index_dims.size(); i++) { + outer_dim_size *= index_dims[i]; + outer_dim_size_data *= grad_dims[i]; + } + + int64_t index_idx = 0; + std::vector num_elements(grad_size, 0); + for (int64_t i = 0; i < inner_dim_size; i++) { + for (int64_t j = 0; j < select_dim_size; j++) { + for (int64_t k = 0; k < outer_dim_size; k++) { + int64_t index = index_data[index_idx]; + int64_t replace_index = k + index * outer_dim_size_data + + i * outer_dim_size_data * grad_select_dim_size; + num_elements[replace_index] += 1; + index_idx++; + } + } + } + for (int64_t i = 0; i < grad_size; i++) + if (num_elements[i]) + grad_data[i] = grad_data[i] / static_cast(num_elements[i] + 1); +} + template void cpu_scatter_value_grad_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor grad, + bool include_self UNUSED, const phi::DeviceContext& ctx UNUSED) { auto* self_data = self.data(); auto* index_data = index.data(); @@ -244,11 +463,75 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self, auto grad_dims = grad.dims(); int64_t self_size = self.numel(); - int64_t grad_size = grad.numel(); - bool* is_self_grad_used = new bool[self_size]; + std::vector is_self_grad_used(self_size, false); + + int64_t inner_dim_size = 1; + int64_t outer_dim_size = 1; + int64_t outer_dim_size_self = 1; + int64_t outer_dim_size_grad = 1; + int64_t select_dim_size = index_dims[dim]; + int64_t self_select_dim_size = self_dims[dim]; + int64_t grad_select_dim_size = grad_dims[dim]; + for (int i = 0; i < dim; ++i) { + inner_dim_size *= index_dims[i]; + } + + for (int i = dim + 1; i < index_dims.size(); i++) { + outer_dim_size *= index_dims[i]; + outer_dim_size_self *= self_dims[i]; + outer_dim_size_grad *= grad_dims[i]; + } + int64_t index_idx = index.numel() - 1; + for (int64_t i = inner_dim_size - 1; i >= 0; i--) { + for (int64_t j = select_dim_size - 1; j >= 0; j--) { + for (int64_t k = outer_dim_size - 1; k >= 0; k--) { + int64_t index = index_data[index_idx]; + int64_t replace_index_self = + k + index * outer_dim_size_self + + i * outer_dim_size_self * self_select_dim_size; + int64_t replace_index_grad = + k + j * outer_dim_size_grad + + i * outer_dim_size_grad * grad_select_dim_size; + if (!is_self_grad_used[replace_index_self]) { + grad_data[replace_index_grad] = self_data[replace_index_self]; + is_self_grad_used[replace_index_self] = true; + } + index_idx--; + } + } + } +} + +template +void cpu_scatter_add_mean_value_grad_kernel( + phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out UNUSED, + const phi::DenseTensor& x UNUSED, + const phi::DenseTensor& value UNUSED, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& ctx UNUSED) { + auto* self_data = self.data(); + auto* index_data = index.data(); + auto* grad_data = grad.data(); + + auto index_dims = index.dims(); + auto self_dims = self.dims(); + auto grad_dims = grad.dims(); - for (int i = 0; i < self_size; i++) { - is_self_grad_used[i] = false; + int64_t self_size = self.numel(); + int64_t grad_size = grad.numel(); + std::vector num_elements; + if (reduce == "mean") { + for (int i = 0; i < self_size; i++) { + if (include_self) + num_elements.push_back(1); + else + num_elements.push_back(0); + } } int64_t inner_dim_size = 1; @@ -267,10 +550,25 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self, outer_dim_size_self *= self_dims[i]; outer_dim_size_grad *= grad_dims[i]; } - int64_t index_idx = index.numel() - 1; for (int i = 0; i < grad_size; i++) { grad_data[i] = static_cast(0); } + int64_t index_idx = index.numel() - 1; + if (reduce == "mean") { + for (int64_t i = inner_dim_size - 1; i >= 0; i--) { + for (int64_t j = select_dim_size - 1; j >= 0; j--) { + for (int64_t k = outer_dim_size - 1; k >= 0; k--) { + int64_t index = index_data[index_idx]; + int64_t replace_index_self = + k + index * outer_dim_size_self + + i * outer_dim_size_self * self_select_dim_size; + num_elements[replace_index_self] += 1; + index_idx--; + } + } + } + index_idx = index.numel() - 1; + } for (int64_t i = inner_dim_size - 1; i >= 0; i--) { for (int64_t j = select_dim_size - 1; j >= 0; j--) { for (int64_t k = outer_dim_size - 1; k >= 0; k--) { @@ -281,23 +579,131 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self, int64_t replace_index_grad = k + j * outer_dim_size_grad + i * outer_dim_size_grad * grad_select_dim_size; - if (!is_self_grad_used[replace_index_self]) { + if (reduce == "add") grad_data[replace_index_grad] = self_data[replace_index_self]; - is_self_grad_used[replace_index_self] = true; - } + else if (reduce == "mean") + grad_data[replace_index_grad] = + self_data[replace_index_self] / + static_cast(num_elements[replace_index_self]); index_idx--; } } } - delete[] is_self_grad_used; } -Instantiate_Template_Function(cpu_gather_kernel) - Instantiate_Template_Function(cpu_scatter_assign_kernel) - Instantiate_Template_Function(cpu_scatter_add_kernel) - Instantiate_Template_Function(cpu_scatter_mul_kernel) - Instantiate_Template_Function(cpu_scatter_input_grad_kernel) - Instantiate_Template_Function(cpu_scatter_value_grad_kernel) +template +void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& ctx) { + auto* self_data = self.data(); + auto* index_data = index.data(); + auto* grad_data = grad.data(); + auto* out_data = out.data(); + auto* x_data = x.data(); + auto* value_data = value.data(); + + auto index_dims = index.dims(); + auto self_dims = self.dims(); + auto grad_dims = grad.dims(); + + int64_t self_size = self.numel(); + std::vector num_elements; + if (reduce == "amin" || reduce == "amax") { + for (int i = 0; i < self_size; i++) { + num_elements.push_back(0); + } + } + int64_t inner_dim_size = 1; + int64_t outer_dim_size = 1; + int64_t outer_dim_size_self = 1; + int64_t outer_dim_size_grad = 1; + int64_t select_dim_size = index_dims[dim]; + int64_t self_select_dim_size = self_dims[dim]; + int64_t grad_select_dim_size = grad_dims[dim]; + for (int i = 0; i < dim; ++i) { + inner_dim_size *= index_dims[i]; + } + + for (int i = dim + 1; i < index_dims.size(); i++) { + outer_dim_size *= index_dims[i]; + outer_dim_size_self *= self_dims[i]; + outer_dim_size_grad *= grad_dims[i]; + } + int64_t index_idx = 0; + for (int64_t i = 0; i < inner_dim_size; i++) { + for (int64_t j = 0; j < select_dim_size; j++) { + for (int64_t k = 0; k < outer_dim_size; k++) { + int64_t index = index_data[index_idx]; + int64_t replace_index_self = + k + index * outer_dim_size_self + + i * outer_dim_size_self * self_select_dim_size; + int64_t replace_index_grad = + k + j * outer_dim_size_grad + + i * outer_dim_size_grad * grad_select_dim_size; + if ((reduce == "amin" || reduce == "amax") && + out_data[replace_index_self] == value_data[replace_index_grad]) { + num_elements[replace_index_self] += 1; + } else if (reduce == "mul" || reduce == "multiply") { + grad_data[replace_index_grad] = + self_data[replace_index_self] * + (out_data[replace_index_self] / value_data[replace_index_grad]); + } + index_idx++; + } + } + } + if (reduce == "amin" || reduce == "amax") { + index_idx = 0; + for (int64_t i = 0; i < inner_dim_size; i++) { + for (int64_t j = 0; j < select_dim_size; j++) { + for (int64_t k = 0; k < outer_dim_size; k++) { + int64_t index = index_data[index_idx]; + int64_t replace_index_self = + k + index * outer_dim_size_self + + i * outer_dim_size_self * self_select_dim_size; + int64_t replace_index_grad = + k + j * outer_dim_size_grad + + i * outer_dim_size_grad * grad_select_dim_size; + if (out_data[replace_index_self] == value_data[replace_index_grad]) { + if (out_data[replace_index_self] == x_data[replace_index_self]) + grad_data[replace_index_grad] = + self_data[replace_index_self] / + static_cast(num_elements[replace_index_self] + 1); + else + grad_data[replace_index_grad] = + self_data[replace_index_self] / + static_cast(num_elements[replace_index_self]); + } + index_idx++; + } + } + } + } +} + +Instantiate_Template_Function(cpu_gather_kernel) // NOLINT + Instantiate_Template_Function(cpu_scatter_assign_kernel) // NOLINT + Instantiate_Template_Function(cpu_scatter_add_kernel) // NOLINT + Instantiate_Template_Function(cpu_scatter_mul_kernel) // NOLINT + Instantiate_Template_Function(cpu_scatter_mean_kernel) // NOLINT + Instantiate_Template_Function(cpu_scatter_max_kernel) // NOLINT + Instantiate_Template_Function(cpu_scatter_min_kernel) // NOLINT + Instantiate_Template_Function(cpu_scatter_input_grad_kernel) // NOLINT + Instantiate_Template_Function(cpu_scatter_value_grad_kernel) // NOLINT + Instantiate_Template_Function_With_Out( + cpu_scatter_mul_min_max_input_grad_kernel) // NOLINT + Instantiate_Template_Function(cpu_scatter_mean_input_grad_kernel) // NOLINT + Instantiate_Template_Function_With_Out( + cpu_scatter_add_mean_value_grad_kernel) // NOLINT + Instantiate_Template_Function_With_Out( + cpu_scatter_mul_min_max_value_grad_kernel) // NOLINT } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cu b/paddle/phi/kernels/funcs/gather_scatter_functor.cu index cbe866d4924d54..865b1d74e36c34 100644 --- a/paddle/phi/kernels/funcs/gather_scatter_functor.cu +++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/gather_scatter_functor.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace phi { namespace funcs { @@ -46,14 +47,58 @@ static ReduceAdd reduce_add; class ReduceMul { public: - template + template < + typename tensor_t, + std::enable_if_t::value>* = nullptr> + __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const { + phi::CudaAtomicMul(self_data, *src_data); + } + template ::value>* = nullptr> __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const { *self_data *= *src_data; - // TODO(huangxu96) platform::CudaAtomicMul(*self_data, *src_data); } }; static ReduceMul reduce_mul; +class ReduceMax { + public: + template < + typename tensor_t, + std::enable_if_t::value>* = nullptr> + __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const { + phi::CudaAtomicMax(self_data, *src_data); + } + template ::value>* = nullptr> + __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const { + *self_data = *src_data > *self_data ? *src_data : *self_data; + } +}; +static ReduceMax reduce_max; + +class ReduceMin { + public: + template < + typename tensor_t, + std::enable_if_t::value>* = nullptr> + __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const { + phi::CudaAtomicMin(self_data, *src_data); + } + template ::value>* = nullptr> + __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const { + *self_data = *src_data < *self_data ? *src_data : *self_data; + } +}; +static ReduceMin reduce_min; + +__global__ void CudaMemsetAsync(int* dest, int value, size_t size) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid * sizeof(int) >= size) return; + dest[tid] = value; +} + template = numel) return; - extern __shared__ int thread_ids[]; - - if (tid == 0) { - for (int i = 0; i < numel_data; i++) { - thread_ids[i] = 0; - } - } - __syncthreads(); int64_t i, j, k; // The i, j, k here is the index of the 3 layers loop // squeezed from the N layers loop. /* tid = i * select_dim_size * outer_dim_size + j * outer_dim_size + k */ @@ -143,9 +181,19 @@ __global__ void GatherScatterGPUKernel(tensor_t* self_data, int64_t outer_dim_size_src, int64_t numel, int64_t numel_data, - const func_t& reduce_op) { + bool include_self, + const func_t& reduce_op, + int* shared_mem) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= numel) return; + if (include_self == false) { + if (tid == 0) { + for (int i = 0; i < numel_data; i++) { + shared_mem[i] = numel + 1; // thread_ids + } + } + __syncthreads(); + } int64_t i, j, k; // The i, j, k here is the index of the 3 layers loop // squeezed from the N layers loop. /* tid = i * select_dim_size * outer_dim_size + j * outer_dim_size + k */ @@ -182,9 +230,95 @@ __global__ void GatherScatterGPUKernel(tensor_t* self_data, replace_index_src = k + index * outer_dim_size_src + i * outer_dim_size_src * src_select_dim_size; } + bool is_op_done = false; + if (include_self == false) { + phi::CudaAtomicMin(shared_mem + replace_index_self, tid); + __syncthreads(); + if (tid == shared_mem[replace_index_self]) { + self_data[replace_index_self] = src_data[replace_index_src]; + is_op_done = true; + } + __syncthreads(); + } + if (!is_op_done) + reduce_op(static_cast(self_data + replace_index_self), + static_cast(src_data + replace_index_src)); +} +template +__global__ void ScatterMeanGPUKernel(tensor_t* self_data, + int dim, + const index_t* index_data, + tensor_t* src_data, + int select_dim_size, + int self_select_dim_size, + int src_select_dim_size, + int64_t outer_dim_size, + int64_t outer_dim_size_self, + int64_t outer_dim_size_src, + int64_t numel, + int64_t numel_data, + bool include_self, + const func_t& reduce_op, + int* shared_mem) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numel) return; + + int64_t i, j, k; // The i, j, k here is the index of the 3 layers loop + // squeezed from the N layers loop. + /* tid = i * select_dim_size * outer_dim_size + j * outer_dim_size + k */ + i = tid / (select_dim_size * outer_dim_size); + int64_t remind = tid % (select_dim_size * outer_dim_size); + j = remind / outer_dim_size; + k = remind % outer_dim_size; + index_t index = index_data[tid]; + /* + gather computation formula: + + self[i][j][k] = src[index[i][j][k]][j][k] # if dim == 0 + self[i][j][k] = src[i][index[i][j][k]][k] # if dim == 1 + self[i][j][k] = src[i][j][index[i][j][k]] # if dim == 2 + + scatter computation formula: + + self[index[i][j][k]][j][k] = src[i][j][k] # if dim == 0 + self[i][index[i][j][k]][k] = src[i][j][k] # if dim == 1 + self[i][j][index[i][j][k]] = src[i][j][k] # if dim == 2 + + */ + // index matrix has different shape with self matrix or src matrix. + int64_t replace_index_self, replace_index_src; + if (is_scatter_like) { + replace_index_self = k + index * outer_dim_size_self + + i * outer_dim_size_self * self_select_dim_size; + + replace_index_src = k + j * outer_dim_size_src + + i * outer_dim_size_src * src_select_dim_size; + } else { + replace_index_self = tid; + + replace_index_src = k + index * outer_dim_size_src + + i * outer_dim_size_src * src_select_dim_size; + } + if (include_self == false) { + self_data[replace_index_self] = 0; + __syncthreads(); + } reduce_op(static_cast(self_data + replace_index_self), static_cast(src_data + replace_index_src)); + + phi::CudaAtomicMax(shared_mem + replace_index_self, tid); + phi::CudaAtomicAdd(shared_mem + numel_data + replace_index_self, 1); + __syncthreads(); + + if (tid == shared_mem[replace_index_self]) { + self_data[replace_index_self] = + self_data[replace_index_self] / + static_cast(shared_mem[replace_index_self + numel_data]); + } } template (ctx).stream(); + DenseTensor shared_mem_tensor; if (method_name == "scatter_assign_gpu") { - int shared_mem_size = - is_scatter_like ? sizeof(int) * self_size : sizeof(int) * index_size; + shared_mem_tensor.Resize({self_size}); + ctx.Alloc(&shared_mem_tensor); + phi::funcs::set_constant(ctx, &shared_mem_tensor, 0); + + int* shared_mem = shared_mem_tensor.data(); ScatterAssignGPUKernel - <<>>(self_data, - dim, - index_data, - src_data, - select_dim_size, - self_select_dim_size, - src_select_dim_size, - outer_dim_size, - outer_dim_size_self, - outer_dim_size_src, - index_size, - self_size, - reduce_op); + <<>>(self_data, + dim, + index_data, + src_data, + select_dim_size, + self_select_dim_size, + src_select_dim_size, + outer_dim_size, + outer_dim_size_self, + outer_dim_size_src, + index_size, + self_size, + reduce_op, + shared_mem); + } else if (method_name == "scatter_mean_gpu") { + shared_mem_tensor.Resize({self_size * 2}); + ctx.Alloc(&shared_mem_tensor); + if (include_self) { + int64_t grid_memset = (self_size * 2 + block - 1) / block; + phi::funcs::set_constant(ctx, &shared_mem_tensor, 1); + } else { + phi::funcs::set_constant(ctx, &shared_mem_tensor, 0); + } + + int* shared_mem = shared_mem_tensor.data(); + ScatterMeanGPUKernel + <<>>(self_data, + dim, + index_data, + src_data, + select_dim_size, + self_select_dim_size, + src_select_dim_size, + outer_dim_size, + outer_dim_size_self, + outer_dim_size_src, + index_size, + self_size, + include_self, + reduce_op, + shared_mem); } else { + int* shared_mem = nullptr; + if (include_self == false) { + shared_mem_tensor.Resize({self_size}); + ctx.Alloc(&shared_mem_tensor); + phi::funcs::set_constant(ctx, &shared_mem_tensor, index_size + 1); + + shared_mem = shared_mem_tensor.data(); + } GatherScatterGPUKernel <<>>(self_data, dim, @@ -265,7 +440,9 @@ struct gpu_gather_scatter_functor { outer_dim_size_src, index_size, self_size, - reduce_op); + include_self, + reduce_op, + shared_mem); } } }; // struct gpu_gather_scatter_functor @@ -275,11 +452,18 @@ void gpu_gather_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor result, + bool include_self, const phi::DeviceContext& ctx) { gpu_gather_scatter_functor()( - result, dim, index, self, "gather_out_gpu", tensor_assign, ctx); + /*is_scatter_like=*/false>()(result, + dim, + index, + self, + "gather_out_gpu", + tensor_assign, + include_self, + ctx); return; } @@ -288,11 +472,18 @@ void gpu_scatter_assign_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx) { gpu_gather_scatter_functor()( - self, dim, index, src, "scatter_assign_gpu", tensor_assign, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_assign_gpu", + tensor_assign, + include_self, + ctx); } template @@ -300,11 +491,12 @@ void gpu_scatter_add_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx) { gpu_gather_scatter_functor()( - self, dim, index, src, "scatter_add_gpu", reduce_add, ctx); + self, dim, index, src, "scatter_add_gpu", reduce_add, include_self, ctx); } template @@ -312,11 +504,51 @@ void gpu_scatter_mul_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx) { + gpu_gather_scatter_functor()( + self, dim, index, src, "scatter_mul_gpu", reduce_mul, include_self, ctx); +} + +template +void gpu_scatter_mean_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx) { + gpu_gather_scatter_functor()( + self, dim, index, src, "scatter_mean_gpu", reduce_add, include_self, ctx); +} + +template +void gpu_scatter_max_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx) { gpu_gather_scatter_functor()( - self, dim, index, src, "scatter_mul_gpu", reduce_mul, ctx); + self, dim, index, src, "scatter_max_gpu", reduce_max, include_self, ctx); +} + +template +void gpu_scatter_min_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx) { + gpu_gather_scatter_functor()( + self, dim, index, src, "scatter_min_gpu", reduce_min, include_self, ctx); } template @@ -347,6 +579,7 @@ void gpu_scatter_input_grad_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor grad, + bool include_self UNUSED, const phi::DeviceContext& ctx) { auto* index_data = index.data(); auto* grad_data = grad.data(); @@ -374,17 +607,265 @@ void gpu_scatter_input_grad_kernel(phi::DenseTensor self, int64_t n = inner_dim_size * select_dim_size * outer_dim_size; int64_t grid = (n + block - 1) / block; auto stream = reinterpret_cast(ctx).stream(); - int shared_mem_size = sizeof(int) * grad_size; ScatterInputGradGPUKernel - <<>>(grad_data, - dim, - index_data, - select_dim_size, - grad_select_dim_size, - outer_dim_size, - outer_dim_size_data, - index_size, - grad_size); + <<>>(grad_data, + dim, + index_data, + select_dim_size, + grad_select_dim_size, + outer_dim_size, + outer_dim_size_data, + index_size, + grad_size); +} + +template +__global__ void ScatterMulInputGradGPUKernel(tensor_t* grad_data, + int dim, + const index_t* index_data, + const tensor_t* out_data, + const tensor_t* x_data, + int select_dim_size, + int grad_select_dim_size, + int64_t outer_dim_size, + int64_t outer_dim_size_grad, + int64_t numel, + int64_t numel_grad, + int* thread_ids) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numel) return; + int64_t i, j, k; + i = tid / (select_dim_size * outer_dim_size); + int64_t remind = tid % (select_dim_size * outer_dim_size); + j = remind / outer_dim_size; + k = remind % outer_dim_size; + index_t index = index_data[tid]; + int64_t replace_index = k + index * outer_dim_size_grad + + i * outer_dim_size_grad * grad_select_dim_size; + atomicMax(thread_ids + replace_index, tid); + __syncthreads(); + if (tid == thread_ids[replace_index]) { + grad_data[replace_index] = grad_data[replace_index] * + out_data[replace_index] / x_data[replace_index]; + } +} + +template +__global__ void ScatterMinMaxInputGradGPUKernel(tensor_t* grad_data, + int dim, + const index_t* index_data, + const tensor_t* out_data, + const tensor_t* x_data, + const tensor_t* value_data, + const tensor_t* self_data, + int select_dim_size, + int grad_select_dim_size, + int value_select_dim_size, + int64_t outer_dim_size, + int64_t outer_dim_size_grad, + int64_t outer_dim_size_value, + int64_t numel, + int64_t numel_grad, + const std::string& reduce, + int* shared_mem) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numel) return; + int64_t i, j, k; + i = tid / (select_dim_size * outer_dim_size); + int64_t remind = tid % (select_dim_size * outer_dim_size); + j = remind / outer_dim_size; + k = remind % outer_dim_size; + index_t index = index_data[tid]; + int64_t replace_index = k + index * outer_dim_size_grad + + i * outer_dim_size_grad * grad_select_dim_size; + int64_t replace_index_value = + k + j * outer_dim_size_value + + i * outer_dim_size_value * value_select_dim_size; + if (value_data[replace_index_value] == out_data[replace_index]) + phi::CudaAtomicAdd(shared_mem + replace_index, 1); + __syncthreads(); + if (out_data[replace_index] != x_data[replace_index]) { + grad_data[replace_index] = 0; + } else { + grad_data[replace_index] = self_data[replace_index] / + static_cast(shared_mem[replace_index]); + } +} + +template +void gpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value + UNUSED, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self UNUSED, + const phi::DeviceContext& ctx) { + auto* index_data = index.data(); + auto* grad_data = grad.data(); + auto* out_data = out.data(); + auto* x_data = x.data(); + auto* value_data = value.data(); + auto* self_data = self.data(); + + int64_t grad_size = grad.numel(); + int64_t index_size = index.numel(); + auto index_dims = index.dims(); + auto grad_dims = grad.dims(); + auto x_dims = x.dims(); + auto value_dims = value.dims(); + + int64_t inner_dim_size = 1; + int64_t outer_dim_size = 1; + int64_t outer_dim_size_grad = 1; + int64_t outer_dim_size_value = 1; + int64_t select_dim_size = index_dims[dim]; + int64_t grad_select_dim_size = grad_dims[dim]; + int64_t value_select_dim_size = grad_dims[dim]; + for (int i = 0; i < dim; ++i) { + inner_dim_size *= index_dims[i]; + } + + for (int i = dim + 1; i < index_dims.size(); i++) { + outer_dim_size *= index_dims[i]; + outer_dim_size_grad *= grad_dims[i]; + outer_dim_size_value *= value_dims[i]; + } + int block = 512; + int64_t n = inner_dim_size * select_dim_size * outer_dim_size; + int64_t grid = (n + block - 1) / block; + auto stream = reinterpret_cast(ctx).stream(); + DenseTensor shared_mem_tensor; + shared_mem_tensor.Resize({grad_size}); + ctx.Alloc(&shared_mem_tensor); + int* shared_mem = shared_mem_tensor.data(); + if (reduce == "mul" || reduce == "multiply") { + phi::funcs::set_constant(ctx, &shared_mem_tensor, 0); + ScatterMulInputGradGPUKernel + <<>>(grad_data, + dim, + index_data, + out_data, + x_data, + select_dim_size, + grad_select_dim_size, + outer_dim_size, + outer_dim_size_grad, + index_size, + grad_size, + shared_mem); + } else if (reduce == "amin" || reduce == "amax") { + phi::funcs::set_constant(ctx, &shared_mem_tensor, 1); + ScatterMinMaxInputGradGPUKernel + <<>>(grad_data, + dim, + index_data, + out_data, + x_data, + value_data, + self_data, + select_dim_size, + grad_select_dim_size, + value_select_dim_size, + outer_dim_size, + outer_dim_size_grad, + outer_dim_size_value, + index_size, + grad_size, + reduce, + shared_mem); + } +} + +template +__global__ void ScatterMeanInputGradGPUKernel(tensor_t* grad_data, + int dim, + const index_t* index_data, + int select_dim_size, + int grad_select_dim_size, + int64_t outer_dim_size, + int64_t outer_dim_size_grad, + int64_t numel, + int64_t numel_grad, + int* shared_mem) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numel) return; + int64_t i, j, k; + i = tid / (select_dim_size * outer_dim_size); + int64_t remind = tid % (select_dim_size * outer_dim_size); + j = remind / outer_dim_size; + k = remind % outer_dim_size; + index_t index = index_data[tid]; + int64_t replace_index = k + index * outer_dim_size_grad + + i * outer_dim_size_grad * grad_select_dim_size; + atomicMax(shared_mem + replace_index, tid); + phi::CudaAtomicAdd(shared_mem + numel_grad + replace_index, 1); + __syncthreads(); + if (tid == shared_mem[replace_index]) { + grad_data[replace_index] = + grad_data[replace_index] / + static_cast(shared_mem[numel_grad + replace_index]); + } +} + +template +void gpu_scatter_mean_input_grad_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor grad, + bool include_self UNUSED, + const phi::DeviceContext& ctx) { + auto* index_data = index.data(); + auto* grad_data = grad.data(); + + auto index_dims = index.dims(); + auto grad_dims = grad.dims(); + + int64_t grad_size = grad.numel(); + int64_t index_size = index.numel(); + + int64_t inner_dim_size = 1; + int64_t outer_dim_size = 1; + int64_t outer_dim_size_grad = 1; + int64_t select_dim_size = index_dims[dim]; + int64_t grad_select_dim_size = grad_dims[dim]; + for (int i = 0; i < dim; ++i) { + inner_dim_size *= index_dims[i]; + } + + for (int i = dim + 1; i < index_dims.size(); i++) { + outer_dim_size *= index_dims[i]; + outer_dim_size_grad *= grad_dims[i]; + } + + DenseTensor shared_mem_tensor; + shared_mem_tensor.Resize({grad_size * 2}); + ctx.Alloc(&shared_mem_tensor); + phi::funcs::set_constant(ctx, &shared_mem_tensor, 0); + int* shared_mem = shared_mem_tensor.data(); + + int block = 512; + int64_t grid_memset = (grad_size + block - 1) / block; + auto stream = reinterpret_cast(ctx).stream(); + CudaMemsetAsync<<>>( + shared_mem + grad_size, 1, sizeof(int) * grad_size); + + int64_t n = inner_dim_size * select_dim_size * outer_dim_size; + int64_t grid = (n + block - 1) / block; + ScatterMeanInputGradGPUKernel + <<>>(grad_data, + dim, + index_data, + select_dim_size, + grad_select_dim_size, + outer_dim_size, + outer_dim_size_grad, + index_size, + grad_size, + shared_mem); } template @@ -399,17 +880,11 @@ __global__ void ScatterValueGradGPUKernel(tensor_t* grad_data, int64_t outer_dim_size_self, int64_t outer_dim_size_grad, int64_t numel, - int64_t numel_data) { + int64_t numel_data, + int* thread_ids) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= numel) return; - extern __shared__ int thread_ids[]; - if (tid == 0) { - for (int i = 0; i < numel_data; i++) { - thread_ids[i] = 0; - } - } - __syncthreads(); int64_t i, j, k; i = tid / (select_dim_size * outer_dim_size); int64_t remind = tid % (select_dim_size * outer_dim_size); @@ -418,7 +893,6 @@ __global__ void ScatterValueGradGPUKernel(tensor_t* grad_data, index_t index = index_data[tid]; int64_t replace_index_self = k + index * outer_dim_size_self + i * outer_dim_size_self * self_select_dim_size; - atomicMax(thread_ids + replace_index_self, tid); __syncthreads(); @@ -433,6 +907,7 @@ void gpu_scatter_value_grad_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor grad, + bool include_self UNUSED, const phi::DeviceContext& ctx) { auto* self_data = self.data(); auto* index_data = index.data(); @@ -461,30 +936,362 @@ void gpu_scatter_value_grad_kernel(phi::DenseTensor self, outer_dim_size_grad *= grad_dims[i]; } + DenseTensor shared_mem_tensor; + shared_mem_tensor.Resize({self_size}); + ctx.Alloc(&shared_mem_tensor); + phi::funcs::set_constant(ctx, &shared_mem_tensor, 0); + int* shared_mem = shared_mem_tensor.data(); + int block = 512; int64_t n = inner_dim_size * select_dim_size * outer_dim_size; int64_t grid = (n + block - 1) / block; auto stream = reinterpret_cast(ctx).stream(); - int shared_mem_size = sizeof(int) * self_size; ScatterValueGradGPUKernel - <<>>(grad_data, - dim, - self_data, - index_data, - select_dim_size, - self_select_dim_size, - grad_select_dim_size, - outer_dim_size, - outer_dim_size_self, - outer_dim_size_grad, - index_size, - self_size); + <<>>(grad_data, + dim, + self_data, + index_data, + select_dim_size, + self_select_dim_size, + grad_select_dim_size, + outer_dim_size, + outer_dim_size_self, + outer_dim_size_grad, + index_size, + self_size, + shared_mem); +} + +template +__global__ void ScatterMeanValueGradGPUKernel(tensor_t* grad_data, + int dim, + const tensor_t* self_data, + const index_t* index_data, + int select_dim_size, + int self_select_dim_size, + int grad_select_dim_size, + int64_t outer_dim_size, + int64_t outer_dim_size_self, + int64_t outer_dim_size_grad, + int64_t numel, + int64_t numel_self, + int* shared_mem) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numel) return; + + int64_t i, j, k; + i = tid / (select_dim_size * outer_dim_size); + int64_t remind = tid % (select_dim_size * outer_dim_size); + j = remind / outer_dim_size; + k = remind % outer_dim_size; + index_t index = index_data[tid]; + int64_t replace_index_self = k + index * outer_dim_size_self + + i * outer_dim_size_self * self_select_dim_size; + + phi::CudaAtomicAdd(shared_mem + replace_index_self, 1); + __syncthreads(); + + int64_t replace_index_grad = k + j * outer_dim_size_grad + + i * outer_dim_size_grad * grad_select_dim_size; + grad_data[replace_index_grad] = + self_data[replace_index_self] / + static_cast(shared_mem[replace_index_self]); +} + +template +__global__ void ScatterAddValueGradGPUKernel(tensor_t* grad_data, + int dim, + const tensor_t* self_data, + const index_t* index_data, + int select_dim_size, + int self_select_dim_size, + int grad_select_dim_size, + int64_t outer_dim_size, + int64_t outer_dim_size_self, + int64_t outer_dim_size_grad, + int64_t numel) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numel) return; + int64_t i, j, k; + i = tid / (select_dim_size * outer_dim_size); + int64_t remind = tid % (select_dim_size * outer_dim_size); + j = remind / outer_dim_size; + k = remind % outer_dim_size; + index_t index = index_data[tid]; + int64_t replace_index_self = k + index * outer_dim_size_self + + i * outer_dim_size_self * self_select_dim_size; + int64_t replace_index_grad = k + j * outer_dim_size_grad + + i * outer_dim_size_grad * grad_select_dim_size; + grad_data[replace_index_grad] = self_data[replace_index_self]; +} + +template +void gpu_scatter_add_mean_value_grad_kernel( + phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out UNUSED, + const phi::DenseTensor& x UNUSED, + const phi::DenseTensor& value UNUSED, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& ctx UNUSED) { + auto* self_data = self.data(); + auto* index_data = index.data(); + auto* grad_data = grad.data(); + + auto index_dims = index.dims(); + auto self_dims = self.dims(); + auto grad_dims = grad.dims(); + + int64_t self_size = self.numel(); + int64_t grad_size = grad.numel(); + int64_t index_size = index.numel(); + + int64_t inner_dim_size = 1; + int64_t outer_dim_size = 1; + int64_t outer_dim_size_self = 1; + int64_t outer_dim_size_grad = 1; + int64_t select_dim_size = index_dims[dim]; + int64_t self_select_dim_size = self_dims[dim]; + int64_t grad_select_dim_size = grad_dims[dim]; + for (int i = 0; i < dim; ++i) { + inner_dim_size *= index_dims[i]; + } + + for (int i = dim + 1; i < index_dims.size(); i++) { + outer_dim_size *= index_dims[i]; + outer_dim_size_self *= self_dims[i]; + outer_dim_size_grad *= grad_dims[i]; + } + int block = 512; + int64_t n = inner_dim_size * select_dim_size * outer_dim_size; + int64_t grid = (n + block - 1) / block; + auto stream = reinterpret_cast(ctx).stream(); + if (reduce == "mean") { + DenseTensor shared_mem_tensor; + shared_mem_tensor.Resize({self_size}); + ctx.Alloc(&shared_mem_tensor); + if (include_self) { + phi::funcs::set_constant(ctx, &shared_mem_tensor, 1); + } else { + phi::funcs::set_constant(ctx, &shared_mem_tensor, 0); + } + int* shared_mem = shared_mem_tensor.data(); + ScatterMeanValueGradGPUKernel + <<>>(grad_data, + dim, + self_data, + index_data, + select_dim_size, + self_select_dim_size, + grad_select_dim_size, + outer_dim_size, + outer_dim_size_self, + outer_dim_size_grad, + index_size, + self_size, + shared_mem); + } else if (reduce == "add") { + ScatterAddValueGradGPUKernel + <<>>(grad_data, + dim, + self_data, + index_data, + select_dim_size, + self_select_dim_size, + grad_select_dim_size, + outer_dim_size, + outer_dim_size_self, + outer_dim_size_grad, + index_size); + } +} + +template +__global__ void ScatterMulValueGradGPUKernel(tensor_t* grad_data, + int dim, + const index_t* index_data, + const tensor_t* self_data, + const tensor_t* value_data, + const tensor_t* out_data, + int select_dim_size, + int self_select_dim_size, + int grad_select_dim_size, + int64_t outer_dim_size, + int64_t outer_dim_size_self, + int64_t outer_dim_size_grad, + int64_t numel) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numel) return; + int64_t i, j, k; + i = tid / (select_dim_size * outer_dim_size); + int64_t remind = tid % (select_dim_size * outer_dim_size); + j = remind / outer_dim_size; + k = remind % outer_dim_size; + index_t index = index_data[tid]; + int64_t replace_index_self = k + index * outer_dim_size_self + + i * outer_dim_size_self * self_select_dim_size; + int64_t replace_index_grad = k + j * outer_dim_size_grad + + i * outer_dim_size_grad * grad_select_dim_size; + grad_data[replace_index_grad] = + self_data[replace_index_self] * + (out_data[replace_index_self] / value_data[replace_index_grad]); } -Instantiate_Template_Function(gpu_gather_kernel) - Instantiate_Template_Function(gpu_scatter_assign_kernel) - Instantiate_Template_Function(gpu_scatter_add_kernel) - Instantiate_Template_Function(gpu_scatter_mul_kernel) - Instantiate_Template_Function(gpu_scatter_input_grad_kernel) - Instantiate_Template_Function(gpu_scatter_value_grad_kernel) + +template +__global__ void ScatterMinMaxValueGradGPUKernel(tensor_t* grad_data, + int dim, + const index_t* index_data, + const tensor_t* self_data, + const tensor_t* value_data, + const tensor_t* out_data, + const tensor_t* x_data, + int select_dim_size, + int self_select_dim_size, + int grad_select_dim_size, + int64_t outer_dim_size, + int64_t outer_dim_size_self, + int64_t outer_dim_size_grad, + int64_t numel, + int64_t numel_self, + bool include_self, + int* shared_mem) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numel) return; + int64_t i, j, k; + i = tid / (select_dim_size * outer_dim_size); + int64_t remind = tid % (select_dim_size * outer_dim_size); + j = remind / outer_dim_size; + k = remind % outer_dim_size; + index_t index = index_data[tid]; + int64_t replace_index_self = k + index * outer_dim_size_self + + i * outer_dim_size_self * self_select_dim_size; + int64_t replace_index_grad = k + j * outer_dim_size_grad + + i * outer_dim_size_grad * grad_select_dim_size; + if (tid == 0) { + for (int i = 0; i < numel_self; i++) { + if (include_self && + x_data[replace_index_self] == out_data[replace_index_self]) + shared_mem[i] = 1; + else + shared_mem[i] = 0; // number of elements + } + } + __syncthreads(); + grad_data[replace_index_grad] = 0; + if (value_data[replace_index_grad] == out_data[replace_index_self]) + phi::CudaAtomicAdd(shared_mem + replace_index_self, 1); + __syncthreads(); + if (value_data[replace_index_grad] == out_data[replace_index_self]) + grad_data[replace_index_grad] = + self_data[replace_index_self] / + static_cast(shared_mem[replace_index_self]); +} + +template +void gpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& ctx) { + auto* self_data = self.data(); + auto* index_data = index.data(); + auto* grad_data = grad.data(); + auto* out_data = out.data(); + auto* x_data = x.data(); + auto* value_data = value.data(); + + auto index_dims = index.dims(); + auto self_dims = self.dims(); + auto grad_dims = grad.dims(); + + int64_t self_size = self.numel(); + int64_t index_size = index.numel(); + + int64_t inner_dim_size = 1; + int64_t outer_dim_size = 1; + int64_t outer_dim_size_self = 1; + int64_t outer_dim_size_grad = 1; + int64_t select_dim_size = index_dims[dim]; + int64_t self_select_dim_size = self_dims[dim]; + int64_t grad_select_dim_size = grad_dims[dim]; + for (int i = 0; i < dim; ++i) { + inner_dim_size *= index_dims[i]; + } + + for (int i = dim + 1; i < index_dims.size(); i++) { + outer_dim_size *= index_dims[i]; + outer_dim_size_self *= self_dims[i]; + outer_dim_size_grad *= grad_dims[i]; + } + int block = 512; + int64_t n = inner_dim_size * select_dim_size * outer_dim_size; + int64_t grid = (n + block - 1) / block; + auto stream = reinterpret_cast(ctx).stream(); + if (reduce == "mul" || reduce == "multiply") { + ScatterMulValueGradGPUKernel + <<>>(grad_data, + dim, + index_data, + self_data, + value_data, + out_data, + select_dim_size, + self_select_dim_size, + grad_select_dim_size, + outer_dim_size, + outer_dim_size_self, + outer_dim_size_grad, + index_size); + } else if (reduce == "amin" || reduce == "amax") { + DenseTensor shared_mem_tensor; + shared_mem_tensor.Resize({self_size}); + ctx.Alloc(&shared_mem_tensor); + + int* shared_mem = shared_mem_tensor.data(); + ScatterMinMaxValueGradGPUKernel + <<>>(grad_data, + dim, + index_data, + self_data, + value_data, + out_data, + x_data, + select_dim_size, + self_select_dim_size, + grad_select_dim_size, + outer_dim_size, + outer_dim_size_self, + outer_dim_size_grad, + index_size, + self_size, + include_self, + shared_mem); + } +} + +Instantiate_Template_Function(gpu_gather_kernel) // NOLINT + Instantiate_Template_Function(gpu_scatter_assign_kernel) // NOLINT + Instantiate_Template_Function(gpu_scatter_add_kernel) // NOLINT + Instantiate_Template_Function(gpu_scatter_mul_kernel) // NOLINT + Instantiate_Template_Function(gpu_scatter_min_kernel) // NOLINT + Instantiate_Template_Function(gpu_scatter_max_kernel) // NOLINT + Instantiate_Template_Function(gpu_scatter_mean_kernel) // NOLINT + Instantiate_Template_Function(gpu_scatter_input_grad_kernel) // NOLINT + Instantiate_Template_Function(gpu_scatter_value_grad_kernel) // NOLINT + Instantiate_Template_Function_With_Out( + gpu_scatter_mul_min_max_input_grad_kernel) // NOLINT + Instantiate_Template_Function(gpu_scatter_mean_input_grad_kernel) // NOLINT + Instantiate_Template_Function_With_Out( + gpu_scatter_add_mean_value_grad_kernel) // NOLINT + Instantiate_Template_Function_With_Out( + gpu_scatter_mul_min_max_value_grad_kernel) // NOLINT } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.h b/paddle/phi/kernels/funcs/gather_scatter_functor.h index 054ccc196fcd00..9fc50c44a79ead 100644 --- a/paddle/phi/kernels/funcs/gather_scatter_functor.h +++ b/paddle/phi/kernels/funcs/gather_scatter_functor.h @@ -36,11 +36,46 @@ namespace funcs { int dim, \ const phi::DenseTensor& index, \ phi::DenseTensor result, \ + bool include_self, \ const phi::DeviceContext& ctx); \ template void func(phi::DenseTensor input, \ int dim, \ const phi::DenseTensor& index, \ phi::DenseTensor result, \ + bool include_self, \ + const phi::DeviceContext& ctx); + +#define Instantiate_Template_Function_With_Out(func) \ + Instantiate_Template_Function_index_t_With_Out(func, int) \ + Instantiate_Template_Function_index_t_With_Out(func, float) \ + Instantiate_Template_Function_index_t_With_Out(func, double) \ + Instantiate_Template_Function_index_t_With_Out(func, int64_t) \ + Instantiate_Template_Function_index_t_With_Out( \ + func, phi::dtype::float16) \ + Instantiate_Template_Function_index_t_With_Out( \ + func, phi::dtype::bfloat16) \ + Instantiate_Template_Function_index_t_With_Out( \ + func, unsigned char) +#define Instantiate_Template_Function_index_t_With_Out(func, tensor_t) \ + template void func(phi::DenseTensor input, \ + int dim, \ + const phi::DenseTensor& index, \ + const phi::DenseTensor& out, \ + const phi::DenseTensor& self, \ + const phi::DenseTensor& value, \ + phi::DenseTensor result, \ + const std::string& reduce, \ + bool include_self, \ + const phi::DeviceContext& ctx); \ + template void func(phi::DenseTensor input, \ + int dim, \ + const phi::DenseTensor& index, \ + const phi::DenseTensor& out, \ + const phi::DenseTensor& self, \ + const phi::DenseTensor& value, \ + phi::DenseTensor result, \ + const std::string& reduce, \ + bool include_self, \ const phi::DeviceContext& ctx); template @@ -48,6 +83,7 @@ void cpu_gather_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor result, + bool include_self, const phi::DeviceContext& ctx); template @@ -55,6 +91,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx); template @@ -62,6 +99,7 @@ void cpu_scatter_add_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx); template @@ -69,6 +107,31 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx); + +template +void cpu_scatter_mean_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx); + +template +void cpu_scatter_max_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx); + +template +void cpu_scatter_min_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx); template @@ -76,20 +139,67 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor grad, + bool include_self, const phi::DeviceContext& ctx); +template +void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& ctx); + +template +void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor grad, + bool include_self, + const phi::DeviceContext& ctx); + template void cpu_scatter_value_grad_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor grad, + bool include_self, const phi::DeviceContext& ctx); +template +void cpu_scatter_add_mean_value_grad_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& ctx); + +template +void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& ctx); + template void gpu_gather_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor result, + bool include_self, const phi::DeviceContext& ctx); template @@ -97,6 +207,7 @@ void gpu_scatter_assign_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx); template @@ -104,6 +215,7 @@ void gpu_scatter_add_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx); template @@ -111,6 +223,31 @@ void gpu_scatter_mul_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx); + +template +void gpu_scatter_mean_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx); + +template +void gpu_scatter_max_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, + const phi::DeviceContext& ctx); + +template +void gpu_scatter_min_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor src, + bool include_self, const phi::DeviceContext& ctx); template @@ -118,14 +255,60 @@ void gpu_scatter_input_grad_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor grad, + bool include_self, const phi::DeviceContext& ctx); +template +void gpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& ctx); + +template +void gpu_scatter_mean_input_grad_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + phi::DenseTensor grad, + bool include_self, + const phi::DeviceContext& ctx); + template void gpu_scatter_value_grad_kernel(phi::DenseTensor self, int dim, const phi::DenseTensor& index, phi::DenseTensor grad, + bool include_self, const phi::DeviceContext& ctx); +template +void gpu_scatter_add_mean_value_grad_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& ctx); + +template +void gpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& ctx); + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h index c81f716d0658b3..85aabb2adf3cd2 100644 --- a/paddle/phi/kernels/funcs/index_put_utils.h +++ b/paddle/phi/kernels/funcs/index_put_utils.h @@ -76,67 +76,73 @@ std::vector DealWithBoolIndices( const Context& dev_ctx, const std::vector& indices_v, std::vector* tmp_indices_v) { - std::vector res(indices_v.begin(), indices_v.end()); - bool contains_bool_tensor = false; + std::vector res; + bool contains_bool_tensor = false; for (size_t i = 0; i < indices_v.size(); ++i) { if (indices_v[i]->dtype() == phi::DataType::BOOL) { contains_bool_tensor = true; - int rank = indices_v[i]->dims().size(); - PADDLE_ENFORCE_GE( - rank, - 1UL, - phi::errors::InvalidArgument("the only bool tensor in indices should " - "have number of dimension at least 1")); - phi::DenseTensor nonzero_indices(phi::DataType::INT64); - nonzero_indices.Resize(common::make_ddim({-1, rank})); - NonZeroKernel(dev_ctx, *indices_v[i], &nonzero_indices); - - if (nonzero_indices.numel() == 0) { - std::vector empty_indices; - return empty_indices; - } + break; + } + } - std::vector integer_indices(rank, nullptr); - const int tmp_ix = tmp_indices_v->size(); - for (int i = 0; i < rank; ++i) { - tmp_indices_v->emplace_back( - DenseTensor(phi::DataType::INT64) - .Resize(common::make_ddim({nonzero_indices.dims()[0]}))); - } - for (int i = 0; i < rank; ++i) { - integer_indices[i] = &((*tmp_indices_v)[i + tmp_ix]); - } - SplitWithNumKernel( - dev_ctx, nonzero_indices, rank, 1, integer_indices); + if (contains_bool_tensor) { + for (size_t i = 0; i < indices_v.size(); ++i) { + if (indices_v[i]->dtype() == phi::DataType::BOOL) { + int rank = indices_v[i]->dims().size(); + PADDLE_ENFORCE_GE(rank, + 1UL, + phi::errors::InvalidArgument( + "the only bool tensor in indices should " + "have number of dimension at least 1")); + phi::DenseTensor nonzero_indices(phi::DataType::INT64); + nonzero_indices.Resize(common::make_ddim({-1, rank})); + NonZeroKernel(dev_ctx, *indices_v[i], &nonzero_indices); + + if (nonzero_indices.numel() == 0) { + std::vector empty_indices; + return empty_indices; + } + + std::vector integer_indices(rank, nullptr); + const int tmp_ix = tmp_indices_v->size(); + for (int i = 0; i < rank; ++i) { + tmp_indices_v->emplace_back( + DenseTensor(phi::DataType::INT64) + .Resize(common::make_ddim({nonzero_indices.dims()[0]}))); + } + for (int i = 0; i < rank; ++i) { + integer_indices[i] = &((*tmp_indices_v)[i + tmp_ix]); + } + SplitWithNumKernel( + dev_ctx, nonzero_indices, rank, 1, integer_indices); #ifdef PADDLE_WITH_XPU - auto place = dev_ctx.GetPlace(); - if (place.GetType() == phi::AllocationType::XPU) { - auto& pool = phi::DeviceContextPool::Instance(); - auto* xpu_ctx = static_cast(pool.Get(place)); - if (xpu_ctx->x_context()->xpu_stream) { - dev_ctx.Wait(); + auto place = dev_ctx.GetPlace(); + if (place.GetType() == phi::AllocationType::XPU) { + auto& pool = phi::DeviceContextPool::Instance(); + auto* xpu_ctx = static_cast(pool.Get(place)); + if (xpu_ctx->x_context()->xpu_stream) { + dev_ctx.Wait(); + } } - } #endif - } else if ((indices_v[i]->dtype() == phi::DataType::INT64) || - (indices_v[i]->dtype() == phi::DataType::INT32)) { - tmp_indices_v->emplace_back(*indices_v[i]); - } else { - PADDLE_THROW(phi::errors::InvalidArgument( - "data type of tensor in indices must be int32, int64 or bool")); + } else if ((indices_v[i]->dtype() == phi::DataType::INT64) || + (indices_v[i]->dtype() == phi::DataType::INT32)) { + tmp_indices_v->emplace_back(*indices_v[i]); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "data type of tensor in indices must be int32, int64 or bool")); + } } - } - if (contains_bool_tensor) { - std::vector res_tmp(tmp_indices_v->size(), - nullptr); - for (size_t i = 0; i < res_tmp.size(); ++i) { - res_tmp[i] = &((*tmp_indices_v)[i]); + + res.reserve(tmp_indices_v->size()); + for (size_t i = 0; i < tmp_indices_v->size(); ++i) { + res.emplace_back(&((*tmp_indices_v)[i])); } - res.swap(res_tmp); + } else { + res = indices_v; } - return res; } @@ -215,62 +221,50 @@ void DealWithIndices(const Context& dev_ctx, res_dim_v->insert(res_dim_v->end(), tmp_x_dims.begin() + int_indices_v.size(), tmp_x_dims.end()); - - std::vector reshaped_indices_v; + phi::DDim res_dim = common::make_ddim(*res_dim_v); for (size_t i = 0; i < int_indices_v.size(); ++i) { + phi::DenseTensor index_tensor; if (int_indices_v[i]->dtype() == phi::DataType::INT32) { - reshaped_indices_v.emplace_back(phi::Cast( - dev_ctx, *int_indices_v[i], phi::DataType::INT64)); + index_tensor = phi::Cast( + dev_ctx, *int_indices_v[i], phi::DataType::INT64); } else { - reshaped_indices_v.emplace_back(*int_indices_v[i]); + index_tensor = *int_indices_v[i]; } + tmp_res_indices_v->emplace_back( + GetReshapeAndExpandTensor( + dev_ctx, index_tensor, res_dim, bd_dim, 0)); } - reshaped_indices_v.insert( - reshaped_indices_v.end(), range_tensor_v.begin(), range_tensor_v.end()); - - phi::DDim res_dim = common::make_ddim(*res_dim_v); - - for (size_t i = 0; i < reshaped_indices_v.size(); ++i) { + for (size_t i = 0; i < range_tensor_v.size(); ++i) { tmp_res_indices_v->emplace_back( GetReshapeAndExpandTensor( - dev_ctx, - reshaped_indices_v[i], - res_dim, - bd_dim, - ((i < int_indices_v.size()) - ? 0 - : i - int_indices_v.size() + len_bd_dim))); + dev_ctx, range_tensor_v[i], res_dim, bd_dim, i + len_bd_dim)); } for (size_t i = 0; i < res_indices_v->size(); ++i) { (*res_indices_v)[i] = &(*tmp_res_indices_v)[i]; } } else { - std::vector int_indices_v_tmp; - for (size_t i = 0; i < int_indices_v.size(); ++i) { + phi::DenseTensor index_tensor; + phi::DenseTensor expand_index; if (int_indices_v[i]->dtype() == phi::DataType::INT32) { - int_indices_v_tmp.emplace_back(phi::Cast( - dev_ctx, *int_indices_v[i], phi::DataType::INT64)); + index_tensor = phi::Cast( + dev_ctx, *int_indices_v[i], phi::DataType::INT64); } else { - int_indices_v_tmp.emplace_back(*int_indices_v[i]); + index_tensor = *int_indices_v[i]; } - } - - for (size_t i = 0; i < int_indices_v.size(); ++i) { if (bd_dim != int_indices_v[i]->dims()) { - tmp_res_indices_v->emplace_back( - DenseTensor(phi::DataType::INT64).Resize(bd_dim)); + expand_index = DenseTensor(phi::DataType::INT64).Resize(bd_dim); ExpandKernel( dev_ctx, - int_indices_v_tmp[i], + index_tensor, IntArray(common::vectorize(bd_dim)), - &(*tmp_res_indices_v)[i]); + &expand_index); } else { - tmp_res_indices_v->emplace_back(int_indices_v_tmp[i]); + expand_index = index_tensor; } + tmp_res_indices_v->emplace_back(expand_index); } - for (size_t i = 0; i < res_indices_v->size(); ++i) { (*res_indices_v)[i] = &(*tmp_res_indices_v)[i]; } diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h b/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h index 972f5ee633bbb0..0db16ffb7e20bc 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h +++ b/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h @@ -125,10 +125,18 @@ __global__ void VectorizedFusedRopeWithRotateEveryTwoKernel( MPType p0 = static_cast(input[pr_index]); MPType p1 = static_cast(input[ls_index]); - result[pr_index] = - cos_value[pr_index] * p0 - sign * sin_value[ls_index] * p1; - result[ls_index] = - cos_value[ls_index] * p1 + sign * sin_value[pr_index] * p0; + if (sign == 1) { + result[pr_index] = cos_value[pr_index] * p0; + result[pr_index] -= sin_value[pr_index] * p1; + + result[ls_index] = sin_value[ls_index] * p0; + result[ls_index] += cos_value[ls_index] * p1; + } else if (sign == -1) { + result[pr_index] = + cos_value[pr_index] * p0 + sin_value[ls_index] * p1; + result[ls_index] = + cos_value[ls_index] * p1 - sin_value[pr_index] * p0; + } store[pr_index] = static_cast(result[pr_index]); store[ls_index] = static_cast(result[ls_index]); diff --git a/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu index f8dc67f5bafe88..d7341e55e23490 100644 --- a/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu @@ -39,10 +39,10 @@ void CummaxGradKernel(const Context& dev_ctx, if (dtype == DataType::INT32) { phi::funcs::gpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } else if (dtype == DataType::INT64) { phi::funcs::gpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } } @@ -63,10 +63,10 @@ void CumminGradKernel(const Context& dev_ctx, if (dtype == DataType::INT32) { phi::funcs::gpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } else if (dtype == DataType::INT64) { phi::funcs::gpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } } diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu index d86e0493786ebd..c70812b473ee62 100644 --- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu @@ -27,9 +27,12 @@ template void PutAlongAxisGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& index, + const DenseTensor& value, + const DenseTensor& out, const DenseTensor& out_grad, int axis, const std::string& reduce, + bool include_self, DenseTensor* x_grad, DenseTensor* value_grad) { PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU, @@ -40,23 +43,118 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, const auto& index_type = index.dtype(); if (x_grad) { phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); - if (index_type == DataType::INT32) { - phi::funcs::gpu_scatter_input_grad_kernel( - out_grad, axis, index, *x_grad, dev_ctx); - } else { - phi::funcs::gpu_scatter_input_grad_kernel( - out_grad, axis, index, *x_grad, dev_ctx); + if (!include_self || reduce == "assign") { + if (index_type == DataType::INT32) { + phi::funcs::gpu_scatter_input_grad_kernel( + out_grad, axis, index, *x_grad, include_self, dev_ctx); + } else { + phi::funcs::gpu_scatter_input_grad_kernel( + out_grad, axis, index, *x_grad, include_self, dev_ctx); + } + } else if (reduce == "multiply" || reduce == "mul" || reduce == "amin" || + reduce == "amax") { + if (index_type == DataType::INT32) { + phi::funcs::gpu_scatter_mul_min_max_input_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *x_grad, + reduce, + include_self, + dev_ctx); + } else { + phi::funcs::gpu_scatter_mul_min_max_input_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *x_grad, + reduce, + include_self, + dev_ctx); + } + } else if (reduce == "mean") { + if (index_type == DataType::INT32) { + phi::funcs::gpu_scatter_mean_input_grad_kernel( + out_grad, axis, index, *x_grad, include_self, dev_ctx); + } else { + phi::funcs::gpu_scatter_mean_input_grad_kernel( + out_grad, axis, index, *x_grad, include_self, dev_ctx); + } } } if (value_grad) { value_grad->Resize(index.dims()); dev_ctx.template Alloc(value_grad); - if (index_type == DataType::INT32) { - phi::funcs::gpu_scatter_value_grad_kernel( - out_grad, axis, index, *value_grad, dev_ctx); - } else { - phi::funcs::gpu_scatter_value_grad_kernel( - out_grad, axis, index, *value_grad, dev_ctx); + auto* grad_data = value_grad->data(); + int64_t grad_size = value_grad->numel(); + cudaMemset(grad_data, 0, sizeof(T) * grad_size); + if (reduce == "assign") { + if (index_type == DataType::INT32) { + phi::funcs::gpu_scatter_value_grad_kernel( + out_grad, axis, index, *value_grad, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::gpu_scatter_value_grad_kernel( + out_grad, axis, index, *value_grad, include_self, dev_ctx); + } + } else if (reduce == "add" || reduce == "mean") { + if (index_type == DataType::INT32) { + phi::funcs::gpu_scatter_add_mean_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } else { + phi::funcs::gpu_scatter_add_mean_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } + } else if (reduce == "mul" || reduce == "multiply" || reduce == "amin" || + reduce == "amax") { + if (index_type == DataType::INT32) { + phi::funcs::gpu_scatter_mul_min_max_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } else { + phi::funcs::gpu_scatter_mul_min_max_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } } } } diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu index b63047973e9b82..aff4eec7bff8dd 100644 --- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu +++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu @@ -30,6 +30,7 @@ void PutAlongAxisKernel(const Context& dev_ctx, const DenseTensor& value, int axis, const std::string& reduce, + bool include_self, DenseTensor* out) { PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU, true, @@ -42,31 +43,56 @@ void PutAlongAxisKernel(const Context& dev_ctx, if (reduce == "add") { if (index_type == DataType::INT32) { phi::funcs::gpu_scatter_add_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::gpu_scatter_add_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } } else if (reduce == "multiply" || reduce == "mul") { if (index_type == DataType::INT32) { phi::funcs::gpu_scatter_mul_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::gpu_scatter_mul_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } } else if (reduce == "assign") { if (index_type == DataType::INT32) { phi::funcs::gpu_scatter_assign_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::gpu_scatter_assign_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); + } + } else if (reduce == "mean") { + if (index_type == DataType::INT32) { + phi::funcs::gpu_scatter_mean_kernel( + *out, axis, index, value, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::gpu_scatter_mean_kernel( + *out, axis, index, value, include_self, dev_ctx); + } + } else if (reduce == "amax") { + if (index_type == DataType::INT32) { + phi::funcs::gpu_scatter_max_kernel( + *out, axis, index, value, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::gpu_scatter_max_kernel( + *out, axis, index, value, include_self, dev_ctx); + } + } else if (reduce == "amin") { + if (index_type == DataType::INT32) { + phi::funcs::gpu_scatter_min_kernel( + *out, axis, index, value, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::gpu_scatter_min_kernel( + *out, axis, index, value, include_self, dev_ctx); } } else { PADDLE_THROW(errors::InvalidArgument( "can not support reduce: '%s' for scatter kernel, only " - "support reduce op: 'add', 'assign', 'mul' and 'multiply', the " + "support reduce op: 'add', 'assign', 'mul', 'mean', 'amin', 'amax' and " + "'multiply', the " "default reduce op is 'assign' ", reduce)); return; diff --git a/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu b/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu index 52a0e313398e8b..5ff1418b2732ad 100644 --- a/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu @@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(repeat_interleave_grad, GPU, ALL_LAYOUT, @@ -33,4 +34,5 @@ PD_REGISTER_KERNEL(repeat_interleave_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu index ed62278f067e5f..7b0675b3a752df 100644 --- a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu +++ b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu @@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(repeat_interleave, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index, GPU, @@ -34,4 +35,5 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu index 66688b417ae307..42ff5b912eccd0 100644 --- a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu @@ -35,3 +35,20 @@ PD_REGISTER_KERNEL(set_value_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(set_value_with_scalar_grad, + GPU, + ALL_LAYOUT, + phi::SetValueWithScalarGradKernel, + float, + double, + int, + int64_t, + bool, + int16_t, + uint8_t, + int8_t, + phi::dtype::float16, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu index 6cea7592836730..5993b11f638db2 100644 --- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu @@ -46,10 +46,11 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx, axis, index, out_grad, + true, dev_ctx); // the gradient of gather is scatter } else if (index_type == DataType::INT64) { phi::funcs::gpu_scatter_add_kernel( - *x_grad, axis, index, out_grad, dev_ctx); + *x_grad, axis, index, out_grad, true, dev_ctx); } else { PADDLE_THROW( phi::errors::InvalidArgument("The data type of input index is expected " diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu index ba4c6ba27e6824..ea32c056d4016a 100644 --- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu +++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu @@ -33,9 +33,11 @@ void TakeAlongAxisKernel(const Context& dev_ctx, const auto& index_type = index.dtype(); if (index_type == DataType::INT32) { - phi::funcs::gpu_gather_kernel(x, axis, index, *out, dev_ctx); + phi::funcs::gpu_gather_kernel( + x, axis, index, *out, true, dev_ctx); } else if (index_type == DataType::INT64) { - phi::funcs::gpu_gather_kernel(x, axis, index, *out, dev_ctx); + phi::funcs::gpu_gather_kernel( + x, axis, index, *out, true, dev_ctx); } else { PADDLE_THROW( phi::errors::InvalidArgument("The data type of input index is expected " diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu index 6a4ad710fff0e7..eea67fc676d6df 100644 --- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu +++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu @@ -303,7 +303,7 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], if (*beam >= MaxLength) break; } else { #ifdef PADDLE_WITH_HIP - uint64 mask = 0; + unsigned mask = 0u; mask = __ballot(true); if (tid_max / WARP_SIZE == wid) { if (__shfl_down(*beam, tid_max % WARP_SIZE, WARP_SIZE) == MaxLength) diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu index e3c6f1bd4c9ef7..c37c8a820aefa9 100644 --- a/paddle/phi/kernels/gpu/unique_kernel.cu +++ b/paddle/phi/kernels/gpu/unique_kernel.cu @@ -26,7 +26,14 @@ #include #include +#ifdef PADDLE_WITH_CUDA +#include "cub/cub.cuh" +#elif defined(PADDLE_WITH_MUSA) #include "cub/cub.cuh" +#else +#include +namespace cub = hipcub; +#endif #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/kernel_registry.h" @@ -170,8 +177,11 @@ UniqueFlattendCUDATensor(const Context& context, #elif defined(PADDLE_WITH_MUSA) musaMemsetAsync(inv_loc_data_ptr, 0, sizeof(IndexT), context.stream()); #else - cudaMemsetAsync(inv_loc_data_ptr, 0, sizeof(IndexT), context.stream()); + thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); + inv_loc_data_dev[0] = 0; // without device_ptr, segmentation fault #endif + +#ifdef PADDLE_WITH_HIP size_t temp_storage_bytes = 0; cub::DeviceScan::InclusiveSum(NULL, temp_storage_bytes, @@ -187,6 +197,12 @@ UniqueFlattendCUDATensor(const Context& context, inv_loc_data_ptr, num_input, context.stream()); +#else + thrust::inclusive_scan(exec_policy, + inv_loc_data_ptr, + inv_loc_data_ptr + num_input, + inv_loc_data_ptr); +#endif thrust::scatter(exec_policy, inv_loc_data_ptr, inv_loc_data_ptr + num_input, @@ -390,9 +406,11 @@ static void ComputeUniqueDims(const Context& context, // 3. counts: 'counts' counts->Resize(common::make_ddim({num_out})); auto* count_data = context.template Alloc(counts); - thrust::fill(exec_policy, count_data, count_data + row, 0); - thrust::adjacent_difference( - exec_policy, range_data_ptr + 1, range_data_ptr + row + 1, count_data); + thrust::fill(exec_policy, count_data, count_data + num_out, 0); + thrust::adjacent_difference(exec_policy, + range_data_ptr + 1, + range_data_ptr + num_out + 1, + count_data); } // Calculate unique when 'axis' is set diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h index 3f78361b92b8bd..99f05f80c17ff7 100644 --- a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h @@ -341,4 +341,26 @@ void SetValueGradKernel(const Context& dev_ctx, } } +template +void SetValueWithScalarGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const IntArray& starts, + const IntArray& ends, + const IntArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* x_grad) { + SetValueGradKernel(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + nullptr); +} + } // namespace phi diff --git a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h index 5dafe445e2b461..201dd403270f36 100644 --- a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h +++ b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h @@ -105,6 +105,7 @@ void weight_permute_gpu(const GPUContext& dev_ctx, input_data, output_data, numel, total_k, total_n); } } + template __global__ void per_channel_quant_gpu(const T* weight_data, int8_t* quanted_weight_data, @@ -160,7 +161,6 @@ __global__ void per_channel_quant_gpu(const T* weight_data, } } } - template void weight_quant_gpu(const GPUContext& dev_ctx, const T* weight_data, @@ -174,8 +174,15 @@ void weight_quant_gpu(const GPUContext& dev_ctx, constexpr int kBlockSize = 64; constexpr int kWarpNum = kBlockSize / kWarpSize; constexpr int kVectorSize = 128 / sizeof(T) / 8; + PADDLE_ENFORCE_EQ(total_n % kVectorSize, + 0, + phi::errors::PreconditionNotMet( + "Currently, weight_quant_gpu kernel only support n " + "with multiple of %d, please use", + kVectorSize)); int vec_total_n = total_n / kVectorSize; - int kGridSize = max(vec_total_n / kBlockSize, static_cast(1)); + int kGridSize = + max((vec_total_n + kBlockSize - 1) / kBlockSize, static_cast(1)); per_channel_quant_gpu<<>>( weight_data, quanted_weight_data, scale_data, total_k, vec_total_n); } diff --git a/paddle/phi/kernels/put_along_axis_grad_kernel.h b/paddle/phi/kernels/put_along_axis_grad_kernel.h index 2141443da7ab17..07c39941ce8d83 100644 --- a/paddle/phi/kernels/put_along_axis_grad_kernel.h +++ b/paddle/phi/kernels/put_along_axis_grad_kernel.h @@ -24,9 +24,12 @@ template void PutAlongAxisGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& index, + const DenseTensor& value, + const DenseTensor& out, const DenseTensor& out_grad, int axis, const std::string& reduce, + bool include_self, DenseTensor* x_grad, DenseTensor* value_grad); diff --git a/paddle/phi/kernels/put_along_axis_kernel.h b/paddle/phi/kernels/put_along_axis_kernel.h index 797d0e364b48d4..c1cb13e607dd6e 100644 --- a/paddle/phi/kernels/put_along_axis_kernel.h +++ b/paddle/phi/kernels/put_along_axis_kernel.h @@ -27,6 +27,7 @@ void PutAlongAxisKernel(const Context& dev_ctx, const DenseTensor& value, int axis, const std::string& reduce, + bool include_self, DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/set_value_grad_kernel.h b/paddle/phi/kernels/set_value_grad_kernel.h index e4dad683e40a9d..04592cd2002d19 100644 --- a/paddle/phi/kernels/set_value_grad_kernel.h +++ b/paddle/phi/kernels/set_value_grad_kernel.h @@ -32,4 +32,14 @@ void SetValueGradKernel(const Context& dev_ctx, DenseTensor* x_grad, DenseTensor* value_grad); +template +void SetValueWithScalarGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const IntArray& starts, + const IntArray& ends, + const IntArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* x_grad); } // namespace phi diff --git a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc index d1ad332cd626c5..c5d33ae4ac8d06 100644 --- a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc @@ -397,6 +397,28 @@ void SetValueGradKernel(const Context& dev_ctx, } } +template +void SetValueWithScalarGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const IntArray& starts, + const IntArray& ends, + const IntArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* x_grad) { + SetValueGradKernel(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + nullptr); +} + } // namespace phi PD_REGISTER_KERNEL(set_value_grad, @@ -407,3 +429,12 @@ PD_REGISTER_KERNEL(set_value_grad, phi::dtype::float16, int, int64_t) {} + +PD_REGISTER_KERNEL(set_value_with_scalar_grad, + XPU, + ALL_LAYOUT, + phi::SetValueWithScalarGradKernel, + float, + phi::dtype::float16, + int, + int64_t) {} diff --git a/python/cinn/compiler/expr_executor.py b/python/cinn/compiler/expr_executor.py index cff9a9d62d7c43..c888be369c3d6e 100644 --- a/python/cinn/compiler/expr_executor.py +++ b/python/cinn/compiler/expr_executor.py @@ -81,14 +81,15 @@ def visit(self, node): value = exec_func(cls_fields) else: new_node = node.__class__(**cls_fields) - ast.copy_location(new_node, node) - new_node = ast.Expression(new_node) value = self.exec_expr(new_node) return self.save_temp_value(value) def exec_expr(self, node): - if isinstance(node, ast.expr): - node = ast.Expression(body=node) + assert isinstance(node, ast.expr) + if type(node).__name__ == "Constant": + return node.value + + node = ast.Expression(node) node = ast.fix_missing_locations(node) exec = compile(node, filename="", mode="eval") return eval(exec, self.var_table) diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py index cc12d50a6069f2..7b4c81cfa323d0 100644 --- a/python/paddle/base/dygraph/tensor_patch_methods.py +++ b/python/paddle/base/dygraph/tensor_patch_methods.py @@ -876,7 +876,7 @@ def __array__(self, dtype=None): array = array.astype(dtype) return array - def pre_deal_index_and_value(self, item, value=None): + def pre_deal_index(self, item): # since in pybind there is no effiency way to transfer Py_Tuple/Py_List/Py_Range to Tensor # we call this function in python level. item = list(item) if isinstance(item, tuple) else [item] @@ -886,17 +886,14 @@ def pre_deal_index_and_value(self, item, value=None): elif isinstance(slice_item, range): item[i] = paddle.to_tensor(list(slice_item)) - if value is not None and not isinstance(value, Variable): - value = paddle.to_tensor(value, dtype=self.dtype) - - return tuple(item), value + return tuple(item) def __getitem__(self, item): - item, _ = pre_deal_index_and_value(self, item) + item = pre_deal_index(self, item) return self._getitem_dygraph(item) def __setitem__(self, item, value): - item, value = pre_deal_index_and_value(self, item, value) + item = pre_deal_index(self, item) return self._setitem_dygraph(item, value) @framework.dygraph_only diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index fff2771da14c2f..bb8a4bc7b10ab0 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -682,7 +682,7 @@ def _get_varname_from_block(block): ) -def _get_program_cache_key(feed, fetch_list): +def _get_feed_fetch_var_names(feed, fetch_list): feed_var_names = [] if isinstance(feed, dict): feed_var_names = list(feed.keys()) @@ -690,7 +690,11 @@ def _get_program_cache_key(feed, fetch_list): for i, each in enumerate(feed): feed_var_names += list(each.keys()) fetch_var_names = list(map(_to_name_str, fetch_list)) - return str(feed_var_names + fetch_var_names) + return feed_var_names + fetch_var_names + + +def _get_program_cache_key(feed, fetch_list): + return str(_get_feed_fetch_var_names(feed, fetch_list)) def _as_lodtensor(data, place, dtype=None): @@ -1026,7 +1030,7 @@ def _get_program_and_executor(self, cached_data): if enable_inplace or enable_addto: # inplace should skip feed and fetch var - skip_var_names = eval(_get_program_cache_key(feed, fetch_list)) + skip_var_names = _get_feed_fetch_var_names(feed, fetch_list) _apply_inplace_addto_pass( program, enable_inplace, enable_addto, skip_var_names ) @@ -2476,7 +2480,7 @@ def _run_from_dataset( reused_trainer = program._heter_pipeline_opt is not None or ( program._fleet_opt is not None - and program._fleet_opt.get("use_ps_gpu", True) + and program._fleet_opt.get("use_ps_gpu", False) ) if reused_trainer is False: diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py index c0d128d4fbbb31..c3a65971ffd983 100644 --- a/python/paddle/base/framework.py +++ b/python/paddle/base/framework.py @@ -5582,8 +5582,7 @@ def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True): def _convert_to_pdf(dot_file_path): pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf' exited_code = subprocess.call( - 'dot -Tpdf ' + dot_file_path + ' -o ' + pdf_save_path, - shell=True, + ['dot', '-Tpdf', dot_file_path, '-o', pdf_save_path] ) if exited_code != 0: print('The dot command is needed for creating pdf files.') diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py index bf1d737970327d..f3ba8aa5e197e8 100644 --- a/python/paddle/base/layers/math_op_patch.py +++ b/python/paddle/base/layers/math_op_patch.py @@ -538,8 +538,10 @@ def __impl__(self, other_var): op_type, lhs_dtype, rhs_dtype ) warnings.warn( - f"The input dtypes of OP {op_type} are {lhs_dtype} and {rhs_dtype}, " - "the output will be auto-promoted to {common_dtype}" + f"The input dtypes of OP {op_type} are {lhs_dtype} and {rhs_dtype}, the output will be auto-promoted to {common_dtype}" + ) + warnings.filterwarnings( + "ignore", message="The input dtypes of OP" ) if rhs_dtype != common_dtype: other_var = astype(other_var, common_dtype) diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py index f3a04076ef3fbd..533d0360764a8a 100644 --- a/python/paddle/base/variable_index.py +++ b/python/paddle/base/variable_index.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import warnings import numpy as np @@ -136,7 +137,6 @@ def deal_attrs(attrs, attr, attr_name, tensor_attr_name, inputs, infer_flags): attrs[attr_name] = attr -# the item is a tensor of bool def get_value_for_bool_tensor(var, item): if len(item.shape) > len(var.shape): raise IndexError( @@ -191,7 +191,9 @@ def _setitem_for_tensor_array(var, item, value): ) -def deal_advanced_index(ori_tensor, indices, is_for_setitem): +def deal_advanced_index( + ori_tensor, indices, is_for_setitem, values, out_is_view=True +): """ Transpose origin Tensor and advanced indices to the front. @@ -201,6 +203,7 @@ def deal_advanced_index(ori_tensor, indices, is_for_setitem): trans_back_dim (List): order of axes to transpose back to original order. Only used in __setitem__. pos_of_new_dim (int): axis of new dim in the result. Only used in __getitem__. rank_of_new_dim (int): rank of new dim in the result. Only used in __getitem__. + transed_value_tensor (Tensor): value tensor transed to the front. Only used in __setitem__. """ transed_dim = [] transed_index = [] @@ -212,24 +215,38 @@ def deal_advanced_index(ori_tensor, indices, is_for_setitem): for i, indice in enumerate(indices): if indice is not None: - if not is_for_setitem: - if i == 0: - # case 1: advanced indices at axis 0, the new dim will be at first. - pos_of_new_dim = 0 - if i > 0 and len(transed_dim) > 0 and transed_dim[-1] != i - 1: - # case 2: there are not adjacent advanced indices, the new dim will be at first. - pos_of_new_dim = 0 - else: - pos_of_new_dim = min(pos_of_new_dim, i) - rank_of_new_dim = max(rank_of_new_dim, indice[1].ndim) + if i == 0: + # case 1: advanced indices at axis 0, the new dim will be at first. + pos_of_new_dim = 0 + if i > 0 and len(transed_dim) > 0 and transed_dim[-1] != i - 1: + # case 2: there are not adjacent advanced indices, the new dim will be at first. + pos_of_new_dim = 0 + else: + pos_of_new_dim = min(pos_of_new_dim, i) + rank_of_new_dim = max(rank_of_new_dim, indice[1].ndim) transed_dim.append(i) transed_index.append(indice[1]) for i in range(ori_tensor.ndim): if indices[i] is None: transed_dim.append(i) - transed_tensor = ori_tensor.transpose(transed_dim) trans_back_dim = np.argsort(transed_dim).tolist() if is_for_setitem else [] + transed_value_tensor = None + + if transed_dim == list(range(ori_tensor.ndim)): + transed_tensor = ori_tensor + if is_for_setitem: + transed_value_tensor = values + else: + out_is_view = True + transed_tensor = ori_tensor.transpose(transed_dim) + if is_for_setitem: + if values.ndim > 1 and pos_of_new_dim != 0: + # If the value tensor is not a scalar / 1-D Tensor, and the src tensor was + # transposed at 1st dim, the value tensor should be transposed too. + transed_value_tensor = values.transpose(transed_dim) + else: + transed_value_tensor = values return ( transed_tensor, @@ -237,11 +254,25 @@ def deal_advanced_index(ori_tensor, indices, is_for_setitem): trans_back_dim, pos_of_new_dim, rank_of_new_dim, + transed_value_tensor, + out_is_view, ) def parse_index(x, indices): - advanced_index = [None] * 2 * len(x.shape) # content is (dim, index) + from .framework import in_pir_mode + + if in_pir_mode(): + is_tensor_array = x.is_dense_tensor_array_type() + else: + is_tensor_array = ( + hasattr(x, "desc") + and x.desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY + ) + + advanced_index = ( + [] if is_tensor_array else [None] * 2 * len(x.shape) + ) # content is (dim, index) # for set_value / slice / strided_slice OP decrease_axes = [] axes = [] @@ -258,11 +289,6 @@ def parse_index(x, indices): indices = replace_ellipsis(x, indices) indices, none_axes = replace_none(indices) - is_tensor_array = ( - hasattr(x, "desc") - and x.desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY - ) - estimated_dim = 0 dim = 0 for i, slice_item in enumerate(indices): @@ -550,7 +576,12 @@ def _setitem_static(x, indices, values): # 3. assign values to the sliced result by index_put OP; # 4. transpose back and assign the result to original tensor by set_value OP. - sub_tensor = get_tensor_with_basic_indexing( + if not isinstance( + values, (Variable, paddle.pir.Value, paddle.pir.OpResult) + ): + values = paddle.assign(values).astype(x.dtype) + + sub_tensor, is_view = get_tensor_with_basic_indexing( x, axes, starts, @@ -566,18 +597,41 @@ def _setitem_static(x, indices, values): transback_dim, _, _, - ) = deal_advanced_index(sub_tensor, advanced_index, True) - if not isinstance(values, (Variable, paddle.pir.Value)): - values = paddle.assign(values).astype(transed_sub_tensor.dtype) + values, + is_view, + ) = deal_advanced_index( + sub_tensor, advanced_index, True, values, is_view + ) if values.dtype != transed_sub_tensor.dtype: values = values.astype(transed_sub_tensor.dtype) - if in_dynamic_or_pir_mode(): - # NOTE(zoooo0820): directly return result instead of another set_value, after backward bug fixed. - transed_sub_tensor = transed_sub_tensor.index_put_( - adjusted_advanced_index, values - ) + if paddle.in_dynamic_mode(): + if ( + len(adjusted_advanced_index) == 1 + and adjusted_advanced_index[0].dtype + in (paddle.bool, paddle.base.libpaddle.BOOL) + and len( + adjusted_advanced_index[0].shape + == len(transed_sub_tensor.shape) + ) + ): + if values.shape != transed_sub_tensor.shape: + values = values.expand(transed_sub_tensor.shape) + transed_sub_tensor = paddle._C_ops.where_( + paddle.logical_not(adjusted_advanced_index[0]), + transed_sub_tensor, + values, + ) + if not is_view: + return x + else: + # NOTE(zoooo0820): directly return result instead of another set_value, after backward bug fixed. + transed_sub_tensor = transed_sub_tensor.index_put_( + adjusted_advanced_index, values + ) + if not is_view: + return x else: transed_sub_tensor = transed_sub_tensor.index_put( adjusted_advanced_index, values @@ -624,12 +678,14 @@ def get_tensor_with_basic_indexing( ): from .dygraph.base import in_to_static_mode + out_is_view = False if in_to_static_mode() and hasattr(x, "is_view_var"): x.is_view_var = True if len(axes) == 0: out = x else: + out_is_view = True op_type = "strided_slice" if use_strided_slice else "slice" inputs = {'Input': [x]} attrs = { @@ -677,6 +733,8 @@ def get_tensor_with_basic_indexing( if isinstance(end, (list, tuple)): if paddle.utils._contain_var(end): end = paddle.utils.get_int_tensor_list(end) + if x.is_dense_tensor_array_type(): + return paddle._pir_ops.slice_array_dense(x, st), False out = paddle._C_ops.slice( x, axes, @@ -703,17 +761,9 @@ def get_tensor_with_basic_indexing( attrs=attrs, ) out = slice_out_var - # NOTE(zoooo0820): When all axes are decreased, the output will be 1-D - # with FLAGS_set_to_1d=True. In this case, one `None` should be pop out, - # otherwise the output shape will be not correct. - set_to_1d = paddle.get_flags('FLAGS_set_to_1d')['FLAGS_set_to_1d'] - if set_to_1d and len(decrease_axes) == len(x.shape): - warnings.warn( - "Warning: In Tensor '__getitem__', if the number of scalar elements in the index is equal to the rank of the Tensor, the output should be 0-D. In order to be consistent with the behavior of previous versions, it will be processed to 1-D. But it is not correct and will be removed in release 2.6. If 1-D is still wanted, please modify the index element from scalar to slice (e.g. 'x[i]' => 'x[i:i+1]')." - ) - none_axes = none_axes[1:] if len(none_axes) > 0: + out_is_view = True # Deal with cases that decrease_axes is not empty # For example: # # x.shape: (2,3,4) @@ -727,7 +777,7 @@ def get_tensor_with_basic_indexing( if in_to_static_mode() and hasattr(out, "is_view_var"): out.is_view_var = True - return out + return out, out_is_view def _getitem_static(x, indices): @@ -750,7 +800,7 @@ def _getitem_static(x, indices): ) = parse_index(x, indices) # step2: Dealing with basic indexing - out = get_tensor_with_basic_indexing( + out, _ = get_tensor_with_basic_indexing( x, axes, starts, @@ -769,13 +819,14 @@ def _getitem_static(x, indices): _, pos_of_new_dim, rank_of_new_dim, - ) = deal_advanced_index(out, advanced_index, False) + _, + _, + ) = deal_advanced_index(out, advanced_index, False, None) # TODO(zooooo0820): Replacing gather_nd to another advanded OP for handling of mixed indexes more efficiently - if ( - len(adjusted_advanced_index) == 1 - and adjusted_advanced_index[0].dtype == paddle.bool - ): + if len(adjusted_advanced_index) == 1 and adjusted_advanced_index[ + 0 + ].dtype in (paddle.bool, paddle.base.libpaddle.BOOL): # Note: now slice not support 0-size Tensor, so only one bool tensor can return empty 0-size. out = get_value_for_bool_tensor( transed_tensor, adjusted_advanced_index[0] @@ -797,8 +848,8 @@ def _getitem_static(x, indices): if pos_of_new_dim != 0: perm = ( - list(range(pos_of_new_dim, pos_of_new_dim + rank_of_new_dim)) - + list(range(0, pos_of_new_dim)) + list(range(rank_of_new_dim, pos_of_new_dim + rank_of_new_dim)) + + list(range(0, rank_of_new_dim)) + list(range(pos_of_new_dim + rank_of_new_dim, out.ndim)) ) out = out.transpose(perm) diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py index 4695b633ffa0fd..35155a2de2d226 100644 --- a/python/paddle/dataset/common.py +++ b/python/paddle/dataset/common.py @@ -18,6 +18,7 @@ import importlib import os import pickle +import re import shutil import sys import tempfile @@ -71,6 +72,11 @@ def md5file(fname): def download(url, module_name, md5sum, save_name=None): + module_name = re.match("^[a-zA-Z0-9_/\\-]+$", module_name).group() + if isinstance(save_name, str): + save_name = re.match( + "^(?:(?!\\.\\.)[a-zA-Z0-9_/\\.-])+$", save_name + ).group() dirname = os.path.join(DATA_HOME, module_name) if not os.path.exists(dirname): os.makedirs(dirname) diff --git a/python/paddle/distributed/auto_tuner/prune.py b/python/paddle/distributed/auto_tuner/prune.py index 94f0a67a21debc..a86ce0f31dd367 100644 --- a/python/paddle/distributed/auto_tuner/prune.py +++ b/python/paddle/distributed/auto_tuner/prune.py @@ -510,17 +510,42 @@ def prune_by_memory_estimation(tuner_cfg, cur_cfg, history_cfgs=[]): "max_mem_usage should be set when using memory estimation tool" ) - memory_estimation_cmd = f"python {memory_estimation_tool} --dp_degree {cur_cfg['dp_degree']} --mp_degree {cur_cfg['mp_degree']} \ - --pp_degree {cur_cfg['pp_degree']} --vpp_degree {cur_cfg['vpp_degree']} \ - --sharding_degree {cur_cfg['sharding_degree']} --sharding_stage {cur_cfg['sharding_stage']} \ - --use_recompute {cur_cfg['use_recompute']} --micro_batch_size {cur_cfg['micro_batch_size']} \ - --recompute_granularity {cur_cfg['recompute_granularity']} \ - --hidden_size {model_cfg['hidden_size']} --num_attention_heads {model_cfg['num_attention_heads']} \ - --num_layers {model_cfg['num_layers']} --max_sequence_length {model_cfg['max_sequence_length']} \ - --vocab_size {model_cfg['vocab_size']} --intermediate_size {model_cfg['intermediate_size']} " + memory_estimation_cmd = [ + "python", + memory_estimation_tool, + "--dp_degree", + str(cur_cfg['dp_degree']), + "--mp_degree", + str(cur_cfg['mp_degree']), + "--pp_degree", + str(cur_cfg['pp_degree']), + "--vpp_degree", + str(cur_cfg['vpp_degree']), + "--sharding_degree", + str(cur_cfg['sharding_degree']), + "--sharding_stage", + str(cur_cfg['sharding_stage']), + "--use_recompute", + str(cur_cfg['use_recompute']), + "--micro_batch_size", + str(cur_cfg['micro_batch_size']), + "--recompute_granularity", + str(cur_cfg['recompute_granularity']), + "--hidden_size", + str(model_cfg['hidden_size']), + "--num_attention_heads", + str(model_cfg['num_attention_heads']), + "--num_layers", + str(model_cfg['num_layers']), + "--max_sequence_length", + str(model_cfg['max_sequence_length']), + "--vocab_size", + str(model_cfg['vocab_size']), + "--intermediate_size", + str(model_cfg['intermediate_size']), + ] result = subprocess.run( memory_estimation_cmd, - shell=True, capture_output=True, text=True, ) diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index ddc6c411598c34..4859a438a930a7 100755 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -13,6 +13,7 @@ # limitations under the License. """Definition of Role Makers.""" import os +import re import time import warnings from multiprocessing import Manager, Process @@ -988,7 +989,9 @@ def _ps_env(self): # each role will execute it raise ValueError( "Can not find PADDLE_STAGE_TRAINERS_NUM, please check your environment." ) - self._stage_trainers = eval(self._stage_trainers) + self._stage_trainers = tuple( + [int(x) for x in re.findall(r'\d+', self._stage_trainers)] + ) cur_port = os.getenv("PADDLE_PORT", None) if cur_port is None: raise ValueError( @@ -1040,7 +1043,9 @@ def _ps_env(self): # each role will execute it raise ValueError( "Can not find PADDLE_STAGE_TRAINERS_NUM, please check your environment." ) - self._stage_trainers = eval(self._stage_trainers) + self._stage_trainers = tuple( + [int(x) for x in re.findall(r'\d+', self._stage_trainers)] + ) self._heter_trainer_device_type = os.getenv( "HETER_DEVICE_TYPE", None diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index 743ceac3e296cc..51aeeb6840d0a4 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -18,6 +18,7 @@ import os import re import shutil +import subprocess import time # (TODO: GhostScreaming) It will be removed later. @@ -513,6 +514,34 @@ def _run_cmd(self, cmd, redirect_stderr=False, retry_times=5): return ret, output.splitlines() + def _run_safe_cmd(self, cmd, redirect_stderr=False, retry_times=5): + exe_cmd = [self._base_cmd] + cmd.split() + ret = 0 + output = "" + retry_sleep_second = 3 + for x in range(retry_times + 1): + try: + process = subprocess.run( + exe_cmd, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT + if redirect_stderr + else subprocess.PIPE, + text=True, + ) + output = process.stdout + break + except subprocess.CalledProcessError as e: + ret = e.returncode + output = e.output + time.sleep(retry_sleep_second) + except Exception as e: + break + + if ret == 134: + raise FSShellCmdAborted(cmd) + @_handle_errors() def list_dirs(self, fs_path): """ @@ -582,8 +611,8 @@ def ls_dir(self, fs_path): return self._ls_dir(fs_path) def _ls_dir(self, fs_path): - cmd = f"ls {fs_path}" - ret, lines = self._run_cmd(cmd) + cmd = ["-ls", fs_path] + ret, lines = self._run_safe_cmd(cmd) if ret != 0: raise ExecuteError(cmd) diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py index 13d8ef403504ab..9e0f46584653c8 100644 --- a/python/paddle/distributed/launch/controllers/collective.py +++ b/python/paddle/distributed/launch/controllers/collective.py @@ -45,7 +45,10 @@ def build_pod(self): ): return self._build_pod_with_args() else: - return self._build_pod_with_master() + if self.ctx.args.auto_parallel_config is None: + skip_run = True + # only when skip_run is Flase, should not reset pod + return self._build_pod_with_master(skip_run) def _build_pod_with_tuner(self): auto_parallel_config = self.ctx.args.auto_parallel_config @@ -148,7 +151,7 @@ def _build_pod_with_args(self): return True - def _build_pod_with_master(self): + def _build_pod_with_master(self, reset_pod=True): self.pod.replicas = self.pod_replicas() # rank will be reset when restart @@ -203,7 +206,8 @@ def _build_pod_with_master(self): job_endpoints = [i['endpoints'] for i in peer_list] - # self.pod.reset() + if reset_pod: + self.pod.reset() selected_dev_key = self.ctx.node.device.get_selected_device_key() selected_dev_list = self.ctx.node.device.get_selected_devices( self.ctx.args.devices diff --git a/python/paddle/distributed/rpc/rpc.py b/python/paddle/distributed/rpc/rpc.py index 0d88c8fef1ce51..273fc1bcc0196b 100644 --- a/python/paddle/distributed/rpc/rpc.py +++ b/python/paddle/distributed/rpc/rpc.py @@ -142,7 +142,7 @@ def init_rpc(name, rank=None, world_size=None, master_endpoint=None): def rpc_sync(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT): """ - Make a blocking RPC call to run function ``fn`` on worker ``to``. + Make a blocking RPC call to run function ``fn`` on worker ``to``. Attention: Users must use this API in a secure network environment. Args: to (str): name of the destination worker. @@ -182,7 +182,7 @@ def rpc_sync(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT): def rpc_async(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT): """ - Make a non-blocking RPC call to run function ``fn`` on worker ``to``. + Make a non-blocking RPC call to run function ``fn`` on worker ``to``. Attention: Users must use this API in a secure network environment. Args: to (str): name of the destination worker. diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py index c39fa57ad56816..f25640804fdbcc 100644 --- a/python/paddle/hapi/hub.py +++ b/python/paddle/hapi/hub.py @@ -117,7 +117,6 @@ def _get_cache_or_reload(repo, force_reload, verbose=True, source='github'): hub_dir, check_exist=not force_reload, decompress=False, - method=('wget' if source == 'gitee' else 'get'), ) shutil.move(fpath, cached_file) diff --git a/python/paddle/incubate/distributed/fleet/fleet_util.py b/python/paddle/incubate/distributed/fleet/fleet_util.py index 1777afffe9aaf4..9af91e4f5b148c 100644 --- a/python/paddle/incubate/distributed/fleet/fleet_util.py +++ b/python/paddle/incubate/distributed/fleet/fleet_util.py @@ -18,6 +18,7 @@ import logging import math import os +import re import sys import time @@ -1317,23 +1318,12 @@ def get_online_pass_interval( ... is_data_hourly_placed=False) """ - assert ( - "|" not in days - and ";" not in days - and "\\" not in days - and "/" not in days - and "(" not in days - and ")" not in days - ), r"days should not contain [|,;,\,/,(,)]" + pattern = r'^\d+|{[0-9]+}|{[0-9]+\.\.[0-9]+}$' + if not re.fullmatch(pattern, str(days)): + raise Exception("days format is not right") days = os.popen("echo -n " + days).read().split(" ") - assert ( - "|" not in hours - and ";" not in hours - and "\\" not in hours - and "/" not in hours - and "(" not in hours - and ")" not in days - ), r"hours should not contain [|,;,\,/,(,)]" + if not re.fullmatch(pattern, str(hours)): + raise Exception("hours format is not right") hours = os.popen("echo -n " + hours).read().split(" ") split_interval = int(split_interval) split_per_pass = int(split_per_pass) diff --git a/python/paddle/io/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py index ed64f5da3d9e93..84260dc90ca562 100644 --- a/python/paddle/io/dataloader/dataloader_iter.py +++ b/python/paddle/io/dataloader/dataloader_iter.py @@ -704,10 +704,11 @@ def _get_data(self): if len(failed_workers) > 0: self._exit_thread_unexpectedly() pids = ', '.join(str(w.pid) for w in failed_workers) - raise RuntimeError( - f"DataLoader {len(failed_workers)} workers exit unexpectedly, " - f"pids: {pids}" + logging.warning( + "DataLoader {} workers exit unexpectedly, " + "pids: {}".format(len(failed_workers), pids) ) + return # get(timeout) will call _poll(timeout) and may raise IOError if isinstance(e, (IOError, queue.Empty)): diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py index a6cfb4cd8c3993..bf44c2a47dcbb0 100644 --- a/python/paddle/jit/dy2static/convert_operators.py +++ b/python/paddle/jit/dy2static/convert_operators.py @@ -726,7 +726,12 @@ def convert_var_dtype(var, dtype): } return paddle.cast(var, dtype=cast_map[dtype]) else: - return eval(f'{dtype}(var)') + assert dtype in [ + 'bool', + 'int', + 'float', + ], f"The casted target dtype is {dtype}, which is not supported in type casting." + return eval(dtype)(var) def convert_assert(cond, message=""): diff --git a/python/paddle/nn/quant/format.py b/python/paddle/nn/quant/format.py index 2e58f1ef2b8b62..0074389b3bc554 100644 --- a/python/paddle/nn/quant/format.py +++ b/python/paddle/nn/quant/format.py @@ -46,7 +46,14 @@ def from_quanter(quanter): class LinearQuanter(Layer): - def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8): + def __init__( + self, + scales, + zero_point=None, + quant_axis=None, + bit_length=8, + group_size=128, + ): super().__init__() scales = paddle.to_tensor(scales, dtype="float32") scale_attr = paddle.framework.ParamAttr( @@ -65,9 +72,21 @@ def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8): ) self._quant_axis = -1 if quant_axis is None else quant_axis self._bit_length = bit_length + self._group_size = group_size def forward(self, input): if in_dynamic_mode(): + if len(self._scales.shape) > 1: + bnt = (1 << (self._bit_length - 1)) - 1 + new_s = paddle.repeat_interleave( + self._scales, self._group_size, 0 + ) + quant_weight = paddle.clip( + paddle.round(input.cast('float32') / new_s * bnt), + -bnt - 1, + bnt, + ) + return quant_weight.cast(input.dtype) return _C_ops.quantize_linear( input.cast('float32'), self._scales, @@ -105,7 +124,14 @@ def from_quanter(quanter): class LinearDequanter(Layer): - def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8): + def __init__( + self, + scales, + zero_point=None, + quant_axis=None, + bit_length=8, + group_size=128, + ): super().__init__() scales = paddle.to_tensor(scales, dtype="float32") scale_attr = paddle.framework.ParamAttr( @@ -124,9 +150,18 @@ def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8): ) self._quant_axis = -1 if quant_axis is None else quant_axis self._bit_length = bit_length + self._group_size = group_size def forward(self, input): if in_dynamic_mode(): + if len(self._scales.shape) > 1: + bnt = (1 << (self._bit_length - 1)) - 1 + new_s = paddle.repeat_interleave( + self._scales, self._group_size, 0 + ) + quant_dequant_weight = input.cast('float32') / bnt * new_s + return quant_dequant_weight.cast(input.dtype) + return _C_ops.dequantize_linear( input.cast('float32'), self._scales, diff --git a/python/paddle/quantization/observers/__init__.py b/python/paddle/quantization/observers/__init__.py index 733b3e7dbb9812..9bb662b53626ea 100644 --- a/python/paddle/quantization/observers/__init__.py +++ b/python/paddle/quantization/observers/__init__.py @@ -14,5 +14,6 @@ # limitations under the License. from .abs_max import AbsmaxObserver +from .groupwise import GroupWiseWeightObserver -__all__ = ["AbsmaxObserver"] +__all__ = ["AbsmaxObserver", "GroupWiseWeightObserver"] diff --git a/python/paddle/quantization/observers/groupwise.py b/python/paddle/quantization/observers/groupwise.py new file mode 100644 index 00000000000000..9d30a7101c1128 --- /dev/null +++ b/python/paddle/quantization/observers/groupwise.py @@ -0,0 +1,113 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle + +from ..base_observer import BaseObserver +from ..factory import ObserverFactory + + +class GroupWiseWeightObserver(ObserverFactory): + r""" + It collects channel-wise maximum absolute values of target weights. + Args: + bit_length(int, optional): Number of bits to represent an quantized integer in binary. + dtype(str, optional): The data type of input tensor. + name (str, optional): This parameter is used by developers to print debugging information. \ + For details, please refer to :ref:`api_guide_Name`. Default is None. + Examples: + .. code-block:: python + from paddle.quantization import QuantConfig + from paddle.quantization.quanters import AbsMaxChannelWiseWeightObserver + quanter = AbsMaxChannelWiseWeightObserver() + q_config = QuantConfig(activation=None, weight=quanter) + """ + + def __init__(self, quant_bits=8, group_size=128): + super().__init__(quant_bits=quant_bits) + + def _get_class(self): + return GroupWiseWeightObserverLayer + + +class GroupWiseWeightObserverLayer(BaseObserver): + def __init__(self, layer, quant_bits=8, group_size=128): + super().__init__() + self.quant_bits = quant_bits + self.group_size = group_size + self._layer = layer + self._max = None + self._scale = None + self._zero_point = None + + def forward(self, inputs): + self._max = self._cal_abs_max(inputs) + return inputs + + def _cal_abs_max(self, inputs): + """Use group_size to group the input, then use the + absmax method to calculate the scale + """ + input_shape = inputs.shape + assert ( + self.group_size == 64 or self.group_size == 128 + ), "group_size only support 64 or 128" + assert ( + inputs.shape[0] % self.group_size == 0 + ), "group_size must be a factor of input channels" + assert len(inputs.shape) == 2, "Currently only support 2D tensor" + input_processed = inputs.transpose([1, 0]).reshape( + [input_shape[1], input_shape[0] // self.group_size, self.group_size] + ) + + abs_max_values = paddle.max(paddle.abs(input_processed), axis=2).cast( + "float32" + ) + abs_max_values = paddle.where( + abs_max_values == np.float32(0), np.float32(1e-8), abs_max_values + ) + abs_max_values = abs_max_values.transpose([1, 0]) + return abs_max_values + + def min_value(self) -> float: + return 0.0 + + def max_value(self) -> float: + return self._max + + def bit_length(self): + return self._quant_bits + + def quant_axis(self): + return -1 + + def cal_thresholds(self): + """Compute thresholds for MAX function.""" + if self._scale is None: + self._scale = self._max + self._zero_point = paddle.zeros_like(self._scale) + + def scales(self): + """Return output scales.""" + if self._scale is None: + self.cal_thresholds() + return self._scale + + def zero_points(self): + """Return output zero points.""" + if self._zero_point is None: + self.cal_thresholds() + return self._zero_point diff --git a/python/paddle/quantization/quantize.py b/python/paddle/quantization/quantize.py index b7887ffc46e1c4..7606c4bb3e1827 100644 --- a/python/paddle/quantization/quantize.py +++ b/python/paddle/quantization/quantize.py @@ -28,8 +28,9 @@ class Quantization(metaclass=abc.ABCMeta): r""" Abstract class used to prepares a copy of the model for quantization calibration or quantization-aware training. + Args: - config(QuantConfig) - Quantization configuration + config(QuantConfig): Quantization configuration """ def __init__(self, config: QuantConfig): @@ -43,10 +44,11 @@ def quantize(self, model: Layer, inplace=False): def convert(self, model: Layer, inplace=False, remain_weight=False): r"""Convert the quantization model to ONNX style. And the converted model can be saved as inference model by calling paddle.jit.save. + Args: - model(Layer) - The quantized model to be converted. - inplace(bool, optional) - Whether to modify the model in-place, default is False. - remain_weight(bool, optional) - Whether to remain weights in floats, default is False. + model(Layer): The quantized model to be converted. + inplace(bool, optional): Whether to modify the model in-place, default is False. + remain_weight(bool, optional): Whether to remain weights in floats, default is False. Return: The converted model @@ -72,7 +74,12 @@ def convert(self, model: Layer, inplace=False, remain_weight=False): for name, child in _model.named_children(): quant_dequant = None if isinstance(child, ConvertibleQuantedLayer): - if child.weight_quanter.scales() is None: + if child.converted: + continue + if ( + child.weight_quanter is None + or child.weight_quanter.scales() is None + ): continue child._convert(remain_weight=remain_weight) elif isinstance(child, BaseQuanter): diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 87cb258952f9e4..d8ee8698e2d70e 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -5308,11 +5308,12 @@ def put_along_axis( Args: arr (Tensor) : The Destination Tensor. Supported data types are float32 and float64. indices (Tensor) : Indices to put along each 1d slice of arr. This must match the dimension of arr, - and need to broadcast against arr. Supported data type are int and int64. + and need to broadcast against arr if broadcast is 'True'. Supported data type are int and int64. + values (Tensor) : The value element(s) to put. The data types should be same as arr. axis (int) : The axis to put 1d slices along. - reduce (str, optional): The reduce operation, default is 'assign', support 'add', 'assign', 'mul' and 'multiply'. - include_self (bool, optional): whether to reduce with the elements of arr. (Only support True now) - broadcast (bool, optional): whether to broadcast indices. + reduce (str, optional): The reduce operation, default is 'assign', support 'add', 'assign', 'mul', 'multiply', "mean", "amin" and "amax". + include_self (bool, optional): whether to reduce with the elements of arr, default is 'True'. + broadcast (bool, optional): whether to broadcast indices, default is 'True'. Returns: Tensor, The indexed element, same dtype with arr @@ -5332,9 +5333,45 @@ def put_along_axis( [[99, 99, 99], [60, 40, 50]]) + >>> index = paddle.zeros((2,2)).astype("int32") + >>> value=paddle.to_tensor([[1,2],[3,4]]).astype(x.dtype) + >>> result = paddle.put_along_axis(x, index, value, 0, "add", True, False) + >>> print(result) + Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True, + [[14, 36, 20], + [60, 40, 50]]) + + >>> result = paddle.put_along_axis(x, index, value, 0, "mul", True, False) + >>> print(result) + Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True, + [[30 , 240, 20 ], + [60 , 40 , 50 ]]) + + >>> result = paddle.put_along_axis(x, index, value, 0, "mean", True, False) + >>> print(result) + Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True, + [[4 , 12, 20], + [60, 40, 50]]) + + >>> result = paddle.put_along_axis(x, index, value, 0, "amin", True, False) + >>> print(result) + Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True, + [[1 , 2 , 20], + [60, 40, 50]]) + + >>> result = paddle.put_along_axis(x, index, value, 0, "amax", True, False) + >>> print(result) + Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True, + [[10, 30, 20], + [60, 40, 50]]) + + >>> result = paddle.put_along_axis(x, index, value, 0, "add", False, False) + >>> print(result) + Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True, + [[4 , 6 , 20], + [60, 40, 50]]) + """ - if not include_self: - raise ValueError("`include_self` is only support True now.") if len(arr.shape) != len(indices.shape): raise ValueError( "`indices` and `arr` must have the same number of dimensions!" @@ -5381,7 +5418,15 @@ def put_along_axis( ) ) if in_dynamic_or_pir_mode(): - return _C_ops.put_along_axis(arr, indices, values, axis, reduce) + if convert_dtype(indices.dtype) not in ['int32', 'int64']: + raise TypeError( + "The data type of indices should be one of ['int32', 'int64'], but got {}".format( + str(convert_dtype(indices.dtype)) + ) + ) + return _C_ops.put_along_axis( + arr, indices, values, axis, reduce, include_self + ) else: check_variable_and_dtype( arr, @@ -5400,20 +5445,27 @@ def put_along_axis( check_variable_and_dtype( indices, 'index', ['int32', 'int64'], 'put_along_axis' ) + check_type(include_self, 'include_self', bool, 'put_along_axis') helper = LayerHelper('put_along_axis', **locals()) dtype = helper.input_dtype() result = helper.create_variable_for_type_inference(dtype) helper.append_op( type="put_along_axis", inputs={"Input": arr, "Index": indices, "Value": values}, - attrs={"Axis": axis, "Reduce": reduce}, + attrs={ + "Axis": axis, + "Reduce": reduce, + "Include_self": include_self, + }, outputs={"Result": result}, ) return result @inplace_apis_in_dygraph_only -def put_along_axis_(arr, indices, values, axis, reduce='assign'): +def put_along_axis_( + arr, indices, values, axis, reduce='assign', include_self=True +): r""" Inplace version of ``put_along_axis`` API, the output Tensor will be inplaced with input ``arr``. Please refer to :ref:`api_paddle_put_along_axis`. @@ -5432,7 +5484,9 @@ def put_along_axis_(arr, indices, values, axis, reduce='assign'): if broadcast_shape: indices = paddle.broadcast_to(indices, broadcast_shape) values = paddle.broadcast_to(values, indices.shape) - return _C_ops.put_along_axis_(arr, indices, values, axis, reduce) + return _C_ops.put_along_axis_( + arr, indices, values, axis, reduce, include_self + ) def index_add(x, index, axis, value, name=None): diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index e64f5e6a25b3f6..2ebff3f5cf25de 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -1326,6 +1326,7 @@ def _jit_compile(file_path, verbose=False): """ Build shared library in subprocess """ + assert os.path.exists(file_path) ext_dir = os.path.dirname(file_path) setup_file = os.path.basename(file_path) diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py index b9ca1f35976c63..de1c36bdddfab1 100644 --- a/python/paddle/utils/download.py +++ b/python/paddle/utils/download.py @@ -16,12 +16,10 @@ import os import os.path as osp import shutil -import subprocess import sys import tarfile import time import zipfile -from urllib.parse import urlparse import httpx @@ -197,39 +195,7 @@ def _get_download(url, fullname): return False -def _wget_download(url: str, fullname: str): - try: - assert urlparse(url).scheme in ( - 'http', - 'https', - ), 'Only support https and http url' - # using wget to download url - tmp_fullname = fullname + "_tmp" - # –user-agent - command = f'wget -O {tmp_fullname} -t {DOWNLOAD_RETRY_LIMIT} {url}' - subprc = subprocess.Popen( - command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - _ = subprc.communicate() - - if subprc.returncode != 0: - raise RuntimeError( - f'{command} failed. Please make sure `wget` is installed or {url} exists' - ) - - shutil.move(tmp_fullname, fullname) - - except Exception as e: # requests.exceptions.ConnectionError - logger.info(f"Downloading {url} failed with exception {str(e)}") - return False - - return fullname - - -_download_methods = { - 'get': _get_download, - 'wget': _wget_download, -} +_download_methods = {'get': _get_download} def _download(url, path, md5sum=None, method='get'): @@ -311,7 +277,10 @@ def _decompress(fname): def _uncompress_file_zip(filepath): with zipfile.ZipFile(filepath, 'r') as files: - file_list = files.namelist() + file_list_tmp = files.namelist() + file_list = [] + for file in file_list_tmp: + file_list.append(file.replace("../", "")) file_dir = os.path.dirname(filepath) @@ -340,7 +309,13 @@ def _uncompress_file_zip(filepath): def _uncompress_file_tar(filepath, mode="r:*"): with tarfile.open(filepath, mode) as files: - file_list = files.getnames() + file_list_tmp = files.getnames() + file_list = [] + for file in file_list_tmp: + assert ( + file[0] != "/" + ), f"uncompress file path {file} should not start with /" + file_list.append(file.replace("../", "")) file_dir = os.path.dirname(filepath) diff --git a/security/README.md b/security/README.md index 01559632d7dd45..7a1c6df5a5f7a5 100644 --- a/security/README.md +++ b/security/README.md @@ -7,12 +7,30 @@ We regularly publish security advisories about using PaddlePaddle. *Note*: In conjunction with these security advisories, we strongly encourage PaddlePaddle users to read and understand PaddlePaddle's security model as outlined in [SECURITY.md](../SECURITY.md). -| Advisory Number | Type | Versions affected | Reported by | Additional Information | -|----------------------------------------------|------------------------------------------------------|:-----------------:|------------------------------------------------------------------|------------------------| -| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py | < 2.5.0 | Xiaochen Guo from Huazhong University of Science and Technology | | -| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree | < 2.4 | Wang Xuan(王旋) of Qihoo 360 AIVul Team | | +| Advisory Number | Type | Versions affected | Reported by | Additional Information | +|----------------------------------------------|------------------------------------------------------|:-----------------:|-----------------------------------------------------------------|------------------------| +| [PDSA-2023-023](./advisory/pdsa-2023-023.md) | Command injection in convert_shape_compare | < 2.6.0 | leeya_bug | | +| [PDSA-2023-022](./advisory/pdsa-2023-022.md) | FPE in paddle.argmin and paddle.argmax | < 2.6.0 | Peng Zhou (zpbrent) from Shanghai University | | +| [PDSA-2023-021](./advisory/pdsa-2023-021.md) | Null pointer dereference in paddle.crop | < 2.6.0 | Peng Zhou (zpbrent) from Shanghai University | | +| [PDSA-2023-020](./advisory/pdsa-2023-020.md) | Command injection in _wget_download | < 2.6.0 | huntr.com | | +| [PDSA-2023-019](./advisory/pdsa-2023-019.md) | Command injection in get_online_pass_interval | < 2.6.0 | huntr.com and leeya_bug | | +| [PDSA-2023-018](./advisory/pdsa-2023-018.md) | Heap buffer overflow in paddle.repeat_interleave | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-017](./advisory/pdsa-2023-017.md) | FPE in paddle.amin | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-016](./advisory/pdsa-2023-016.md) | Stack overflow in paddle.linalg.lu_unpack | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-015](./advisory/pdsa-2023-015.md) | FPE in paddle.lerp | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-014](./advisory/pdsa-2023-014.md) | FPE in paddle.topk | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-013](./advisory/pdsa-2023-013.md) | Stack overflow in paddle.searchsorted | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-012](./advisory/pdsa-2023-012.md) | Segfault in paddle.put_along_axis | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-011](./advisory/pdsa-2023-011.md) | Null pointer dereference in paddle.nextafter | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-010](./advisory/pdsa-2023-010.md) | Segfault in paddle.mode | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-009](./advisory/pdsa-2023-009.md) | FPE in paddle.linalg.eig | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-008](./advisory/pdsa-2023-008.md) | Segfault in paddle.dot | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-007](./advisory/pdsa-2023-007.md) | FPE in paddle.linalg.matrix_rank | < 2.6.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-006](./advisory/pdsa-2023-006.md) | FPE in paddle.nanmedian | < 2.6.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py | < 2.5.0 | Xiaochen Guo from Huazhong University of Science and Technology | | +| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree | < 2.4 | Wang Xuan(王旋) of Qihoo 360 AIVul Team | | diff --git a/security/README_cn.md b/security/README_cn.md index 49223df8844f39..7022221643a429 100644 --- a/security/README_cn.md +++ b/security/README_cn.md @@ -4,15 +4,33 @@ -注:我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](../SECURITY_cn.md)所介绍的飞桨安全模型,以便更好地了解此安全公告。 +*注*:我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](../SECURITY_cn.md)所介绍的飞桨安全模型,以便更好地了解此安全公告。 -| 安全公告编号 | 类型 | 受影响版本 | 报告者 | 备注 | -|-------------------------------------------------|------------------------------------------------------|:------------:|-----------------------------------------------------------------|----| -| [PDSA-2023-005](./advisory/pdsa-2023-005_cn.md) | Command injection in fs.py | < 2.5.0 | Xiaochen Guo from Huazhong University of Science and Technology | | -| [PDSA-2023-004](./advisory/pdsa-2023-004_cn.md) | FPE in paddle.linalg.matrix_power | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2023-003](./advisory/pdsa-2023-003_cn.md) | Heap buffer overflow in paddle.trace | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2023-002](./advisory/pdsa-2023-002_cn.md) | Null pointer dereference in paddle.flip | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2023-001](./advisory/pdsa-2023-001_cn.md) | Use after free in paddle.diagonal | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2022-002](./advisory/pdsa-2022-002_cn.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2022-001](./advisory/pdsa-2022-001_cn.md) | OOB read in gather_tree | < 2.4 | Wang Xuan(王旋) of Qihoo 360 AIVul Team | | +| 安全公告编号 | 类型 | 受影响版本 | 报告者 | 备注 | +|-------------------------------------------------|------------------------------------------------------|:-----------:|-----------------------------------------------------------------|----| +| [PDSA-2023-023](./advisory/pdsa-2023-023_cn.md) | Command injection in convert_shape_compare | < 2.6.0 | leeya_bug | | +| [PDSA-2023-022](./advisory/pdsa-2023-022_cn.md) | FPE in paddle.argmin and paddle.argmax | < 2.6.0 | Peng Zhou (zpbrent) from Shanghai University | | +| [PDSA-2023-021](./advisory/pdsa-2023-021_cn.md) | Null pointer dereference in paddle.crop | < 2.6.0 | Peng Zhou (zpbrent) from Shanghai University | | +| [PDSA-2023-020](./advisory/pdsa-2023-020_cn.md) | Command injection in _wget_download | < 2.6.0 | huntr.com | | +| [PDSA-2023-019](./advisory/pdsa-2023-019_cn.md) | Command injection in get_online_pass_interval | < 2.6.0 | huntr.com and leeya_bug | | +| [PDSA-2023-018](./advisory/pdsa-2023-018_cn.md) | Heap buffer overflow in paddle.repeat_interleave | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-017](./advisory/pdsa-2023-017_cn.md) | FPE in paddle.amin | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-016](./advisory/pdsa-2023-016_cn.md) | Stack overflow in paddle.linalg.lu_unpack | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-015](./advisory/pdsa-2023-015_cn.md) | FPE in paddle.lerp | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-014](./advisory/pdsa-2023-014_cn.md) | FPE in paddle.topk | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-013](./advisory/pdsa-2023-013_cn.md) | Stack overflow in paddle.searchsorted | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-012](./advisory/pdsa-2023-012_cn.md) | Segfault in paddle.put_along_axis | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-011](./advisory/pdsa-2023-011_cn.md) | Null pointer dereference in paddle.nextafter | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-010](./advisory/pdsa-2023-010_cn.md) | Segfault in paddle.mode | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-009](./advisory/pdsa-2023-009_cn.md) | FPE in paddle.linalg.eig | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-008](./advisory/pdsa-2023-008_cn.md) | Segfault in paddle.dot | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-007](./advisory/pdsa-2023-007_cn.md) | FPE in paddle.linalg.matrix_rank | < 2.6.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-006](./advisory/pdsa-2023-006_cn.md) | FPE in paddle.nanmedian | < 2.6.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-005](./advisory/pdsa-2023-005_cn.md) | Command injection in fs.py | < 2.5.0 | Xiaochen Guo from Huazhong University of Science and Technology | | +| [PDSA-2023-004](./advisory/pdsa-2023-004_cn.md) | FPE in paddle.linalg.matrix_power | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-003](./advisory/pdsa-2023-003_cn.md) | Heap buffer overflow in paddle.trace | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-002](./advisory/pdsa-2023-002_cn.md) | Null pointer dereference in paddle.flip | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-001](./advisory/pdsa-2023-001_cn.md) | Use after free in paddle.diagonal | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2022-002](./advisory/pdsa-2022-002_cn.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2022-001](./advisory/pdsa-2022-001_cn.md) | OOB read in gather_tree | < 2.4 | Wang Xuan(王旋) of Qihoo 360 AIVul Team | | diff --git a/security/README_ja.md b/security/README_ja.md index 4bd0b984c5834c..2711a91396b5e5 100644 --- a/security/README_ja.md +++ b/security/README_ja.md @@ -7,12 +7,30 @@ PaddlePaddle の使用に関するセキュリティ勧告を定期的に発表 *注*: これらのセキュリティ勧告と併せ、PaddlePaddle ユーザーには [SECURITY.md](../SECURITY_ja.md) に記載されている PaddlePaddle のセキュリティモデルを読み、理解することを強くお勧めします。 -| アドバイザリー番号 | タイプ | 対象バージョン | 報告者 | 追加情報 | -|----------------------------------------------|------------------------------------------------------|:-----------------:|------------------------------------------------------------------|------------------------| -| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py | < 2.5.0 | Xiaochen Guo from Huazhong University of Science and Technology | | -| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal | < 2.5.0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University | | -| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree | < 2.4 | Wang Xuan(王旋) of Qihoo 360 AIVul Team | | +| アドバイザリー番号 | タイプ | 対象バージョン | 報告者 | 追加情報 | +|----------------------------------------------|------------------------------------------------------|:-----------:|-----------------------------------------------------------------|------| +| [PDSA-2023-023](./advisory/pdsa-2023-023.md) | Command injection in convert_shape_compare | < 2.6.0 | leeya_bug | | +| [PDSA-2023-022](./advisory/pdsa-2023-022.md) | FPE in paddle.argmin and paddle.argmax | < 2.6.0 | Peng Zhou (zpbrent) from Shanghai University | | +| [PDSA-2023-021](./advisory/pdsa-2023-021.md) | Null pointer dereference in paddle.crop | < 2.6.0 | Peng Zhou (zpbrent) from Shanghai University | | +| [PDSA-2023-020](./advisory/pdsa-2023-020.md) | Command injection in _wget_download | < 2.6.0 | huntr.com | | +| [PDSA-2023-019](./advisory/pdsa-2023-019.md) | Command injection in get_online_pass_interval | < 2.6.0 | huntr.com and leeya_bug | | +| [PDSA-2023-018](./advisory/pdsa-2023-018.md) | Heap buffer overflow in paddle.repeat_interleave | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-017](./advisory/pdsa-2023-017.md) | FPE in paddle.amin | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-016](./advisory/pdsa-2023-016.md) | Stack overflow in paddle.linalg.lu_unpack | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-015](./advisory/pdsa-2023-015.md) | FPE in paddle.lerp | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-014](./advisory/pdsa-2023-014.md) | FPE in paddle.topk | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-013](./advisory/pdsa-2023-013.md) | Stack overflow in paddle.searchsorted | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-012](./advisory/pdsa-2023-012.md) | Segfault in paddle.put_along_axis | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-011](./advisory/pdsa-2023-011.md) | Null pointer dereference in paddle.nextafter | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-010](./advisory/pdsa-2023-010.md) | Segfault in paddle.mode | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-009](./advisory/pdsa-2023-009.md) | FPE in paddle.linalg.eig | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-008](./advisory/pdsa-2023-008.md) | Segfault in paddle.dot | < 2.6.0 | Tong Liu of CAS-IIE | | +| [PDSA-2023-007](./advisory/pdsa-2023-007.md) | FPE in paddle.linalg.matrix_rank | < 2.6.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-006](./advisory/pdsa-2023-006.md) | FPE in paddle.nanmedian | < 2.6.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py | < 2.5.0 | Xiaochen Guo from Huazhong University of Science and Technology | | +| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal | < 2.5.0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University | | +| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree | < 2.4 | Wang Xuan(王旋) of Qihoo 360 AIVul Team | | diff --git a/security/advisory/pdsa-2023-004_cn.md b/security/advisory/pdsa-2023-004_cn.md index c31c4da4f8728f..11f22a45aca11c 100644 --- a/security/advisory/pdsa-2023-004_cn.md +++ b/security/advisory/pdsa-2023-004_cn.md @@ -6,7 +6,7 @@ CVE-2023-38672 ### 影响 -当张量包含纬度值为0的情况,`paddle.linalg.matrix_power`会触发除0异常,导致程序运行时崩溃,PoC代码如下: +当张量包含维度值为0的情况,`paddle.linalg.matrix_power`会触发除0异常,导致程序运行时崩溃,PoC代码如下: ```python import paddle diff --git a/security/advisory/pdsa-2023-006.md b/security/advisory/pdsa-2023-006.md new file mode 100644 index 00000000000000..4997760cd5000a --- /dev/null +++ b/security/advisory/pdsa-2023-006.md @@ -0,0 +1,31 @@ +## PDSA-2023-006: FPE in paddle.nanmedian + +### CVE Number + +CVE-2023-38674 + +### Impact + +When `x` dim calculates `stride` to 0, `paddle.nanmedian` triggers FPE by `numel / stride`. The PoC is as follows: + +```python +import paddle +import numpy as np + +x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32) +x = paddle.to_tensor(x) +paddle.nanmedian(x) +``` + +### Patches + +We have patched the issue in commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of ShanghaiTech University. diff --git a/security/advisory/pdsa-2023-006_cn.md b/security/advisory/pdsa-2023-006_cn.md new file mode 100644 index 00000000000000..e8ac803c033d6a --- /dev/null +++ b/security/advisory/pdsa-2023-006_cn.md @@ -0,0 +1,31 @@ +## PDSA-2023-006: FPE in paddle.nanmedian + +### CVE编号 + +CVE-2023-38674 + +### 影响 + +当由`x`的dim计算的`stride`为0时,`paddle.nanmedian`会由`numel / stride`触发除0异常,PoC代码如下: + +```python +import paddle +import numpy as np + +x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32) +x = paddle.to_tensor(x) +paddle.nanmedian(x) +``` + +### 补丁 + +我们在commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of ShanghaiTech University 提交。 diff --git a/security/advisory/pdsa-2023-007.md b/security/advisory/pdsa-2023-007.md new file mode 100644 index 00000000000000..f61223193cabfe --- /dev/null +++ b/security/advisory/pdsa-2023-007.md @@ -0,0 +1,31 @@ +## PDSA-2023-007: FPE in paddle.linalg.matrix_rank + +### CVE Number + +CVE-2023-38675 + +### Impact + +When `x` dim calculates `rows` or `cols` to 0, `paddle.linalg.matrix_rank` triggers FPE by `numel / (rows * cols)`. The PoC is as follows: + +```python +import paddle +import numpy as np + +x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32) +x = paddle.to_tensor(x) +paddle.linalg.matrix_rank(x) +``` + +### Patches + +We have patched the issue in commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of ShanghaiTech University. diff --git a/security/advisory/pdsa-2023-007_cn.md b/security/advisory/pdsa-2023-007_cn.md new file mode 100644 index 00000000000000..0572aa1767b36d --- /dev/null +++ b/security/advisory/pdsa-2023-007_cn.md @@ -0,0 +1,31 @@ +## PDSA-2023-007: FPE in paddle.linalg.matrix_rank + +### CVE编号 + +CVE-2023-38675 + +### 影响 + +当由`x`的dim计算的`rows`或者`cols`为0时,`paddle.linalg.matrix_rank`会由`numel / (rows * cols)`触发除0异常,PoC代码如下: + +```python +import paddle +import numpy as np + +x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32) +x = paddle.to_tensor(x) +paddle.linalg.matrix_rank(x) +``` + +### 补丁 + +我们在commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of ShanghaiTech University 提交。 diff --git a/security/advisory/pdsa-2023-008.md b/security/advisory/pdsa-2023-008.md new file mode 100644 index 00000000000000..8994abd90fc23e --- /dev/null +++ b/security/advisory/pdsa-2023-008.md @@ -0,0 +1,31 @@ +## PDSA-2023-008: Segfault in paddle.dot + +### CVE Number + +CVE-2023-38676 + +### Impact + +Segfault occurs when `x` and `y` shape is 0 in `paddle.dot`. The PoC is as follows: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32)) +y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32)) +paddle.dot(x, y) +``` + +### Patches + +We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of CAS-IIE. diff --git a/security/advisory/pdsa-2023-008_cn.md b/security/advisory/pdsa-2023-008_cn.md new file mode 100644 index 00000000000000..92052de2f38090 --- /dev/null +++ b/security/advisory/pdsa-2023-008_cn.md @@ -0,0 +1,31 @@ +## PDSA-2023-008: Segfault in paddle.dot + +### CVE编号 + +CVE-2023-38676 + +### 影响 + +在`paddle.dot`中当`x`和`y`的shape为0时,将造成segfault,PoC代码如下: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32)) +y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32)) +paddle.dot(x, y) +``` + +### 补丁 + +我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of CAS-IIE 提交。 diff --git a/security/advisory/pdsa-2023-009.md b/security/advisory/pdsa-2023-009.md new file mode 100644 index 00000000000000..2f0450f9eb4e32 --- /dev/null +++ b/security/advisory/pdsa-2023-009.md @@ -0,0 +1,31 @@ +## PDSA-2023-009: FPE in paddle.linalg.eig + +### CVE Number + +CVE-2023-38677 + +### Impact + +When tensor dims contain 0, `paddle.linalg.eig` will trigger a float point exception. The PoC is as follows: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [3, 6, 0, 2, 2]).astype(np.float32)) + +paddle.linalg.eig(x) +``` + +### Patches + +We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of CAS-IIE. diff --git a/security/advisory/pdsa-2023-009_cn.md b/security/advisory/pdsa-2023-009_cn.md new file mode 100644 index 00000000000000..a212a2320c8902 --- /dev/null +++ b/security/advisory/pdsa-2023-009_cn.md @@ -0,0 +1,31 @@ +## PDSA-2023-009: FPE in paddle.linalg.eig + +### CVE编号 + +CVE-2023-38677 + +### 影响 + +当张量包含维度值为0的情况,`paddle.linalg.eig`会触发除0异常,PoC代码如下: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [3, 6, 0, 2, 2]).astype(np.float32)) + +paddle.linalg.eig(x) +``` + +### 补丁 + +我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of CAS-IIE 提交。 diff --git a/security/advisory/pdsa-2023-010.md b/security/advisory/pdsa-2023-010.md new file mode 100644 index 00000000000000..3f1c65f6c91c4f --- /dev/null +++ b/security/advisory/pdsa-2023-010.md @@ -0,0 +1,33 @@ +## PDSA-2023-010: Segfault in paddle.mode + +### CVE Number + +CVE-2023-38678 + +### Impact + +Invalid `axis` and `dim_size` may cause `paddle.mode` segfault . The PoC is as follows: + +```python +import paddle +import numpy as np + +paddle.mode( + x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64)), + axis=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)), + keepdim=True +) +``` + +### Patches + +We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of CAS-IIE. diff --git a/security/advisory/pdsa-2023-010_cn.md b/security/advisory/pdsa-2023-010_cn.md new file mode 100644 index 00000000000000..f72cd8af856360 --- /dev/null +++ b/security/advisory/pdsa-2023-010_cn.md @@ -0,0 +1,33 @@ +## PDSA-2023-010: Segfault in paddle.mode + +### CVE编号 + +CVE-2023-38678 + +### 影响 + +接收异常的`axis`和`dim_size`可能会造成`paddle.mode`发生segfault,PoC代码如下: + +```python +import paddle +import numpy as np + +paddle.mode( + x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64)), + axis=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)), + keepdim=True +) +``` + +### 补丁 + +我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of CAS-IIE 提交。 diff --git a/security/advisory/pdsa-2023-011.md b/security/advisory/pdsa-2023-011.md new file mode 100644 index 00000000000000..da7985dede7d00 --- /dev/null +++ b/security/advisory/pdsa-2023-011.md @@ -0,0 +1,32 @@ +## PDSA-2023-011: Null pointer dereference in paddle.nextafter + +### CVE Number + +CVE-2023-52302 + +### Impact + +Null pointer dereference in `paddle.nextafter` when tensor dims are invalid . The PoC is as follows: + +```python +import paddle +import numpy as np + +paddle.nextafter( + x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 2]).astype(np.float32)), + y=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 0, 0, 0]).astype(np.float32)) +) +``` + +### Patches + +We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of CAS-IIE. diff --git a/security/advisory/pdsa-2023-011_cn.md b/security/advisory/pdsa-2023-011_cn.md new file mode 100644 index 00000000000000..71440ac2c5d9a2 --- /dev/null +++ b/security/advisory/pdsa-2023-011_cn.md @@ -0,0 +1,32 @@ +## PDSA-2023-011: Null pointer dereference in paddle.nextafter + +### CVE编号 + +CVE-2023-52302 + +### 影响 + +输入张量的维度异常时,`paddle.nextafter`会引发空指针解引用,PoC代码如下: + +```python +import paddle +import numpy as np + +paddle.nextafter( + x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 2]).astype(np.float32)), + y=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 0, 0, 0]).astype(np.float32)) +) +``` + +### 补丁 + +我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of CAS-IIE 提交。 diff --git a/security/advisory/pdsa-2023-012.md b/security/advisory/pdsa-2023-012.md new file mode 100644 index 00000000000000..f659d356154474 --- /dev/null +++ b/security/advisory/pdsa-2023-012.md @@ -0,0 +1,35 @@ +## PDSA-2023-012: Segfault in paddle.put_along_axis + +### CVE Number + +CVE-2023-52303 + +### Impact + +Segfault in `paddle.put_along_axis` when tensor dims are invalid . The PoC is as follows: + +```python +import paddle +import numpy as np + +paddle.put_along_axis( + arr=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [1]).astype(np.int32)), + indices=paddle.to_tensor(np.random.uniform(-9223372036854775808, 9223372036854775807, [1]).astype(np.int64)), + values=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)), + axis=0, + reduce="assign" +) +``` + +### Patches + +We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of CAS-IIE. diff --git a/security/advisory/pdsa-2023-012_cn.md b/security/advisory/pdsa-2023-012_cn.md new file mode 100644 index 00000000000000..234961cded2359 --- /dev/null +++ b/security/advisory/pdsa-2023-012_cn.md @@ -0,0 +1,35 @@ +## PDSA-2023-012: Segfault in paddle.put_along_axis + +### CVE编号 + +CVE-2023-52303 + +### 影响 + +输入张量的维度异常时,`paddle.put_along_axis`会引发segfault,PoC代码如下: + +```python +import paddle +import numpy as np + +paddle.put_along_axis( + arr=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [1]).astype(np.int32)), + indices=paddle.to_tensor(np.random.uniform(-9223372036854775808, 9223372036854775807, [1]).astype(np.int64)), + values=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)), + axis=0, + reduce="assign" +) +``` + +### 补丁 + +我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of CAS-IIE 提交。 diff --git a/security/advisory/pdsa-2023-013.md b/security/advisory/pdsa-2023-013.md new file mode 100644 index 00000000000000..53deab6f3c346a --- /dev/null +++ b/security/advisory/pdsa-2023-013.md @@ -0,0 +1,32 @@ +## PDSA-2023-013: Stack overflow in paddle.searchsorted + +### CVE Number + +CVE-2023-52304 + +### Impact + +Invalid shapes cuase stack buffer overflow in `paddle.searchsorted`. The PoC is as follows: + +```python +import paddle +import numpy as np + +sorted_sequence = paddle.to_tensor(np.array(0)) +values = paddle.to_tensor(np.random.uniform(-10, 10, []).astype(np.float64)) + +paddle.searchsorted(sorted_sequence, values, out_int32=True, right=True) +``` + +### Patches + +We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of CAS-IIE. diff --git a/security/advisory/pdsa-2023-013_cn.md b/security/advisory/pdsa-2023-013_cn.md new file mode 100644 index 00000000000000..c5210242f651fd --- /dev/null +++ b/security/advisory/pdsa-2023-013_cn.md @@ -0,0 +1,32 @@ +## PDSA-2023-013: Stack overflow in paddle.searchsorted + +### CVE编号 + +CVE-2023-52304 + +### 影响 + +不正确的shapes会引发`paddle.searchsorted`栈溢出,PoC代码如下: + +```python +import paddle +import numpy as np + +sorted_sequence = paddle.to_tensor(np.array(0)) +values = paddle.to_tensor(np.random.uniform(-10, 10, []).astype(np.float64)) + +paddle.searchsorted(sorted_sequence, values, out_int32=True, right=True) +``` + +### 补丁 + +我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of CAS-IIE 提交。 diff --git a/security/advisory/pdsa-2023-014.md b/security/advisory/pdsa-2023-014.md new file mode 100644 index 00000000000000..1792f3b21e8fac --- /dev/null +++ b/security/advisory/pdsa-2023-014.md @@ -0,0 +1,32 @@ +## PDSA-2023-014: FPE in paddle.topk + +### CVE Number + +CVE-2023-52305 + +### Impact + +FPE in `paddle.topk` when `x` and `k` dims not correct. The PoC is as follows: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [6, 2, 1, 4, 2, 0]).astype(np.float64)) +k = paddle.to_tensor(np.array(1).astype(np.int32)) + +paddle.topk(x, k, axis=2,largest=False, sorted=True) +``` + +### Patches + +We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of CAS-IIE. diff --git a/security/advisory/pdsa-2023-014_cn.md b/security/advisory/pdsa-2023-014_cn.md new file mode 100644 index 00000000000000..d1be63be148d21 --- /dev/null +++ b/security/advisory/pdsa-2023-014_cn.md @@ -0,0 +1,32 @@ +## PDSA-2023-014: FPE in paddle.topk + +### CVE编号 + +CVE-2023-52305 + +### 影响 + +当`x`和`k`的dims不符合要求时,可能导致`paddle.topk`除0异常,PoC代码如下: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [6, 2, 1, 4, 2, 0]).astype(np.float64)) +k = paddle.to_tensor(np.array(1).astype(np.int32)) + +paddle.topk(x, k, axis=2,largest=False, sorted=True) +``` + +### 补丁 + +我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of CAS-IIE 提交。 diff --git a/security/advisory/pdsa-2023-015.md b/security/advisory/pdsa-2023-015.md new file mode 100644 index 00000000000000..6830516e0505b6 --- /dev/null +++ b/security/advisory/pdsa-2023-015.md @@ -0,0 +1,33 @@ +## PDSA-2023-015: FPE in paddle.lerp + +### CVE Number + +CVE-2023-52306 + +### Impact + +FPE in `paddle.lerp` when tensor shape is invalid. The PoC is as follows: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64)) +y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 0, 0, 2, 6]).astype(np.float64)) +weight = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64)) + +paddle.lerp(x, y, weight) +``` + +### Patches + +We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of CAS-IIE. diff --git a/security/advisory/pdsa-2023-015_cn.md b/security/advisory/pdsa-2023-015_cn.md new file mode 100644 index 00000000000000..7daa17bfff490b --- /dev/null +++ b/security/advisory/pdsa-2023-015_cn.md @@ -0,0 +1,33 @@ +## PDSA-2023-015: FPE in paddle.lerp + +### CVE编号 + +CVE-2023-52306 + +### 影响 + +不合法的张量shape可能导致`paddle.lerp`除0异常,PoC代码如下: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64)) +y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 0, 0, 2, 6]).astype(np.float64)) +weight = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64)) + +paddle.lerp(x, y, weight) +``` + +### 补丁 + +我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of CAS-IIE 提交。 diff --git a/security/advisory/pdsa-2023-016.md b/security/advisory/pdsa-2023-016.md new file mode 100644 index 00000000000000..2c6e93e3f87717 --- /dev/null +++ b/security/advisory/pdsa-2023-016.md @@ -0,0 +1,32 @@ +## PDSA-2023-016: Stack overflow in paddle.linalg.lu_unpack + +### CVE Number + +CVE-2023-52307 + +### Impact + +Invalid shapes cuase stack buffer overflow in `paddle.linalg.lu_unpack`. The PoC is as follows: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 6, 4, 8, 2]).astype(np.float32)) +y = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)) + +paddle.linalg.lu_unpack(x, y, True, True) +``` + +### Patches + +We have patched the issue in commit [10093636a10f29f73f13729b33570d8cafd58fb6](https://github.com/PaddlePaddle/Paddle/pull/56311/commits/10093636a10f29f73f13729b33570d8cafd58fb6). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of CAS-IIE. diff --git a/security/advisory/pdsa-2023-016_cn.md b/security/advisory/pdsa-2023-016_cn.md new file mode 100644 index 00000000000000..cdad03e02dce4a --- /dev/null +++ b/security/advisory/pdsa-2023-016_cn.md @@ -0,0 +1,32 @@ +## PDSA-2023-016: Stack overflow in paddle.linalg.lu_unpack + +### CVE编号 + +CVE-2023-52307 + +### 影响 + +不正确的shapes会引发`paddle.linalg.lu_unpack`栈溢出,PoC代码如下: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 6, 4, 8, 2]).astype(np.float32)) +y = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)) + +paddle.linalg.lu_unpack(x, y, True, True) +``` + +### 补丁 + +我们在commit [10093636a10f29f73f13729b33570d8cafd58fb6](https://github.com/PaddlePaddle/Paddle/pull/56311/commits/10093636a10f29f73f13729b33570d8cafd58fb6)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of CAS-IIE 提交。 diff --git a/security/advisory/pdsa-2023-017.md b/security/advisory/pdsa-2023-017.md new file mode 100644 index 00000000000000..2d65947f7be858 --- /dev/null +++ b/security/advisory/pdsa-2023-017.md @@ -0,0 +1,33 @@ +## PDSA-2023-017: FPE in paddle.amin + +### CVE Number + +CVE-2023-52308 + +### Impact + +FPE in `paddle.amin` when `x` has invalid dims. The PoC is as follows: + +```python +import paddle +import numpy as np + +paddle.amin( + x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 6, 3]).astype(np.float32)), + axis=-1, + keepdim=True +) +``` + +### Patches + +We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of CAS-IIE. diff --git a/security/advisory/pdsa-2023-017_cn.md b/security/advisory/pdsa-2023-017_cn.md new file mode 100644 index 00000000000000..ac04896e1ffeb4 --- /dev/null +++ b/security/advisory/pdsa-2023-017_cn.md @@ -0,0 +1,33 @@ +## PDSA-2023-017: FPE in paddle.amin + +### CVE编号 + +CVE-2023-52308 + +### 影响 + +当`x` dims不符合要求时,可能导致`paddle.amin`除0异常,PoC代码如下: + +```python +import paddle +import numpy as np + +paddle.amin( + x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 6, 3]).astype(np.float32)), + axis=-1, + keepdim=True +) +``` + +### 补丁 + +我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of CAS-IIE 提交。 diff --git a/security/advisory/pdsa-2023-018.md b/security/advisory/pdsa-2023-018.md new file mode 100644 index 00000000000000..6dbec29738b2f8 --- /dev/null +++ b/security/advisory/pdsa-2023-018.md @@ -0,0 +1,32 @@ +## PDSA-2023-018: Heap buffer overflow in paddle.repeat_interleave + +### CVE Number + +CVE-2023-52309 + +### Impact + +Heap buffer overflow in `paddle.repeat_interleave` by using invalid params. The PoC is as follows: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 4, 8, 3, 2, 4]).astype(np.float64)) +repeats = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [2, 1]).astype(np.int32)) + +paddle.repeat_interleave(x, repeats, axis=-2) +``` + +### Patches + +We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of CAS-IIE. diff --git a/security/advisory/pdsa-2023-018_cn.md b/security/advisory/pdsa-2023-018_cn.md new file mode 100644 index 00000000000000..9680099b47d83c --- /dev/null +++ b/security/advisory/pdsa-2023-018_cn.md @@ -0,0 +1,32 @@ +## PDSA-2023-018: Heap buffer overflow in paddle.repeat_interleave + +### CVE编号 + +CVE-2023-52309 + +### 影响 + +非法的参数可能导致`paddle.repeat_interleave`堆溢出,PoC代码如下: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 4, 8, 3, 2, 4]).astype(np.float64)) +repeats = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [2, 1]).astype(np.int32)) + +paddle.repeat_interleave(x, repeats, axis=-2) +``` + +### 补丁 + +我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of CAS-IIE 提交。 diff --git a/security/advisory/pdsa-2023-019.md b/security/advisory/pdsa-2023-019.md new file mode 100644 index 00000000000000..78a7b6b3230f5a --- /dev/null +++ b/security/advisory/pdsa-2023-019.md @@ -0,0 +1,35 @@ +## PDSA-2023-019: Command injection in get_online_pass_interval + +### CVE Number + +CVE-2023-52310 + +### Impact + +Command injection in `get_online_pass_interval` which could lead to execute arbitrary commands. The PoC is as follows: + +```python +from paddle.incubate.distributed.fleet.fleet_util import FleetUtil + +fleet_util = FleetUtil() +online_pass_interval = fleet_util.get_online_pass_interval( + days="{20190720..20190729}", + hours="9;touch /home/test/aaaa", + split_interval=5, + split_per_pass=2, + is_data_hourly_placed=False +) +``` + +### Patches + +We have patched the issue in commits [1aae481dfd7d2055c801563e254f1484b974b68e](https://github.com/PaddlePaddle/Paddle/pull/60023/commits/1aae481dfd7d2055c801563e254f1484b974b68e), [c62d87eb91c84154af40946f17205d86f608866b](https://github.com/PaddlePaddle/Paddle/pull/60544/commits/c62d87eb91c84154af40946f17205d86f608866b) and [f8560c903c80450e37b8f304a9cd8207678f2f83](https://github.com/PaddlePaddle/Paddle/pull/60615/commits/f8560c903c80450e37b8f304a9cd8207678f2f83). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by huntr.com and leeya_bug. diff --git a/security/advisory/pdsa-2023-019_cn.md b/security/advisory/pdsa-2023-019_cn.md new file mode 100644 index 00000000000000..096d4c191ebc2b --- /dev/null +++ b/security/advisory/pdsa-2023-019_cn.md @@ -0,0 +1,35 @@ +## PDSA-2023-019: Command injection in get_online_pass_interval + +### CVE编号 + +CVE-2023-52310 + +### 影响 + +`get_online_pass_interval`存在命令注入漏洞,可造成任意命令执行,PoC代码如下: + +```python +from paddle.incubate.distributed.fleet.fleet_util import FleetUtil + +fleet_util = FleetUtil() +online_pass_interval = fleet_util.get_online_pass_interval( + days="{20190720..20190729}", + hours="9;touch /home/test/aaaa", + split_interval=5, + split_per_pass=2, + is_data_hourly_placed=False +) +``` + +### 补丁 + +我们在commits [1aae481dfd7d2055c801563e254f1484b974b68e](https://github.com/PaddlePaddle/Paddle/pull/60023/commits/1aae481dfd7d2055c801563e254f1484b974b68e)、[c62d87eb91c84154af40946f17205d86f608866b](https://github.com/PaddlePaddle/Paddle/pull/60544/commits/c62d87eb91c84154af40946f17205d86f608866b) 和 [f8560c903c80450e37b8f304a9cd8207678f2f83](https://github.com/PaddlePaddle/Paddle/pull/60615/commits/f8560c903c80450e37b8f304a9cd8207678f2f83) 中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 huntr.com 和 leeya_bug 提交。 diff --git a/security/advisory/pdsa-2023-020.md b/security/advisory/pdsa-2023-020.md new file mode 100644 index 00000000000000..ed3a5966d6ca60 --- /dev/null +++ b/security/advisory/pdsa-2023-020.md @@ -0,0 +1,28 @@ +## PDSA-2023-020: Command injection in _wget_download + +### CVE Number + +CVE-2023-52311 + +### Impact + +Command injection in `_wget_download` which could lead to execute arbitrary commands. The PoC is as follows: + +```python +from paddle import utils + +utils.download._wget_download("aa; touch codexecution", "bb") +``` + +### Patches + +We have patched the issue in commit [d5550d3f2f5bab48c783b4986ba1cd8e061ce542](https://github.com/PaddlePaddle/Paddle/pull/59957/commits/d5550d3f2f5bab48c783b4986ba1cd8e061ce542). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by huntr.com. diff --git a/security/advisory/pdsa-2023-020_cn.md b/security/advisory/pdsa-2023-020_cn.md new file mode 100644 index 00000000000000..a6bd1321592e62 --- /dev/null +++ b/security/advisory/pdsa-2023-020_cn.md @@ -0,0 +1,28 @@ +## PDSA-2023-020: Command injection in _wget_download + +### CVE编号 + +CVE-2023-52311 + +### 影响 + +`_wget_download`存在命令注入漏洞,可造成任意命令执行,PoC代码如下: + +```python +from paddle import utils + +utils.download._wget_download("aa; touch codexecution", "bb") +``` + +### 补丁 + +我们在commit [d5550d3f2f5bab48c783b4986ba1cd8e061ce542](https://github.com/PaddlePaddle/Paddle/pull/59957/commits/d5550d3f2f5bab48c783b4986ba1cd8e061ce542)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 huntr.com 提交。 diff --git a/security/advisory/pdsa-2023-021.md b/security/advisory/pdsa-2023-021.md new file mode 100644 index 00000000000000..6a8ec45b33e23c --- /dev/null +++ b/security/advisory/pdsa-2023-021.md @@ -0,0 +1,33 @@ +## PDSA-2023-021: Null pointer dereference in paddle.crop + +### CVE Number + +CVE-2023-52312 + +### Impact + +Null pointer dereference in `paddle.crop` when tensor dims are invalid . The PoC is as follows: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(0, 10, [2, 2]).astype(np.int32)) +shape = paddle.to_tensor([-1, 0], dtype='int32') +offsets = paddle.to_tensor([], dtype='int32') + +out = paddle.crop(x, shape, offsets) +``` + +### Patches + +We have patched the issue in commit [c074de6911944d5d30d28cc7ce2c7099f1c87bce](https://github.com/PaddlePaddle/Paddle/pull/59967/commits/c074de6911944d5d30d28cc7ce2c7099f1c87bce). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Peng Zhou (zpbrent) from Shanghai University. diff --git a/security/advisory/pdsa-2023-021_cn.md b/security/advisory/pdsa-2023-021_cn.md new file mode 100644 index 00000000000000..eff0b0c2225aac --- /dev/null +++ b/security/advisory/pdsa-2023-021_cn.md @@ -0,0 +1,33 @@ +## PDSA-2023-021: Null pointer dereference in paddle.crop + +### CVE编号 + +CVE-2023-52312 + +### 影响 + +输入张量的维度异常时,`paddle.crop`会引发空指针解引用,PoC代码如下: + +```python +import paddle +import numpy as np + +x = paddle.to_tensor(np.random.uniform(0, 10, [2, 2]).astype(np.int32)) +shape = paddle.to_tensor([-1, 0], dtype='int32') +offsets = paddle.to_tensor([], dtype='int32') + +out = paddle.crop(x, shape, offsets) +``` + +### 补丁 + +我们在commit [c074de6911944d5d30d28cc7ce2c7099f1c87bce](https://github.com/PaddlePaddle/Paddle/pull/59967/commits/c074de6911944d5d30d28cc7ce2c7099f1c87bce)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Peng Zhou (zpbrent) from Shanghai University 提交。 diff --git a/security/advisory/pdsa-2023-022.md b/security/advisory/pdsa-2023-022.md new file mode 100644 index 00000000000000..b5b3b3519c9c0e --- /dev/null +++ b/security/advisory/pdsa-2023-022.md @@ -0,0 +1,30 @@ +## PDSA-2023-022: FPE in paddle.argmin and paddle.argmax + +### CVE Number + +CVE-2023-52313 + +### Impact + +FPE in `paddle.argmin` and `paddle.argmax` when input `x.numel()` is 0. The PoC is as follows: + +```python +import paddle + +data = paddle.to_tensor([], dtype="int32") + +paddle.argmax(data, axis=0) +``` + +### Patches + +We have patched the issue in commit [41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794](https://github.com/PaddlePaddle/Paddle/pull/59976/commits/41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Peng Zhou (zpbrent) from Shanghai University. diff --git a/security/advisory/pdsa-2023-022_cn.md b/security/advisory/pdsa-2023-022_cn.md new file mode 100644 index 00000000000000..d7c57f94394955 --- /dev/null +++ b/security/advisory/pdsa-2023-022_cn.md @@ -0,0 +1,30 @@ +## PDSA-2023-022: FPE in paddle.argmin and paddle.argmax + +### CVE编号 + +CVE-2023-52313 + +### 影响 + +输入`x.numel()`为0时`paddle.argmin`和`paddle.argmax`会引发除0异常,PoC代码如下: + +```python +import paddle + +data = paddle.to_tensor([], dtype="int32") + +paddle.argmax(data, axis=0) +``` + +### 补丁 + +我们在commit [41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794](https://github.com/PaddlePaddle/Paddle/pull/59976/commits/41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Peng Zhou (zpbrent) from Shanghai University 提交。 diff --git a/security/advisory/pdsa-2023-023.md b/security/advisory/pdsa-2023-023.md new file mode 100644 index 00000000000000..c2671f7f87adca --- /dev/null +++ b/security/advisory/pdsa-2023-023.md @@ -0,0 +1,28 @@ +## PDSA-2023-023: Command injection in convert_shape_compare + +### CVE Number + +CVE-2023-52314 + +### Impact + +Command injection in `convert_shape_compare` which could lead to execute arbitrary commands. The PoC is as follows: + +```python +import paddle + +paddle.jit.dy2static.convert_operators.convert_shape_compare('prefix','+ str(__import__("os").system("cat /etc/passwd")) +','1') +``` + +### Patches + +We have patched the issue in commit [c3b6414eb313480f1417abe92d410dfe89723097](https://github.com/PaddlePaddle/Paddle/pull/60097/commits/c3b6414eb313480f1417abe92d410dfe89723097). +The fix will be included in PaddlePaddle 2.6.0. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by leeya_bug. diff --git a/security/advisory/pdsa-2023-023_cn.md b/security/advisory/pdsa-2023-023_cn.md new file mode 100644 index 00000000000000..3de87a4d707674 --- /dev/null +++ b/security/advisory/pdsa-2023-023_cn.md @@ -0,0 +1,28 @@ +## PDSA-2023-023: Command injection in convert_shape_compare + +### CVE编号 + +CVE-2023-52314 + +### 影响 + +`convert_shape_compare`存在命令注入漏洞,可造成任意命令执行,PoC代码如下: + +```python +import paddle + +paddle.jit.dy2static.convert_operators.convert_shape_compare('prefix','+ str(__import__("os").system("cat /etc/passwd")) +','1') +``` + +### 补丁 + +我们在commit [c3b6414eb313480f1417abe92d410dfe89723097](https://github.com/PaddlePaddle/Paddle/pull/60097/commits/c3b6414eb313480f1417abe92d410dfe89723097)中对此问题进行了补丁。 +修复将包含在飞桨2.6.0版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 leeya_bug 提交。 diff --git a/test/auto_parallel/hybrid_strategy/CMakeLists.txt b/test/auto_parallel/hybrid_strategy/CMakeLists.txt index 8d4c34745d8238..4d15ae4b30d922 100644 --- a/test/auto_parallel/hybrid_strategy/CMakeLists.txt +++ b/test/auto_parallel/hybrid_strategy/CMakeLists.txt @@ -10,7 +10,7 @@ if((WITH_GPU) AND (LINUX)) test_semi_auto_parallel_hybrid_strategy ENVS "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_semi_auto_parallel_hybrid_strategy - PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=HYBRID") + PROPERTIES TIMEOUT "600" LABELS "RUN_TYPE=HYBRID") endif() if((WITH_GPU) AND (LINUX)) py_test_modules( diff --git a/test/collective/fleet/run_server_for_communicator_half_async.py b/test/collective/fleet/run_server_for_communicator_half_async.py new file mode 100644 index 00000000000000..14d8fd80331b35 --- /dev/null +++ b/test/collective/fleet/run_server_for_communicator_half_async.py @@ -0,0 +1,38 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End + +import paddle + +paddle.enable_static() + +pipe_name = os.getenv("PIPE_FILE") + + +class RunServer(TestCommunicatorHalfAsyncEnd2End): + def runTest(self): + pass + + +os.environ["TRAINING_ROLE"] = "PSERVER" +os.environ["http_proxy"] = "" +os.environ["https_proxy"] = "" +half_run_server = RunServer() +with open(pipe_name, 'w') as pipe: + pipe.write('done') + +half_run_server.run_ut() diff --git a/test/collective/fleet/test_communicator_half_async.py b/test/collective/fleet/test_communicator_half_async.py index 25e5302fb444fd..687337f25ab2ae 100644 --- a/test/collective/fleet/test_communicator_half_async.py +++ b/test/collective/fleet/test_communicator_half_async.py @@ -15,6 +15,7 @@ import os import subprocess import sys +import tempfile import unittest import numpy @@ -23,6 +24,7 @@ from paddle import base from paddle.distributed import fleet from paddle.distributed.fleet.base import role_maker +from paddle.distributed.utils.launch_utils import find_free_ports paddle.enable_static() @@ -30,25 +32,44 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase): def net(self): x = paddle.static.data(name='x', shape=[-1, 13], dtype='float32') - y_predict = paddle.static.nn.fc(x, size=1, activation=None) - y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') + x1 = paddle.static.data( + name='x1', shape=[-1, 1], dtype='int64', lod_level=1 + ) + emb = paddle.static.nn.embedding( + input=x1, + size=[10000, 10], + param_attr=base.ParamAttr( + name="embedding", + initializer=paddle.nn.initializer.Constant(value=0.01), + ), + is_sparse=True, + ) + + pool = paddle.static.nn.sequence_lod.sequence_pool( + input=emb.squeeze(-2), pool_type="sum" + ) + z = paddle.concat([x, pool], axis=1) + + y_predict = paddle.static.nn.fc(x=z, size=1) + y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) - return avg_cost, x, y + return avg_cost, x, x1, y def fake_reader(self): def reader(): for i in range(10000): x = numpy.random.random((1, 13)).astype('float32') + z = numpy.random.randint(0, 9999, (1, 1)).astype('int64') y = numpy.random.randint(0, 2, (1, 1)).astype('int64') - yield x, y + yield x, z, y return reader def run_pserver(self, role, strategy): fleet.init(role) - avg_cost, x, y = self.net() + avg_cost, x, z, y = self.net() optimizer = paddle.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) @@ -61,20 +82,20 @@ def run_trainer(self, role, strategy): exe = base.Executor(place) fleet.init(role) - avg_cost, x, y = self.net() + avg_cost, x, z, y = self.net() optimizer = paddle.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) - exe.run(paddle.static.default_startup_program()) + exe.run(base.default_startup_program()) fleet.init_worker() train_reader = paddle.batch(self.fake_reader(), batch_size=24) - feeder = base.DataFeeder(place=place, feed_list=[x, y]) + feeder = base.DataFeeder(place=place, feed_list=[x, z, y]) for batch_id, data in enumerate(train_reader()): exe.run( - paddle.static.default_main_program(), + base.default_main_program(), feed=feeder.feed(data), fetch_list=[], ) @@ -82,19 +103,18 @@ def run_trainer(self, role, strategy): fleet.stop_worker() def run_ut(self): - strategy = paddle.distributed.fleet.DistributedStrategy() - strategy.a_sync = True - training_role = os.getenv("TRAINING_ROLE", "TRAINER") - role = role_maker.UserDefinedRoleMaker( - current_id=0, - role=role_maker.Role.WORKER - if training_role == "TRAINER" - else role_maker.Role.SERVER, - worker_num=1, - server_endpoints=["127.0.0.1:6002"], - ) + os.environ["PADDLE_PSERVER_NUMS"] = "1" + os.environ["PADDLE_TRAINERS_NUM"] = "1" + os.environ["PADDLE_TRAINER_ID"] = "0" + os.environ["PADDLE_TRAINERS_NUM"] = "1" + os.environ["POD_IP"] = "127.0.0.1" + + role = role_maker.PaddleCloudRoleMaker() + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.a_sync = True if training_role == "TRAINER": self.run_trainer(role, strategy) @@ -102,61 +122,39 @@ def run_ut(self): self.run_pserver(role, strategy) def test_communicator(self): - run_server_cmd = """ + temp_dir = tempfile.TemporaryDirectory() + pipe_name = os.path.join(temp_dir.name, 'mypipe') + try: + os.mkfifo(pipe_name) + except OSError as oe: + print(f"Failed to create pipe: {oe}") -import sys -import os + port = find_free_ports(1).pop() -import time -import threading -import subprocess -import unittest -import numpy - -from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End - -import paddle -import paddle.base as base -import paddle.distributed.fleet as fleet -import paddle.distributed.fleet.base.role_maker as role_maker - -paddle.enable_static() - -class RunServer(TestCommunicatorHalfAsyncEnd2End): - def runTest(self): - pass - -os.environ["http_proxy"] = "" -os.environ["https_proxy"] = "" -os.environ["TRAINING_ROLE"] = "PSERVER" -half_run_server = RunServer() -half_run_server.run_ut() -""" - - server_file = "run_server_for_communicator_haflaysnc.py" - with open(server_file, "w") as wb: - wb.write(run_server_cmd) os.environ["TRAINING_ROLE"] = "PSERVER" - _python = sys.executable + os.environ["PADDLE_PORT"] = str(port) + os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = f"127.0.0.1:{port}" + os.environ["PIPE_FILE"] = pipe_name + _python = sys.executable + server_file = "run_server_for_communicator_half_async.py" ps_cmd = f"{_python} {server_file}" + ps_proc = subprocess.Popen( ps_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - os.environ["http_proxy"] = "" - os.environ["https_proxy"] = "" + with open(pipe_name, 'r') as pipe: + start_command = pipe.read() + os.environ["TRAINING_ROLE"] = "TRAINER" - os.environ["FLAGS_communicator_send_queue_size"] = "1" - os.environ["FLAGS_communicator_max_merge_var_num"] = "1" self.run_ut() ps_proc.kill() - - if os.path.exists(server_file): - os.remove(server_file) + ps_proc.wait() + outs, errs = ps_proc.communicate() if __name__ == '__main__': diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt index 59ed51f7681685..e811e547511a84 100644 --- a/test/cpp/fluid/CMakeLists.txt +++ b/test/cpp/fluid/CMakeLists.txt @@ -1,8 +1,6 @@ add_subdirectory(benchmark) add_subdirectory(framework) -add_subdirectory(inference) - if(WITH_CINN) add_subdirectory(cinn) endif() diff --git a/test/cpp/fluid/inference/CMakeLists.txt b/test/cpp/fluid/inference/CMakeLists.txt deleted file mode 100644 index 512d2b1553c8c9..00000000000000 --- a/test/cpp/fluid/inference/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(utils) diff --git a/test/cpp/fluid/inference/utils/CMakeLists.txt b/test/cpp/fluid/inference/utils/CMakeLists.txt deleted file mode 100644 index 3ea72839b19243..00000000000000 --- a/test/cpp/fluid/inference/utils/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -if(WITH_TESTING) - if(NOT APPLE) - inference_base_test( - infer_io_utils_tester SRCS io_utils_tester.cc - DEPS - paddle_inference_shared - common - ) - endif() -endif() - -if(WITH_ONNXRUNTIME AND WIN32) - # Copy onnxruntime for some c++ test in Windows, since the test will - # be build only in CI, so suppose the generator in Windows is Ninja. - copy_onnx(infer_io_utils_tester) -endif() diff --git a/test/cpp/fluid/inference/utils/io_utils_tester.cc b/test/cpp/fluid/inference/utils/io_utils_tester.cc deleted file mode 100644 index 756027fb6cb9bd..00000000000000 --- a/test/cpp/fluid/inference/utils/io_utils_tester.cc +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include - -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/utils/io_utils.h" - -namespace paddle { -namespace inference { -namespace { - -bool pd_tensor_equal(const paddle::PaddleTensor& ref, - const paddle::PaddleTensor& t) { - bool is_equal = true; - VLOG(3) << "ref.name: " << ref.name << ", t.name: " << t.name; - VLOG(3) << "ref.dtype: " << ref.dtype << ", t.dtype: " << t.dtype; - VLOG(3) << "ref.lod_level: " << ref.lod.size() - << ", t.dtype: " << t.lod.size(); - VLOG(3) << "ref.data_len: " << ref.data.length() - << ", t.data_len: " << t.data.length(); - return is_equal && (ref.name == t.name) && (ref.lod == t.lod) && - (ref.dtype == t.dtype) && - (std::memcmp(ref.data.data(), t.data.data(), ref.data.length()) == 0); -} - -template -void test_io_utils() { - std::vector input({6, 8}); - paddle::PaddleTensor in; - in.name = "Hello"; - in.shape = {1, 2}; - in.lod = std::vector>{{0, 1}}; - in.data = paddle::PaddleBuf(static_cast(input.data()), - input.size() * sizeof(T)); - in.dtype = paddle::inference::PaddleTensorGetDType(); - std::stringstream ss; - paddle::inference::SerializePDTensorToStream(&ss, in); - paddle::PaddleTensor out; - paddle::inference::DeserializePDTensorToStream(ss, &out); - ASSERT_TRUE(pd_tensor_equal(in, out)); -} -} // namespace -} // namespace inference -} // namespace paddle - -TEST(infer_io_utils, float32) { paddle::inference::test_io_utils(); } - -TEST(infer_io_utils, tensors) { - // Create a float32 tensor. - std::vector input_fp32({1.1f, 3.2f, 5.0f, 8.2f}); - paddle::PaddleTensor in_fp32; - in_fp32.name = "Tensor.fp32_0"; - in_fp32.shape = {2, 2}; - in_fp32.data = paddle::PaddleBuf(static_cast(input_fp32.data()), - input_fp32.size() * sizeof(float)); - in_fp32.dtype = paddle::inference::PaddleTensorGetDType(); - - // Create a int64 tensor. - std::vector input_int64({5, 8}); - paddle::PaddleTensor in_int64; - in_int64.name = "Tensor.int64_0"; - in_int64.shape = {1, 2}; - in_int64.lod = std::vector>{{0, 1}}; - in_int64.data = paddle::PaddleBuf(static_cast(input_int64.data()), - input_int64.size() * sizeof(int64_t)); - in_int64.dtype = paddle::inference::PaddleTensorGetDType(); - - // Serialize tensors. - std::vector tensors_in({in_fp32}); - std::string file_path = "./io_utils_tensors"; - paddle::inference::SerializePDTensorsToFile(file_path, tensors_in); - - // Deserialize tensors. - std::vector tensors_out; - paddle::inference::DeserializePDTensorsToFile(file_path, &tensors_out); - - // Check results. - ASSERT_EQ(tensors_in.size(), tensors_out.size()); - for (size_t i = 0; i < tensors_in.size(); ++i) { - ASSERT_TRUE( - paddle::inference::pd_tensor_equal(tensors_in[i], tensors_out[i])); - } -} - -TEST(shape_info_io, read_and_write) { - const std::string path = "test_shape_info_io"; - std::map> min_shape, max_shape, opt_shape; - std::map> min_value, max_value, opt_value; - min_shape.insert( - std::make_pair("test1", std::vector{1, 3, 112, 112})); - max_shape.insert( - std::make_pair("test1", std::vector{1, 3, 224, 224})); - opt_shape.insert( - std::make_pair("test1", std::vector{1, 3, 224, 224})); - min_value.insert( - std::make_pair("test1", std::vector{1, 3, 112, 112})); - max_value.insert( - std::make_pair("test1", std::vector{1, 3, 224, 224})); - opt_value.insert( - std::make_pair("test1", std::vector{1, 3, 224, 224})); - paddle::inference::SerializeShapeRangeInfo( - path, min_shape, max_shape, opt_shape, min_value, max_value, opt_value); - min_shape.clear(); - max_shape.clear(); - opt_shape.clear(); - min_value.clear(); - max_value.clear(); - opt_value.clear(); - opt_shape.insert( - std::make_pair("test2", std::vector{1, 3, 224, 224})); - paddle::inference::DeserializeShapeRangeInfo(path, - &min_shape, - &max_shape, - &opt_shape, - &min_value, - &max_value, - &opt_value); - - min_shape.insert(std::make_pair("test1", std::vector{1, 3, 56, 56})); - std::vector names{"test1"}; - paddle::inference::UpdateShapeRangeInfo(path, - min_shape, - max_shape, - opt_shape, - min_value, - max_value, - opt_value, - names, - names); - - ASSERT_THROW(paddle::inference::DeserializeShapeRangeInfo("no_exists_file", - &min_shape, - &max_shape, - &opt_shape, - &min_value, - &max_value, - &opt_value); - , paddle::platform::EnforceNotMet); -} diff --git a/test/cpp/inference/api/tester_helper.h b/test/cpp/inference/api/tester_helper.h index a410df859fe450..a5d60ca6eec974 100644 --- a/test/cpp/inference/api/tester_helper.h +++ b/test/cpp/inference/api/tester_helper.h @@ -34,7 +34,6 @@ #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/utils/benchmark.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "test/cpp/inference/api/config_printer.h" #include "test/cpp/inference/test_helper.h" @@ -69,9 +68,6 @@ PD_DEFINE_int32(num_threads, PD_DEFINE_bool(use_analysis, true, "Running the inference program in analysis mode."); -PD_DEFINE_bool(record_benchmark, - false, - "Record benchmark after profiling the model"); PD_DEFINE_double(accuracy, 1e-3, "Result Accuracy."); PD_DEFINE_double(quantized_accuracy, 2e-2, "Result Quantized Accuracy."); PD_DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch."); @@ -594,14 +590,6 @@ void PredictionRun(PaddlePredictor *predictor, if (sample_latency != nullptr) *sample_latency = batch_latency / FLAGS_batch_size; - - if (FLAGS_record_benchmark) { - Benchmark benchmark; - benchmark.SetName(FLAGS_model_name); - benchmark.SetBatchSize(FLAGS_batch_size); - benchmark.SetLatency(batch_latency); - benchmark.PersistToFile("benchmark_record.txt"); - } } void TestOneThreadPrediction( diff --git a/test/cpp/inference/api/trt_dynamic_shape_test.cc b/test/cpp/inference/api/trt_dynamic_shape_test.cc index 80929f10447b83..52336e7e8a5412 100644 --- a/test/cpp/inference/api/trt_dynamic_shape_test.cc +++ b/test/cpp/inference/api/trt_dynamic_shape_test.cc @@ -191,6 +191,7 @@ void TestTunedDynamic() { output_t->copy_to_cpu(out_data.data()); }; check_func(predictor_tuned.get()); + predictor_tuned.reset(nullptr); // check tuned_dynamic_shape AnalysisConfig config; diff --git a/test/cpp/inference/test.cmake b/test/cpp/inference/test.cmake index 33961a949369c5..7d3fb889e0e727 100644 --- a/test/cpp/inference/test.cmake +++ b/test/cpp/inference/test.cmake @@ -111,10 +111,9 @@ function(inference_base_test_build TARGET) add_executable(${TARGET} ${base_test_SRCS}) if("${base_test_DEPS};" MATCHES "paddle_inference_shared;") list(REMOVE_ITEM base_test_DEPS paddle_inference_shared) - target_link_libraries( - ${TARGET} $ - $) - add_dependencies(${TARGET} paddle_inference_shared benchmark) + target_link_libraries(${TARGET} + $) + add_dependencies(${TARGET} paddle_inference_shared) elseif("${base_test_DEPS};" MATCHES "paddle_inference_c_shared;") list(REMOVE_ITEM base_test_DEPS paddle_inference_c_shared) target_link_libraries(${TARGET} diff --git a/test/custom_runtime/CMakeLists.txt b/test/custom_runtime/CMakeLists.txt index e8b14445278be8..cf11b5555c3860 100644 --- a/test/custom_runtime/CMakeLists.txt +++ b/test/custom_runtime/CMakeLists.txt @@ -1,6 +1,6 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU) set(PLUGIN_URL https://github.com/PaddlePaddle/PaddleCustomDevice.git) - set(PLUGIN_TAG develop) + set(PLUGIN_TAG release/2.6) file( GLOB TEST_OPS diff --git a/test/custom_runtime/test_collective_process_group_xccl.py b/test/custom_runtime/test_collective_process_group_xccl.py index 3c04a59ebfa742..83690a8ac11348 100644 --- a/test/custom_runtime/test_collective_process_group_xccl.py +++ b/test/custom_runtime/test_collective_process_group_xccl.py @@ -150,7 +150,7 @@ def setUp(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) self.temp_dir = tempfile.TemporaryDirectory() cmd = 'cd {} \ - && git clone --depth 1 {} \ + && git clone --depth 1 {} -b {} \ && cd PaddleCustomDevice \ && git fetch origin \ && git checkout {} -b dev \ @@ -159,6 +159,7 @@ def setUp(self): self.temp_dir.name, os.getenv('PLUGIN_URL'), os.getenv('PLUGIN_TAG'), + os.getenv('PLUGIN_TAG'), sys.executable, ) os.system(cmd) diff --git a/test/custom_runtime/test_custom_cpu_plugin.py b/test/custom_runtime/test_custom_cpu_plugin.py index b92df8def9dd30..5478b7ecfad64c 100755 --- a/test/custom_runtime/test_custom_cpu_plugin.py +++ b/test/custom_runtime/test_custom_cpu_plugin.py @@ -26,7 +26,7 @@ def setUp(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) self.temp_dir = tempfile.TemporaryDirectory() cmd = 'cd {} \ - && git clone --depth 1 {} \ + && git clone --depth 1 {} -b {} \ && cd PaddleCustomDevice \ && git fetch origin \ && git checkout {} -b dev \ @@ -35,6 +35,7 @@ def setUp(self): self.temp_dir.name, os.getenv('PLUGIN_URL'), os.getenv('PLUGIN_TAG'), + os.getenv('PLUGIN_TAG'), sys.executable, ) os.system(cmd) diff --git a/test/custom_runtime/test_custom_cpu_profiler_plugin.py b/test/custom_runtime/test_custom_cpu_profiler_plugin.py index 220c9a0a21aeb1..aeebec9e342c32 100644 --- a/test/custom_runtime/test_custom_cpu_profiler_plugin.py +++ b/test/custom_runtime/test_custom_cpu_profiler_plugin.py @@ -24,7 +24,7 @@ def setUp(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) self.temp_dir = tempfile.TemporaryDirectory() cmd = 'cd {} \ - && git clone --depth 1 {} \ + && git clone --depth 1 {} -b {} \ && cd PaddleCustomDevice \ && git fetch origin \ && git checkout {} -b dev \ @@ -33,6 +33,7 @@ def setUp(self): self.temp_dir.name, os.getenv('PLUGIN_URL'), os.getenv('PLUGIN_TAG'), + os.getenv('PLUGIN_TAG'), sys.executable, ) os.system(cmd) diff --git a/test/custom_runtime/test_custom_cpu_to_static.py b/test/custom_runtime/test_custom_cpu_to_static.py index 60ba27004afbdd..55181cc017440f 100644 --- a/test/custom_runtime/test_custom_cpu_to_static.py +++ b/test/custom_runtime/test_custom_cpu_to_static.py @@ -106,7 +106,7 @@ def setUp(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) self.temp_dir = tempfile.TemporaryDirectory() cmd = 'cd {} \ - && git clone --depth 1 {} \ + && git clone --depth 1 {} -b {} \ && cd PaddleCustomDevice \ && git fetch origin \ && git checkout {} -b dev \ @@ -115,6 +115,7 @@ def setUp(self): self.temp_dir.name, os.getenv('PLUGIN_URL'), os.getenv('PLUGIN_TAG'), + os.getenv('PLUGIN_TAG'), sys.executable, ) os.system(cmd) diff --git a/test/custom_runtime/test_custom_op_setup.py b/test/custom_runtime/test_custom_op_setup.py index 47c7d9821d6b8e..2086b3ac6f2ed1 100644 --- a/test/custom_runtime/test_custom_op_setup.py +++ b/test/custom_runtime/test_custom_op_setup.py @@ -104,7 +104,7 @@ def setUp(self): self.cur_dir = os.path.dirname(os.path.abspath(__file__)) self.temp_dir = tempfile.TemporaryDirectory() cmd = 'cd {} \ - && git clone --depth 1 {} \ + && git clone --depth 1 {} -b {} \ && cd PaddleCustomDevice \ && git fetch origin \ && git checkout {} -b dev \ @@ -114,6 +114,7 @@ def setUp(self): self.temp_dir.name, os.getenv('PLUGIN_URL'), os.getenv('PLUGIN_TAG'), + os.getenv('PLUGIN_TAG'), sys.executable, self.cur_dir, ) diff --git a/test/custom_runtime/test_fleet_launch_custom_device.sh b/test/custom_runtime/test_fleet_launch_custom_device.sh index cc851558462399..5cbb3a11d14220 100644 --- a/test/custom_runtime/test_fleet_launch_custom_device.sh +++ b/test/custom_runtime/test_fleet_launch_custom_device.sh @@ -18,7 +18,7 @@ set -e temp_dir=$(mktemp --directory) pushd ${temp_dir} \ -&& git clone --depth 1 ${PLUGIN_URL} \ +&& git clone --depth 1 ${PLUGIN_URL} -b ${PLUGIN_TAG} \ && pushd PaddleCustomDevice/ \ && git fetch origin \ && git checkout ${PLUGIN_TAG} -b dev \ diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt index e2ce58b7cf58c2..bdb9f182e46ada 100644 --- a/test/dygraph_to_static/CMakeLists.txt +++ b/test/dygraph_to_static/CMakeLists.txt @@ -8,6 +8,9 @@ set(SOT_ENVS SOT_LOG_LEVEL=0 COST_MODEL=False MIN_GRAPH_SIZE=0 set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0) list(REMOVE_ITEM TEST_OPS test_lac) +list(REMOVE_ITEM TEST_OPS test_grad) # disable test_grad on release/2.6 +list(REMOVE_ITEM TEST_OPS test_sentiment +)# disable test_sentiment on release/2.6 # NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope # will be removed and will cause some random failed in multi-thread. if(WITH_PYTHON) @@ -28,6 +31,9 @@ if(NOT WITH_GPU) # disable some model test on CPU to avoid timeout list(REMOVE_ITEM TEST_OPS test_resnet) list(REMOVE_ITEM TEST_OPS test_build_strategy) + list(REMOVE_ITEM TEST_OPS test_bert) + list(REMOVE_ITEM TEST_OPS test_transformer) + list(REMOVE_ITEM TEST_OPS test_mobile_net) endif() foreach(TEST_OP ${TEST_OPS}) @@ -37,15 +43,11 @@ endforeach() set_tests_properties(test_se_resnet PROPERTIES TIMEOUT 900) set_tests_properties(test_yolov3 PROPERTIES TIMEOUT 900 LABELS "RUN_TYPE=EXCLUSIVE") -set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 120) set_tests_properties(test_seq2seq PROPERTIES TIMEOUT 420) set_tests_properties(test_cycle_gan PROPERTIES TIMEOUT 150) -set_tests_properties(test_bert PROPERTIES TIMEOUT 180) set_tests_properties(test_basic_api_transformation PROPERTIES TIMEOUT 240) set_tests_properties(test_reinforcement_learning PROPERTIES TIMEOUT 120) -set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_bmn PROPERTIES TIMEOUT 300) -set_tests_properties(test_bert PROPERTIES TIMEOUT 240) if(NOT WIN32) set_tests_properties(test_tsm PROPERTIES TIMEOUT 900) @@ -53,12 +55,14 @@ endif() if(APPLE) set_tests_properties(test_bmn PROPERTIES TIMEOUT 300) - set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 300) endif() if(WITH_GPU) set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 240) set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 240) + set_tests_properties(test_bert PROPERTIES TIMEOUT 240) + set_tests_properties(test_transformer PROPERTIES TIMEOUT 240) + set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 240) endif() # Legacy IR only tests for dygraph_to_static diff --git a/test/dygraph_to_static/test_list.py b/test/dygraph_to_static/test_list.py index 52db0e53eb6255..ef3d195d90805d 100644 --- a/test/dygraph_to_static/test_list.py +++ b/test/dygraph_to_static/test_list.py @@ -292,6 +292,7 @@ def init_dygraph_func(self): test_list_pop_in_while_loop, ] + # TODO(zhangbo): Refine BuildOpFrom for op with sub_block def train(self, to_static=False): with base.dygraph.guard(): if to_static: diff --git a/test/dygraph_to_static/test_mobile_net.py b/test/dygraph_to_static/test_mobile_net.py index 599d863d12c795..44cf791191a8de 100644 --- a/test/dygraph_to_static/test_mobile_net.py +++ b/test/dygraph_to_static/test_mobile_net.py @@ -19,7 +19,10 @@ import unittest import numpy as np -from dygraph_to_static_utils import Dy2StTestBase, test_pt_only +from dygraph_to_static_utils import ( + Dy2StTestBase, + test_pt_only, +) from predictor_utils import PredictorTools import paddle @@ -735,12 +738,6 @@ def assert_same_predict(self, model_name): ) @test_pt_only - def test_mobile_net_pir(self): - # MobileNet-V1 - self.assert_same_loss("MobileNetV1") - # MobileNet-V2 - self.assert_same_loss("MobileNetV2") - def test_mobile_net(self): # MobileNet-V1 self.assert_same_loss("MobileNetV1") diff --git a/test/indexing/test_getitem.py b/test/indexing/test_getitem.py index f3a2374ecbe1d0..3959bde43d1528 100644 --- a/test/indexing/test_getitem.py +++ b/test/indexing/test_getitem.py @@ -233,6 +233,26 @@ def test_combined_index_11(self): np.testing.assert_allclose(y.numpy(), np_res) + def test_combined_index_12(self): + np_data = ( + np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6)).astype(self.ndtype) + ) + + if self.dtype == 'bfloat16': + np_data = convert_uint16_to_float(convert_float_to_uint16(np_data)) + if self.dtype == 'complex64' or self.dtype == 'complex128': + np_data = np_data + 1j * np_data + + np_res = np_data[:, :, [2, 4], :] + + x = paddle.to_tensor(np_data, dtype=self.dtype) + y = x[:, :, [2, 4], :] + + if self.dtype == 'bfloat16': + y = paddle.cast(y, dtype='float32') + + np.testing.assert_allclose(y.numpy(), np_res) + def test_index_has_range(self): np_data = ( np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6)).astype(self.ndtype) @@ -970,6 +990,20 @@ def test_combined_index_11(self): np.testing.assert_allclose(res[0], np_res) + def test_combined_index_12(self): + np_data = np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6)) + np_res = np_data[:, :, [2, 4], :] + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + x = paddle.to_tensor(np_data) + y = _getitem_static( + x, (slice(None), slice(None), [2, 4], slice(None)) + ) + res = self.exe.run(fetch_list=[y]) + + np.testing.assert_allclose(res[0], np_res) + def test_index_has_range(self): # only one bool tensor with all False np_data = np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6)) diff --git a/test/indexing/test_setitem.py b/test/indexing/test_setitem.py index b8d7e3361efc45..0f0bdf3d08b8da 100644 --- a/test/indexing/test_setitem.py +++ b/test/indexing/test_setitem.py @@ -28,6 +28,21 @@ def setUp(self): self.ndtype = np.float64 self.dtype = 'float64' + def test_advanced_index(self): + np_data = np.zeros((3, 4, 5, 6), dtype='float32').astype(self.ndtype) + if self.dtype == 'bfloat16': + np_data = convert_uint16_to_float(convert_float_to_uint16(np_data)) + if self.dtype == 'complex64' or self.dtype == 'complex128': + np_data = np_data + 1j * np_data + + x = paddle.to_tensor(np_data, dtype=self.dtype) + np_data[[0, 1], [1, 2], [1]] = 10.0 + x[[0, 1], [1, 2], [1]] = 10.0 + + if self.dtype == 'bfloat16': + x = paddle.cast(x, dtype='float32') + np.testing.assert_allclose(x.numpy(), np_data) + def test_combined_index_1(self): np_data = np.zeros((3, 4, 5, 6), dtype='float32').astype(self.ndtype) if self.dtype == 'bfloat16': @@ -228,6 +243,54 @@ def test_indexing_is_boolean_false(self): np.testing.assert_allclose(x.numpy(), np_data) + def test_combined_indexing_and_value_is_tensor_2(self): + # value is tensor needed to broadcast and index will be adjusted + np_data = np.ones((3, 4, 5, 6)).astype(self.ndtype) + value_data = np.arange(3 * 4 * 2 * 1).reshape((3, 4, 2, 1)) + + if self.dtype == 'bfloat16': + np_data = convert_uint16_to_float(convert_float_to_uint16(np_data)) + value_data = convert_uint16_to_float( + convert_float_to_uint16(value_data) + ) + if self.dtype == 'complex64' or self.dtype == 'complex128': + np_data = np_data + 1j * np_data + value_data = value_data + 1j * value_data + + x = paddle.to_tensor(np_data, dtype=self.dtype) + v = paddle.to_tensor(value_data, dtype=self.dtype) + x[..., [1, 4], ::2] = v + + np_data[..., [1, 4], ::2] = value_data + if self.dtype == 'bfloat16': + x = paddle.cast(x, dtype='float32') + np.testing.assert_allclose(x.numpy(), np_data) + + def test_combined_indexing_and_value_is_tensor_3(self): + # value is tensor and index will be adjusted + # and the value rank is less than original tensor + np_data = np.ones((3, 4, 5, 6)).astype(self.ndtype) + value_data = np.arange(2 * 3 * 5).reshape((2, 3, 5)) + + if self.dtype == 'bfloat16': + np_data = convert_uint16_to_float(convert_float_to_uint16(np_data)) + value_data = convert_uint16_to_float( + convert_float_to_uint16(value_data) + ) + if self.dtype == 'complex64' or self.dtype == 'complex128': + np_data = np_data + 1j * np_data + value_data = value_data + 1j * value_data + + x = paddle.to_tensor(np_data, dtype=self.dtype) + v = paddle.to_tensor(value_data, dtype=self.dtype) + x[:, [1, 3], :, [3, 4]] = v + + np_data[:, [1, 3], :, [3, 4]] = value_data + + if self.dtype == 'bfloat16': + x = paddle.cast(x, dtype='float32') + np.testing.assert_allclose(x.numpy(), np_data) + def test_inplace_with_stride(self): np_v = np.random.randn(3, 1).astype(self.ndtype) if self.dtype == 'bfloat16': @@ -242,12 +305,12 @@ def test_inplace_with_stride(self): zero.stop_gradient = False zero1 = zero * 1 - zero1[paddle.to_tensor([0, 1])] = vv + zero1[1, paddle.to_tensor([2, 0, 1])] = vv loss = zero1.sum() loss.backward() - expected_v_grad = np.ones((3, 1)) * 10.0 + expected_v_grad = np.ones((3, 1)) * 5.0 if self.dtype == 'bfloat16': np.testing.assert_allclose( v.grad.cast('float32').numpy(), expected_v_grad @@ -574,6 +637,69 @@ def test_indexing_is_boolean_false(self): np.testing.assert_allclose(res[0], np_data) + def test_combined_indexing_and_value_is_tensor_1(self): + # value is tensor with same shape to getitem and index will be adjusted + np_data = np.ones((3, 3), dtype='int32') + value_data = np.array([-1, -1, -1]) + np_data[:, [0, 2]] = np_data[:, [0, 2]] * np.expand_dims(value_data, -1) + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + x = paddle.ones((3, 3), dtype='int32') + v = paddle.to_tensor([-1, -1, -1]) + y = _setitem_static( + x, + (slice(None), [0, 2]), + x[:, [0, 2]] * v.unsqueeze(-1), + ) + res = self.exe.run(fetch_list=[y]) + + np.testing.assert_allclose(res[0], np_data) + + def test_combined_indexing_and_value_is_tensor_2(self): + # value is tensor needed to broadcast and index will be adjusted + np_data = np.ones((3, 4, 5, 6), dtype='int32') + value_data = np.arange(3 * 4 * 2 * 1).reshape((3, 4, 2, 1)) + np_data[..., [1, 4], ::2] = value_data + + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + x = paddle.ones((3, 4, 5, 6), dtype='int32') + v = paddle.arange(3 * 4 * 2 * 1).reshape((3, 4, 2, 1)) + + y = _setitem_static( + x, + (..., [1, 4], slice(None, None, 2)), + v, + ) + + res = self.exe.run(fetch_list=[y]) + + np.testing.assert_allclose(res[0], np_data) + + def test_combined_indexing_and_value_is_tensor_3(self): + # value is tensor and index will be adjusted + # and the value rank is less than original tensor + np_data = np.ones((3, 4, 5, 6), dtype='int32') + value_data = np.arange(2 * 3 * 5).reshape((2, 3, 5)) + np_data[:, [1, 3], :, [3, 4]] = value_data + + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + x = paddle.ones((3, 4, 5, 6), dtype='int32') + v = paddle.arange(2 * 3 * 5).reshape((2, 3, 5)) + y = _setitem_static( + x, + (slice(None), [1, 3], slice(None), [3, 4]), + v, + ) + + res = self.exe.run(fetch_list=[y]) + + np.testing.assert_allclose(res[0], np_data) + if __name__ == '__main__': unittest.main() diff --git a/test/ir/inference/program_config.py b/test/ir/inference/program_config.py index a2f36617de8e6e..f9424502484ccb 100644 --- a/test/ir/inference/program_config.py +++ b/test/ir/inference/program_config.py @@ -275,6 +275,7 @@ def generate_weight(): self.outputs = outputs self.input_type = input_type self.no_cast_list = [] if no_cast_list is None else no_cast_list + self.supported_cast_type = [np.float32, np.float16] def __repr__(self): log_str = '' @@ -292,11 +293,9 @@ def __repr__(self): return log_str def set_input_type(self, _type: np.dtype) -> None: - assert _type in [ - np.float32, - np.float16, - None, - ], "PaddleTRT only supports FP32 / FP16 IO" + assert ( + _type in self.supported_cast_type or _type is None + ), "PaddleTRT only supports FP32 / FP16 IO" ver = paddle.inference.get_trt_compile_version() trt_version = ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 @@ -309,15 +308,14 @@ def set_input_type(self, _type: np.dtype) -> None: def get_feed_data(self) -> Dict[str, Dict[str, Any]]: feed_data = {} for name, tensor_config in self.inputs.items(): - do_casting = ( - self.input_type is not None and name not in self.no_cast_list - ) + data = tensor_config.data # Cast to target input_type - data = ( - tensor_config.data.astype(self.input_type) - if do_casting - else tensor_config.data - ) + if ( + self.input_type is not None + and name not in self.no_cast_list + and data.dtype in self.supported_cast_type + ): + data = data.astype(self.input_type) # Truncate FP32 tensors to FP16 precision for FP16 test stability if data.dtype == np.float32 and name not in self.no_cast_list: data = data.astype(np.float16).astype(np.float32) @@ -334,10 +332,14 @@ def _cast(self) -> None: for name, inp in self.inputs.items(): if name in self.no_cast_list: continue + if inp.dtype not in self.supported_cast_type: + continue inp.convert_type_inplace(self.input_type) for name, weight in self.weights.items(): if name in self.no_cast_list: continue + if weight.dtype not in self.supported_cast_type: + continue weight.convert_type_inplace(self.input_type) return self diff --git a/test/ir/inference/test_trt_convert_assign.py b/test/ir/inference/test_trt_convert_assign.py index 55939982d5ee0d..99b027877bc9cb 100644 --- a/test/ir/inference/test_trt_convert_assign.py +++ b/test/ir/inference/test_trt_convert_assign.py @@ -120,9 +120,8 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if not dynamic_shape and ( - self.has_bool_dtype or self.dims == 1 or self.dims == 0 - ): + # Static shape does not support 0 or 1 dim's input + if not dynamic_shape and (self.dims == 1 or self.dims == 0): return 0, 4 return 1, 2 diff --git a/test/ir/inference/test_trt_convert_cast.py b/test/ir/inference/test_trt_convert_cast.py index 026abc571050a2..0b5f2186429e3e 100644 --- a/test/ir/inference/test_trt_convert_cast.py +++ b/test/ir/inference/test_trt_convert_cast.py @@ -118,6 +118,7 @@ def generate_input(type): ) }, outputs=["cast_output_data1"], + no_cast_list=["input_data"], ) yield program_config diff --git a/test/ir/inference/test_trt_convert_lookup_table.py b/test/ir/inference/test_trt_convert_lookup_table.py index e1fb64bcdf545f..b7cf7d657d7a02 100644 --- a/test/ir/inference/test_trt_convert_lookup_table.py +++ b/test/ir/inference/test_trt_convert_lookup_table.py @@ -80,6 +80,7 @@ def generate_input2(dims, attrs: List[Dict[str, Any]]): ) }, outputs=["out_data"], + no_cast_list=["indices"], ) yield program_config diff --git a/test/ir/inference/test_trt_convert_solve.py b/test/ir/inference/test_trt_convert_solve.py index c3f9b51d0d05c2..de70cfacc4e071 100644 --- a/test/ir/inference/test_trt_convert_solve.py +++ b/test/ir/inference/test_trt_convert_solve.py @@ -87,11 +87,10 @@ def clear_dynamic_shape(): # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 - program_config.set_input_type(np.float32) yield self.create_inference_config(), (1, 3), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half - program_config.set_input_type(np.float16) - yield self.create_inference_config(), (1, 3), 1e-3 + yield self.create_inference_config(), (1, 3), (1e-3, 1e-3) def test(self): self.run_test() diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index 9c875f6755187a..0d54fa7ea37400 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -550,9 +550,9 @@ if((NOT WITH_GPU) endif() list(REMOVE_ITEM TEST_OPS "test_stride") +list(REMOVE_ITEM TEST_OPS "test_graph_reindex") if(WITH_COVERAGE) list(REMOVE_ITEM TEST_OPS test_weight_decay) - list(REMOVE_ITEM TEST_OPS test_graph_reindex) list(REMOVE_ITEM TEST_OPS test_cuda_graphed_layer) list(REMOVE_ITEM TEST_OPS test_cuda_graph_partial_graph_static_run) list(REMOVE_ITEM DIST_TEST_OPS test_dist_fleet_geo) diff --git a/test/legacy_test/c_embedding_op_base.py b/test/legacy_test/c_embedding_op_base.py index 83758b6bb0bc98..cfb9df8e69d22d 100644 --- a/test/legacy_test/c_embedding_op_base.py +++ b/test/legacy_test/c_embedding_op_base.py @@ -34,10 +34,8 @@ def get_c_embedding(start, end, table, ids): return output -def c_embedding_wrapper(table, index, start_index=0): - return paddle._legacy_C_ops.c_embedding( - table, index, "start_index", start_index - ) +def c_embedding_wrapper(table, index, start_index=0, vocab_size=-1): + return paddle._C_ops.c_embedding(table, index, start_index, vocab_size) class TestCEmbeddingCPU(OpTest): @@ -58,11 +56,15 @@ def initcase(self): ) self.start_index = 10 self.end_index = self.start_index + 17 + self.vocab_size = 34 self.inputs = {'W': table, 'Ids': ids} np_out = get_c_embedding(self.start_index, self.end_index, table, ids) self.outputs = {'Out': np_out.reshape((2, 4, 64))} - self.attrs = {'start_index': self.start_index} + self.attrs = { + 'start_index': self.start_index, + 'vocab_size': self.vocab_size, + } if core.is_compiled_with_xpu(): self.__class__.use_xpu = True @@ -87,12 +89,20 @@ def test_check_output(self): self.check_output_with_place(core.CUDAPlace(0)) elif core.is_compiled_with_xpu(): self.check_output_with_place(core.XPUPlace(0)) + else: + current_place = paddle.framework._current_expected_place() + if isinstance(current_place, paddle.CustomPlace): + self.check_output_with_place(current_place) def test_check_grad(self): if core.is_compiled_with_cuda(): self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out') elif core.is_compiled_with_xpu(): self.check_grad_with_place(core.XPUPlace(0), ['W'], 'Out') + else: + current_place = paddle.framework._current_expected_place() + if isinstance(current_place, paddle.CustomPlace): + self.check_grad_with_place(current_place, ['W'], 'Out') def init_dtype(self): if core.is_compiled_with_cuda(): @@ -101,6 +111,11 @@ def init_dtype(self): elif core.is_compiled_with_xpu(): self.dtype = "float32" self.ids_dtype = "int64" + else: + current_place = paddle.framework._current_expected_place() + if isinstance(current_place, paddle.CustomPlace): + self.dtype = "float32" + self.ids_dtype = "int64" class TestCEmbeddingOpFP32(TestCEmbeddingOpBase): diff --git a/test/legacy_test/test_download.py b/test/legacy_test/test_download.py index 742c4b2a651902..da25a3021a31e0 100644 --- a/test/legacy_test/test_download.py +++ b/test/legacy_test/test_download.py @@ -120,14 +120,6 @@ def test_retry_exception( './test', ) - def test_wget_download_error( - self, - ): - with self.assertRaises(RuntimeError): - from paddle.utils.download import _download - - _download('www.baidu', './test', method='wget') - def test_download_methods( self, ): @@ -136,14 +128,9 @@ def test_download_methods( "https://paddle-hapi.bj.bcebos.com/unittest/files.zip", ] - import sys - from paddle.utils.download import _download - if sys.platform == 'linux': - methods = ['wget', 'get'] - else: - methods = ['get'] + methods = ['get'] for url in urls: for method in methods: diff --git a/test/legacy_test/test_put_along_axis_op.py b/test/legacy_test/test_put_along_axis_op.py index 43d2e80c25e24a..47cfc65d617136 100644 --- a/test/legacy_test/test_put_along_axis_op.py +++ b/test/legacy_test/test_put_along_axis_op.py @@ -120,6 +120,470 @@ def init_data(self): self.axis_type = "int64" +class TestPutAlongAxisOpMul(TestPutAlongAxisOp): + def setUp(self): + self.init_data() + self.reduce_op = "mul" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] *= self.value[ + i, j, k + ] + self.inputs = { + 'Input': self.xnp, + 'Index': self.index, + 'Value': self.value, + } + self.attrs = { + 'Axis': self.axis, + 'Reduce': self.reduce_op, + 'Include_self': True, + 'broadcast': False, + } + self.outputs = {'Result': self.target} + + def init_data(self): + self.dtype = 'float64' + self.x_type = "float64" + self.x_shape = (10, 10, 10) + self.value_type = "float64" + self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + +class TestPutAlongAxisOpMulNotIncludeSelf(TestPutAlongAxisOp): + def setUp(self): + self.init_data() + self.reduce_op = "mul" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + self.nums = np.zeros_like(self.target) + for i in range(5): + for j in range(5): + for k in range(5): + if self.nums[i, self.index[i, j, k], k] == 0: + self.target[i, self.index[i, j, k], k] = self.value[ + i, j, k + ] + else: + self.target[i, self.index[i, j, k], k] *= self.value[ + i, j, k + ] + self.nums[i, self.index[i, j, k], k] += 1 + self.inputs = { + 'Input': self.xnp, + 'Index': self.index, + 'Value': self.value, + } + self.attrs = { + 'Axis': self.axis, + 'Reduce': self.reduce_op, + 'Include_self': False, + 'broadcast': False, + } + self.outputs = {'Result': self.target} + + def init_data(self): + self.dtype = 'float64' + self.x_type = "float64" + self.x_shape = (10, 10, 10) + self.value_type = "float64" + self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + +class TestPutAlongAxisOpAdd(TestPutAlongAxisOp): + def setUp(self): + self.init_data() + self.reduce_op = "add" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] += self.value[ + i, j, k + ] + self.inputs = { + 'Input': self.xnp, + 'Index': self.index, + 'Value': self.value, + } + self.attrs = { + 'Axis': self.axis, + 'Reduce': self.reduce_op, + 'Include_self': True, + 'broadcast': False, + } + self.outputs = {'Result': self.target} + + def init_data(self): + self.dtype = 'float64' + self.x_type = "float64" + self.x_shape = (10, 10, 10) + self.value_type = "float64" + self.value = np.random.randint(1, 100, (5, 5, 5)).astype( + self.value_type + ) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + +class TestPutAlongAxisOpAddNotIncludeSelf(TestPutAlongAxisOp): + def setUp(self): + self.init_data() + self.reduce_op = "add" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + self.nums = np.zeros_like(self.target) + for i in range(5): + for j in range(5): + for k in range(5): + if self.nums[i, self.index[i, j, k], k] == 0: + self.target[i, self.index[i, j, k], k] = self.value[ + i, j, k + ] + else: + self.target[i, self.index[i, j, k], k] += self.value[ + i, j, k + ] + self.nums[i, self.index[i, j, k], k] += 1 + self.inputs = { + 'Input': self.xnp, + 'Index': self.index, + 'Value': self.value, + } + self.attrs = { + 'Axis': self.axis, + 'Reduce': self.reduce_op, + 'Include_self': False, + 'broadcast': False, + } + self.outputs = {'Result': self.target} + + def init_data(self): + self.dtype = 'float64' + self.x_type = "float64" + self.x_shape = (10, 10, 10) + self.value_type = "float64" + self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + +class TestPutAlongAxisOpMean(TestPutAlongAxisOp): + def setUp(self): + self.init_data() + self.reduce_op = "mean" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + self.nums = np.ones_like(self.target) + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] += self.value[ + i, j, k + ] + self.nums[i, self.index[i, j, k], k] += 1 + for i in range(10): + for j in range(10): + for k in range(10): + self.target[i, j, k] /= self.nums[i, j, k] + self.inputs = { + 'Input': self.xnp, + 'Index': self.index, + 'Value': self.value, + } + self.attrs = { + 'Axis': self.axis, + 'Reduce': self.reduce_op, + 'Include_self': True, + 'broadcast': False, + } + self.outputs = {'Result': self.target} + + def init_data(self): + self.dtype = 'float64' + self.x_type = "float64" + self.x_shape = (10, 10, 10) + self.value_type = "float64" + self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + +class TestPutAlongAxisOpMeanNotIncludeSelf(TestPutAlongAxisOp): + def setUp(self): + self.init_data() + self.reduce_op = "mean" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + self.nums = np.zeros_like(self.target) + for i in range(5): + for j in range(5): + for k in range(5): + if self.nums[i, self.index[i, j, k], k] == 0: + self.target[i, self.index[i, j, k], k] = self.value[ + i, j, k + ] + else: + self.target[i, self.index[i, j, k], k] += self.value[ + i, j, k + ] + self.nums[i, self.index[i, j, k], k] += 1 + for i in range(10): + for j in range(10): + for k in range(10): + if self.nums[i, j, k] > 0: + self.target[i, j, k] = ( + self.target[i, j, k] / self.nums[i, j, k] + ) + self.inputs = { + 'Input': self.xnp, + 'Index': self.index, + 'Value': self.value, + } + self.attrs = { + 'Axis': self.axis, + 'Reduce': self.reduce_op, + 'Include_self': False, + 'broadcast': False, + } + self.outputs = {'Result': self.target} + + def init_data(self): + self.dtype = 'float64' + self.x_type = "float64" + self.x_shape = (10, 10, 10) + self.value_type = "float64" + self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + +class TestPutAlongAxisOpMin(TestPutAlongAxisOp): + def setUp(self): + self.init_data() + self.reduce_op = "amin" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] = ( + self.value[i, j, k] + if self.value[i, j, k] + < self.target[i, self.index[i, j, k], k] + else self.target[i, self.index[i, j, k], k] + ) + self.inputs = { + 'Input': self.xnp, + 'Index': self.index, + 'Value': self.value, + } + self.attrs = { + 'Axis': self.axis, + 'Reduce': self.reduce_op, + 'include_self': True, + 'broadcast': False, + } + self.outputs = {'Result': self.target} + + def init_data(self): + self.dtype = 'float64' + self.x_type = "float64" + self.x_shape = (10, 10, 10) + self.value_type = "float64" + self.value = ( + np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type) + ) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + +class TestPutAlongAxisOpMinNotIncludeSelf(TestPutAlongAxisOp): + def setUp(self): + self.init_data() + self.reduce_op = "amin" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] = self.value[i, j, k] + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] = ( + self.value[i, j, k] + if self.value[i, j, k] + < self.target[i, self.index[i, j, k], k] + else self.target[i, self.index[i, j, k], k] + ) + self.inputs = { + 'Input': self.xnp, + 'Index': self.index, + 'Value': self.value, + } + self.attrs = { + 'Axis': self.axis, + 'Reduce': self.reduce_op, + 'Include_self': False, + 'broadcast': False, + } + self.outputs = {'Result': self.target} + + def init_data(self): + self.dtype = 'float64' + self.x_type = "float64" + self.x_shape = (10, 10, 10) + self.value_type = "float64" + self.value = ( + np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type) + ) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + +class TestPutAlongAxisOpMax(TestPutAlongAxisOp): + def setUp(self): + self.init_data() + self.reduce_op = "amax" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] = ( + self.value[i, j, k] + if self.value[i, j, k] + > self.target[i, self.index[i, j, k], k] + else self.target[i, self.index[i, j, k], k] + ) + self.inputs = { + 'Input': self.xnp, + 'Index': self.index, + 'Value': self.value, + } + self.attrs = { + 'Axis': self.axis, + 'Reduce': self.reduce_op, + 'include_self': True, + 'broadcast': False, + } + self.outputs = {'Result': self.target} + + def init_data(self): + self.dtype = 'float64' + self.x_type = "float64" + self.x_shape = (10, 10, 10) + self.value_type = "float64" + self.value = ( + np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type) + ) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + +class TestPutAlongAxisOpMaxNotIncludeSelf(TestPutAlongAxisOp): + def setUp(self): + self.init_data() + self.reduce_op = "amax" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] = self.value[i, j, k] + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] = ( + self.value[i, j, k] + if self.value[i, j, k] + > self.target[i, self.index[i, j, k], k] + else self.target[i, self.index[i, j, k], k] + ) + self.inputs = { + 'Input': self.xnp, + 'Index': self.index, + 'Value': self.value, + } + self.attrs = { + 'Axis': self.axis, + 'Reduce': self.reduce_op, + 'Include_self': False, + 'broadcast': False, + } + self.outputs = {'Result': self.target} + + def init_data(self): + self.dtype = 'float64' + self.x_type = "float64" + self.x_shape = (10, 10, 10) + self.value_type = "float64" + self.value = ( + np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type) + ) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + @unittest.skipIf( not core.is_compiled_with_cuda() or not core.is_bfloat16_supported(core.CUDAPlace(0)), @@ -274,6 +738,45 @@ def run(place): run(place) +@unittest.skipIf( + not core.is_compiled_with_cuda(), + "core is not complied with CUDA", +) +class TestPutAlongAxisAPILargeCase(unittest.TestCase): + def setUp(self): + np.random.seed(0) + self.shape = [64, 1327104] + self.index_shape = [64, 1327104] + self.index_np = np.zeros(self.index_shape).astype('int64') + self.x_np = np.random.random(self.shape).astype(np.float32) + self.axis = 1 + self.value_np = np.ones(self.index_shape).astype(np.float32) + self.x_feed = copy.deepcopy(self.x_np) + self.place = [paddle.CUDAPlace(0)] + + def test_api_dygraph(self): + def run(place): + paddle.disable_static(place) + x_tensor = paddle.to_tensor(self.x_np) + index_tensor = paddle.to_tensor(self.index_np) + value_tensor = paddle.to_tensor(self.value_np) + out = paddle.put_along_axis( + x_tensor, index_tensor, value_tensor, self.axis + ) + np.array( + np.put_along_axis( + self.x_np, self.index_np, self.value_np, self.axis + ) + ) + out_ref = self.x_np + np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001) + + paddle.enable_static() + + for place in self.place: + run(place) + + class TestPutAlongAxisAPICase2(TestPutAlongAxisAPI): def setUp(self): np.random.seed(0) @@ -468,13 +971,262 @@ def test_error(self): except Exception as error: self.assertIsInstance(error, RuntimeError) - # use includ_self=False - try: + def test_index_type_error(self): + tensorx = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]).astype("float32") + indices = paddle.to_tensor([[1]]).astype("float32") + values = paddle.to_tensor([[2]]) + with self.assertRaises(TypeError): res = paddle.put_along_axis( - tensorx, indices, 1.0, 0, 'assign', False + tensorx, indices, values, 0, 'mul', True, False ) - except Exception as error: - self.assertIsInstance(error, ValueError) + + +@unittest.skipIf( + not core.is_compiled_with_cuda(), + "core is not complied with CUDA", +) +class TestPutAlongAxisAPIMulFloat32(unittest.TestCase): + def setUp(self): + np.random.seed(0) + self.dtype = 'float32' + self.x_type = "float32" + self.x_shape = (10, 10, 10) + self.value_type = "float32" + self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type) + self.index_type = "int64" + self.index = np.random.randint(0, 5, (5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] *= self.value[ + i, j, k + ] + + def test_api_dygraph(self): + def run(place): + paddle.disable_static(place) + x_tensor = paddle.to_tensor(self.xnp) + index_tensor = paddle.to_tensor(self.index) + value_tensor = paddle.to_tensor(self.value) + out = paddle.put_along_axis( + x_tensor, + index_tensor, + value_tensor, + self.axis, + "mul", + True, + False, + ) + out_ref = self.target + np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001) + + run(paddle.CUDAPlace(0)) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the bfloat16", +) +class TestPutAlongAxisAPIMulBF16(unittest.TestCase): + def setUp(self): + np.random.seed(0) + self.dtype = 'float32' + self.x_type = "float32" + self.x_shape = (10, 10, 10) + self.value_type = "float32" + self.value = np.random.randint(1, 3, (3, 3, 3)).astype(self.value_type) + self.index_type = "int64" + self.index = np.random.randint(0, 3, (3, 3, 3)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + self.target = copy.deepcopy(self.xnp) + for i in range(3): + for j in range(3): + for k in range(3): + self.target[i, self.index[i, j, k], k] *= self.value[ + i, j, k + ] + self.xnp = convert_float_to_uint16(self.xnp) + self.value = convert_float_to_uint16(self.value) + self.target = convert_float_to_uint16(self.target) + + def test_api_dygraph(self): + def run(place): + paddle.disable_static(place) + x_tensor = paddle.to_tensor(self.xnp) + index_tensor = paddle.to_tensor(self.index) + value_tensor = paddle.to_tensor(self.value) + out = paddle.put_along_axis( + x_tensor, + index_tensor, + value_tensor, + self.axis, + "mul", + True, + False, + ) + out_ref = self.target + np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001) + + run(paddle.CUDAPlace(0)) + + +@unittest.skipIf( + not core.is_compiled_with_cuda(), + "core is not complied with CUDA", +) +class TestPutAlongAxisAPIMulInt32(unittest.TestCase): + def setUp(self): + np.random.seed(0) + self.dtype = 'int32' + self.x_type = "int32" + self.x_shape = (10, 10, 10) + self.value_type = "int32" + self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type) + self.index_type = "int32" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.randint(1, 5, self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] *= self.value[ + i, j, k + ] + + def test_api_dygraph(self): + def run(place): + paddle.disable_static(place) + x_tensor = paddle.to_tensor(self.xnp) + index_tensor = paddle.to_tensor(self.index) + value_tensor = paddle.to_tensor(self.value) + out = paddle.put_along_axis( + x_tensor, + index_tensor, + value_tensor, + self.axis, + "mul", + True, + False, + ) + out_ref = self.target + np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001) + + run(paddle.CUDAPlace(0)) + + +@unittest.skipIf( + not core.is_compiled_with_cuda(), + "core is not complied with CUDA", +) +class TestPutAlongAxisAPIMulInt64(unittest.TestCase): + def setUp(self): + np.random.seed(0) + self.dtype = 'int64' + self.x_type = "int64" + self.x_shape = (10, 10, 10) + self.value_type = "int64" + self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.randint(1, 5, self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] *= self.value[ + i, j, k + ] + + def test_api_dygraph(self): + def run(place): + paddle.disable_static(place) + x_tensor = paddle.to_tensor(self.xnp) + index_tensor = paddle.to_tensor(self.index) + value_tensor = paddle.to_tensor(self.value) + out = paddle.put_along_axis( + x_tensor, + index_tensor, + value_tensor, + self.axis, + "mul", + True, + False, + ) + out_ref = self.target + np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001) + + run(paddle.CUDAPlace(0)) + + +@unittest.skipIf( + not core.is_compiled_with_cuda(), + "core is not complied with CUDA", +) +class TestPutAlongAxisAPIMulUint8(unittest.TestCase): + def setUp(self): + np.random.seed(0) + self.dtype = 'uint8' + self.x_type = "uint8" + self.x_shape = (10, 10, 10) + self.value_type = "uint8" + self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type) + self.index_type = "int64" + self.index = np.zeros((5, 5, 5)).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.randint(1, 5, self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.target = copy.deepcopy(self.xnp) + for i in range(5): + for j in range(5): + for k in range(5): + self.target[i, self.index[i, j, k], k] *= self.value[ + i, j, k + ] + + def test_api_dygraph(self): + def run(place): + paddle.disable_static(place) + x_tensor = paddle.to_tensor(self.xnp) + index_tensor = paddle.to_tensor(self.index) + value_tensor = paddle.to_tensor(self.value) + out = paddle.put_along_axis( + x_tensor, + index_tensor, + value_tensor, + self.axis, + "mul", + True, + False, + ) + out_ref = self.target + np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001) + + run(paddle.CUDAPlace(0)) if __name__ == "__main__": diff --git a/test/legacy_test/test_repeat_interleave_op.py b/test/legacy_test/test_repeat_interleave_op.py index b2d0a12c6e260d..60d11a813263e5 100644 --- a/test/legacy_test/test_repeat_interleave_op.py +++ b/test/legacy_test/test_repeat_interleave_op.py @@ -252,6 +252,25 @@ def test_dygraph_api(self): expect_out = np.repeat(input_x, index, axis=None) np.testing.assert_allclose(expect_out, np_z, rtol=1e-05) + # case input dtype is bfloat16 + input_x = np.array([[1, 2, 1], [1, 2, 3]]).astype('uint16') + + with base.dygraph.guard(): + x = paddle.to_tensor(input_x) + index = paddle.to_tensor(index_x) + z = paddle.repeat_interleave(x, index, None) + np_z = z.numpy() + expect_out = np.repeat(input_x, index_x, axis=None) + np.testing.assert_allclose(expect_out, np_z, rtol=1e-05) + + with base.dygraph.guard(): + x = paddle.to_tensor(input_x) + index = 2 + z = paddle.repeat_interleave(x, index, None) + np_z = z.numpy() + expect_out = np.repeat(input_x, index, axis=None) + np.testing.assert_allclose(expect_out, np_z, rtol=1e-05) + # case 1: with base.dygraph.guard(): x = base.dygraph.to_variable(self.data_x) diff --git a/test/legacy_test/test_set_value_op.py b/test/legacy_test/test_set_value_op.py index 65c9f69765d116..c42026fb9caee1 100644 --- a/test/legacy_test/test_set_value_op.py +++ b/test/legacy_test/test_set_value_op.py @@ -1978,5 +1978,87 @@ def test_check_grad(self): self.check_grad_with_place(place, ['Input'], 'Out', check_dygraph=False) +class TestSetValueWithScalarInStatic(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.shape = (10, 2) + self.exe = paddle.static.Executor() + self.train_program = paddle.static.Program() + self.startup_program = paddle.static.Program() + + def test_value_input_is_scalar(self): + with paddle.static.program_guard( + self.train_program, self.startup_program + ): + x = paddle.ones(self.shape) + x.stop_gradient = False + y = x * 1 + + # mock test case x[0, 0] = 10 with no ValueTensor input + inputs = { + 'Input': y, + } + attrs = { + 'axes': [0, 1], + 'starts': [0, 0], + 'ends': [1, 1], + 'steps': [1, 1], + 'values': [10], + 'shape': [1], + } + + helper = LayerHelper("set_value") + out = helper.create_variable_for_type_inference(dtype=y.dtype) + + helper.append_op( + type="set_value", + inputs=inputs, + outputs={'Out': out}, + attrs=attrs, + ) + + np_data = np.ones(self.shape).astype('float32') + + paddle.static.append_backward(out.sum()) + res = self.exe.run( + self.train_program, fetch_list=[out, x.grad_name] + ) + + np_data[0, 0] = 10 + expected_x_grad = np.ones(self.shape) + expected_x_grad[0, 0] = 0 + + np.testing.assert_array_equal(res[0], np_data) + np.testing.assert_array_equal(res[1], expected_x_grad) + + +class TestSetValueWithScalarInDygraph(unittest.TestCase): + def setUp(self): + paddle.disable_static() + self.shape = (10, 2) + + def test_value_input_is_scalar(self): + x = paddle.ones(self.shape) + x.stop_gradient = False + y = x * 1 + + # mock test case x[0, 0] = 10 with no ValueTensor input + out = paddle._C_ops.set_value( + y, [0, 0], [1, 1], [1, 1], [0, 1], [], [], [1], [10.0] + ) + + loss = out.sum() + loss.backward() + + np_data = np.ones(self.shape).astype('float32') + np_data[0, 0] = 10 + + expected_x_grad = np.ones(self.shape) + expected_x_grad[0, 0] = 0 + + np.testing.assert_array_equal(out, np_data) + np.testing.assert_array_equal(x.grad, expected_x_grad) + + if __name__ == '__main__': unittest.main() diff --git a/test/legacy_test/test_sparse_fused_attention_op.py b/test/legacy_test/test_sparse_fused_attention_op.py index 68cdd16d4bd12c..098f4815b85f38 100644 --- a/test/legacy_test/test_sparse_fused_attention_op.py +++ b/test/legacy_test/test_sparse_fused_attention_op.py @@ -42,6 +42,7 @@ def get_cuda_version(): ) class TestSparseAttentionAPI1(unittest.TestCase): def setUp(self): + paddle.seed(0) self.batch_size = 16 self.num_heads = 16 self.seq_len = 128 @@ -134,6 +135,7 @@ def test_dygraph(self): class TestSparseAttentionAPI2(TestSparseAttentionAPI1): def setUp(self): + super().setUp() self.batch_size = 16 self.num_heads = 16 self.seq_len = 128 @@ -144,6 +146,7 @@ def setUp(self): class TestSparseAttentionAPI3(TestSparseAttentionAPI1): def setUp(self): + super().setUp() self.batch_size = 16 self.num_heads = 16 self.seq_len = 512 @@ -154,6 +157,7 @@ def setUp(self): class TestSparseAttentionAPI4(TestSparseAttentionAPI1): def setUp(self): + super().setUp() self.batch_size = 16 self.num_heads = 16 self.seq_len = 512 @@ -164,6 +168,7 @@ def setUp(self): class TestSparseAttentionAPI5(TestSparseAttentionAPI1): def setUp(self): + super().setUp() self.batch_size = 16 self.num_heads = 16 self.seq_len = 512 diff --git a/test/quantization/test_groupwise.py b/test/quantization/test_groupwise.py new file mode 100644 index 00000000000000..aef864fd2713bd --- /dev/null +++ b/test/quantization/test_groupwise.py @@ -0,0 +1,69 @@ +# copyright (c) 2023 paddlepaddle authors. all rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +import unittest + +import paddle +from paddle.nn import Linear, Sequential +from paddle.quantization import PTQ, QuantConfig +from paddle.quantization.observers import GroupWiseWeightObserver + + +class LinearDygraph(paddle.nn.Layer): + def __init__(self): + super().__init__() + self.fc = Sequential( + Linear(128, 128), Linear(128, 128), Linear(128, 128) + ) + + def forward(self, inputs): + out = self.fc(inputs) + return out + + +class TestPTQGroupWise(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.path = os.path.join(self.temp_dir.name, 'ptq') + + def tearDown(self): + self.temp_dir.cleanup() + + def _get_model_for_ptq(self): + observer = GroupWiseWeightObserver(quant_bits=4, group_size=128) + model = LinearDygraph() + model.eval() + q_config = QuantConfig(activation=None, weight=observer) + ptq = PTQ(q_config) + quant_model = ptq.quantize(model) + return quant_model, ptq + + def _count_layers(self, model, layer_type): + count = 0 + for _layer in model.sublayers(True): + if isinstance(_layer, layer_type): + count += 1 + return count + + def test_quantize(self): + ptq_model, _ = self._get_model_for_ptq() + inputs = paddle.rand([128, 128], dtype="float32") + out = ptq_model(inputs) + self.assertIsNotNone(out) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/quantization/test_llm_int8_linear.py b/test/quantization/test_llm_int8_linear.py index 5a35b0d5124616..e4920f198f2c6f 100644 --- a/test/quantization/test_llm_int8_linear.py +++ b/test/quantization/test_llm_int8_linear.py @@ -15,12 +15,11 @@ import unittest import numpy as np -from test_weight_only_linear import convert_uint16_to_float, get_cuda_version +from test_weight_only_linear import convert_uint16_to_float import paddle import paddle.nn.quant as Q from paddle import base -from paddle.base import core from paddle.base.framework import default_main_program from paddle.framework import set_default_dtype from paddle.pir_utils import test_with_pir_api @@ -30,12 +29,7 @@ default_main_program().random_seed = 42 -@unittest.skipIf( - not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8, - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCase(unittest.TestCase): def config(self): self.dtype = 'float16' @@ -149,12 +143,7 @@ def test_llm_int8_linear(self): ) -@unittest.skipIf( - not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8, - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCase1(LLMInt8LinearTestCase): def config(self): super().config() @@ -162,12 +151,7 @@ def config(self): self.weight_dtype = "int8" -@unittest.skipIf( - not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8, - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCase2(LLMInt8LinearTestCase): def config(self): super().config() @@ -176,12 +160,7 @@ def config(self): self.weight_dtype = "int8" -@unittest.skipIf( - not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8, - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCase3(LLMInt8LinearTestCase): def config(self): super().config() @@ -189,13 +168,7 @@ def config(self): self.weight_dtype = "int8" -@unittest.skipIf( - not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8 - or not core.is_bfloat16_supported(core.CUDAPlace(0)), - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8 or core is not support bfloat16", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCase4(LLMInt8LinearTestCase): def config(self): super().config() @@ -203,12 +176,7 @@ def config(self): self.weight_dtype = "int4" -@unittest.skipIf( - not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8, - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCase5(LLMInt8LinearTestCase): def config(self): super().config() @@ -217,13 +185,7 @@ def config(self): self.weight_dtype = "int4" -@unittest.skipIf( - not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8 - or not core.is_bfloat16_supported(core.CUDAPlace(0)), - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8 or core is not support bfloat16", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCase6(LLMInt8LinearTestCase): def config(self): super().config() @@ -231,12 +193,7 @@ def config(self): self.weight_dtype = "int4" -@unittest.skipIf( - not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8, - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCase7(LLMInt8LinearTestCase): def config(self): super().config() @@ -246,12 +203,7 @@ def config(self): self.token = 1 -@unittest.skipIf( - not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8, - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCase8(LLMInt8LinearTestCase): def config(self): super().config() @@ -262,12 +214,7 @@ def config(self): self.token = 1 -@unittest.skipIf( - not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8, - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCase9(LLMInt8LinearTestCase): def config(self): super().config() @@ -277,12 +224,7 @@ def config(self): self.token = 1 -@unittest.skipIf( - not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8, - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCase10(LLMInt8LinearTestCase): def config(self): super().config() @@ -293,13 +235,7 @@ def config(self): self.token = 1 -@unittest.skipIf( - not core.is_compiled_with_cuda() - or not core.is_compiled_with_cuda() - or get_cuda_version() < 11020 - or paddle.device.cuda.get_device_capability()[0] < 8, - "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", -) +@unittest.skipIf(True, "Disable this unit test in release/2.6") class LLMInt8LinearTestCaseStatic(LLMInt8LinearTestCase): def config(self): super().config() diff --git a/test/quantization/test_post_training_quantization_mobilenetv1.py b/test/quantization/test_post_training_quantization_mobilenetv1.py index 4500f61ca13dc6..113b2cb066b915 100644 --- a/test/quantization/test_post_training_quantization_mobilenetv1.py +++ b/test/quantization/test_post_training_quantization_mobilenetv1.py @@ -25,6 +25,7 @@ import paddle from paddle.dataset.common import download +from paddle.io import Dataset from paddle.static.log_helper import get_logger from paddle.static.quantization import PostTrainingQuantization @@ -116,6 +117,33 @@ def val(data_dir=DATA_DIR): return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir) +class ImageNetDataset(Dataset): + def __init__(self, data_dir=DATA_DIR, shuffle=False, need_label=False): + super().__init__() + self.need_label = need_label + self.data_dir = data_dir + val_file_list = os.path.join(data_dir, 'val_list.txt') + with open(val_file_list) as flist: + lines = [line.strip() for line in flist] + if shuffle: + np.random.shuffle(lines) + self.data = [line.split() for line in lines] + + def __getitem__(self, index): + sample = self.data[index] + data_path = os.path.join(self.data_dir, sample[0]) + data, label = process_image( + [data_path, sample[1]], mode='val', color_jitter=False, rotate=False + ) + if self.need_label: + return data, np.array([label]).astype('int64') + else: + return data + + def __len__(self): + return len(self.data) + + class TestPostTrainingQuantization(unittest.TestCase): def setUp(self): self.int8_download = 'int8/download' @@ -267,7 +295,7 @@ def run_program( throughput = cnt / np.sum(periods) latency = np.average(periods) acc1 = np.sum(test_info) / cnt - return (throughput, latency, acc1) + return (throughput, latency, acc1, feed_dict) def generate_quantized_model( self, @@ -284,6 +312,7 @@ def generate_quantized_model( batch_nums=1, onnx_format=False, deploy_backend=None, + feed_name="inputs", ): try: os.system("mkdir " + self.int8_model) @@ -293,11 +322,30 @@ def generate_quantized_model( place = paddle.CPUPlace() exe = paddle.static.Executor(place) - val_reader = val() + image = paddle.static.data( + name=feed_name[0], shape=[None, 3, 224, 224], dtype='float32' + ) + feed_list = [image] + if len(feed_name) == 2: + label = paddle.static.data( + name='label', shape=[None, 1], dtype='int64' + ) + feed_list.append(label) + + val_dataset = ImageNetDataset(need_label=len(feed_list) == 2) + data_loader = paddle.io.DataLoader( + val_dataset, + places=place, + feed_list=feed_list, + drop_last=False, + return_list=False, + batch_size=2, + shuffle=False, + ) ptq = PostTrainingQuantization( executor=exe, - sample_generator=val_reader, + data_loader=data_loader, model_dir=model_path, model_filename=model_filename, params_filename=params_filename, @@ -348,7 +396,12 @@ def run_test( model, infer_iterations * batch_size ) ) - (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program( + ( + fp32_throughput, + fp32_latency, + fp32_acc1, + feed_name, + ) = self.run_program( model_path, model_filename, params_filename, @@ -370,6 +423,7 @@ def run_test( batch_nums, onnx_format, deploy_backend, + feed_name, ) _logger.info( @@ -377,7 +431,7 @@ def run_test( model, infer_iterations * batch_size ) ) - (int8_throughput, int8_latency, int8_acc1) = self.run_program( + (int8_throughput, int8_latency, int8_acc1, _) = self.run_program( self.int8_model, model_filename, params_filename, @@ -421,7 +475,7 @@ def test_post_training_kl_mobilenetv1(self): is_use_cache_file = False is_optimize_model = True diff_threshold = 0.025 - batch_nums = 1 + batch_nums = 2 self.run_test( model, 'inference.pdmodel', @@ -607,7 +661,7 @@ def test_post_training_onnx_format_mobilenetv1_tensorrt(self): is_optimize_model = False onnx_format = True diff_threshold = 0.05 - batch_nums = 2 + batch_nums = 12 deploy_backend = "tensorrt" self.run_test( model, @@ -650,7 +704,7 @@ def test_post_training_onnx_format_mobilenetv1_mkldnn(self): is_optimize_model = False onnx_format = True diff_threshold = 0.05 - batch_nums = 1 + batch_nums = 12 deploy_backend = "mkldnn" self.run_test( model, diff --git a/test/quantization/test_post_training_quantization_resnet50.py b/test/quantization/test_post_training_quantization_resnet50.py index ca87f17572a4c3..895b2f170084dc 100644 --- a/test/quantization/test_post_training_quantization_resnet50.py +++ b/test/quantization/test_post_training_quantization_resnet50.py @@ -113,7 +113,7 @@ def run_program( throughput = cnt / np.sum(periods) latency = np.average(periods) acc1 = np.sum(test_info) / cnt - return (throughput, latency, acc1) + return (throughput, latency, acc1, feed_dict) class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingForResnet50): diff --git a/test/quantization/test_ptq.py b/test/quantization/test_ptq.py index 29ef308bd0b54e..2c6c21d472665f 100644 --- a/test/quantization/test_ptq.py +++ b/test/quantization/test_ptq.py @@ -128,6 +128,48 @@ def test_convert(self): self.assertIsNotNone(results) paddle.disable_static() + def test_convert_2times(self): + quant_model, ptq = self._get_model_for_ptq() + + image = paddle.rand([1, 1, 32, 32], dtype="float32") + converted_model = ptq.convert(quant_model) + converted_model = ptq.convert(converted_model) + out = converted_model(image) + self.assertIsNotNone(out) + + observer_count = self._count_layers( + converted_model, AbsmaxObserverLayer + ) + quanter_count = self._count_layers(converted_model, LinearQuanter) + dequanter_count = self._count_layers(converted_model, LinearDequanter) + self.assertEqual(observer_count, 0) + self.assertEqual(dequanter_count, 14) + self.assertEqual(quanter_count, 9) + + save_path = os.path.join(self.temp_dir.name, 'int8_infer') + paddle.jit.save(converted_model, save_path, [image]) + + paddle.enable_static() + exe = paddle.static.Executor(paddle.CPUPlace()) + main_program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.static.program_guard(main_program, startup_program): + [ + inference_program, + feed_target_names, + fetch_targets, + ] = paddle.static.load_inference_model(save_path, exe) + tensor_img = np.array( + np.random.random((1, 1, 32, 32)), dtype=np.float32 + ) + results = exe.run( + inference_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets, + ) + self.assertIsNotNone(results) + paddle.disable_static() + if __name__ == '__main__': unittest.main() diff --git a/test/quantization/test_weight_only_linear.py b/test/quantization/test_weight_only_linear.py index 81f84f138e70b8..c7bbc1c6582676 100644 --- a/test/quantization/test_weight_only_linear.py +++ b/test/quantization/test_weight_only_linear.py @@ -399,5 +399,47 @@ def test_weightonly_linear_backward(self): np.testing.assert_allclose(quant_x.grad, x.grad, rtol=1e-3, atol=1e-3) +@unittest.skipIf( + not core.is_compiled_with_cuda() or get_cuda_version() < 11020, + "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", +) +class WeightOnlyLinearTestCase11(WeightOnlyLinearTestCase): + def config(self): + super().config() + self.dtype = 'float16' + self.weight_dtype = "int8" + self.in_features = 128 + self.out_features = 288 + + +@unittest.skipIf( + not core.is_compiled_with_cuda() or get_cuda_version() < 11020, + "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", +) +class WeightOnlyLinearTestCase12(WeightOnlyLinearTestCase): + def config(self): + super().config() + self.dtype = 'float16' + self.bias = False + self.weight_dtype = "int8" + self.in_features = 128 + self.out_features = 288 + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or get_cuda_version() < 11020 + or paddle.device.cuda.get_device_capability()[0] < 8, + "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8", +) +class WeightOnlyLinearTestCase13(WeightOnlyLinearTestCase): + def config(self): + super().config() + self.dtype = 'bfloat16' + self.weight_dtype = "int8" + self.in_features = 128 + self.out_features = 288 + + if __name__ == '__main__': unittest.main() diff --git a/third_party/cryptopp b/third_party/cryptopp new file mode 160000 index 00000000000000..9dcc26c58213ab --- /dev/null +++ b/third_party/cryptopp @@ -0,0 +1 @@ +Subproject commit 9dcc26c58213abb8351fbb1b2a7a1d2c667366e4 diff --git a/third_party/cryptopp-cmake b/third_party/cryptopp-cmake new file mode 160000 index 00000000000000..6d0666c457fbbf --- /dev/null +++ b/third_party/cryptopp-cmake @@ -0,0 +1 @@ +Subproject commit 6d0666c457fbbf6f81819fd2b80f0cb5b6646593 diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 464fb9cc1cfe46..cbc97375fd869d 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -28,7 +28,6 @@ 'test_fc_gru_fuse_pass_cc', 'device_worker_test', 'test_custom_conj', - 'infer_io_utils_tester', 'test_transpose_bf16_mkldnn_op', 'test_container', 'cpu_helper_test', @@ -73,7 +72,6 @@ 'test_pybind_interface', 'test_io_save_load', 'test_fusion_lstm_int8_mkldnn_op', - 'test_benchmark', 'test_protobuf', 'test_tdm_sampler_op', 'test_teacher_student_sigmoid_loss_op', @@ -482,7 +480,6 @@ 'test_communicator_half_async', 'test_dynrnn_gradient_check', 'test_pool2d_bf16_mkldnn_op', - 'test_table_printer', 'test_framework_debug_str', 'test_dist_fleet_ps2', 'test_collective_scatter_api', @@ -1926,7 +1923,6 @@ 'test_bpr_loss_op', 'test_boxps', 'test_bipartite_match_op', - 'test_benchmark', 'test_beam_search_op', 'test_batch_sampler', 'test_batch_norm_act_fuse_pass', @@ -1970,7 +1966,6 @@ 'lodtensor_printer_test', 'test_dispatch_jit', 'inlined_vector_test', - 'infer_io_utils_tester', 'graph_to_program_pass_test', 'graph_test', 'graph_helper_test', @@ -2176,7 +2171,6 @@ 'test_auto_parallel_api', 'test_tensor_copy_from', 'test_analyzer_capi_exp_xpu', - 'test_table_printer', 'test_egr_task_autocodegen', 'test_static_save_load_bf16', 'test_parallel_executor_run_cinn',