From 70cf5ccc4a93f7824fc62e715ef8fa8c063f1491 Mon Sep 17 00:00:00 2001 From: "haowen.han" Date: Mon, 20 May 2024 11:48:06 +0000 Subject: [PATCH 1/6] Revert "paddle_musa v2.6.0 release initialization (#64265)" This reverts commit 6caf5d5cfdae84480dbd31673045355c01b7b3da. --- .gitmodules | 8 + CMakeLists.txt | 71 +- README.md | 2 +- README_cn.md | 4 +- README_ja.md | 2 +- cmake/configure.cmake | 13 - cmake/cupti.cmake | 6 +- cmake/external/cryptopp.cmake | 18 +- cmake/external/eigen.cmake | 70 - cmake/flags.cmake | 5 - cmake/generic.cmake | 143 +- cmake/inference_lib.cmake | 15 +- cmake/mccl.cmake | 51 - cmake/mudnn.cmake | 92 - cmake/musa.cmake | 128 -- cmake/operators.cmake | 94 +- cmake/phi.cmake | 2 +- paddle/cinn/ir/ir_base.h | 9 +- paddle/cinn/ir/utils/ir_nodes_collector.cc | 67 +- paddle/common/array.h | 8 +- paddle/common/hostdevice.h | 6 +- paddle/common/macros.h | 2 +- .../distributed/collective/CMakeLists.txt | 4 +- .../collective/process_group_nccl.cc | 30 +- .../collective/process_group_nccl.h | 2 +- .../collective/processgroup_comm_utils.cc | 6 +- .../fluid/distributed/collective/reducer.cc | 6 +- .../distributed/common/chunk_allocator.h | 14 +- .../distributed/fleet_executor/carrier.cc | 2 +- .../fleet_executor/cond_interceptor.cc | 2 +- .../distributed/fleet_executor/dist_model.cc | 2 +- .../distributed/fleet_executor/message_bus.cc | 2 +- .../forwards/multiply_fwd_func.cc | 10 +- .../eager/auto_code_generator/CMakeLists.txt | 4 - .../generator/eager_gen.py | 2 +- .../generator/python_c_gen.py | 2 +- paddle/fluid/eager/nan_inf_utils.cc | 2 +- paddle/fluid/framework/CMakeLists.txt | 7 +- paddle/fluid/framework/conv_search_cache.h | 18 - paddle/fluid/framework/custom_operator.cc | 4 +- paddle/fluid/framework/data_feed.cc | 4 +- paddle/fluid/framework/data_feed.cu | 40 +- paddle/fluid/framework/data_feed.h | 2 +- paddle/fluid/framework/data_feed_factory.cc | 2 +- paddle/fluid/framework/data_type_transform.cc | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 71 +- .../framework/details/all_reduce_op_handle.cc | 12 +- .../framework/details/all_reduce_op_handle.h | 8 +- .../framework/details/broadcast_op_handle.cc | 6 +- .../framework/details/broadcast_op_handle.h | 8 +- .../fluid/framework/details/build_strategy.cc | 12 +- .../fluid/framework/details/build_strategy.h | 2 +- .../details/eager_deletion_op_handle.cc | 21 +- .../details/eager_deletion_op_handle.h | 2 +- .../details/fetch_async_op_handle.cc | 2 +- .../framework/details/fetch_op_handle.cc | 2 +- .../details/fused_all_reduce_op_handle.cc | 21 +- .../details/fused_all_reduce_op_handle.h | 6 +- .../details/fused_broadcast_op_handle.h | 4 +- .../grad_merge_all_reduce_op_handle.cc | 6 +- .../details/grad_merge_all_reduce_op_handle.h | 6 +- .../framework/details/nan_inf_utils_detail.cc | 2 +- .../fluid/framework/details/nccl_op_handle.h | 61 +- .../fluid/framework/details/op_handle_base.cc | 34 +- .../fluid/framework/details/op_handle_base.h | 2 +- .../framework/details/reduce_op_handle.cc | 8 +- .../framework/details/reduce_op_handle.h | 6 +- .../details/scale_loss_grad_op_handle.cc | 4 +- .../details/share_tensor_buffer_op_handle.cc | 2 +- .../details/sparse_all_reduce_op_handle.cc | 6 +- paddle/fluid/framework/details/var_handle.h | 4 +- paddle/fluid/framework/device_worker.h | 20 +- .../fluid/framework/device_worker_factory.cc | 4 +- paddle/fluid/framework/dlpack_tensor.cc | 4 +- paddle/fluid/framework/fleet/CMakeLists.txt | 14 +- paddle/fluid/framework/fleet/box_wrapper.cu | 22 - paddle/fluid/framework/fleet/box_wrapper.h | 3 - .../fluid/framework/fleet/box_wrapper_impl.h | 13 +- paddle/fluid/framework/fleet/fleet_wrapper.cc | 5 +- paddle/fluid/framework/fleet/fleet_wrapper.h | 2 +- .../framework/fleet/heter_ps/CMakeLists.txt | 15 - .../fleet/heter_ps/graph_gpu_wrapper.cu | 6 +- .../fleet/heter_ps/graph_gpu_wrapper.h | 8 +- .../framework/fleet/heter_ps/heter_comm.h | 8 +- .../framework/fleet/heter_ps/heter_comm_inl.h | 16 +- .../framework/fleet/heter_ps/heter_ps.cu | 4 +- .../fluid/framework/fleet/heter_ps/heter_ps.h | 4 +- .../framework/fleet/heter_ps/heter_ps_base.h | 4 +- paddle/fluid/framework/fleet/heter_wrapper.cc | 6 +- paddle/fluid/framework/fleet/heter_wrapper.h | 2 +- paddle/fluid/framework/fleet/nccl_wrapper.cc | 22 +- paddle/fluid/framework/fleet/nccl_wrapper.h | 10 +- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 12 +- paddle/fluid/framework/garbage_collector.cc | 8 +- paddle/fluid/framework/garbage_collector.h | 2 +- paddle/fluid/framework/hogwild_worker.cc | 18 +- paddle/fluid/framework/ir/CMakeLists.txt | 8 +- paddle/fluid/framework/ir/cost_model.cc | 4 +- paddle/fluid/framework/ir/fuse_bn_act_pass.cc | 4 +- .../framework/ir/fuse_bn_add_act_pass.cc | 4 +- .../framework/ir/fusion_group/CMakeLists.txt | 2 +- .../ir/fusion_group/code_generator_tester.cc | 2 +- .../ir/fusion_group/cuda_resources.h | 2 +- paddle/fluid/framework/ir/graph_helper.cc | 6 +- ...est_reference_count_pass_last_lived_ops.cc | 2 +- .../all_reduce_deps_pass.cc | 2 +- .../fuse_all_reduce_op_pass.cc | 16 +- .../multi_devices_graph_pass.cc | 18 +- .../multi_devices_graph_pass.h | 4 +- .../instruction/instruction_util.cc | 6 +- .../interpreter/execution_config.cc | 2 +- .../interpreter/interpreter_util.cc | 2 +- .../interpreter/stream_analyzer.cc | 4 +- .../new_executor/interpreter_base_impl.h | 4 +- .../new_executor/new_executor_defs.cc | 4 +- .../new_executor/new_executor_defs.h | 4 +- .../framework/new_executor/pir_interpreter.cc | 10 +- .../fluid/framework/new_executor/profiler.h | 2 +- .../new_executor/program_interpreter.cc | 22 +- .../new_executor/program_interpreter.h | 4 +- paddle/fluid/framework/op_registry.h | 4 +- paddle/fluid/framework/operator.cc | 18 +- paddle/fluid/framework/operator.h | 2 +- paddle/fluid/framework/parallel_executor.cc | 60 +- paddle/fluid/framework/parallel_executor.h | 2 +- paddle/fluid/framework/phi_utils.cc | 2 +- paddle/fluid/framework/phi_utils.h | 2 +- paddle/fluid/framework/pipeline_trainer.cc | 4 +- paddle/fluid/framework/ps_gpu_trainer.cc | 2 +- paddle/fluid/framework/ps_gpu_worker.cc | 6 +- paddle/fluid/framework/pull_dense_worker.cc | 14 +- paddle/fluid/framework/section_worker.cc | 4 +- paddle/fluid/framework/tensor_util.cc | 14 +- paddle/fluid/framework/tensor_util.h | 8 +- paddle/fluid/framework/trainer.h | 12 +- paddle/fluid/framework/trainer_factory.cc | 6 +- paddle/fluid/framework/var_type_traits.cc | 7 - paddle/fluid/framework/var_type_traits.h | 20 +- paddle/fluid/imperative/CMakeLists.txt | 11 +- paddle/fluid/imperative/all_reduce.cc | 37 +- paddle/fluid/imperative/all_reduce.h | 2 +- paddle/fluid/imperative/amp_auto_cast.cc | 2 +- paddle/fluid/imperative/gloo_context.cc | 2 +- .../fluid/imperative/gradient_accumulator.cc | 18 +- paddle/fluid/imperative/nccl_context.cc | 22 +- paddle/fluid/imperative/nccl_context.h | 10 +- paddle/fluid/imperative/prepared_operator.cc | 6 +- paddle/fluid/imperative/reducer.cc | 10 +- paddle/fluid/imperative/reducer.cu | 2 +- paddle/fluid/imperative/reducer.h | 2 +- paddle/fluid/imperative/tracer.cc | 6 +- paddle/fluid/inference/CMakeLists.txt | 2 +- .../ir_params_sync_among_devices_pass.cc | 4 +- .../ir_params_sync_among_devices_pass.h | 2 +- paddle/fluid/inference/api/analysis_config.cc | 15 +- .../fluid/inference/api/analysis_predictor.cc | 35 +- .../fluid/inference/api/analysis_predictor.h | 2 +- paddle/fluid/inference/api/api_impl.cc | 2 +- .../inference/api/details/zero_copy_tensor.cc | 21 +- paddle/fluid/inference/api/infer_context.cc | 2 +- paddle/fluid/inference/api/infer_context.h | 4 +- .../inference/api/paddle_analysis_config.h | 3 +- paddle/fluid/inference/api/paddle_api.h | 3 - .../inference/api/paddle_pass_builder.cc | 5 +- .../fluid/inference/api/resource_manager.cc | 79 +- paddle/fluid/inference/api/resource_manager.h | 26 +- paddle/fluid/inference/lite/tensor_utils.cc | 2 +- paddle/fluid/inference/tensorrt/op_teller.cc | 8 +- .../tensorrt/plugin/c_allreduce_op_plugin.cu | 28 +- .../tensorrt/plugin/qkv_to_context_plugin.cu | 3 - paddle/fluid/inference/utils/CMakeLists.txt | 13 - paddle/fluid/inference/utils/benchmark.cc | 54 - paddle/fluid/inference/utils/benchmark.h | 56 - .../fluid/inference/utils/benchmark_tester.cc | 40 - .../inference/utils/table_printer_tester.cc | 82 - paddle/fluid/memory/CMakeLists.txt | 11 - paddle/fluid/memory/allocation/CMakeLists.txt | 14 +- paddle/fluid/memory/allocation/allocator.h | 14 +- .../memory/allocation/allocator_facade.cc | 32 +- .../memory/allocation/allocator_facade.h | 2 +- .../memory/allocation/buddy_allocator.cc | 6 +- .../fluid/memory/allocation/cuda_allocator.cc | 4 - .../cuda_device_context_allocator.h | 9 +- .../allocation/cuda_managed_allocator.cc | 5 - .../allocation/naive_best_fit_allocator.cc | 28 +- .../memory/allocation/pinned_allocator.cc | 4 - .../allocation/stream_safe_cuda_allocator.cc | 16 - .../allocation/stream_safe_cuda_allocator.h | 3 - .../memory/allocation/system_allocator.cc | 22 +- .../memory/allocation/system_allocator.h | 2 +- paddle/fluid/memory/malloc.cc | 2 +- paddle/fluid/memory/malloc.h | 2 +- paddle/fluid/memory/memcpy.cc | 83 +- paddle/fluid/operators/CMakeLists.txt | 12 +- paddle/fluid/operators/affine_channel_op.cu | 2 +- .../fluid/operators/array_to_lod_tensor_op.cc | 2 +- paddle/fluid/operators/batch_norm_op.cu | 2 +- .../fluid/operators/class_center_sample_op.cu | 23 +- .../fluid/operators/collective/CMakeLists.txt | 2 +- .../operators/collective/alltoall_op.cu.cc | 10 +- .../operators/collective/barrier_op.cu.cc | 12 +- .../operators/collective/c_allgather_op.cu.cc | 10 +- .../collective/c_allreduce_max_op.cu.cc | 4 +- .../operators/collective/c_allreduce_op.h | 20 +- .../collective/c_allreduce_sum_op.cu.cc | 4 +- .../operators/collective/c_broadcast_op.cu.cc | 14 +- .../collective/c_comm_init_all_op.cc | 4 +- .../collective/c_comm_init_multitrainer_op.cc | 10 +- .../operators/collective/c_comm_init_op.cc | 17 +- .../operators/collective/c_concat_op.cu.cc | 14 +- .../operators/collective/c_gen_nccl_id_op.cc | 14 +- .../fluid/operators/collective/c_reduce_op.h | 20 +- .../collective/c_reducescatter_op.cu.cc | 16 +- .../operators/collective/c_scatter_op.cu.cc | 10 +- .../c_softmax_with_cross_entropy_op.cu | 20 +- .../collective/c_sync_calc_stream_op.h | 2 +- .../collective/c_sync_comm_stream_op.h | 6 +- .../operators/collective/c_wait_comm_op.cc | 7 +- .../operators/collective/c_wait_compute_op.cc | 7 +- .../operators/collective/gen_nccl_id_op.cc | 14 +- .../collective/global_gather_op.cu.cc | 38 +- .../collective/global_scatter_op.cu.cc | 38 +- .../collective/mp_allreduce_sum_op.cu.cc | 4 +- .../collective/partial_allgather_op.cu.cc | 14 +- .../collective/partial_recv_op.cu.cc | 14 +- .../collective/partial_send_op.cu.cc | 14 +- .../operators/collective/recv_v2_op.cu.cc | 24 +- .../operators/collective/send_v2_op.cu.cc | 26 +- .../controlflow/conditional_block_op.h | 2 +- paddle/fluid/operators/controlflow/feed_op.cc | 2 +- .../operators/controlflow/get_places_op.cc | 4 +- .../operators/controlflow/while_op_helper.cc | 2 +- paddle/fluid/operators/data_norm_op.cu | 28 +- .../fluid/operators/detection/CMakeLists.txt | 4 +- .../fluid/operators/detection/bbox_util.cu.h | 2 +- .../detection/collect_fpn_proposals_op.cu | 2 +- paddle/fluid/operators/dgc_clip_by_norm_op.h | 76 +- .../elementwise/elementwise_op_function.h | 19 +- paddle/fluid/operators/expand_op.cc | 2 +- paddle/fluid/operators/fake_quantize_op.cu.h | 2 - paddle/fluid/operators/fused/CMakeLists.txt | 12 +- .../fluid/operators/fused/attn_bias_add.cu.h | 2 +- .../operators/fused/fused_attention_utils.h | 10 +- .../operators/fused/fused_dropout_common.h | 4 +- .../fused/fused_multi_transformer_op.cu.h | 12 +- .../operators/fused/fused_seqpool_cvm_op.cu | 59 - .../fluid/operators/fused/yolo_box_post_op.cu | 39 - .../get_tensor_from_selected_rows_op.cc | 2 +- .../fluid/operators/graph_khop_sampler_op.cu | 11 - .../operators/grid_sampler_cudnn_op.cu.cc | 2 +- paddle/fluid/operators/hinge_loss_op.cc | 2 +- paddle/fluid/operators/im2sequence_op.cc | 2 +- paddle/fluid/operators/isfinite_op.h | 8 +- paddle/fluid/operators/l1_norm_op.cc | 2 +- paddle/fluid/operators/load_op.cc | 2 +- .../fluid/operators/lod_tensor_to_array_op.cc | 2 +- paddle/fluid/operators/lookup_table_v2_op.cu | 3 - .../operators/margin_cross_entropy_op.cu | 38 +- .../operators/math/bert_encoder_functor.h | 8 +- paddle/fluid/operators/math/gru_compute.cc | 8 +- paddle/fluid/operators/math/inclusive_scan.h | 2 +- paddle/fluid/operators/math/prelu.h | 2 +- paddle/fluid/operators/math/sample_prob.cu | 5 - paddle/fluid/operators/math/sample_prob.h | 2 +- paddle/fluid/operators/matmul_op.cc | 12 +- paddle/fluid/operators/memcpy_h2d_op.h | 2 +- paddle/fluid/operators/merge_lod_tensor_op.cc | 2 +- paddle/fluid/operators/minus_op.cc | 2 +- paddle/fluid/operators/nccl/CMakeLists.txt | 11 +- .../fluid/operators/nccl/nccl_gpu_common.cc | 10 +- paddle/fluid/operators/nccl/nccl_gpu_common.h | 4 +- paddle/fluid/operators/nccl/nccl_op.cc | 24 +- paddle/fluid/operators/nccl/nccl_op.cu.cc | 34 +- .../optimizers/distributed_fused_lamb_op.cu | 188 +- .../operators/optimizers/sparse_momentum_op.h | 5 +- .../operators/pscore/send_and_recv_op.cc | 2 +- paddle/fluid/operators/rank_loss_op.cc | 2 +- .../fluid/operators/reader/buffered_reader.cc | 9 +- .../fluid/operators/reader/buffered_reader.h | 4 +- paddle/fluid/operators/reduce_ops/reduce_op.h | 4 +- paddle/fluid/operators/reshape_op.cc | 8 +- paddle/fluid/operators/save_op.cc | 2 +- paddle/fluid/operators/select_op_helper.h | 2 +- .../sequence_ops/sequence_reverse_op.h | 4 +- .../sequence_softmax_cudnn_op.cu.cc | 69 +- .../sequence_ops/sequence_softmax_op.cc | 2 +- .../sequence_ops/sequence_softmax_op.cu | 4 - paddle/fluid/operators/set_value_op.cc | 44 +- paddle/fluid/operators/split_lod_tensor_op.cc | 2 +- paddle/fluid/operators/svd_helper.h | 2 +- paddle/fluid/operators/sync_batch_norm_op.cu | 94 +- .../fluid/operators/sync_batch_norm_utils.h | 15 +- paddle/fluid/operators/top_k_op.cu | 3 +- paddle/fluid/operators/uniform_random_op.h | 4 +- paddle/fluid/platform/CMakeLists.txt | 66 +- paddle/fluid/platform/collective_helper.cc | 32 +- paddle/fluid/platform/collective_helper.h | 14 +- paddle/fluid/platform/device/CMakeLists.txt | 2 +- paddle/fluid/platform/device/device_wrapper.h | 2 +- .../fluid/platform/device/gpu/CMakeLists.txt | 12 - paddle/fluid/platform/device/gpu/gpu_helper.h | 4 +- paddle/fluid/platform/device/gpu/gpu_info.cc | 13 - paddle/fluid/platform/device/gpu/gpu_info.h | 2 +- .../platform/device/gpu/gpu_launch_config.h | 4 +- .../platform/device/gpu/gpu_resource_pool.cc | 12 +- .../platform/device/gpu/gpu_resource_pool.h | 7 +- paddle/fluid/platform/device/gpu/gpu_types.h | 124 +- .../platform/device/gpu/musa/musa_helper.h | 104 -- .../fluid/platform/device/gpu/nccl_helper.h | 86 +- paddle/fluid/platform/device_context.cc | 10 +- paddle/fluid/platform/device_context.h | 16 +- paddle/fluid/platform/device_event.h | 2 +- paddle/fluid/platform/device_event_base.cc | 8 - paddle/fluid/platform/device_event_gpu.cc | 2 +- paddle/fluid/platform/dynload/CMakeLists.txt | 22 - .../fluid/platform/dynload/dynamic_loader.h | 1 - paddle/fluid/platform/dynload/mccl.cc | 43 - paddle/fluid/platform/dynload/mccl.h | 51 - paddle/fluid/platform/dynload/mublas.cc | 38 - paddle/fluid/platform/dynload/mublas.h | 55 - paddle/fluid/platform/dynload/mudnn.cc | 30 - paddle/fluid/platform/dynload/mudnn.h | 39 - paddle/fluid/platform/dynload/mufft.cc | 30 - paddle/fluid/platform/dynload/mufft.h | 93 - paddle/fluid/platform/dynload/murand.cc | 27 - paddle/fluid/platform/dynload/murand.h | 43 - paddle/fluid/platform/dynload/musa_driver.cc | 31 - paddle/fluid/platform/dynload/musa_driver.h | 58 - paddle/fluid/platform/dynload/musartc.cc | 31 - paddle/fluid/platform/dynload/musartc.h | 51 - paddle/fluid/platform/dynload/musparse.cc | 30 - paddle/fluid/platform/dynload/musparse.h | 41 - paddle/fluid/platform/dynload/nccl.cc | 16 +- paddle/fluid/platform/dynload/nccl.h | 30 +- paddle/fluid/platform/dynload/rccl.cc | 16 +- paddle/fluid/platform/dynload/rccl.h | 14 +- paddle/fluid/platform/enforce.h | 26 +- paddle/fluid/platform/enforce_test.cc | 4 +- paddle/fluid/platform/event.h | 5 - paddle/fluid/platform/gen_comm_id_helper.cc | 6 +- paddle/fluid/platform/gen_comm_id_helper.h | 2 +- paddle/fluid/platform/init.cc | 20 +- paddle/fluid/platform/place.h | 4 +- paddle/fluid/platform/profiler.cc | 2 +- paddle/fluid/platform/profiler.cu | 19 - paddle/fluid/platform/profiler.h | 4 +- .../platform/profiler/chrometracing_logger.cc | 40 +- .../platform/profiler/chrometracing_logger.h | 2 +- .../profiler/dump/deserialization_reader.cc | 4 +- .../profiler/dump/deserialization_reader.h | 2 +- .../profiler/dump/serialization_logger.cc | 2 +- .../profiler/dump/serialization_logger.h | 2 +- .../fluid/platform/profiler/event_python.cc | 6 +- paddle/fluid/platform/profiler/event_python.h | 6 +- paddle/fluid/platform/profiler/profiler.cc | 13 +- .../fluid/platform/profiler/profiler_test.cc | 8 - paddle/fluid/platform/profiler_helper.h | 19 +- .../fluid/platform/stream_callback_manager.cc | 15 +- .../fluid/platform/stream_callback_manager.h | 5 - paddle/fluid/primitive/composite/composite.h | 45 +- paddle/fluid/pybind/CMakeLists.txt | 30 +- paddle/fluid/pybind/communication.cc | 2 +- paddle/fluid/pybind/cuda_streams_py.cc | 22 +- paddle/fluid/pybind/cuda_streams_py.h | 4 +- paddle/fluid/pybind/distributed_py.cc | 4 +- paddle/fluid/pybind/eager.cc | 2 +- paddle/fluid/pybind/eager_functions.cc | 2 +- paddle/fluid/pybind/eager_math_op_patch.cc | 2 +- paddle/fluid/pybind/eager_method.cc | 285 +-- paddle/fluid/pybind/generator_py.cc | 2 +- paddle/fluid/pybind/imperative.cc | 6 +- paddle/fluid/pybind/inference_api.cc | 14 +- paddle/fluid/pybind/parallel_executor.cc | 8 +- paddle/fluid/pybind/place.cc | 20 +- paddle/fluid/pybind/process_group_utils.h | 4 +- paddle/fluid/pybind/pybind.cc | 59 +- paddle/fluid/pybind/slice_utils.h | 151 +- paddle/fluid/pybind/tensor.cc | 10 +- paddle/fluid/pybind/tensor_py.h | 17 +- paddle/phi/CMakeLists.txt | 17 +- paddle/phi/api/include/context_pool.h | 2 +- paddle/phi/api/include/tensor.h | 7 +- paddle/phi/api/lib/api_gen_utils.cc | 6 +- paddle/phi/api/lib/context_pool.cc | 4 +- paddle/phi/api/lib/data_transform.cc | 8 +- paddle/phi/api/lib/tensor.cc | 2 +- paddle/phi/api/lib/tensor_utils.cc | 40 +- paddle/phi/api/profiler/event.h | 32 +- paddle/phi/api/yaml/backward.yaml | 6 +- .../phi/api/yaml/generator/dist_bw_api_gen.py | 1 + paddle/phi/api/yaml/legacy_backward.yaml | 6 +- paddle/phi/api/yaml/op_compat.yaml | 2 +- paddle/phi/api/yaml/ops.yaml | 2 +- paddle/phi/backends/CMakeLists.txt | 6 +- paddle/phi/backends/context_pool.cc | 2 +- paddle/phi/backends/context_pool.h | 4 +- paddle/phi/backends/custom/custom_device.cc | 2 +- paddle/phi/backends/device_code.cc | 144 +- paddle/phi/backends/device_code.h | 16 +- paddle/phi/backends/device_memory_aligment.h | 2 +- paddle/phi/backends/dynload/CMakeLists.txt | 22 - paddle/phi/backends/dynload/dynamic_loader.cc | 48 - paddle/phi/backends/dynload/dynamic_loader.h | 1 - paddle/phi/backends/dynload/mccl.cc | 36 - paddle/phi/backends/dynload/mccl.h | 80 - paddle/phi/backends/dynload/mublas.cc | 38 - paddle/phi/backends/dynload/mublas.h | 128 -- paddle/phi/backends/dynload/mudnn.cc | 41 - paddle/phi/backends/dynload/mudnn.h | 41 - paddle/phi/backends/dynload/mufft.cc | 43 - paddle/phi/backends/dynload/mufft.h | 155 -- paddle/phi/backends/dynload/murand.cc | 28 - paddle/phi/backends/dynload/murand.h | 54 - paddle/phi/backends/dynload/musa_driver.cc | 33 - paddle/phi/backends/dynload/musa_driver.h | 69 - paddle/phi/backends/dynload/musartc.cc | 34 - paddle/phi/backends/dynload/musartc.h | 147 -- paddle/phi/backends/dynload/musparse.cc | 29 - paddle/phi/backends/dynload/musparse.h | 76 - paddle/phi/backends/dynload/nccl.h | 14 +- paddle/phi/backends/dynload/rccl.h | 14 +- paddle/phi/backends/gpu/forwards.h | 19 - paddle/phi/backends/gpu/gpu_context.cc | 176 +- paddle/phi/backends/gpu/gpu_context.h | 22 +- paddle/phi/backends/gpu/gpu_decls.h | 81 +- paddle/phi/backends/gpu/gpu_device_function.h | 4 +- paddle/phi/backends/gpu/gpu_dnn.h | 5 +- paddle/phi/backends/gpu/gpu_helper.h | 4 +- paddle/phi/backends/gpu/gpu_info.h | 2 +- paddle/phi/backends/gpu/gpu_launch_config.h | 4 +- paddle/phi/backends/gpu/gpu_primitives.h | 186 +- paddle/phi/backends/gpu/gpu_resources.cc | 175 +- paddle/phi/backends/gpu/gpu_resources.h | 8 +- paddle/phi/backends/gpu/gpu_types.h | 70 +- paddle/phi/backends/gpu/musa/mudnn_desc.h | 202 --- paddle/phi/backends/gpu/musa/mudnn_helper.h | 323 ---- .../backends/gpu/musa/musa_device_function.h | 193 -- paddle/phi/backends/gpu/musa/musa_helper.h | 74 - paddle/phi/backends/gpu/musa/musa_info.cc | 334 ---- paddle/phi/capi/include/c_meta_tensor.h | 12 + paddle/phi/capi/include/c_tensor.h | 17 + paddle/phi/capi/include/wrapper_base.h | 66 + paddle/phi/capi/lib/c_device_context.cc | 2 +- paddle/phi/capi/lib/c_kernel_context.cc | 2 +- paddle/phi/capi/lib/c_meta_tensor.cc | 46 + paddle/phi/capi/lib/c_tensor.cc | 72 + paddle/phi/common/backend.h | 2 +- paddle/phi/common/bfloat16.h | 40 +- paddle/phi/common/complex.h | 19 +- paddle/phi/common/cpstring_impl.h | 6 +- paddle/phi/common/float16.h | 53 +- paddle/phi/common/memory_utils.cc | 6 +- paddle/phi/common/memory_utils.h | 23 +- paddle/phi/common/place.cc | 4 +- paddle/phi/common/transform.h | 17 +- paddle/phi/core/compat/convert_utils.cc | 6 +- paddle/phi/core/cuda_stream.h | 22 - paddle/phi/core/distributed/CMakeLists.txt | 2 +- .../auto_parallel/reshard/reshard_utils.cc | 4 +- .../auto_parallel/reshard/reshard_utils.h | 4 +- .../phi/core/distributed/check/CMakeLists.txt | 2 +- .../distributed/check/nccl_dynamic_check.cc | 38 +- .../distributed/check/nccl_dynamic_check.h | 10 +- .../core/distributed/comm_context_manager.cc | 16 +- .../core/distributed/comm_context_manager.h | 8 +- paddle/phi/core/distributed/comm_task.h | 9 +- .../phi/core/distributed/comm_task_manager.cc | 2 +- .../phi/core/distributed/nccl_comm_context.cc | 50 +- .../phi/core/distributed/nccl_comm_context.h | 31 +- paddle/phi/core/distributed/nccl_comm_task.cc | 55 +- paddle/phi/core/distributed/nccl_comm_task.h | 6 +- paddle/phi/core/distributed/nccl_tools.cc | 76 +- paddle/phi/core/distributed/nccl_tools.h | 36 +- paddle/phi/core/enforce.h | 272 +-- paddle/phi/core/flags.cc | 22 +- paddle/phi/core/generator.cc | 5 +- paddle/phi/core/hostdevice.h | 6 +- paddle/phi/core/kernel_factory.cc | 4 +- paddle/phi/core/kernel_registry.cc | 2 +- paddle/phi/core/kernel_registry.h | 2 +- paddle/phi/core/kernel_utils.h | 2 +- paddle/phi/core/mixed_vector.cc | 4 +- paddle/phi/core/string_tensor.cc | 4 +- paddle/phi/core/tensor_utils.cc | 16 +- paddle/phi/core/utils/data_type.h | 29 +- paddle/phi/core/utils/type_info.cc | 4 +- paddle/phi/core/utils/visit_place.h | 4 +- paddle/phi/core/visit_type.h | 4 +- paddle/phi/infermeta/multiary.cc | 2 +- paddle/phi/kernels/CMakeLists.txt | 76 +- paddle/phi/kernels/array_kernel.cc | 8 +- paddle/phi/kernels/assign_kernel.cc | 2 +- paddle/phi/kernels/autotune/gpu_timer.h | 39 +- paddle/phi/kernels/batch_norm_kernel.cc | 2 +- .../kernels/check_memory_continue_kernel.cc | 2 +- paddle/phi/kernels/coalesce_tensor_kernel.cc | 14 - .../phi/kernels/cpu/cum_maxmin_grad_kernel.cc | 8 +- paddle/phi/kernels/cpu/decode_jpeg_kernel.cc | 2 +- paddle/phi/kernels/cpu/gelu_grad_kernel.cc | 2 +- paddle/phi/kernels/cpu/gelu_kernel.cc | 2 +- .../kernels/cpu/put_along_axis_grad_kernel.cc | 149 +- .../phi/kernels/cpu/put_along_axis_kernel.cc | 40 +- .../cpu/repeat_interleave_grad_kernel.cc | 6 +- .../kernels/cpu/repeat_interleave_kernel.cc | 6 +- .../phi/kernels/cpu/set_value_grad_kernel.cc | 17 + .../cpu/take_along_axis_grad_kernel.cc | 3 +- .../phi/kernels/cpu/take_along_axis_kernel.cc | 6 +- .../kernels/custom/c_embedding_grad_kernel.cc | 93 + .../phi/kernels/custom/c_embedding_kernel.cc | 84 + paddle/phi/kernels/dist_grad_kernel.cc | 2 +- paddle/phi/kernels/empty_kernel.cc | 2 +- paddle/phi/kernels/flatten_grad_kernel.cc | 2 +- paddle/phi/kernels/flatten_kernel.cc | 2 +- paddle/phi/kernels/full_kernel.cc | 2 +- paddle/phi/kernels/funcs/CMakeLists.txt | 8 +- paddle/phi/kernels/funcs/activation_functor.h | 2 +- paddle/phi/kernels/funcs/algorithm.h | 4 +- paddle/phi/kernels/funcs/blas/blas.h | 14 +- paddle/phi/kernels/funcs/blas/blas_impl.h | 4 +- paddle/phi/kernels/funcs/blas/blas_impl.mu.h | 1602 ----------------- paddle/phi/kernels/funcs/broadcast_function.h | 4 +- .../phi/kernels/funcs/check_numerics_utils.h | 2 +- .../kernels/funcs/concat_and_split_functor.cu | 2 +- .../phi/kernels/funcs/detail/gru_cpu_kernel.h | 2 +- .../phi/kernels/funcs/detail/gru_gpu_kernel.h | 4 +- paddle/phi/kernels/funcs/detail/gru_kernel.h | 10 +- .../kernels/funcs/detail/lstm_cpu_kernel.h | 2 +- paddle/phi/kernels/funcs/detail/lstm_kernel.h | 4 +- .../phi/kernels/funcs/detail/strided_memcpy.h | 6 +- paddle/phi/kernels/funcs/diagonal.h | 6 +- .../phi/kernels/funcs/distribution_helper.h | 48 +- paddle/phi/kernels/funcs/dropout_impl.cu.h | 23 +- paddle/phi/kernels/funcs/elementwise_base.h | 6 +- .../phi/kernels/funcs/elementwise_functor.h | 2 +- .../phi/kernels/funcs/elementwise_grad_base.h | 4 +- .../funcs/emb_eltwise_layer_norm_functor.cu | 7 +- paddle/phi/kernels/funcs/fc_functor.cu | 6 +- paddle/phi/kernels/funcs/fft.cu | 7 +- paddle/phi/kernels/funcs/fft_cache.h | 2 - paddle/phi/kernels/funcs/fft_fill_conj.h | 4 +- paddle/phi/kernels/funcs/for_range.h | 2 +- .../kernels/funcs/gather_scatter_functor.cc | 456 ++++- .../kernels/funcs/gather_scatter_functor.cu | 951 +++++++++- .../kernels/funcs/gather_scatter_functor.h | 183 ++ paddle/phi/kernels/funcs/gru_compute.cc | 8 +- paddle/phi/kernels/funcs/inclusive_scan.h | 2 +- paddle/phi/kernels/funcs/index_calculator.h | 2 +- paddle/phi/kernels/funcs/index_put_utils.h | 163 +- .../phi/kernels/funcs/interpolate_function.h | 4 +- paddle/phi/kernels/funcs/isfinite_functor.h | 6 +- paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 2 +- paddle/phi/kernels/funcs/layer_norm_util.h | 4 +- paddle/phi/kernels/funcs/load_store_util.h | 2 +- paddle/phi/kernels/funcs/math_cuda_utils.h | 17 +- paddle/phi/kernels/funcs/math_function.cc | 2 +- paddle/phi/kernels/funcs/math_function.h | 2 +- paddle/phi/kernels/funcs/matrix_inverse.cu | 2 +- paddle/phi/kernels/funcs/matrix_solve.cu | 2 +- paddle/phi/kernels/funcs/mode.h | 4 +- paddle/phi/kernels/funcs/mufft_util.h | 130 -- .../kernels/funcs/multihead_matmul_functor.cu | 10 +- paddle/phi/kernels/funcs/norm_utils.cu.h | 2 +- paddle/phi/kernels/funcs/pooling.h | 6 +- paddle/phi/kernels/funcs/reduce_function.h | 6 +- paddle/phi/kernels/funcs/segmented_array.h | 2 +- paddle/phi/kernels/funcs/select_impl.cu.h | 4 +- .../kernels/funcs/skip_layernorm_functor.cu | 8 +- .../kernels/funcs/skip_layernorm_functor.h | 6 - paddle/phi/kernels/funcs/softmax.cu | 36 +- paddle/phi/kernels/funcs/softmax.h | 2 +- paddle/phi/kernels/funcs/sparse/softmax.cu.h | 4 - paddle/phi/kernels/funcs/sparse/sparse_blas.h | 4 - paddle/phi/kernels/funcs/squared_l2_norm.h | 6 +- paddle/phi/kernels/funcs/strided_memcpy.h | 2 +- .../phi/kernels/funcs/top_k_function_cuda.h | 29 +- .../cutlass/fused_conv2d_add_act_kernel.cu | 1 - paddle/phi/kernels/fusion/gpu/block_attn.h | 1 + .../fusion/gpu/fused_bias_act_kernel.cu | 4 +- .../kernels/fusion/gpu/fused_bias_act_utils.h | 4 +- ...dropout_residual_layer_norm_grad_kernel.cu | 6 +- ...bias_dropout_residual_layer_norm_kernel.cu | 4 +- .../gpu/fused_bn_activation_grad_kernel.cu | 2 +- .../fusion/gpu/fused_bn_activation_kernel.cu | 2 +- .../fused_bn_add_activation_grad_kernel.cu | 2 +- .../gpu/fused_bn_add_activation_kernel.cu | 2 +- .../gpu/fused_dropout_add_grad_kernel.cu | 6 +- .../fusion/gpu/fused_dropout_add_kernel.cu | 6 +- .../fused_fc_elementwise_layernorm_kernel.cu | 4 +- .../fusion/gpu/fused_layernorm_kernel.cu | 11 +- .../phi/kernels/fusion/gpu/fused_rope_utils.h | 16 +- .../fused_softmax_mask_upper_triangle_utils.h | 8 +- .../fusion/gpu/fused_softmax_mask_utils.h | 10 +- .../gpu/masked_multihead_attention_kernel.cu | 4 +- paddle/phi/kernels/fusion/gpu/mmha_util.cu.h | 2 +- .../fusion/gpu/multihead_matmul_kernel.cu | 2 - .../phi/kernels/gpu/activation_grad_kernel.cu | 12 +- paddle/phi/kernels/gpu/activation_kernel.cu | 12 +- paddle/phi/kernels/gpu/all_gather_kernel.cu | 4 +- paddle/phi/kernels/gpu/all_reduce_kernel.cu | 21 +- paddle/phi/kernels/gpu/all_to_all_kernel.cu | 41 +- paddle/phi/kernels/gpu/allclose_kernel.cu | 2 - paddle/phi/kernels/gpu/arg_min_max_kernel.cu | 4 +- paddle/phi/kernels/gpu/argsort_grad_kernel.cu | 3 +- paddle/phi/kernels/gpu/argsort_kernel.cu | 2 +- paddle/phi/kernels/gpu/auc_kernel.cu | 10 +- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 16 +- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 18 +- paddle/phi/kernels/gpu/bernoulli_kernel.cu | 10 +- paddle/phi/kernels/gpu/broadcast_kernel.cu | 4 +- .../phi/kernels/gpu/check_numerics_kernel.cu | 6 - paddle/phi/kernels/gpu/cholesky_kernel.cu | 2 +- .../kernels/gpu/cholesky_solve_grad_kernel.cu | 2 +- .../phi/kernels/gpu/cholesky_solve_kernel.cu | 2 +- .../kernels/gpu/cross_entropy_grad_kernel.cu | 4 +- .../phi/kernels/gpu/cross_entropy_kernel.cu | 34 +- paddle/phi/kernels/gpu/cum_kernel.cu | 2 - .../phi/kernels/gpu/cum_maxmin_grad_kernel.cu | 8 +- paddle/phi/kernels/gpu/cumprod_grad_kernel.cu | 2 - paddle/phi/kernels/gpu/decode_jpeg_kernel.cu | 2 +- paddle/phi/kernels/gpu/dgc_kernel.cu | 2 +- paddle/phi/kernels/gpu/dirichlet_kernel.cu | 12 - paddle/phi/kernels/gpu/dist_concat_kernel.cu | 4 +- paddle/phi/kernels/gpu/dist_kernel.cu | 2 +- .../gpu/distribute_fpn_proposals_kernel.cu | 2 +- paddle/phi/kernels/gpu/eigh_kernel.cu | 2 +- paddle/phi/kernels/gpu/eigvalsh_kernel.cu | 2 +- .../phi/kernels/gpu/embedding_grad_kernel.cu | 3 - .../kernels/gpu/generate_proposals_kernel.cu | 2 +- .../phi/kernels/gpu/graph_reindex_kernel.cu | 12 - .../gpu/graph_sample_neighbors_kernel.cu | 17 - .../kernels/gpu/graph_send_ue_recv_funcs.h | 9 - paddle/phi/kernels/gpu/group_norm_kernel.cu | 13 +- paddle/phi/kernels/gpu/group_norm_utils.h | 2 +- .../phi/kernels/gpu/gumbel_softmax_kernel.cu | 5 +- paddle/phi/kernels/gpu/instance_norm_utils.h | 2 +- .../kernels/gpu/interpolate_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/kthvalue_kernel.cu | 12 +- paddle/phi/kernels/gpu/layer_norm_kernel.cu | 2 +- .../phi/kernels/gpu/logsumexp_function.cu.h | 58 - paddle/phi/kernels/gpu/lstsq_kernel.cu | 2 +- paddle/phi/kernels/gpu/lu_kernel.cu | 2 +- paddle/phi/kernels/gpu/matrix_rank_kernel.cu | 2 +- .../phi/kernels/gpu/matrix_rank_tol_kernel.cu | 2 +- .../phi/kernels/gpu/multiclass_nms3_kernel.cu | 2 +- paddle/phi/kernels/gpu/multinomial_kernel.cu | 12 +- .../phi/kernels/gpu/nll_loss_grad_kernel.cu | 2 - paddle/phi/kernels/gpu/nll_loss_kernel.cu | 2 - paddle/phi/kernels/gpu/nonzero_kernel.cu | 2 +- paddle/phi/kernels/gpu/nop_kernel.cu | 2 +- paddle/phi/kernels/gpu/norm_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/norm_kernel.cu | 2 +- paddle/phi/kernels/gpu/p_recv_kernel.cu | 17 +- paddle/phi/kernels/gpu/p_send_kernel.cu | 19 +- paddle/phi/kernels/gpu/poisson_kernel.cu | 20 +- .../kernels/gpu/put_along_axis_grad_kernel.cu | 122 +- .../phi/kernels/gpu/put_along_axis_kernel.cu | 40 +- paddle/phi/kernels/gpu/qr_kernel.cu | 2 +- paddle/phi/kernels/gpu/randperm_kernel.cu | 15 +- paddle/phi/kernels/gpu/reduce.h | 2 +- paddle/phi/kernels/gpu/reduce_grad.h | 2 +- paddle/phi/kernels/gpu/reduce_kernel.cu | 17 +- .../phi/kernels/gpu/reduce_scatter_kernel.cu | 6 +- .../gpu/repeat_interleave_grad_kernel.cu | 6 +- .../kernels/gpu/repeat_interleave_kernel.cu | 6 +- paddle/phi/kernels/gpu/rms_norm_kernel.cu | 9 +- paddle/phi/kernels/gpu/rnn_functor.h | 55 - paddle/phi/kernels/gpu/rnn_kernel.cu.cc | 2 +- .../kernels/gpu/send_u_recv_grad_kernel.cu | 2 - paddle/phi/kernels/gpu/send_u_recv_kernel.cu | 2 - .../kernels/gpu/send_ue_recv_grad_kernel.cu | 27 - paddle/phi/kernels/gpu/send_ue_recv_kernel.cu | 3 - paddle/phi/kernels/gpu/send_uv_grad_kernel.cu | 15 - .../phi/kernels/gpu/set_value_grad_kernel.cu | 17 + paddle/phi/kernels/gpu/sgd_kernel.cu | 16 - .../kernels/gpu/shuffle_batch_grad_kernel.cu | 2 +- .../phi/kernels/gpu/shuffle_batch_kernel.cu | 4 +- paddle/phi/kernels/gpu/shuffle_batch_utils.h | 2 +- .../gpu/sigmoid_cross_entropy_with_logits.h | 3 +- paddle/phi/kernels/gpu/strided_copy_kernel.cu | 11 +- paddle/phi/kernels/gpu/svd_kernel.cu | 2 +- .../gpu/take_along_axis_grad_kernel.cu | 3 +- .../phi/kernels/gpu/take_along_axis_kernel.cu | 6 +- .../phi/kernels/gpu/top_p_sampling_kernel.cu | 34 +- paddle/phi/kernels/gpu/unique_kernel.cu | 34 +- .../phi/kernels/gpu/viterbi_decode_kernel.cu | 2 +- .../gpu/weighted_sample_neighbors_kernel.cu | 6 - .../kernels/gpudnn/affine_grid_grad_kernel.cu | 2 +- .../phi/kernels/gpudnn/affine_grid_kernel.cu | 2 +- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 12 - .../phi/kernels/gpudnn/softmax_grad_kernel.cu | 2 +- paddle/phi/kernels/gpudnn/softmax_kernel.cu | 2 +- paddle/phi/kernels/group_norm_kernel.h | 2 +- .../phi/kernels/impl/clip_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/clip_kernel_impl.h | 4 +- paddle/phi/kernels/impl/complex_kernel_impl.h | 2 +- paddle/phi/kernels/impl/diag_embed_impl.h | 4 +- .../phi/kernels/impl/dot_grad_kernel_impl.h | 12 +- .../impl/elementwise_grad_kernel_impl.h | 2 +- .../kernels/impl/elementwise_kernel_impl.h | 2 +- .../phi/kernels/impl/fft_grad_kernel_impl.h | 2 +- paddle/phi/kernels/impl/isclose_kernel_impl.h | 4 +- .../phi/kernels/impl/kron_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/kron_kernel_impl.h | 4 +- .../kernels/impl/matmul_grad_kernel_impl.h | 4 +- .../phi/kernels/impl/polygamma_kernel_impl.h | 4 +- paddle/phi/kernels/impl/pool_kernel_impl.h | 4 +- .../kernels/impl/quant_linear_kernel_impl.h | 2 +- paddle/phi/kernels/impl/renorm_impl.h | 6 +- .../impl/repeat_interleave_grad_kernel_impl.h | 10 +- .../impl/repeat_interleave_kernel_impl.h | 8 +- .../kernels/impl/segment_pool_kernel_impl.h | 7 +- .../kernels/impl/sequence_mask_kernel_impl.h | 4 +- .../kernels/impl/set_value_grad_kernel_impl.h | 22 + .../phi/kernels/impl/solve_grad_kernel_impl.h | 4 +- .../phi/kernels/impl/trace_grad_kernel_impl.h | 4 +- .../kernels/impl/unstack_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/unstack_kernel_impl.h | 6 +- .../phi/kernels/impl/warprnnt_kernel_impl.h | 2 +- .../impl/weight_quantize_kernel_gpu_impl.h | 11 +- paddle/phi/kernels/is_empty_kernel.cc | 2 +- paddle/phi/kernels/kps/elementwise_kernel.cu | 2 +- paddle/phi/kernels/layer_norm_kernel.h | 2 +- paddle/phi/kernels/memcpy_kernel.cc | 4 +- paddle/phi/kernels/npu_identity_kernel.cc | 2 +- .../kernels/primitive/compute_primitives.h | 6 - .../kernels/primitive/datamover_primitives.h | 5 - paddle/phi/kernels/prod_kernel.cc | 2 +- .../phi/kernels/put_along_axis_grad_kernel.h | 3 + paddle/phi/kernels/put_along_axis_kernel.h | 1 + paddle/phi/kernels/reduce_all_kernel.cc | 2 +- paddle/phi/kernels/reduce_amax_kernel.cc | 2 +- paddle/phi/kernels/reduce_amin_kernel.cc | 2 +- paddle/phi/kernels/reduce_any_kernel.cc | 2 +- paddle/phi/kernels/reduce_mean_kernel.cc | 2 +- paddle/phi/kernels/reduce_min_kernel.cc | 2 +- paddle/phi/kernels/reduce_sum_kernel.cc | 2 +- paddle/phi/kernels/reverse_kernel.cc | 2 +- .../selected_rows/activation_kernel.cc | 2 +- .../kernels/selected_rows/assign_kernel.cc | 2 +- .../elementwise_multiply_kernel.cc | 2 +- .../phi/kernels/selected_rows/full_kernel.cc | 6 +- .../kernels/selected_rows/isfinite_kernel.cc | 4 +- .../merge_selected_rows_kernel.cc | 2 +- .../phi/kernels/selected_rows/scale_kernel.cc | 2 +- .../phi/kernels/selected_rows/shape_kernel.cc | 2 +- .../kernels/selected_rows/uniform_kernel.cc | 2 +- paddle/phi/kernels/set_value_grad_kernel.h | 10 + paddle/phi/kernels/shape_kernel.cc | 2 +- .../kernels/sparse/gpu/softmax_grad_kernel.cu | 3 - .../kernels/sparse/gpu/sparse_utils_kernel.cu | 7 - paddle/phi/kernels/squeeze_grad_kernel.cc | 2 +- paddle/phi/kernels/squeeze_kernel.cc | 2 +- .../phi/kernels/stride/as_complex_kernel.cc | 2 +- paddle/phi/kernels/stride/as_real_kernel.cc | 2 +- .../phi/kernels/stride/complex_grad_kernel.cc | 2 +- paddle/phi/kernels/stride/complex_kernel.cc | 2 +- .../phi/kernels/strided_slice_grad_kernel.cc | 2 +- paddle/phi/kernels/strided_slice_kernel.cc | 2 +- paddle/phi/kernels/strings/case_utils.h | 2 +- paddle/phi/kernels/strings/gpu/copy_utils.h | 10 +- .../kernels/strings/strings_empty_kernel.cc | 2 +- paddle/phi/kernels/strings/unicode.cc | 10 +- paddle/phi/kernels/strings/unicode.h | 2 +- paddle/phi/kernels/transfer_layout_kernel.cc | 4 +- paddle/phi/kernels/unsqueeze_grad_kernel.cc | 2 +- paddle/phi/kernels/unsqueeze_kernel.cc | 2 +- .../phi/kernels/xpu/set_value_grad_kernel.cc | 31 + paddle/phi/tools/CMakeLists.txt | 4 - patches/eigen/Complex.h.patch | 33 +- patches/eigen/Eigen_CORE.patch | 13 - ...c_Core_util_ConfigureVectorization.h.patch | 21 - .../eigen/Eigen_src_Core_util_Macros.h.patch | 51 - .../eigen/Eigen_src_Core_util_Meta.h.patch | 58 - patches/eigen/TensorReductionGpu.h | 2 +- .../unsupported_Eigen_CXX11_Tensor.patch | 13 - ...11_src_Tensor_TensorContractionGpu.h.patch | 22 - ...X11_src_Tensor_TensorDeviceDefault.h.patch | 15 - ...n_CXX11_src_Tensor_TensorDeviceGpu.h.patch | 15 - ...src_Tensor_TensorGpuHipCudaDefines.h.patch | 40 - ...n_CXX11_src_Tensor_TensorReduction.h.patch | 13 - python/CMakeLists.txt | 2 - python/cinn/compiler/expr_executor.py | 9 +- python/env_dict.py.in | 1 - python/paddle/__init__.py | 1 - python/paddle/base/__init__.py | 1 - .../base/dygraph/tensor_patch_methods.py | 11 +- python/paddle/base/executor.py | 12 +- python/paddle/base/framework.py | 17 +- python/paddle/base/layers/math_op_patch.py | 6 +- python/paddle/base/variable_index.py | 143 +- python/paddle/dataset/common.py | 6 + python/paddle/device/__init__.py | 2 - python/paddle/device/cuda/graphs.py | 3 +- python/paddle/distributed/auto_tuner/prune.py | 43 +- .../distributed/fleet/base/role_maker.py | 9 +- .../paddle/distributed/fleet/launch_utils.py | 2 +- .../distributed/fleet/layers/mpu/mp_layers.py | 2 +- python/paddle/distributed/fleet/utils/fs.py | 33 +- .../fleet/utils/sequence_parallel_utils.py | 1 - .../launch/controllers/collective.py | 10 +- .../paddle/distributed/launch/utils/nvsmi.py | 2 - python/paddle/distributed/rpc/rpc.py | 4 +- .../paddle/distributed/utils/launch_utils.py | 2 +- python/paddle/hapi/hub.py | 1 - .../incubate/distributed/fleet/fleet_util.py | 22 +- .../paddle/io/dataloader/dataloader_iter.py | 7 +- .../paddle/jit/dy2static/convert_operators.py | 7 +- python/paddle/nn/functional/conv.py | 1 - python/paddle/nn/functional/vision.py | 2 +- python/paddle/nn/quant/format.py | 39 +- .../paddle/quantization/observers/__init__.py | 3 +- .../quantization/observers/groupwise.py | 113 ++ python/paddle/quantization/quantize.py | 17 +- python/paddle/tensor/manipulation.py | 74 +- .../utils/cpp_extension/extension_utils.py | 1 + python/paddle/utils/download.py | 49 +- python/setup.py.in | 2 +- security/README.md | 36 +- security/README_cn.md | 38 +- security/README_ja.md | 36 +- security/advisory/pdsa-2023-004_cn.md | 2 +- security/advisory/pdsa-2023-006.md | 31 + security/advisory/pdsa-2023-006_cn.md | 31 + security/advisory/pdsa-2023-007.md | 31 + security/advisory/pdsa-2023-007_cn.md | 31 + security/advisory/pdsa-2023-008.md | 31 + security/advisory/pdsa-2023-008_cn.md | 31 + security/advisory/pdsa-2023-009.md | 31 + security/advisory/pdsa-2023-009_cn.md | 31 + security/advisory/pdsa-2023-010.md | 33 + security/advisory/pdsa-2023-010_cn.md | 33 + security/advisory/pdsa-2023-011.md | 32 + security/advisory/pdsa-2023-011_cn.md | 32 + security/advisory/pdsa-2023-012.md | 35 + security/advisory/pdsa-2023-012_cn.md | 35 + security/advisory/pdsa-2023-013.md | 32 + security/advisory/pdsa-2023-013_cn.md | 32 + security/advisory/pdsa-2023-014.md | 32 + security/advisory/pdsa-2023-014_cn.md | 32 + security/advisory/pdsa-2023-015.md | 33 + security/advisory/pdsa-2023-015_cn.md | 33 + security/advisory/pdsa-2023-016.md | 32 + security/advisory/pdsa-2023-016_cn.md | 32 + security/advisory/pdsa-2023-017.md | 33 + security/advisory/pdsa-2023-017_cn.md | 33 + security/advisory/pdsa-2023-018.md | 32 + security/advisory/pdsa-2023-018_cn.md | 32 + security/advisory/pdsa-2023-019.md | 35 + security/advisory/pdsa-2023-019_cn.md | 35 + security/advisory/pdsa-2023-020.md | 28 + security/advisory/pdsa-2023-020_cn.md | 28 + security/advisory/pdsa-2023-021.md | 33 + security/advisory/pdsa-2023-021_cn.md | 33 + security/advisory/pdsa-2023-022.md | 30 + security/advisory/pdsa-2023-022_cn.md | 30 + security/advisory/pdsa-2023-023.md | 28 + security/advisory/pdsa-2023-023_cn.md | 28 + .../hybrid_strategy/CMakeLists.txt | 2 +- test/collective/fleet/CMakeLists.txt | 4 +- .../run_server_for_communicator_half_async.py | 38 + .../fleet/test_communicator_half_async.py | 118 +- .../fleet/test_dygraph_sharding_stage2.py | 9 +- .../fleet/test_parallel_dygraph_mp_layers.py | 5 +- .../fleet/test_parallel_dygraph_qat.py | 2 +- test/cpp/fluid/CMakeLists.txt | 2 - test/cpp/fluid/inference/CMakeLists.txt | 1 - test/cpp/fluid/inference/utils/CMakeLists.txt | 16 - .../fluid/inference/utils/io_utils_tester.cc | 154 -- test/cpp/fluid/nccl/CMakeLists.txt | 2 +- test/cpp/fluid/nccl/nccl_op_test.cu.cc | 12 +- test/cpp/imperative/CMakeLists.txt | 3 +- test/cpp/imperative/nccl_context_test.cc | 10 +- test/cpp/inference/api/tester_helper.h | 12 - .../inference/api/trt_dynamic_shape_test.cc | 1 + test/cpp/inference/test.cmake | 7 +- test/custom_runtime/CMakeLists.txt | 2 +- .../test_collective_process_group_xccl.py | 5 +- test/custom_runtime/test_custom_cpu_plugin.py | 3 +- .../test_custom_cpu_profiler_plugin.py | 3 +- .../test_custom_cpu_to_static.py | 3 +- test/custom_runtime/test_custom_op_setup.py | 3 +- .../test_fleet_launch_custom_device.sh | 2 +- test/dygraph_to_static/CMakeLists.txt | 14 +- test/dygraph_to_static/test_list.py | 1 + test/dygraph_to_static/test_mobile_net.py | 11 +- test/indexing/test_getitem.py | 34 + test/indexing/test_setitem.py | 130 +- test/ir/inference/program_config.py | 28 +- test/ir/inference/test_trt_convert_assign.py | 5 +- test/ir/inference/test_trt_convert_cast.py | 1 + .../test_trt_convert_lookup_table.py | 1 + test/ir/inference/test_trt_convert_solve.py | 5 +- test/legacy_test/CMakeLists.txt | 4 +- test/legacy_test/c_embedding_op_base.py | 25 +- test/legacy_test/test_adaptive_avg_pool1d.py | 1 - test/legacy_test/test_dist_hapi_model.py | 2 +- test/legacy_test/test_download.py | 15 +- .../test_parallel_dygraph_dataparallel.py | 2 +- ...t_parallel_dygraph_dataparallel_cpuonly.py | 2 +- test/legacy_test/test_put_along_axis_op.py | 762 +++++++- test/legacy_test/test_repeat_interleave_op.py | 19 + test/legacy_test/test_set_value_op.py | 82 + .../test_sparse_fused_attention_op.py | 5 + test/legacy_test/test_yolov3_loss_op.py | 3 +- test/quantization/test_groupwise.py | 69 + test/quantization/test_llm_int8_linear.py | 90 +- ..._post_training_quantization_mobilenetv1.py | 70 +- ...est_post_training_quantization_resnet50.py | 2 +- test/quantization/test_ptq.py | 42 + test/quantization/test_weight_only_linear.py | 42 + .../xpu/test_parallel_dygraph_dataparallel.py | 2 +- third_party/cryptopp | 1 + third_party/cryptopp-cmake | 1 + tools/enforce/grep_invalid_enforce.sh | 2 +- tools/parallel_UT_rule.py | 6 - 915 files changed, 8842 insertions(+), 11920 deletions(-) delete mode 100644 cmake/mccl.cmake delete mode 100644 cmake/mudnn.cmake delete mode 100644 cmake/musa.cmake mode change 100755 => 100644 paddle/fluid/inference/tensorrt/op_teller.cc delete mode 100644 paddle/fluid/inference/utils/benchmark.cc delete mode 100644 paddle/fluid/inference/utils/benchmark.h delete mode 100644 paddle/fluid/inference/utils/benchmark_tester.cc delete mode 100644 paddle/fluid/inference/utils/table_printer_tester.cc delete mode 100644 paddle/fluid/platform/device/gpu/musa/musa_helper.h delete mode 100644 paddle/fluid/platform/dynload/mccl.cc delete mode 100644 paddle/fluid/platform/dynload/mccl.h delete mode 100644 paddle/fluid/platform/dynload/mublas.cc delete mode 100644 paddle/fluid/platform/dynload/mublas.h delete mode 100644 paddle/fluid/platform/dynload/mudnn.cc delete mode 100644 paddle/fluid/platform/dynload/mudnn.h delete mode 100644 paddle/fluid/platform/dynload/mufft.cc delete mode 100644 paddle/fluid/platform/dynload/mufft.h delete mode 100644 paddle/fluid/platform/dynload/murand.cc delete mode 100644 paddle/fluid/platform/dynload/murand.h delete mode 100644 paddle/fluid/platform/dynload/musa_driver.cc delete mode 100644 paddle/fluid/platform/dynload/musa_driver.h delete mode 100644 paddle/fluid/platform/dynload/musartc.cc delete mode 100644 paddle/fluid/platform/dynload/musartc.h delete mode 100644 paddle/fluid/platform/dynload/musparse.cc delete mode 100644 paddle/fluid/platform/dynload/musparse.h delete mode 100644 paddle/phi/backends/dynload/mccl.cc delete mode 100644 paddle/phi/backends/dynload/mccl.h delete mode 100644 paddle/phi/backends/dynload/mublas.cc delete mode 100644 paddle/phi/backends/dynload/mublas.h delete mode 100644 paddle/phi/backends/dynload/mudnn.cc delete mode 100644 paddle/phi/backends/dynload/mudnn.h delete mode 100644 paddle/phi/backends/dynload/mufft.cc delete mode 100644 paddle/phi/backends/dynload/mufft.h delete mode 100644 paddle/phi/backends/dynload/murand.cc delete mode 100644 paddle/phi/backends/dynload/murand.h delete mode 100644 paddle/phi/backends/dynload/musa_driver.cc delete mode 100644 paddle/phi/backends/dynload/musa_driver.h delete mode 100644 paddle/phi/backends/dynload/musartc.cc delete mode 100644 paddle/phi/backends/dynload/musartc.h delete mode 100644 paddle/phi/backends/dynload/musparse.cc delete mode 100644 paddle/phi/backends/dynload/musparse.h delete mode 100644 paddle/phi/backends/gpu/musa/mudnn_desc.h delete mode 100644 paddle/phi/backends/gpu/musa/mudnn_helper.h delete mode 100644 paddle/phi/backends/gpu/musa/musa_device_function.h delete mode 100644 paddle/phi/backends/gpu/musa/musa_helper.h delete mode 100644 paddle/phi/backends/gpu/musa/musa_info.cc create mode 100644 paddle/phi/kernels/custom/c_embedding_grad_kernel.cc create mode 100644 paddle/phi/kernels/custom/c_embedding_kernel.cc delete mode 100644 paddle/phi/kernels/funcs/blas/blas_impl.mu.h delete mode 100644 paddle/phi/kernels/funcs/mufft_util.h delete mode 100644 patches/eigen/Eigen_CORE.patch delete mode 100644 patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch delete mode 100644 patches/eigen/Eigen_src_Core_util_Macros.h.patch delete mode 100644 patches/eigen/Eigen_src_Core_util_Meta.h.patch delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_Tensor.patch delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch create mode 100644 python/paddle/quantization/observers/groupwise.py create mode 100644 security/advisory/pdsa-2023-006.md create mode 100644 security/advisory/pdsa-2023-006_cn.md create mode 100644 security/advisory/pdsa-2023-007.md create mode 100644 security/advisory/pdsa-2023-007_cn.md create mode 100644 security/advisory/pdsa-2023-008.md create mode 100644 security/advisory/pdsa-2023-008_cn.md create mode 100644 security/advisory/pdsa-2023-009.md create mode 100644 security/advisory/pdsa-2023-009_cn.md create mode 100644 security/advisory/pdsa-2023-010.md create mode 100644 security/advisory/pdsa-2023-010_cn.md create mode 100644 security/advisory/pdsa-2023-011.md create mode 100644 security/advisory/pdsa-2023-011_cn.md create mode 100644 security/advisory/pdsa-2023-012.md create mode 100644 security/advisory/pdsa-2023-012_cn.md create mode 100644 security/advisory/pdsa-2023-013.md create mode 100644 security/advisory/pdsa-2023-013_cn.md create mode 100644 security/advisory/pdsa-2023-014.md create mode 100644 security/advisory/pdsa-2023-014_cn.md create mode 100644 security/advisory/pdsa-2023-015.md create mode 100644 security/advisory/pdsa-2023-015_cn.md create mode 100644 security/advisory/pdsa-2023-016.md create mode 100644 security/advisory/pdsa-2023-016_cn.md create mode 100644 security/advisory/pdsa-2023-017.md create mode 100644 security/advisory/pdsa-2023-017_cn.md create mode 100644 security/advisory/pdsa-2023-018.md create mode 100644 security/advisory/pdsa-2023-018_cn.md create mode 100644 security/advisory/pdsa-2023-019.md create mode 100644 security/advisory/pdsa-2023-019_cn.md create mode 100644 security/advisory/pdsa-2023-020.md create mode 100644 security/advisory/pdsa-2023-020_cn.md create mode 100644 security/advisory/pdsa-2023-021.md create mode 100644 security/advisory/pdsa-2023-021_cn.md create mode 100644 security/advisory/pdsa-2023-022.md create mode 100644 security/advisory/pdsa-2023-022_cn.md create mode 100644 security/advisory/pdsa-2023-023.md create mode 100644 security/advisory/pdsa-2023-023_cn.md create mode 100644 test/collective/fleet/run_server_for_communicator_half_async.py delete mode 100644 test/cpp/fluid/inference/CMakeLists.txt delete mode 100644 test/cpp/fluid/inference/utils/CMakeLists.txt delete mode 100644 test/cpp/fluid/inference/utils/io_utils_tester.cc create mode 100644 test/quantization/test_groupwise.py create mode 160000 third_party/cryptopp create mode 160000 third_party/cryptopp-cmake diff --git a/.gitmodules b/.gitmodules index 8b06f4fb771cbb..0c41450793fc2a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -110,3 +110,11 @@ path = third_party/cccl url = https://github.com/NVIDIA/cccl.git ignore = dirty +[submodule "third_party/cryptopp"] + path = third_party/cryptopp + url = https://github.com/weidai11/cryptopp.git + ignore = dirty +[submodule "third_party/cryptopp-cmake"] + path = third_party/cryptopp-cmake + url = https://github.com/noloader/cryptopp-cmake.git + ignore = dirty diff --git a/CMakeLists.txt b/CMakeLists.txt index da58f0095ae09d..e9f3fafe8d22ad 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,14 +41,13 @@ if(NOT CMAKE_BUILD_TYPE) endif() project(paddle CXX C) -# set(CMAKE_VERBOSE_MAKEFILE ON) + # enable language CUDA # TODO(Shibo Tao): remove find_package(CUDA) completely. find_package(CUDA QUIET) find_package(MKL CONFIG QUIET) option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF) -option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" OFF) -option(WITH_MUSA "Compile PaddlePaddle with MUSA" ON) +option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_MPI "Compile PaddlePaddle with MPI" OFF) option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) @@ -90,9 +89,6 @@ endif() if(WITH_GPU AND WITH_ROCM) message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time") endif() -if(WITH_GPU AND WITH_MUSA) - message(FATAL_ERROR "Error when compile CUDA and MUSA at the same time") -endif() if(WITH_GPU AND NOT APPLE) enable_language(CUDA) @@ -256,7 +252,7 @@ option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_MULTINODE_TESTING "Test multinode apis and ops" OFF) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_SYSTEM_BLAS "Use system blas library" OFF) -option(WITH_DISTRIBUTE "Compile with distributed support" ON) +option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(ON_INFER "Turn on inference optimization and inference-lib generation" ON) @@ -289,7 +285,6 @@ option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) option(WITH_CINN "Compile PaddlePaddle with CINN" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON) -option(WITH_MCCL "Compile PaddlePaddle with MCCL support" ON) option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON) option(WITH_ARM "Compile PaddlePaddle with arm support" OFF) @@ -357,7 +352,6 @@ endif() if(LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT WITH_GPU - AND NOT WITH_MUSA AND NOT WITH_ROCM AND NOT WITH_XPU AND NOT WITH_XPU_KP @@ -410,14 +404,6 @@ if(NOT WITH_GPU AND WITH_NCCL) CACHE STRING "Disable NCCL when compiling without GPU" FORCE) endif() -if(NOT WITH_MUSA AND WITH_MCCL) - message( - WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.") - set(WITH_MCCL - OFF - CACHE STRING "Disable MCCL when compiling without MUSA" FORCE) -endif() - if(NOT WITH_GPU AND WITH_CUDNN_DSO) message( WARNING @@ -475,19 +461,6 @@ else() endif() endif() -if(WITH_MCCL) - add_definitions("-DPADDLE_WITH_MCCL") - include(mccl) -else() - if(WITH_MUSA) - message( - WARNING - "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used." - ) - endif() -endif() - - if(WITH_BRPC_RDMA) message(STATUS "Use brpc with rdma.") if(NOT WITH_DISTRIBUTE) @@ -513,11 +486,6 @@ if(WITH_ROCM) include(cupti) endif() -if(WITH_MUSA) - include(musa) - include(mudnn) -endif() - if(WITH_XPU_KP) include(xpu_kp) endif() @@ -530,14 +498,6 @@ if(NOT WITH_ROCM AND WITH_RCCL) CACHE STRING "Disable RCCL when compiling without ROCM" FORCE) endif() -if(NOT WITH_MUSA AND WITH_MCCL) - message( - WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.") - set(WITH_MCCL - OFF - CACHE STRING "Disable MCCL when compiling without MUSA" FORCE) -endif() - if(WITH_RCCL) add_definitions("-DPADDLE_WITH_RCCL") include(rccl) @@ -550,18 +510,6 @@ else() endif() endif() -if(WITH_MCCL) - add_definitions("-DPADDLE_WITH_MCCL") - include(mccl) -else() - if(WITH_MUSA) - message( - WARNING - "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used." - ) - endif() -endif() - if(WITH_HETERPS AND WITH_PSLIB) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() @@ -612,13 +560,6 @@ if(WITH_RPC) OFF CACHE BOOL "Disable WITH_RPC when compiling with ROCM" FORCE) endif() - if(WITH_MUSA AND WITH_RPC) - message( - WARNING "Disable WITH_RPC when compiling with MUSA. Force WITH_RPC=OFF.") - set(WITH_RPC - OFF - CACHE BOOL "Disable WITH_RPC when compiling with MUSA" FORCE) - endif() if(WITH_XPU AND WITH_RPC) message( WARNING "Disable WITH_RPC when compiling with XPU. Force WITH_RPC=OFF.") @@ -690,12 +631,6 @@ include(configure) # add paddle env configuration include_directories("${PADDLE_SOURCE_DIR}") -# distribute need openssl -# openssl install tutorial: https://www.howtoforge.com/tutorial/how-to-install-openssl-from-source-on-linux/ -include_directories("/usr/local/ssl/include") -link_directories("/usr/local/ssl/lib64") - - if(WITH_NV_JETSON) set(WITH_ARM ON diff --git a/README.md b/README.md index 8f708334ed28f1..001352ea45fc4d 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ PaddlePaddle is originated from industrial practices with dedication and commitm ## Installation -### Latest PaddlePaddle Release: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5) +### Latest PaddlePaddle Release: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6) Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle. diff --git a/README_cn.md b/README_cn.md index a13fa5ba214503..cd45e4e3ecd2b7 100644 --- a/README_cn.md +++ b/README_cn.md @@ -18,9 +18,9 @@ ## 安装 -### PaddlePaddle最新版本: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5) +### PaddlePaddle 最新版本: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6) -跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) +跟进 PaddlePaddle 最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) ### 安装最新稳定版本: ``` diff --git a/README_ja.md b/README_ja.md index 22c78a1a79bbd9..dad60eb7ffcf87 100644 --- a/README_ja.md +++ b/README_ja.md @@ -20,7 +20,7 @@ PaddlePaddle は、工業化に対するコミットメントを持つ工業的 ## インストール -### PaddlePaddle の最新リリース: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5) +### PaddlePaddle の最新リリース: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6) 私たちのビジョンは、PaddlePaddle を通じて、誰もが深層学習を行えるようにすることです。 PaddlePaddle の最新機能を追跡するために、私たちの[リリースのお知らせ](https://github.com/PaddlePaddle/Paddle/releases)を参照してください。 diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 29cca57db65891..dc661fce388fe1 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -175,19 +175,6 @@ elseif(WITH_ROCM) if(${MIOPEN_VERSION} VERSION_LESS 2090) message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile") endif() -elseif(WITH_MUSA) - add_definitions(-DPADDLE_WITH_MUSA) - add_definitions(-DEIGEN_USE_GPU) - add_definitions(-DEIGEN_USE_MUSA) - if(MUPTI_FOUND) - include_directories(${CUPTI_INCLUDE_DIR}) - add_definitions(-DPADDLE_WITH_MUPTI) - else() - message(STATUS "Cannot find MUPTI, GPU Profiling is incorrect.") - endif() - if(NOT MUDNN_FOUND) - message(FATAL_ERROR "Paddle needs mudnn to compile") - endif() else() add_definitions(-DHPPL_STUB_FUNC) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake index 5967b468d65ce5..eb7ad44af2313f 100644 --- a/cmake/cupti.cmake +++ b/cmake/cupti.cmake @@ -1,4 +1,4 @@ -if(NOT WITH_GPU AND NOT WITH_ROCM AND NOT WITH_MUSA) +if(NOT WITH_GPU AND NOT WITH_ROCM) return() endif() @@ -6,10 +6,6 @@ if(WITH_ROCM) set(CUPTI_ROOT "${ROCM_PATH}/cuda/extras/CUPTI" CACHE PATH "CUPTI ROOT") -elseif(WITH_MUSA) - set(CUPTI_ROOT - "/usr/local/musa" - CACHE PATH "CUPTI ROOT") else() set(CUPTI_ROOT "/usr" diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake index 9daa4be7468e42..b3ec8f622923fd 100644 --- a/cmake/external/cryptopp.cmake +++ b/cmake/external/cryptopp.cmake @@ -14,12 +14,13 @@ include(ExternalProject) +set(CRYPTOPP_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cryptopp) +set(CRYPTOPP_CMAKE_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cryptopp-cmake) set(CRYPTOPP_PREFIX_DIR ${THIRD_PARTY_PATH}/cryptopp) set(CRYPTOPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cryptopp) set(CRYPTOPP_INCLUDE_DIR "${CRYPTOPP_INSTALL_DIR}/include" CACHE PATH "cryptopp include directory." FORCE) -set(CRYPTOPP_REPOSITORY ${GIT_URL}/weidai11/cryptopp.git) set(CRYPTOPP_TAG CRYPTOPP_8_2_0) if(WIN32) @@ -63,17 +64,16 @@ include_directories(${CRYPTOPP_INCLUDE_DIR}) ExternalProject_Add( extern_cryptopp ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} - GIT_REPOSITORY ${CRYPTOPP_REPOSITORY} - GIT_TAG ${CRYPTOPP_TAG} PREFIX ${CRYPTOPP_PREFIX_DIR} + SOURCE_DIR ${CRYPTOPP_SOURCE_DIR} UPDATE_COMMAND "" PATCH_COMMAND - COMMAND ${CMAKE_COMMAND} -E remove_directory "/cmake/" - COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "/cmake" - COMMAND cd "/cmake" && git checkout tags/${CRYPTOPP_TAG} -b - ${CRYPTOPP_TAG} - COMMAND ${CMAKE_COMMAND} -E copy_directory "/cmake/" - "/" + COMMAND ${CMAKE_COMMAND} -E copy "${CRYPTOPP_CMAKE_SOURCE_DIR}/CMakeLists.txt" + "/CMakeLists.txt" + COMMAND + ${CMAKE_COMMAND} -E copy + "${CRYPTOPP_CMAKE_SOURCE_DIR}/cryptopp-config.cmake" + "/cryptopp-config.cmake" COMMAND ${CRYPTOPP_PATCH_COMMAND} INSTALL_DIR ${CRYPTOPP_INSTALL_DIR} CMAKE_ARGS ${CRYPTOPP_CMAKE_ARGS} diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 4051a09d767f6b..06e37b3c8a6028 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -60,76 +60,6 @@ if(CMAKE_COMPILER_IS_GNUCC) ${EIGEN_PATCH_COMMAND} && patch -Nd ${SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header}) endif() - if(WITH_MUSA) - file( - TO_NATIVE_PATH - ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch - configure_vectorization_header) - set(EIGEN_PATCH_COMMAND - ${EIGEN_PATCH_COMMAND} && patch -Nd ${SOURCE_DIR}/Eigen/src/Core/util/ - < ${configure_vectorization_header}) - file(TO_NATIVE_PATH - ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_Macros.h.patch - util_macros_header) - set(EIGEN_PATCH_COMMAND - ${EIGEN_PATCH_COMMAND} && patch -Nd ${SOURCE_DIR}/Eigen/src/Core/util/ - < ${util_macros_header}) - file(TO_NATIVE_PATH - ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_Meta.h.patch - meta_header) - set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && patch -Nd - ${SOURCE_DIR}/Eigen/src/Core/util/ < ${meta_header}) - file(TO_NATIVE_PATH - ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_Tensor.patch - cxx11_tensor) - set(EIGEN_PATCH_COMMAND - ${EIGEN_PATCH_COMMAND} && patch -Nd - ${SOURCE_DIR}/unsupported/Eigen/CXX11/ < ${cxx11_tensor}) - file( - TO_NATIVE_PATH - ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch - tensor_contraction_gpu_header) - set(EIGEN_PATCH_COMMAND - ${EIGEN_PATCH_COMMAND} && patch -Nd - ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ < - ${tensor_contraction_gpu_header}) - file( - TO_NATIVE_PATH - ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch - tensor_device_default_header) - set(EIGEN_PATCH_COMMAND - ${EIGEN_PATCH_COMMAND} && patch -Nd - ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ < - ${tensor_device_default_header}) - file( - TO_NATIVE_PATH - ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch - tensor_gpu_hip_cuda_defines_header) - set(EIGEN_PATCH_COMMAND - ${EIGEN_PATCH_COMMAND} && patch -Nd - ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ < - ${tensor_gpu_hip_cuda_defines_header}) - file( - TO_NATIVE_PATH - ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch - tensor_reduction_header) - set(EIGEN_PATCH_COMMAND - ${EIGEN_PATCH_COMMAND} && patch -Nd - ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ < - ${tensor_reduction_header}) - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_CORE.patch - eigen_core) - set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && patch -Nd - ${SOURCE_DIR}/Eigen/ < ${eigen_core}) - file( - TO_NATIVE_PATH - ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch - tensor_device_gpu_header) - set(EIGEN_PATCH_COMMAND - ${EIGEN_PATCH_COMMAND} && patch -Nd - ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ < - ${tensor_device_gpu_header}) - endif() endif() set(EIGEN_INCLUDE_DIR ${SOURCE_DIR}) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 8d6384d2f0a141..7a4956e6e15567 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -246,11 +246,6 @@ if(WITH_GPU) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") endif() -if(WITH_MUSA) - set(CMAKE_MUSA_FLAGS "${CMAKE_MUSA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") -endif() - - if(WITH_ROCM) set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 788237cc4699b4..c463dbc6064e12 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -453,9 +453,6 @@ function(cc_binary TARGET_NAME) if(WITH_ROCM) target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB}) endif() - if(WITH_MUSA) - target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB}) - endif() check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS}) @@ -484,12 +481,6 @@ function(cc_test_build TARGET_NAME) if(WITH_ROCM) target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB}) endif() - if(WITH_MUSA) - target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB}) - # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/' - target_link_options(${TARGET_NAME} PRIVATE - -Wl,-rpath,/usr/lib/x86_64-linux-gnu/) - endif(()) check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS}) endif() endfunction() @@ -628,12 +619,6 @@ function(paddle_test_build TARGET_NAME) if(WITH_ROCM) target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB}) endif() - if(WITH_MUSA) - target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB}) - # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/' - target_link_options(${TARGET_NAME} PRIVATE - -Wl,-rpath,/usr/lib/x86_64-linux-gnu/) - endif() if(APPLE) target_link_libraries( ${TARGET_NAME} @@ -765,115 +750,6 @@ function(nv_test TARGET_NAME) endif() endfunction() - - -function(musa_library TARGET_NAME) - if(WITH_MUSA) - set(options STATIC static SHARED shared) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(musa_library "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) - if(musa_library_SRCS) - if(musa_library_SHARED OR musa_library_shared) # build *.so - musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS}) - else() - musa_add_library(${TARGET_NAME} STATIC ${musa_library_SRCS}) - find_fluid_modules(${TARGET_NAME}) - find_phi_modules(${TARGET_NAME}) - endif() - if(musa_library_DEPS) - add_dependencies(${TARGET_NAME} ${musa_library_DEPS}) - target_link_libraries(${TARGET_NAME} ${musa_library_DEPS}) - endif() - # cpplint code style - foreach(source_file ${musa_library_SRCS}) - string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - list(APPEND musa_library_HEADERS - ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - endif() - endforeach() - else() - if(musa_library_DEPS) - list(REMOVE_DUPLICATES musa_library_DEPS) - generate_dummy_static_lib( - LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR - "generic.cmake:musa_library") - - target_link_libraries(${TARGET_NAME} ${musa_library_DEPS}) - add_dependencies(${TARGET_NAME} ${musa_library_DEPS}) - else() - message(FATAL "Please specify source file or library in musa_library.") - endif() - endif() - endif() -endfunction() - -function(musa_binary TARGET_NAME) - if(WITH_MUSA) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(musa_binary "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) - add_executable(${TARGET_NAME} ${musa_binary_SRCS}) - if(musa_binary_DEPS) - target_link_libraries(${TARGET_NAME} ${musa_binary_DEPS}) - add_dependencies(${TARGET_NAME} ${musa_binary_DEPS}) - common_link(${TARGET_NAME}) - endif() - endif() -endfunction() - -function(musa_test TARGET_NAME) - if(WITH_MUSA AND WITH_TESTING) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(musa_test "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) - musa_add_executable(${TARGET_NAME} ${musa_test_SRCS}) - # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE - target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt) - get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - target_link_libraries( - ${TARGET_NAME} - ${musa_test_DEPS} - paddle_gtest_main - lod_tensor - memory - gtest - glog - phi - ${os_dependency_modules}) - add_dependencies( - ${TARGET_NAME} - ${musa_test_DEPS} - paddle_gtest_main - lod_tensor - memory - gtest - phi - glog) - common_link(${TARGET_NAME}) - add_test(${TARGET_NAME} ${TARGET_NAME}) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT - FLAGS_cpu_deterministic=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT - FLAGS_init_allocated_mem=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT - FLAGS_cudnn_deterministic=true) - set_property( - TEST ${TARGET_NAME} - PROPERTY - ENVIRONMENT - "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH" - ) - endif() -endfunction() - - - function(hip_library TARGET_NAME) if(WITH_ROCM) set(options STATIC static SHARED shared) @@ -882,12 +758,6 @@ function(hip_library TARGET_NAME) cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) if(hip_library_SRCS) - # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found - if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators" - OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels")) - set_source_files_properties(${hip_library_SRCS} - PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - endif() if(hip_library_SHARED OR hip_library_shared) # build *.so hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS}) else() @@ -901,6 +771,10 @@ function(hip_library TARGET_NAME) endif() # cpplint code style foreach(source_file ${hip_library_SRCS}) + if(NOT ${source_file} MATCHES "\\.cu$") + set_source_files_properties(${source_file} + PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + endif() string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) list(APPEND hip_library_HEADERS @@ -1501,15 +1375,6 @@ function(math_library TARGET) ${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) - elseif(WITH_MUSA) - musa_library( - ${TARGET} - SRCS - ${cc_srcs} - ${cu_srcs} - DEPS - ${math_library_DEPS} - ${math_common_deps}) elseif(${cc_srcs_len} GREATER 0) cc_library( ${TARGET} diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 06dc5d6173794a..517ac24cccc72e 100755 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -237,6 +237,16 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR}) set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") +if(WIN32) + set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/common.*) +else() + set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/libcommon.*) +endif() +copy( + inference_lib_dist + SRCS ${paddle_common_lib} + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) + if(WIN32) if(WITH_STATIC_LIB) set(paddle_inference_lib @@ -268,11 +278,6 @@ else() SRCS ${paddle_phi_lib} DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) endif() - set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/libcommon.*) - copy( - inference_lib_dist - SRCS ${paddle_common_lib} - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) endif() copy( diff --git a/cmake/mccl.cmake b/cmake/mccl.cmake deleted file mode 100644 index 5ce4ea9c25fec0..00000000000000 --- a/cmake/mccl.cmake +++ /dev/null @@ -1,51 +0,0 @@ -if(NOT WITH_MUSA) - return() -endif() - -# Now we don't support MCCL on windows -if(WIN32) - return() -endif() - -if(WITH_MCCL) - set(MCCL_ROOT - "/usr/local/musa/" - CACHE PATH "MCCL ROOT") - find_path( - MCCL_INCLUDE_DIR mccl.h - PATHS ${MCCL_ROOT} ${MCCL_ROOT}/include ${MCCL_ROOT}/local/include - $ENV{MCCL_ROOT} $ENV{MCCL_ROOT}/include $ENV{MCCL_ROOT}/local/include - NO_DEFAULT_PATH) - - if(MCCL_INCLUDE_DIR) - file(READ ${MCCL_INCLUDE_DIR}/mccl.h MCCL_VERSION_FILE_CONTENTS) - - string(REGEX MATCH "define MCCL_MAJOR +([0-9]+)" MCCL_MAJOR_VERSION - "${MCCL_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define MCCL_MAJOR +([0-9]+)" "\\1" MCCL_MAJOR_VERSION - "${MCCL_MAJOR_VERSION}") - string(REGEX MATCH "define MCCL_MINOR +([0-9]+)" MCCL_MINOR_VERSION - "${MCCL_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define MCCL_MINOR +([0-9]+)" "\\1" MCCL_MINOR_VERSION - "${MCCL_MINOR_VERSION}") - string(REGEX MATCH "define MCCL_PATCH +([0-9]+)" MCCL_PATCH_VERSION - "${MCCL_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define MCCL_PATCH +([0-9]+)" "\\1" MCCL_PATCH_VERSION - "${MCCL_PATCH_VERSION}") - if(NOT MCCL_MAJOR_VERSION) - set(MCCL_VERSION "???") - else() - math(EXPR MCCL_VERSION "${MCCL_MAJOR_VERSION} * 1000 + - ${MCCL_MINOR_VERSION} * 100 + ${MCCL_PATCH_VERSION}") - endif() - include_directories(${MCCL_INCLUDE_DIR}) - - message(STATUS "Current MCCL header is ${MCCL_INCLUDE_DIR}/mccl.h. ") - message( - STATUS - "Current MCCL version is " - "v${MCCL_MAJOR_VERSION}.${MCCL_MINOR_VERSION}.${MCCL_PATCH_VERSION} ") - else() - message(FATAL_ERROR "WITH_MCCL is enabled but mccl.h file is not found!") - endif() -endif() diff --git a/cmake/mudnn.cmake b/cmake/mudnn.cmake deleted file mode 100644 index 81027890d144e3..00000000000000 --- a/cmake/mudnn.cmake +++ /dev/null @@ -1,92 +0,0 @@ -if(NOT WITH_MUSA) - return() -endif() - -if(WIN32) - return() -else() - set(MUDNN_ROOT - "/usr/local/musa" - CACHE PATH "MUDNN ROOT") -endif() - -find_path( - MUDNN_INCLUDE_DIR mudnn.h - PATHS ${MUDNN_ROOT} ${MUDNN_ROOT}/include $ENV{MUDNN_ROOT} - $ENV{MUDNN_ROOT}/include ${MUSA_TOOLKIT_INCLUDE} - NO_DEFAULT_PATH) - -set(TARGET_ARCH "x86_64") -if(NOT ${CMAKE_SYSTEM_PROCESSOR}) - set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) -endif() - -list( - APPEND - MUDNN_CHECK_LIBRARY_DIRS - ${MUDNN_ROOT} - ${MUDNN_ROOT}/lib64 - ${MUDNN_ROOT}/lib - ${MUDNN_ROOT}/lib/x64 - ${MUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu - $ENV{MUDNN_ROOT} - $ENV{MUDNN_ROOT}/lib64 - $ENV{MUDNN_ROOT}/lib - $ENV{MUDNN_ROOT}/lib/x64 - /usr/lib - ${MUSA_TOOLKIT_ROOT_DIR} - ${MUSA_TOOLKIT_ROOT_DIR}/lib/x64) -set(MUDNN_LIB_NAME "") - -if(LINUX) - set(MUDNN_LIB_NAME "libmudnn.so") -endif() - -find_library( - MUDNN_LIBRARY - NAMES ${MUDNN_LIB_NAME} - PATHS ${MUDNN_CHECK_LIBRARY_DIRS} ${MUDNN_INCLUDE_DIR} - NO_DEFAULT_PATH - DOC "Path to muDNN library.") - -if(MUDNN_INCLUDE_DIR AND MUDNN_LIBRARY) - set(MUDNN_FOUND ON) -else() - set(MUDNN_FOUND OFF) -endif() - -macro(find_mudnn_version mudnn_version_file) - file(READ ${mudnn_version_file} MUDNN_VERSION_FILE_CONTENTS) - get_filename_component(MUDNN_LIB_PATH ${MUDNN_LIBRARY} DIRECTORY) - - string(REGEX MATCH "define MUDNN_VERSION_MAJOR +([0-9]+)" MUDNN_MAJOR_VERSION - "${MUDNN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define MUDNN_VERSION_MAJOR +([0-9]+)" "\\1" - MUDNN_MAJOR_VERSION "${MUDNN_MAJOR_VERSION}") - string(REGEX MATCH "define MUDNN_VERSION_MINOR +([0-9]+)" MUDNN_MINOR_VERSION - "${MUDNN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define MUDNN_VERSION_MINOR +([0-9]+)" "\\1" - MUDNN_MINOR_VERSION "${MUDNN_MINOR_VERSION}") - string(REGEX MATCH "define MUDNN_VERSION_PATCH +([0-9]+)" MUDNN_PATCH_VERSION - "${MUDNN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define MUDNN_VERSION_PATCH +([0-9]+)" "\\1" - MUDNN_PATCH_VERSION "${MUDNN_PATCH_VERSION}") - - if(NOT MUDNN_MAJOR_VERSION) - set(MUDNN_VERSION "???") - else() - add_definitions("-DMUDNN_MAJOR_VERSION=\"${MUDNN_MAJOR_VERSION}\"") - math(EXPR MUDNN_VERSION "${MUDNN_MAJOR_VERSION} * 1000 + - ${MUDNN_MINOR_VERSION} * 100 + ${MUDNN_PATCH_VERSION}") - message(STATUS "Current muDNN version file is ${mudnn_version_file} ") - message( - STATUS - "Current muDNN version is v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}.${MUDNN_PATCH_VERSION}. " - ) - endif() -endmacro() - -if(MUDNN_FOUND) - find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn_version.h) - include_directories(${MUDNN_INCLUDE_DIR}) -endif() diff --git a/cmake/musa.cmake b/cmake/musa.cmake deleted file mode 100644 index 63a85e827061cf..00000000000000 --- a/cmake/musa.cmake +++ /dev/null @@ -1,128 +0,0 @@ -if(NOT WITH_MUSA) - return() -endif() - -if(NOT DEFINED ENV{MUSA_PATH}) - set(MUSA_PATH - "/usr/local/musa" - CACHE PATH "Path to which ROCm has been installed") -else() - set(MUSA_PATH - $ENV{MUSA_PATH} - CACHE PATH "Path to which ROCm has been installed") -endif() -set(CMAKE_MODULE_PATH "${MUSA_PATH}/cmake" ${CMAKE_MODULE_PATH}) - -find_package(MUSA REQUIRED) -include_directories(${MUSA_PATH}/include) - -# set openmp include directory -set(llvm_openmp_search_list) -foreach(item RANGE 6 20 1) - list(APPEND llvm_openmp_search_list /usr/lib/llvm-${item}/include/openmp/) -endforeach() - -find_path( - OPENMP_INCLUDE_DIR omp.h - PATHS ${llvm_openmp_search_list} REQUIRED - NO_DEFAULT_PATH) -include_directories(${OPENMP_INCLUDE_DIR}) - -macro(find_musa_version musa_version_file) - set(python_file ${PROJECT_BINARY_DIR}/get_version.py) - set(MUSA_VERSION - "None" - CACHE STRING "musa version" FORCE) - file( - WRITE ${python_file} - "" - "import json\n" - "import sys\n" - "with open(sys.argv[1], 'r') as f:\n" - " data = json.load(f)\n" - " print(data[\"musa_runtime\"][\"version\"])" - "") - - execute_process( - COMMAND "python" "${python_file}" ${musa_version_file} - WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" - RESULT_VARIABLE python_res - OUTPUT_VARIABLE python_out - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - - if(python_res EQUAL 0) - set(MUSA_VERSION ${python_out}) - endif() - string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\1" MUSA_MAJOR_VERSION - "${MUSA_VERSION}") - string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\2" MUSA_MINOR_VERSION - "${MUSA_VERSION}") - string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\3" MUSA_PATCH_VERSION - "${MUSA_VERSION}") - - if(NOT MUSA_MAJOR_VERSION) - set(MUSA_VERSION "???") - message(WARNING "Cannot find MUSA version in ${MUSA_PATH}/version.json") - else() - math( - EXPR - MUSA_VERSION - "${MUSA_MAJOR_VERSION} * 10000 + ${MUSA_MINOR_VERSION} * 100 + ${MUSA_PATCH_VERSION}" - ) - message(STATUS "Current MUSA version file is ${MUSA_PATH}/version.json.") - message( - STATUS - "Current MUSA version is v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION} " - ) - endif() -endmacro() -find_musa_version(${MUSA_PATH}/version.json) - -list(APPEND MUSA_MCC_FLAGS -Wno-macro-redefined) -list(APPEND MUSA_MCC_FLAGS -Wno-deprecated-copy-with-user-provided-copy) -list(APPEND MUSA_MCC_FLAGS -Wno-pragma-once-outside-header) -list(APPEND MUSA_MCC_FLAGS -Wno-return-type) -list(APPEND MUSA_MCC_FLAGS -Wno-sign-compare) -list(APPEND MUSA_MCC_FLAGS -Wno-overloaded-virtual) -list(APPEND MUSA_MCC_FLAGS -Wno-mismatched-tags) -list(APPEND MUSA_MCC_FLAGS -Wno-pessimizing-move) -list(APPEND MUSA_MCC_FLAGS -Wno-unused-but-set-variable) -list(APPEND MUSA_MCC_FLAGS -Wno-bitwise-instead-of-logical) -list(APPEND MUSA_MCC_FLAGS -Wno-format) -list(APPEND MUSA_MCC_FLAGS -Wno-self-assign) -list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion) -list(APPEND MUSA_MCC_FLAGS -Wno-literal-range) -list(APPEND MUSA_MCC_FLAGS -Wno-unused-private-field) -list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option) -list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable) -list(APPEND MUSA_MCC_FLAGS -Wno-unused-value) -list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef) -list(APPEND MUSA_MCC_FLAGS -Wno-unused-lambda-capture) -list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor) -list(APPEND MUSA_MCC_FLAGS -Wno-braced-scalar-init) -list(APPEND MUSA_MCC_FLAGS -Wno-pass-failed) -list(APPEND MUSA_MCC_FLAGS -Wno-missing-braces) -list(APPEND MUSA_MCC_FLAGS -Wno-dangling-gsl) - -if(WITH_CINN) - list(APPEND MUSA_MCC_FLAGS -std=c++14) -else() - list(APPEND MUSA_MCC_FLAGS -std=c++17) -endif() - -list(APPEND MUSA_MCC_FLAGS --cuda-gpu-arch=mp_22) -list(APPEND MUSA_MCC_FLAGS -U__CUDA__) -# MUSA has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer -list(APPEND MUSA_MCC_FLAGS -D__MUSA_NO_HALF_CONVERSIONS__) - -#set(MUSA_VERBOSE_BUILD ON) -if(CMAKE_BUILD_TYPE MATCHES Debug) - list(APPEND MUSA_MCC_FLAGS -g2) - list(APPEND MUSA_MCC_FLAGS -O0) -else() - list(APPEND MUSA_MCC_FLAGS -O2) -endif() - -set(musa_runtime_library_name musart) -find_library(MUSARTC_LIB ${musa_runtime_library_name} HINTS ${MUSA_PATH}/lib) -message(STATUS "MUSARTC_LIB: ${MUSARTC_LIB}") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 60966c41e95b93..95273118c25057 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,11 +84,6 @@ function(register_cu_kernel TARGET) ${TARGET} SRCS ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) - elseif(WITH_MUSA) - musa_library( - ${TARGET} - SRCS ${cu_srcs} - DEPS ${op_library_DEPS} ${op_common_deps}) endif() set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} @@ -156,18 +151,14 @@ function(op_library TARGET) set(cc_srcs) set(cu_srcs) set(hip_srcs) - set(mu_srcs) set(cu_cc_srcs) set(hip_cc_srcs) - set(mu_cc_srcs) set(xpu_cc_srcs) set(xpu_kp_cc_srcs) set(cudnn_cu_cc_srcs) set(miopen_cu_cc_srcs) - set(mudnn_cu_cc_srcs) set(cudnn_cu_srcs) set(miopen_cu_srcs) - set(mudnn_cu_srcs) set(CUDNN_FILE) set(MIOPEN_FILE) set(mkldnn_cc_srcs) @@ -246,35 +237,6 @@ function(op_library TARGET) list(APPEND miopen_cu_srcs ${MIOPEN_FILE}.cu) endif() endif() - if(WITH_MUSA) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) - list(APPEND mu_cc_srcs ${TARGET}.cu.cc) - endif() - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) - list(APPEND mu_srcs ${TARGET}.cu) - endif() - # rename in KP: .kps -> .cu - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps) - file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) - file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps - ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) - list(APPEND mu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) - endif() - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) - set(PART_CUDA_KERNEL_FILES - ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu - ${PART_CUDA_KERNEL_FILES} - PARENT_SCOPE) - list(APPEND mu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) - endif() - string(REPLACE "_op" "_cudnn_op" MUDNN_FILE "${TARGET}") - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MUDNN_FILE}.cu.cc) - list(APPEND mudnn_cu_cc_srcs ${MUDNN_FILE}.cu.cc) - endif() - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MUDNN_FILE}.cu) - list(APPEND mudnn_cu_srcs ${MUDNN_FILE}.cu) - endif() - endif() if(WITH_MKLDNN) string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc) @@ -305,14 +267,6 @@ function(op_library TARGET) list(APPEND miopen_cu_cc_srcs ${src}) elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu.cc$") list(APPEND hip_cc_srcs ${src}) - elseif(WITH_MUSA AND ${src} MATCHES ".*_cudnn_op.cu$") - list(APPEND mudnn_cu_srcs ${src}) - elseif(WITH_MUSA AND ${src} MATCHES ".*\\.cu$") - list(APPEND mu_srcs ${src}) - elseif(WITH_MUSA AND ${src} MATCHES ".*_cudnn_op.cu.cc$") - list(APPEND mudnn_cu_cc_srcs ${src}) - elseif(WITH_MUSA AND ${src} MATCHES ".*\\.cu.cc$") - list(APPEND mu_cc_srcs ${src}) elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu$") list(APPEND cudnn_cu_srcs ${src}) elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu$") @@ -331,15 +285,13 @@ function(op_library TARGET) list(APPEND xpu_kp_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cc$") list(APPEND cc_srcs ${src}) - elseif((WITH_ROCM OR WITH_GPU OR WITH_MUSA) AND ${src} MATCHES ".*\\.kps$") + elseif((WITH_ROCM OR WITH_GPU) AND ${src} MATCHES ".*\\.kps$") string(REPLACE ".kps" ".cu" src_cu ${src}) file(COPY ${src} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${src} ${CMAKE_CURRENT_BINARY_DIR}/${src_cu}) if(WITH_ROCM) list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu}) - elseif(WITH_MUSA) - list(APPEND mu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu}) else() list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu}) endif() @@ -439,26 +391,6 @@ function(op_library TARGET) SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) - elseif(WITH_MUSA) - list(REMOVE_ITEM mudnn_cu_cc_srcs "affine_grid_cudnn_op.cu.cc") - list(REMOVE_ITEM mudnn_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc") - list(REMOVE_ITEM mu_srcs "cholesky_op.cu") - list(REMOVE_ITEM mu_srcs "cholesky_solve_op.cu") - list(REMOVE_ITEM mu_srcs "lu_op.cu") - list(REMOVE_ITEM mu_srcs "matrix_rank_op.cu") - list(REMOVE_ITEM mu_srcs "svd_op.cu") - list(REMOVE_ITEM mu_srcs "eigvalsh_op.cu") - list(REMOVE_ITEM mu_srcs "qr_op.cu") - list(REMOVE_ITEM mu_srcs "eigh_op.cu") - list(REMOVE_ITEM mu_srcs "lstsq_op.cu") - list(REMOVE_ITEM mu_srcs "multinomial_op.cu") - list(REMOVE_ITEM mu_srcs "multiclass_nms3_op.cu") - message(STATUS "mu_cc_srcs: ${mu_cc_srcs}, cc_srcs: ${cc_srcs}") - musa_library( - ${TARGET} - SRCS ${cc_srcs} ${mu_cc_srcs} ${mudnn_cu_cc_srcs} ${mudnn_cu_srcs} - ${mkldnn_cc_srcs} ${mu_srcs} - DEPS ${op_library_DEPS} ${op_common_deps}) elseif(WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0) xpu_library( ${TARGET} @@ -492,10 +424,8 @@ function(op_library TARGET) list(LENGTH cu_srcs cu_srcs_len) list(LENGTH hip_srcs hip_srcs_len) - list(LENGTH mu_srcs mu_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH hip_cc_srcs hip_cc_srcs_len) - list(LENGTH mu_cc_srcs mu_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len) @@ -606,30 +536,12 @@ function(op_library TARGET) endif() endforeach() - # pybind USE_OP_DEVICE_KERNEL for MUSA - list(APPEND mu_srcs ${mu_cc_srcs}) - message("mu_srcs ${mu_srcs}") - foreach(mu_src ${mu_srcs}) - set(op_name "") - find_register(${mu_src} "REGISTER_OP_CUDA_KERNEL" op_name) - find_phi_register(${mu_src} ${pybind_file} "PD_REGISTER_KERNEL") - find_phi_register(${mu_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL") - find_phi_register(${mu_src} ${pybind_file} - "PD_REGISTER_KERNEL_FOR_ALL_DTYPE") - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n") - set(pybind_flag 1) - endif() - endforeach() - - # pybind USE_OP_DEVICE_KERNEL for CUDNN/MIOPEN list(APPEND cudnn_cu_srcs ${cudnn_cu_cc_srcs}) list(APPEND cudnn_cu_srcs ${miopen_cu_cc_srcs}) list(APPEND cudnn_cu_srcs ${miopen_cu_srcs}) - list(APPEND cudnn_cu_srcs ${mudnn_cu_cc_srcs}) - list(APPEND cudnn_cu_srcs ${mudnn_cu_srcs}) list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len) + #message("cudnn_cu_srcs ${cudnn_cu_srcs}") if(${cudnn_cu_srcs_len} GREATER 0 AND ${ORIGINAL_TARGET} STREQUAL "activation_op") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n") @@ -744,7 +656,7 @@ function(register_operators) string(REPLACE ".cc" "" OPS "${OPS}") list(REMOVE_DUPLICATES OPS) list(LENGTH register_operators_DEPS register_operators_DEPS_len) - message(STATUS "OPS in register_operators:${OPS}") + foreach(src ${OPS}) list(FIND register_operators_EXCLUDES ${src} _index) if(${_index} EQUAL -1) diff --git a/cmake/phi.cmake b/cmake/phi.cmake index 499cc4c591bbfc..ead66697ef68cb 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -104,7 +104,7 @@ function(kernel_declare TARGET_LIST) endif() endif() # some gpu kernel only can run on cuda, not support rocm, so we add this branch - if(WITH_ROCM OR WITH_MUSA) + if(WITH_ROCM) string(FIND "${first_registry}" "cuda_only" pos) if(pos GREATER 1) set(first_registry "") diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h index c333448d029ae0..0047100ebcfdfc 100644 --- a/paddle/cinn/ir/ir_base.h +++ b/paddle/cinn/ir/ir_base.h @@ -110,16 +110,23 @@ class Dim; macro__(Product) \ macro__(Sum) \ macro__(PrimitiveNode) \ - macro__(IntrinsicOp) \ macro__(_BufferRange_) \ macro__(ScheduleBlock) \ macro__(ScheduleBlockRealize) \ macro__(_Dim_) \ +#define NODETY_CONTROL_OP_FOR_INTRINSIC(macro__) \ + macro__(IntrinsicOp) \ #define NODETY_FORALL(__m) \ NODETY_PRIMITIVE_TYPE_FOR_EACH(__m) \ NODETY_OP_FOR_EACH(__m) \ + NODETY_CONTROL_OP_FOR_INTRINSIC(__m) \ + NODETY_CONTROL_OP_FOR_EACH(__m) + +#define NODETY_FORALL_EXCEPT_INTRINSIC(__m) \ + NODETY_PRIMITIVE_TYPE_FOR_EACH(__m) \ + NODETY_OP_FOR_EACH(__m) \ NODETY_CONTROL_OP_FOR_EACH(__m) // clang-format on diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.cc b/paddle/cinn/ir/utils/ir_nodes_collector.cc index ac2f0317e9213f..e4ebaca653bae9 100644 --- a/paddle/cinn/ir/utils/ir_nodes_collector.cc +++ b/paddle/cinn/ir/utils/ir_nodes_collector.cc @@ -15,6 +15,8 @@ #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include +#include "paddle/cinn/ir/intrinsic_ops.h" +#include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/ir_mutator.h" #include "paddle/cinn/ir/ir_printer.h" @@ -71,8 +73,71 @@ struct IrNodesCollector : public IRVisitorRequireReImpl { } \ } - NODETY_FORALL(__m) + NODETY_FORALL_EXCEPT_INTRINSIC(__m) #undef __m + + void Visit(const ir::IntrinsicOp* op) { + switch (op->getKind()) { +#define __(x) \ + case ir::IntrinsicKind::k##x: \ + Visit(llvm::dyn_cast(op)); \ + break; + + INTRINSIC_KIND_FOR_EACH(__) +#undef __ + } + } + + void Visit(const ir::intrinsics::GetAddr* x) { + if (x->data.defined()) { + Visit(&(x->data)); + } + } + + void Visit(const ir::intrinsics::BufferGetDataHandle* x) { + if (x->buffer.defined()) { + Visit(&(x->buffer)); + } + } + + void Visit(const ir::intrinsics::BufferGetDataConstHandle* x) { + if (x->buffer.defined()) { + Visit(&(x->buffer)); + } + } + + void Visit(const ir::intrinsics::PodValueToX* x) { + if (x->pod_value_ptr.defined()) { + Visit(&(x->pod_value_ptr)); + } + } + + void Visit(const ir::intrinsics::BufferCreate* x) { + if (x->buffer.defined()) { + Visit(&(x->buffer)); + } + } + + void Visit(const ir::intrinsics::ArgsConstruct* x) { + if (x->var.defined()) { + Expr convert = Expr(x->var); + Visit(&convert); + } + for (int i = 0; i < x->args.size(); ++i) { + if (x->args[i].defined()) { + Visit(&(x->args[i])); + } + } + } + + void Visit(const ir::intrinsics::BuiltinIntrin* x) { + for (int i = 0; i < x->args.size(); ++i) { + if (x->args[i].defined()) { + Visit(&(x->args[i])); + } + } + } + std::set visited_; }; diff --git a/paddle/common/array.h b/paddle/common/array.h index 20f7904fc3bd19..11457a1eaa756b 100644 --- a/paddle/common/array.h +++ b/paddle/common/array.h @@ -54,7 +54,7 @@ class Array { } HOSTDEVICE inline T &at(size_t i) { -#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)&& !defined(__MUSACC__) +#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) COMMON_ENFORCE_LT( i, N, common::errors::OutOfRange("Array index out of bounds.")); #endif @@ -62,7 +62,7 @@ class Array { } HOSTDEVICE inline const T &at(size_t i) const { -#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)&& !defined(__MUSACC__) +#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) COMMON_ENFORCE_LT( i, N, common::errors::OutOfRange("Array index out of bounds.")); #endif @@ -103,7 +103,7 @@ class Array { HOSTDEVICE inline T *GetMutable() { return nullptr; } HOSTDEVICE inline T &operator[](size_t) { -#if defined(__HIPCC__) || defined(__MUSACC__) || defined(__CUDA_ARCH__) +#if defined(__HIPCC__) || defined(__CUDA_ARCH__) // HIP and CUDA will have compile error, if use "obj()" // function declared in block scope cannot have 'static' storage class static T obj{}; @@ -114,7 +114,7 @@ class Array { } HOSTDEVICE inline const T &operator[](size_t) const { -#if defined(__HIPCC__) || defined(__MUSACC__) || defined(__CUDA_ARCH__) +#if defined(__HIPCC__) || defined(__CUDA_ARCH__) // HIP and CUDA will have compile error, if use "obj()" // function declared in block scope cannot have 'static' storage class static const T obj{}; diff --git a/paddle/common/hostdevice.h b/paddle/common/hostdevice.h index f7070893d83b58..7f8cf135634341 100644 --- a/paddle/common/hostdevice.h +++ b/paddle/common/hostdevice.h @@ -18,10 +18,6 @@ #include #endif -#ifdef __MUSACC__ -#include -#endif - #if defined(__xpu__) #include @@ -30,7 +26,7 @@ #include "xpu/kernel/math.h" #endif -#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)) +#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ diff --git a/paddle/common/macros.h b/paddle/common/macros.h index 8189b3147db8cc..2d476c58cb6ae1 100644 --- a/paddle/common/macros.h +++ b/paddle/common/macros.h @@ -72,7 +72,7 @@ namespace common { #define PD_CONCATENATE2(arg1, arg2) arg1##arg2 #define PD_EXPAND(x) x -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) #define PADDLE_RESTRICT __restrict__ #else #define PADDLE_RESTRICT diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index dd6309f7da3608..d42b810972dc85 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -15,7 +15,7 @@ if(WITH_DISTRIBUTE) DEPS phi common eager_api gloo_wrapper) endif() -if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) +if(WITH_NCCL OR WITH_RCCL) cc_library( process_group_nccl SRCS process_group_nccl.cc common.cc @@ -63,7 +63,7 @@ if(WITH_CUSTOM_DEVICE) endif() set(COMM_UTILS_DEPS process_group) -if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) +if(WITH_NCCL OR WITH_RCCL) set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl) endif() if(WITH_CUSTOM_DEVICE) diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc index dd3e1f410ee0d2..6732ea375d500e 100644 --- a/paddle/fluid/distributed/collective/process_group_nccl.cc +++ b/paddle/fluid/distributed/collective/process_group_nccl.cc @@ -106,8 +106,6 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { // If we use the work to do barrier, we should block cpu #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else // PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif @@ -139,20 +137,18 @@ ProcessGroupNCCL::~ProcessGroupNCCL() { } void ProcessGroupNCCL::GroupStart() { - MCCL_CHECK(phi::dynload::mcclGroupStart()); + NCCL_CHECK(phi::dynload::ncclGroupStart()); ++s_group_call_counter; } void ProcessGroupNCCL::GroupEnd() { - MCCL_CHECK(phi::dynload::mcclGroupEnd()); + NCCL_CHECK(phi::dynload::ncclGroupEnd()); --s_group_call_counter; // NOTE: This is to sync the calc stream and comm stream for debug using // batch_isend_irecv if (FLAGS_benchmark || FLAGS_benchmark_nccl) { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else // PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif @@ -183,7 +179,7 @@ phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext( } } -mcclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const { +ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const { const std::string& key = GetKeyFromPlace(place); const auto& iter = place_to_comm_ctx_.find(key); PADDLE_ENFORCE_NE( @@ -208,7 +204,7 @@ std::shared_ptr ProcessGroupNCCL::AllGather( numel > 0 ? GetPartialTensor(tensor_tmp, offset, numel) : tensor_tmp; return Collective( [&](phi::distributed::NCCLCommContext* comm_context, gpuStream_t stream) { - VLOG(3) << "[mcclAllGather] " + VLOG(3) << "[ncclAllGather] " << "sendbuff: " << in_tensor_maybe_partial.data() << ", recvbuff: " << out_tensor->data() << ", count: " << in_tensor_maybe_partial.numel() @@ -239,7 +235,7 @@ std::shared_ptr ProcessGroupNCCL::AllReduce( paddle::experimental::CheckAndTrans2NewContiguousTensor(in_tensor); return Collective( [&](phi::distributed::NCCLCommContext* comm_context, gpuStream_t stream) { - VLOG(3) << "[mcclAllReduce] " + VLOG(3) << "[ncclAllReduce] " << "sendbuff: " << tensor_tmp.data() << ", recvbuff: " << out_tensor->data() << ", count: " << tensor_tmp.numel() << ", datatype: " @@ -708,7 +704,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, << ", store_key: " << store_key; for (size_t i = 0; i < s_group_call_counter; ++i) { - MCCL_CHECK(phi::dynload::mcclGroupEnd()); + NCCL_CHECK(phi::dynload::ncclGroupEnd()); } bool is_batch_p2p = s_group_call_counter > 0; @@ -717,13 +713,13 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, int num_ranks = is_p2p_op ? 2 : GetSize(); int rank = is_p2p_op ? p2p_rank : GetRank(); - MCCL_CHECK(phi::dynload::mcclGroupStart()); + NCCL_CHECK(phi::dynload::ncclGroupStart()); phi::distributed::P2POption p2p_opts({is_p2p_op, p2p_rank, num_ranks, rank}); phi::distributed::CommContextManager::CreateNCCLCommContext( store_, store_key, rank_, size_, "", &p2p_opts); - MCCL_CHECK(phi::dynload::mcclGroupEnd()); + NCCL_CHECK(phi::dynload::ncclGroupEnd()); auto nccl_comm_ctx = this->GetCommContext(&store_key); VLOG(3) << "Get nccl comm: " << nccl_comm_ctx->GetNcclComm() @@ -751,10 +747,10 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()), gpu_global_ranks_size); - MCCL_CHECK(phi::dynload::mcclAllGather(gpu_global_rank->ptr(), + NCCL_CHECK(phi::dynload::ncclAllGather(gpu_global_rank->ptr(), gpu_global_ranks->ptr(), 1, - mcclInt, + ncclInt, nccl_comm_ctx->GetNcclComm(), comm_ctx->stream())); @@ -787,7 +783,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, place_to_comm_ctx_.emplace(place_key, std::move(comm_ctx)); for (size_t i = 0; i < s_group_call_counter; ++i) { - MCCL_CHECK(phi::dynload::mcclGroupStart()); + NCCL_CHECK(phi::dynload::ncclGroupStart()); } } @@ -882,8 +878,6 @@ std::shared_ptr ProcessGroupNCCL::Collective( if (FLAGS_benchmark || FLAGS_benchmark_nccl) { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else // PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif @@ -999,8 +993,6 @@ std::shared_ptr ProcessGroupNCCL::Point2Point( if (!is_batch_p2p && (FLAGS_benchmark || FLAGS_benchmark_nccl)) { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else // PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif diff --git a/paddle/fluid/distributed/collective/process_group_nccl.h b/paddle/fluid/distributed/collective/process_group_nccl.h index 8a626d701b3245..22d90370f16afc 100644 --- a/paddle/fluid/distributed/collective/process_group_nccl.h +++ b/paddle/fluid/distributed/collective/process_group_nccl.h @@ -175,7 +175,7 @@ class ProcessGroupNCCL final : public ProcessGroupWithStream { static void GroupEnd(); - mcclComm_t NCCLComm(const Place& place) const; + ncclComm_t NCCLComm(const Place& place) const; private: std::shared_ptr CreateTask(const Place& place, diff --git a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc index 9061ce7aeaa068..eec697f5239450 100644 --- a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc +++ b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/phi/backends/c_comm_lib.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #endif #if defined(PADDLE_WITH_CUSTOM_DEVICE) @@ -33,7 +33,7 @@ namespace detail { // In principle, the PHI Kernel cannot use the global singleton internally, // and the required members need to be passed in from the eucalyptus tree. ccl::CCLComm GetCCLComm(const Place& place, int global_gid) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) paddle::distributed::ProcessGroup* pg = nullptr; if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has( @@ -45,7 +45,7 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid) { } #endif if (place.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) return static_cast(pg)->NCCLComm( place); #else diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 591e083d005a44..6165dfc27e38ef 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -372,7 +372,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) { paddle::experimental::empty(IntArray({all_length_}), dtype_, place); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto *default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); ConcatTensorsWithType( @@ -419,7 +419,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) { void EagerGroup::SplitTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto &gpu_context = static_cast(context); SplitTensorsWithType( gpu_context, &dense_contents_, &dense_tensors_, dtype_); @@ -1112,7 +1112,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_); // NOLINT if (platform::is_gpu_place(inner_place_)) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(inner_place_)); #else diff --git a/paddle/fluid/distributed/common/chunk_allocator.h b/paddle/fluid/distributed/common/chunk_allocator.h index 17f7bb14224d35..7b19b3a1098398 100644 --- a/paddle/fluid/distributed/common/chunk_allocator.h +++ b/paddle/fluid/distributed/common/chunk_allocator.h @@ -14,6 +14,7 @@ #pragma once #include +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace distributed { @@ -77,9 +78,16 @@ class ChunkAllocator { void create_new_chunk() { Chunk* chunk; - posix_memalign(reinterpret_cast(&chunk), - std::max(sizeof(void*), alignof(Chunk)), - sizeof(Chunk) + sizeof(Node) * _chunk_size); + size_t alloc_size = sizeof(Chunk) + sizeof(Node) * _chunk_size; + int error = posix_memalign(reinterpret_cast(&chunk), + std::max(sizeof(void*), alignof(Chunk)), + alloc_size); + PADDLE_ENFORCE_EQ(error, + 0, + paddle::platform::errors::ResourceExhausted( + "Fail to alloc memory of %ld size, error code is %d.", + alloc_size, + error)); chunk->next = _chunks; _chunks = chunk; diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index c896786c657f61..82a3514f2791f9 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -277,7 +277,7 @@ static std::shared_ptr GetGC( int64_t max_memory_size = framework::GetEagerDeletionThreshold(); std::shared_ptr gc; if (max_memory_size >= 0) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { if (framework::IsFastEagerDeletionModeEnabled()) { gc.reset(new framework::UnsafeFastGPUGarbageCollector(place, diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc index 61e0732f89f5bc..704dd16400065c 100644 --- a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc @@ -71,7 +71,7 @@ bool CondInterceptor::GetCondResult() { const auto& cond_tensor = cond_var->Get(); bool res = false; if (platform::is_gpu_place(cond_tensor.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::DenseTensor cpu_tensor; framework::TensorCopy(cond_tensor, platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(cond_tensor.place())->Wait(); diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index 0117a472ef06d3..a1fd38295319ed 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -76,7 +76,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, input_data.data.length()); } else if (platform::is_gpu_place(place)) { VLOG(3) << "Loading data for GPU."; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = dynamic_cast(pool.Get(place)); auto gpu_place = place; diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc index 6dc9cff9d9120b..b5786e23933930 100644 --- a/paddle/fluid/distributed/fleet_executor/message_bus.cc +++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc @@ -51,7 +51,7 @@ void MessageBus::Init( addr_)); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE) // NOTE: To make the brpc is compatible with collective, // need release the handler holding the ip address. diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc index 2bd9213cae610d..47509d025722d8 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc @@ -61,8 +61,9 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x, // Type promotion Logic if (phi::NeedTypePromotion(x.dtype(), y.dtype())) { VLOG(5) << "got different data type, run type protmotion automatically."; - LOG(WARNING) << "got different data type, run type protmotion " - "automatically, this may cause data type been changed."; + LOG_FIRST_N(WARNING, 1) + << "got different data type, run type protmotion " + "automatically, this may cause data type been changed."; auto op_name = phi::TransToFluidOpName("multiply"); auto promotion_type = phi::GetPromoteDtype(op_name, x.dtype(), y.dtype()); @@ -407,8 +408,9 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x, // Type promotion Logic if (phi::NeedTypePromotion(x.dtype(), y.dtype())) { VLOG(5) << "got different data type, run type protmotion automatically."; - LOG(WARNING) << "got different data type, run type protmotion " - "automatically, this may cause data type been changed."; + LOG_FIRST_N(WARNING, 1) + << "got different data type, run type protmotion " + "automatically, this may cause data type been changed."; auto op_name = phi::TransToFluidOpName("multiply"); auto promotion_type = phi::GetPromoteDtype(op_name, x.dtype(), y.dtype()); diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt index bef2878e706f55..a6bb716e6b7ade 100644 --- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt @@ -27,10 +27,6 @@ if(WITH_ROCM) target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB}) endif() -if(WITH_MUSA) - target_link_libraries(eager_generator ${MUSARTC_LIB}) -endif() - if(WITH_CINN) target_link_libraries(eager_generator ${PYTHON_LIBRARIES}) endif() diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index 2a96fddccbce70..75d6cb94c6b5f2 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -528,7 +528,7 @@ class {} : public egr::GradNodeBase {{ TYPE_PROMOTION_LOGIC_TEMPLATE = """ if (phi::NeedTypePromotion({x}.dtype(), {y}.dtype())) {{ VLOG(5) << "got different data type, run type protmotion automatically."; - LOG(WARNING) << "got different data type, run type protmotion automatically, this may cause data type been changed."; + LOG_FIRST_N(WARNING, 1) << "got different data type, run type protmotion automatically, this may cause data type been changed."; {op_name} auto promotion_type = phi::GetPromoteDtype(op_name, {x}.dtype(), {y}.dtype()); diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index f93f41a21553a3..daf16f446ab12c 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -146,7 +146,7 @@ def FindParsingFunctionFromAttributeType(atype): FUNCTION_SET_DEVICE_TEMPLATE = """{} SetPythonStack(); if (paddle::platform::is_gpu_place(place)) {{ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::backends::gpu::SetDeviceId(place.device); VLOG(4) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device; #else diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index 2da9994b7671ce..a1e62ea6ba519b 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -103,7 +103,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) { auto& place = dense_tensor->place(); if (paddle::platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) paddle::framework::details::tensor_check( api_name, tensor_name, *dense_tensor, place); #else diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 8aab6bf2a201ab..8aa03e98809fb2 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -213,11 +213,6 @@ elseif(WITH_ROCM) data_type_transform SRCS data_type_transform.cu DEPS tensor) -elseif(WITH_MUSA) - musa_library( - data_type_transform - SRCS data_type_transform.cu - DEPS tensor) elseif(WITH_XPU) cc_library( data_type_transform @@ -466,7 +461,7 @@ if(WITH_PYTHON) ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMENT "Copy generated python proto into directory paddle/fluid/proto." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - if(WITH_GPU) + if(NOT WITH_ROCM) add_custom_target( fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h index 6621b74740f250..1620c99ce8560d 100644 --- a/paddle/fluid/framework/conv_search_cache.h +++ b/paddle/fluid/framework/conv_search_cache.h @@ -45,19 +45,6 @@ class ConvSearchCache { AlgorithmsCache* GetConvFusion() { return &fusion_forward_cache_; } -#elif defined(PADDLE_WITH_MUSA) - // AlgorithmsCache* GetForward() { - // return &forward_cache_; - // } - // AlgorithmsCache* GetBackwardData() { - // return &backward_data_cache_; - // } - // AlgorithmsCache* GetBackwardFilter() { - // return &backward_filter_cache_; - // } - // AlgorithmsCache* GetConvFusion() { - // return &fusion_forward_cache_; - // } #else AlgorithmsCache* GetForward() { return &forward_cache_; @@ -85,11 +72,6 @@ class ConvSearchCache { AlgorithmsCache backward_data_cache_; AlgorithmsCache backward_filter_cache_; AlgorithmsCache fusion_forward_cache_; -#elif defined(PADDLE_WITH_MUSA) - // AlgorithmsCache forward_cache_; - // AlgorithmsCache backward_data_cache_; - // AlgorithmsCache backward_filter_cache_; - // AlgorithmsCache fusion_forward_cache_; #else AlgorithmsCache forward_cache_; AlgorithmsCache backward_data_cache_; diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 4d2236ed1e66f7..bf2f9e4379b693 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -124,7 +124,7 @@ static void RunKernelFunc( "Input tensor (%s) is not initialized.", in_name)); paddle::Tensor custom_in; custom_in.set_impl(std::make_shared(*x)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (custom_in.is_gpu_pinned()) { VLOG(3) << "Custom Operator: custom input is gpu pinned tensor"; auto gpu_place = phi::GPUPlace(platform::GetCurrentDeviceId()); @@ -936,7 +936,7 @@ static void RegisterOperatorKernel( } RegisterOperatorKernelWithPlace( name, op_kernel_func, proto::VarType::RAW, platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) RegisterOperatorKernelWithPlace( name, op_kernel_func, proto::VarType::RAW, platform::CUDAPlace()); #endif diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index d3525c80d56db2..4a72f339a85cbc 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -271,8 +271,6 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); #elif defined(PADDLE_WITH_HIP) hipMemcpy(dst, src, size, hipMemcpyHostToDevice); -#elif defined(PADDLE_WITH_MUSA) - musaMemcpy(dst, src, size, musaMemcpyHostToDevice); #elif defined(PADDLE_WITH_XPU_KP) xpu_memcpy(dst, src, size, XPUMemcpyKind::XPU_HOST_TO_DEVICE); #else @@ -1531,7 +1529,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( #endif } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) template void PrivateInstantDataFeed::PutToFeedVec() { for (size_t i = 0; i < use_slots_.size(); ++i) { diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index 57cf488d2a3014..156c70b9825382 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -2982,7 +2982,7 @@ std::shared_ptr GetNodeDegree( } int multi_node_sync_sample(int flag, - const mcclRedOp_t &op, + const ncclRedOp_t &op, const paddle::platform::Place &place, const int gpu_id, phi::DenseTensor *multi_node_sync_stat_ptr) { @@ -2998,8 +2998,8 @@ int multi_node_sync_sample(int flag, int *stat_ptr = multi_node_sync_stat_ptr->data(); auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId()); auto stream = comm->stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( - &stat_ptr[flag], &stat_ptr[3], 1, mcclInt, op, comm->comm(), stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + &stat_ptr[flag], &stat_ptr[3], 1, ncclInt, op, comm->comm(), stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret, // output &stat_ptr[3], sizeof(int), @@ -3011,7 +3011,7 @@ int multi_node_sync_sample(int flag, } int get_multi_node_global_flag(int local_flag, - const mcclRedOp_t &op, + const ncclRedOp_t &op, const paddle::platform::Place &place, const int gpu_id, cudaStream_t stream) { @@ -3025,10 +3025,10 @@ int get_multi_node_global_flag(int local_flag, send_buff_ptr, &local_flag, sizeof(int), cudaMemcpyHostToDevice, stream); cudaStreamSynchronize(stream); auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0], + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0], &send_buff_ptr[1], 1, - mcclInt, + ncclInt, op, comm->comm(), stream)); @@ -3177,7 +3177,7 @@ int FillWalkBuf(const std::vector &h_device_keys_len, // to decide whether to continue sampling if (FLAGS_enable_graph_multi_node_sampling) { switch_command = multi_node_sync_sample( - switch_flag, mcclProd, place, conf.gpuid, multi_node_sync_stat_ptr); + switch_flag, ncclProd, place, conf.gpuid, multi_node_sync_stat_ptr); VLOG(2) << "gpuid:" << conf.gpuid << " multi node sample sync" << " switch_flag:" << switch_flag << "," << switch_command; if (switch_command) { @@ -3187,7 +3187,7 @@ int FillWalkBuf(const std::vector &h_device_keys_len, } sample_command = multi_node_sync_sample( - sample_flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr); + sample_flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr); VLOG(2) << "gpuid:" << conf.gpuid << " multi node sample sync" << " sample_flag:" << sample_flag << "," << sample_command; if (sample_command == EVENT_FINISH_EPOCH) { @@ -3280,7 +3280,7 @@ int FillWalkBuf(const std::vector &h_device_keys_len, if (FLAGS_enable_graph_multi_node_sampling) { int flag = *jump_rows_ptr > 0 ? 1 : 0; int command = multi_node_sync_sample( - flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr); + flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr); VLOG(2) << "gpuid:" << conf.gpuid << " multi node step sync" << " step:" << step << " step_sample:" << flag << "," << command; if (command <= 0) { @@ -3326,7 +3326,7 @@ int FillWalkBuf(const std::vector &h_device_keys_len, // Step synchronization for multi-step sampling in multi node int flag = sample_res.total_sample_size > 0 ? 1 : 0; int command = multi_node_sync_sample( - flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr); + flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr); VLOG(2) << "gpuid:" << conf.gpuid << " multi node step sync" << " step:" << step << " step_sample:" << flag << "," << command; @@ -3846,7 +3846,7 @@ void GraphDataGenerator::DoWalkandSage() { } else { if (conf_.sage_mode) { global_train_flag_ = get_multi_node_global_flag( - local_train_flag, mcclProd, place_, conf_.gpuid, sample_stream_); + local_train_flag, ncclProd, place_, conf_.gpuid, sample_stream_); VLOG(1) << "gpu_id: " << conf_.gpuid << ", local_train_flag: " << local_train_flag << ", global_train_flag: " << global_train_flag_; @@ -4010,7 +4010,7 @@ void GraphDataGenerator::DoSageForTrain() { // check whether reach sage pass end if (conf_.is_multi_node) { int res = multi_node_sync_sample(sage_pass_end, - mcclProd, + ncclProd, place_, conf_.gpuid, &multi_node_sync_stat_); @@ -4165,7 +4165,7 @@ void GraphDataGenerator::DoSageForInfer() { int local_pass_end = total_instance == 0; if (conf_.is_multi_node) { global_pass_end = get_multi_node_global_flag( - local_pass_end, mcclProd, place_, conf_.gpuid, sample_stream_); + local_pass_end, ncclProd, place_, conf_.gpuid, sample_stream_); } else { global_pass_end = local_pass_end; } @@ -4261,11 +4261,11 @@ int dynamic_adjust_total_row_for_infer(int local_reach_end, stream); cudaStreamSynchronize(stream); auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0], + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0], &send_buff_ptr[1], 1, - mcclInt, - mcclProd, + ncclInt, + ncclProd, comm->comm(), stream)); int global_reach_end = 0; @@ -4356,7 +4356,7 @@ bool FillInferBuf( global_infer_node_type_start[infer_cursor] + conf.buf_size >= device_key_size; int global_reach_end = get_multi_node_global_flag( - local_reach_end, mcclProd, place, conf.gpuid, stream); + local_reach_end, ncclProd, place, conf.gpuid, stream); int remain = device_key_size - global_infer_node_type_start[infer_cursor]; if (global_reach_end) { *total_row_ptr = remain; @@ -5005,11 +5005,11 @@ int GraphDataGenerator::dynamic_adjust_batch_num_for_sage() { cudaStreamSynchronize(sample_stream_); auto comm = platform::NCCLCommContext::Instance().Get(0, place_.GetDeviceId()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0], + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0], &send_buff_ptr[1], 1, - mcclInt, - mcclMax, + ncclInt, + ncclMax, comm->comm(), sample_stream_)); int thread_max_batch_num = 0; diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 492c7629abf9eb..243c5c818f5887 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -2023,7 +2023,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed { int pv_batch_size_; }; -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) template class PrivateInstantDataFeed : public DataFeed { public: diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc index 010661fef6e8ab..88afa021b7c1b9 100644 --- a/paddle/fluid/framework/data_feed_factory.cc +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -70,7 +70,7 @@ REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed); REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed); REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed); #endif } // namespace framework diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index b2fb089f535749..9d114fcf563963 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -101,7 +101,7 @@ struct CastDataType { in_end, out_begin, CastDataTypeFunctor()); -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) } else if (platform::is_gpu_place(in_.place())) { phi::Transform trans; auto* context = static_cast(ctx_); diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index f43c20a0d3a94c..f0c2b60f41b69d 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -221,75 +221,6 @@ elseif(WITH_ROCM) fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) -elseif(WITH_MUSA) - musa_library( - nan_inf_utils - SRCS nan_inf_utils_detail.cc - DEPS framework_proto scope place phi common) - musa_library( - all_reduce_op_handle - SRCS all_reduce_op_handle.cc - DEPS op_handle_base - scope - lod_tensor - phi - common - memory - dynload_cuda - variable_visitor) - musa_library( - fused_all_reduce_op_handle - SRCS fused_all_reduce_op_handle.cc - DEPS all_reduce_op_handle - op_handle_base - variable_visitor - scope - lod_tensor - phi - common - memory - dynload_cuda - place) - musa_library( - grad_merge_all_reduce_op_handle - SRCS grad_merge_all_reduce_op_handle.cc - DEPS fused_all_reduce_op_handle - op_handle_base - scope - lod_tensor - phi - common - memory - dynload_cuda - variable_visitor - place - all_reduce_op_handle) - - if(WITH_DISTRIBUTE) - musa_library( - reduce_op_handle - SRCS reduce_op_handle.cc - DEPS op_handle_base variable_visitor scope phi common dynload_cuda) - else() - musa_library( - reduce_op_handle - SRCS reduce_op_handle.cc - DEPS op_handle_base variable_visitor scope phi common dynload_cuda) - endif() - musa_library( - broadcast_op_handle - SRCS broadcast_op_handle.cc - DEPS op_handle_base - scope - phi - common - memory - variable_visitor - dynload_cuda) - musa_library( - fused_broadcast_op_handle - SRCS fused_broadcast_op_handle.cc - DEPS broadcast_op_handle) else() cc_library( nan_inf_utils @@ -489,7 +420,7 @@ endif() if(NOT APPLE AND NOT WIN32 - AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA)) + AND (WITH_GPU OR WITH_ROCM)) set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) endif() cc_library( diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 087a629d493444..b064a2aded0bcb 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -20,7 +20,7 @@ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) PHI_DECLARE_bool(sync_nccl_allreduce); #endif @@ -28,7 +28,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -207,17 +207,17 @@ void AllReduceOpHandle::AllReduceFunc( const std::vector &places, const std::vector &out_var_names) { if (platform::is_gpu_place(places[0])) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::InvalidArgument( "The nccl context should not be NULL.")); - mcclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype); + ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype); std::vector> all_reduce_calls; for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { auto &p = places[i]; void *buffer = const_cast(lod_tensor_data.at(i)); all_reduce_calls.emplace_back([=] { - NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, mcclSum); + NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, ncclSum); }); } NCCLAllReduceFunc(all_reduce_calls); @@ -300,7 +300,7 @@ void AllReduceOpHandle::SyncBKCLAllReduce() { } #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) void AllReduceOpHandle::NCCLAllReduceFunc( const std::vector> &all_reduce_calls) { this->RunAndRecordEvent([&] { diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index 0e2c06311bf385..685ab0b957a448 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -31,7 +31,7 @@ namespace platform { class NCCLCommunicator; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) @@ -43,7 +43,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class AllReduceOpHandle : public NCCLOpHandleBase { public: AllReduceOpHandle(ir::Node *node, @@ -77,14 +77,14 @@ class AllReduceOpHandle : public OpHandleBase { std::vector local_scopes_; -#if !defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && !defined(PADDLE_WITH_MCCL) && \ +#if !defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && \ !defined(PADDLE_WITH_XPU_BKCL) // NCCLOpHandleBase and BKCLOpHandleBase already have these attributes. // Will polish it by class inheritance framework. std::vector places_; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) void NCCLAllReduceFunc( const std::vector> &all_reduce_calls); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 98672d09a2452e..b79eff24ee87d7 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -88,7 +88,7 @@ void BroadcastOpHandle::BroadcastOneVar( }); } } else if (platform::is_gpu_place(in_tensor.place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) VarHandle *out_handle = nullptr; int root_id = in_tensor.place().device; // NOLINT std::vector> broadcast_calls; @@ -118,9 +118,9 @@ void BroadcastOpHandle::BroadcastOneVar( broadcast_calls.emplace_back( [send_recv_buffer, numel, type, root_id, &nccl_ctx] { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclBcast(send_recv_buffer, + platform::dynload::ncclBcast(send_recv_buffer, numel, - static_cast(type), + static_cast(type), root_id, nccl_ctx.comm_, nccl_ctx.stream())); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 3300c48b165853..9fbe2764913b55 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -34,7 +34,7 @@ class Node; } // namespace ir } // namespace framework namespace platform { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) struct NCCLContextMap; #endif #if defined(PADDLE_WITH_XPU_BKCL) @@ -43,7 +43,7 @@ struct BKCLContextMap; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" @@ -55,7 +55,7 @@ namespace details { struct BroadcastOpHandle : public OpHandleBase { public: -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -109,7 +109,7 @@ struct BroadcastOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) const platform::NCCLContextMap *nccl_ctxs_; #elif defined(PADDLE_WITH_XPU_BKCL) const platform::BKCLContextMap *bkcl_ctxs_; diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 5b8857977c9fab..5a6f4e6e70d4c1 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -186,7 +186,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { "fuse_relu_depthwise_conv_pass"); AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass"); AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass"); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MCCL)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); #endif @@ -348,7 +348,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, const std::string &loss_var_name, const std::vector &local_scopes, const size_t &nranks, -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) DeviceType use_device, platform::NCCLCommunicator *nccl_ctxs) const { #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL) @@ -380,7 +380,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Erase(kNRanks); pass->Set(kNRanks, new size_t(nranks)); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) platform::NCCLCommunicator *nctx = (use_device == p::kCUDA) ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); @@ -400,7 +400,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Erase(kLocalScopes); pass->SetNotOwned>(kLocalScopes, &local_scopes); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) platform::NCCLCommunicator *nctx = (use_device == p::kCUDA) ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); @@ -428,7 +428,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; } else if (pass->Type() == "all_reduce_deps_pass") { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) platform::NCCLCommunicator *nctx = (use_device == p::kCUDA) ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); @@ -545,7 +545,7 @@ USE_PASS(fused_feedforward_pass); #ifdef PADDLE_WITH_DNNL USE_PASS(mkldnn_placement_pass); #endif -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MCCL)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ !defined(_WIN32) && !defined(__APPLE__) USE_PASS(fusion_group_pass); #endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 90cf7fe82ebfd2..203525d5a74821 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -217,7 +217,7 @@ struct BuildStrategy { const std::string &loss_var_name, const std::vector &local_scopes, const size_t &nranks, -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) DeviceType use_device, platform::NCCLCommunicator *nccl_ctxs) const; #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 89d72a1b8213a5..4012263f688cb5 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,7 +16,7 @@ #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include @@ -44,7 +44,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( place_(place), var_infos_(vars.begin(), vars.end()), gc_(gc) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { dev_ctx_ = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); @@ -53,9 +53,6 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaEventCreateWithFlags(&event_, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); @@ -78,14 +75,12 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( } EagerDeletionOpHandle::~EagerDeletionOpHandle() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (event_) { auto gpu_place = dev_ctx_->GetPlace(); platform::CUDADeviceGuard guard(gpu_place.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_)); #endif @@ -94,7 +89,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { } void EagerDeletionOpHandle::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) int dev_id = dev_ctxes_.begin()->first.device; events_[dev_id] = nullptr; #endif @@ -182,7 +177,7 @@ void EagerDeletionOpHandle::RunImpl() { void EagerDeletionOpHandle::ClearGarbages( std::deque> *garbages) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (event_) { auto compute_stream = dev_ctx_->stream(); auto callback_stream = @@ -192,10 +187,6 @@ void EagerDeletionOpHandle::ClearGarbages( PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(callback_stream, event_, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, compute_stream)); - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamWaitEvent(callback_stream, event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( @@ -206,7 +197,7 @@ void EagerDeletionOpHandle::ClearGarbages( } else { #endif gc_->Add(std::move(*garbages)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } #endif } diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index 049b0c2ec478b4..0a92269c50ad2d 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -80,7 +80,7 @@ class EagerDeletionOpHandle : public OpHandleBase { std::vector var_infos_; // not own GarbageCollector *gc_; // not own std::vector vars_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::GPUContext *dev_ctx_{nullptr}; gpuEvent_t event_{nullptr}; #endif diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc index be3b196c3ca6ca..ee78d366711075 100644 --- a/paddle/fluid/framework/details/fetch_async_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc @@ -135,7 +135,7 @@ static void TransData(const phi::DenseTensor *src_item, const platform::DeviceContext &ctx) { if (src_item->IsInitialized() && src_item->numel() > 0) { if (platform::is_gpu_place(src_item->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item); #endif } else { diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 0ab7767aca0bac..27be4b77176350 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -121,7 +121,7 @@ static void TransData(const phi::DenseTensor &src_item, phi::DenseTensor *dst_item) { if (src_item.IsInitialized() && src_item.numel() > 0) { if (platform::is_gpu_place(src_item.place())) { // NOLINT -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TensorCopy(src_item, platform::CPUPlace(), dst_item); #endif } else { diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index b1db6b334013d3..53746482d58a80 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -32,7 +32,7 @@ typedef std::vector< std::vector>> GradientAndLoDTensor; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) FusedAllReduceOpHandle::FusedAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, @@ -61,13 +61,11 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle( #endif FusedAllReduceOpHandle::~FusedAllReduceOpHandle() { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto destroy_event = [](gpuEvent_t event) { if (event == nullptr) return; #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); #endif @@ -82,7 +80,7 @@ void FusedAllReduceOpHandle::RunImpl() { Name(), platform::TracerEventType::Communication, 1); VLOG(4) << this->DebugString(); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (FLAGS_allreduce_record_one_event && start_event_ == nullptr) { VLOG(10) << "FLAGS_allreduce_record_one_event=true"; PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, @@ -105,9 +103,6 @@ void FusedAllReduceOpHandle::RunImpl() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(event, hipEventDisableTiming)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaEventCreateWithFlags(event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(event, cudaEventDisableTiming)); @@ -131,10 +126,6 @@ void FusedAllReduceOpHandle::RunImpl() { PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(nccl_stream, start_event_, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(start_event_, compute_stream)); - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamWaitEvent(nccl_stream, start_event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( @@ -194,16 +185,12 @@ void FusedAllReduceOpHandle::RunImpl() { FusedAllReduceFunc(in_var_handles, out_var_handles); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (FLAGS_allreduce_record_one_event) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(compute_stream, end_event_, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(end_event_, nccl_stream)); - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamWaitEvent(compute_stream, end_event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end_event_, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS( diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h index a5c6c431f1742e..533d1d0860a553 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -33,7 +33,7 @@ namespace platform { class NCCLCommunicator; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) @@ -44,7 +44,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) struct FusedAllReduceOpHandle : public AllReduceOpHandle { FusedAllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, @@ -75,7 +75,7 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle { private: size_t num_of_all_reduce_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) gpuEvent_t start_event_{nullptr}; gpuEvent_t end_event_{nullptr}; #endif diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h index 198fb8b6eb07e6..6ba6df7011ade6 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -36,7 +36,7 @@ struct NCCLContextMap; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -46,7 +46,7 @@ namespace details { struct FusedBroadcastOpHandle : public BroadcastOpHandle { public: -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) FusedBroadcastOpHandle(ir::Node *node, const std::vector local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc index 2ebaa31f53bd89..15648aa058f073 100644 --- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc @@ -16,7 +16,7 @@ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) PHI_DECLARE_bool(sync_nccl_allreduce); #endif @@ -24,7 +24,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) GradMergeAllReduceOpHandle::GradMergeAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, @@ -77,7 +77,7 @@ std::string GradMergeAllReduceOpHandle::Name() const { return "grad_merge_all_reduce"; } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) FusedGradMergeAllReduceOpHandle::FusedGradMergeAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h index 5e8d061762cbc8..ce01f85eaba52a 100644 --- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h @@ -33,7 +33,7 @@ namespace platform { class NCCLCommunicator; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -44,7 +44,7 @@ namespace details { class GradMergeAllReduceOpHandle : public AllReduceOpHandle { public: -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) GradMergeAllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -75,7 +75,7 @@ class GradMergeAllReduceOpHandle : public AllReduceOpHandle { class FusedGradMergeAllReduceOpHandle : public FusedAllReduceOpHandle { public: -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) FusedGradMergeAllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 91cb342594a635..6c3f5356ac1f15 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -183,7 +183,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, << ", place:" << tensor->place() << ", numel:" << tensor->numel(); if (platform::is_gpu_place(tensor->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) tensor_check(op_type, var_name, *tensor, place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h index ab7c4ecd884683..e4472e8d989dd2 100644 --- a/paddle/fluid/framework/details/nccl_op_handle.h +++ b/paddle/fluid/framework/details/nccl_op_handle.h @@ -27,9 +27,6 @@ #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/rccl.h" #endif -#ifdef PADDLE_WITH_MUSA -#include "paddle/fluid/platform/dynload/mccl.h" -#endif #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/flags.h" @@ -58,8 +55,6 @@ class NCCLOpHandleBase : public OpHandleBase { for (auto& ev : inter_events_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second)); #endif @@ -67,8 +62,6 @@ class NCCLOpHandleBase : public OpHandleBase { for (auto& ev : exter_events_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second)); #endif @@ -79,7 +72,7 @@ class NCCLOpHandleBase : public OpHandleBase { return nccl_ctxs_; } - mcclComm_t GetComm() const { + ncclComm_t GetComm() const { PADDLE_ENFORCE_EQ( places_.size(), 1, @@ -150,11 +143,6 @@ class NCCLOpHandleBase : public OpHandleBase { &inter_events_[dev_id], hipEventDisableTiming)); PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags( &exter_events_[dev_id], hipEventDisableTiming)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventCreateWithFlags( - &inter_events_[dev_id], musaEventDisableTiming)); - PADDLE_ENFORCE_GPU_SUCCESS(musaEventCreateWithFlags( - &exter_events_[dev_id], musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags( &inter_events_[dev_id], cudaEventDisableTiming)); @@ -171,8 +159,8 @@ class NCCLOpHandleBase : public OpHandleBase { const void* sendbuff, void* recvbuff, size_t count, - mcclDataType_t datatype, - mcclRedOp_t op) { + ncclDataType_t datatype, + ncclRedOp_t op) { PADDLE_ENFORCE_GE( run_order_, 0, @@ -188,7 +176,7 @@ class NCCLOpHandleBase : public OpHandleBase { << ", dev_id:" << dev_id << ", dtype:" << datatype << ", place:" << place; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, count, datatype, op, comm, stream)); } @@ -196,8 +184,8 @@ class NCCLOpHandleBase : public OpHandleBase { const void* sendbuff, void* recvbuff, size_t count, - mcclDataType_t datatype, - mcclRedOp_t op) { + ncclDataType_t datatype, + ncclRedOp_t op) { PADDLE_ENFORCE_GE( run_order_, 0, @@ -215,8 +203,8 @@ class NCCLOpHandleBase : public OpHandleBase { const void* sendbuff, void* recvbuff, size_t count, - mcclDataType_t datatype, - mcclRedOp_t op) { + ncclDataType_t datatype, + ncclRedOp_t op) { PADDLE_ENFORCE_GE( run_order_, 0, @@ -236,8 +224,8 @@ class NCCLOpHandleBase : public OpHandleBase { const void* sendbuff, void* recvbuff, size_t count, - mcclDataType_t datatype, - mcclRedOp_t op UNUSED) { + ncclDataType_t datatype, + ncclRedOp_t op UNUSED) { auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_); int dev_id = place.device; auto& nccl_ctx = nccl_ctxs->at(dev_id); @@ -250,13 +238,11 @@ class NCCLOpHandleBase : public OpHandleBase { << ", dtype:" << datatype << ", place:" << place << ", stream:" << stream; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce( - sendbuff, recvbuff, count, datatype, mcclSum, 0, comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( + sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream)); #ifdef PADDLE_WITH_HIP hipEventRecord(inter_events_.at(dev_id), stream); -#elif defined(PADDLE_WITH_MUSA) - musaEventRecord(inter_events_.at(dev_id), stream); #else cudaEventRecord(inter_events_.at(dev_id), stream); #endif @@ -270,8 +256,8 @@ class NCCLOpHandleBase : public OpHandleBase { const void* sendbuff, void* recvbuff, size_t count, - mcclDataType_t datatype, - mcclRedOp_t op) { + ncclDataType_t datatype, + ncclRedOp_t op) { auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_); PADDLE_ENFORCE_NOT_NULL( nccl_ctxs_, @@ -290,21 +276,14 @@ class NCCLOpHandleBase : public OpHandleBase { #ifdef PADDLE_WITH_HIP hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, count, datatype, op, comm, stream)); hipEventRecord(exter_events_.at(dev_id), stream); -#elif defined(PADDLE_WITH_MUSA) - musaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( - sendbuff, recvbuff, count, datatype, op, comm, stream)); - - musaEventRecord(exter_events_.at(dev_id), stream); #else cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, count, datatype, op, comm, stream)); cudaEventRecord(exter_events_.at(dev_id), stream); @@ -317,8 +296,8 @@ class NCCLOpHandleBase : public OpHandleBase { void InterBroadCast(platform::Place place, void* sendbuff, size_t count, - mcclDataType_t datatype, - mcclRedOp_t op UNUSED) { + ncclDataType_t datatype, + ncclRedOp_t op UNUSED) { auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_); int dev_id = place.device; auto& nccl_ctx = nccl_ctxs->at(dev_id); @@ -331,12 +310,10 @@ class NCCLOpHandleBase : public OpHandleBase { << ", stream:" << stream; #ifdef PADDLE_WITH_HIP hipStreamWaitEvent(stream, exter_events_.at(dev_id), 0); -#elif defined(PADDLE_WITH_MUSA) - musaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); #else cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); #endif - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( sendbuff, count, datatype, 0, comm, stream)); } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 896b251571fc96..ee87141a9d5414 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -31,13 +31,11 @@ std::string OpHandleBase::DebugString() const { } OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { // NOLINT -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) for (auto &ev : events_) { if (ev.second) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second)); #endif @@ -47,16 +45,13 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { // NOLINT } void OpHandleBase::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) for (auto &p : dev_ctxes_) { int dev_id = p.first.device; // NOLINT platform::SetDeviceId(dev_id); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaEventCreateWithFlags(&events_[dev_id], musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); @@ -141,7 +136,7 @@ void OpHandleBase::InitXPU() { } void OpHandleBase::Run(DeviceType use_device) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (events_.empty() && use_device == p::kCUDA && !dev_ctxes_.empty()) { InitCUDA(); } @@ -177,7 +172,7 @@ void OpHandleBase::Run(DeviceType use_device) { } void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_NOT_NULL( waited_ctx, platform::errors::InvalidArgument("Argument waited_ctx is NULL.")); @@ -193,8 +188,6 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { for (auto &ev : events_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream, ev.second, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0)); #endif @@ -228,15 +221,12 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (in_var_handle) { auto &place = in_var_handle->place(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto stream = static_cast(dev_ctxes_.at(place))->stream(); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); @@ -258,7 +248,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (in_var_handle) { auto &place = in_var_handle->place(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto stream = @@ -283,16 +273,13 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { auto *in_var_handle = dynamic_cast(in_var); if (in_var_handle) { if (platform::is_gpu_place(in_var_handle->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto stream = static_cast( dev_ctxes_.at(in_var_handle->place())) ->stream(); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); @@ -324,7 +311,7 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { void OpHandleBase::RunAndRecordEvent(const std::function &callback) { callback(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!events_.empty()) { // Use event for (auto &p : dev_ctxes_) { auto dev_id = p.first.device; @@ -333,9 +320,6 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); @@ -347,7 +331,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { void OpHandleBase::RunAndRecordEvent(platform::Place p, const std::function &callback) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_cpu_place(p) || events_.empty()) { callback(); } else { diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 4bd385ff5099cb..9afe56e4babd45 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -161,7 +161,7 @@ class OpHandleBase { // See https://github.com/PaddlePaddle/Paddle/pull/32283 bool is_variant_scope_ = false; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::unordered_map events_; #endif diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index d7d0a3e2863638..fe43126ca8abe4 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -182,7 +182,7 @@ void ReduceOpHandle::RunImpl() { } }); } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto pre_in = pre_in_var->Get(); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); VariableVisitor::GetMutableTensor(out_var).mutable_data( @@ -210,12 +210,12 @@ void ReduceOpHandle::RunImpl() { size_t numel = static_cast(lod_tensor.numel()); all_reduce_calls.emplace_back( [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( buffer, recvbuffer, numel, - static_cast(type), - mcclSum, + static_cast(type), + ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream())); diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index eb0e319cce3b50..2eb0ad29232119 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -39,7 +39,7 @@ namespace platform { struct NCCLContextMap; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" @@ -79,7 +79,7 @@ struct ReduceOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) const platform::NCCLContextMap *nccl_ctxs_; ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, @@ -129,7 +129,7 @@ struct ReduceOpHandle : public OpHandleBase { std::vector GetLocalScopes() override { return local_scopes_; } -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA) && \ +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) && \ defined PADDLE_WITH_DISTRIBUTE template void GatherSelectedRows( diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index f37ea73a477b66..8b486be9cc686a 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -76,7 +76,7 @@ struct ScaleLossGradFunctor { "Please recompile or reinstall Paddle with XPU support.")); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) OutT cast_coeff = static_cast(coeff_); auto stream = static_cast(ctx_)->stream(); memory::Copy(place_, @@ -110,7 +110,7 @@ void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) { auto *tensor = var->GetMutable(); tensor->Resize(common::make_ddim({1})); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ScaleLossGradFunctor func( coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_)); if (record_event) { diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc index cb16915316ecfe..02a68fb697efbb 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc @@ -95,7 +95,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype( } void ShareTensorBufferOpHandle::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) int dev_id = dev_ctxes_.begin()->first.device; events_[dev_id] = nullptr; #endif diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc index 5c266946144fe0..ba678bbe2e26be 100644 --- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc @@ -196,7 +196,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() { auto comm = nccl_ctx.comm_; int encode_size = 2 * k * sizeof(int); - // dgc use mcclAllGather to get all the encoded data + // dgc use ncclAllGather to get all the encoded data // so the buffer need nranks. int buf_size = nranks_ * encode_size; void *gather_buff = gathers[i]->data(); @@ -207,10 +207,10 @@ void SparseAllReduceOpHandle::RunImplEncoded() { all_gather_calls.emplace_back([=] { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclAllGather(in_tensor_buf, + platform::dynload::ncclAllGather(in_tensor_buf, gather_buff, 2 * k, - static_cast(dtype), + static_cast(dtype), comm, stream)); }); diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index 9a130bea0d3a27..a6314220d5c264 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -129,7 +129,7 @@ struct VarHandle : public VarHandleBase { name_(std::move(name)), place_(std::move(place)) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool HasEvent() { return has_event_; } const gpuEvent_t& GetEvent() { @@ -154,7 +154,7 @@ struct VarHandle : public VarHandleBase { size_t scope_idx_; std::string name_; platform::Place place_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Only when this event is triggered, var is generated. gpuEvent_t event_; bool has_event_{false}; diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index e448f80ae39388..d7714808ff08ac 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -53,7 +53,7 @@ class Scope; } // namespace framework } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -85,12 +85,12 @@ class PullDenseWorker { public: virtual ~PullDenseWorker() {} virtual void Initialize(const TrainerDesc& param); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); } #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA) + defined(PADDLE_WITH_XPU) void AddPlace(const paddle::platform::Place place) { places_.push_back(place); } @@ -155,7 +155,7 @@ class PullDenseWorker { float total_batch_num_ = 0; std::unordered_map scope_to_thread_id_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::vector copy_streams_; #endif std::vector places_; @@ -186,7 +186,7 @@ class DeviceWorker { virtual void ProduceTasks() {} virtual void GetXpuOpIndex() {} virtual void Schedule(int taskid UNUSED) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) virtual void SetStream(const gpuStream_t stream UNUSED) {} virtual void SetEvent(const gpuEvent_t event UNUSED) {} #endif @@ -588,7 +588,7 @@ class HeterCpuWorker : public HogwildWorker { }; #endif -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) class PSGPUWorker : public HogwildWorker { @@ -604,7 +604,7 @@ class PSGPUWorker : public HogwildWorker { new (&program_) ProgramDesc(main_program); } void ProduceTasks() override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; } virtual void SetEvent(const gpuEvent_t event) { event_ = event; } #endif @@ -672,7 +672,7 @@ class PSGPUWorker : public HogwildWorker { std::unordered_map> feasign_set_; paddle::framework::Channel> pull_queue_; paddle::framework::Channel> push_queue_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuEvent_t event_; gpuStream_t copy_stream_; #endif @@ -718,7 +718,7 @@ class PSGPUWorker : public HogwildWorker { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class SectionWorker : public DeviceWorker { public: SectionWorker() {} @@ -845,7 +845,7 @@ class HeterSectionWorker : public DeviceWorker { Scope* GetThreadScope() override { return minibatch_scope_; } // multi-stream - // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) + // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // void SetStream(const gpuStream_t stream) override {} // void SetEvent(const gpuEvent_t event) override {} // #endif diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index c4ef22ebfe82cb..5c920fa3e318f9 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -77,13 +77,13 @@ REGISTER_DEVICE_WORKER_CLASS(HeterSectionWorker); REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker); #endif -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL|| \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif } // namespace framework diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 4c6e19fd964bb1..1e1a02f944f65b 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -96,7 +96,7 @@ struct DLDeviceVisitor { } inline ::DLDevice operator()(const platform::CUDAPlace &place) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ::DLDevice device; device.device_type = kDLGPU; device.device_id = place.device; // NOLINT @@ -108,7 +108,7 @@ struct DLDeviceVisitor { } inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ::DLDevice device; device.device_type = kDLCPUPinned; device.device_id = 0; diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 659bdcaaf95164..5dee8b04e78b7b 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -50,12 +50,6 @@ if(WITH_HETERPS) SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) add_subdirectory(heter_ps) - elseif(WITH_MCCL) - musa_library( - ps_gpu_wrapper - SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) - add_subdirectory(heter_ps) endif() else() cc_library( @@ -64,7 +58,7 @@ else() DEPS gloo_wrapper) endif() -if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) +if(WITH_NCCL OR WITH_RCCL) cc_library( nccl_wrapper SRCS nccl_wrapper.cc @@ -83,12 +77,6 @@ if(WITH_BOX_PS) SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps) endif() - if(WITH_MUSA) - musa_library( - box_wrapper - SRCS box_wrapper.cc box_wrapper.cu - DEPS framework_proto lod_tensor box_ps) - endif() else() cc_library( box_wrapper diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index 0d1c4aba87dc57..5f46906cf8e823 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -161,11 +161,6 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, values.data(), values.size() * sizeof(float*), hipMemcpyHostToDevice); -#elif defined(PADDLE_WITH_MUSA) - musaMemcpy(gpu_values, - values.data(), - values.size() * sizeof(float*), - musaMemcpyHostToDevice); #else cudaMemcpy(gpu_values, values.data(), @@ -255,10 +250,6 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place, slot_num, total_len); hipStreamSynchronize(stream); -#elif defined(PADDLE_WITH_MUSA) - CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>( - origin_keys, total_keys, gpu_len, slot_num, total_len); - musaStreamSynchronize(stream); #else CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>( origin_keys, total_keys, gpu_len, slot_num, total_len); @@ -304,19 +295,6 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place, slot_vector_.data(), slot_lengths_lod.size() * sizeof(int), hipMemcpyHostToDevice); -#elif defined(PADDLE_WITH_MUSA) - musaMemcpy(gpu_values, - grad_values.data(), - grad_values.size() * sizeof(float*), - musaMemcpyHostToDevice); - musaMemcpy(gpu_len, - slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), - musaMemcpyHostToDevice); - musaMemcpy(d_slot_vector, - slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), - musaMemcpyHostToDevice); #else cudaMemcpy(gpu_values, grad_values.data(), diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index b3432277805a7e..9853c328cd14e9 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -595,9 +595,6 @@ class BoxWrapper { data->resize(len); #ifdef PADDLE_WITH_HIP hipMemcpy(data->data(), gpu_data, sizeof(T) * len, hipMemcpyDeviceToHost); -#elif defined(PADDLE_WITH_MUSA) - musaMemcpy( - data->data(), gpu_data, sizeof(T) * len, musaMemcpyDeviceToHost); #else cudaMemcpy( data->data(), gpu_data, sizeof(T) * len, cudaMemcpyDeviceToHost); diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index 9eb4360e7dd08d..d72e418aadd3ef 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -44,7 +44,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in PaddleBox now.")); } else if (platform::is_gpu_place(place)) { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; int device_id = place.GetDeviceId(); phi::DenseTensor& total_keys_tensor = keys_tensor[device_id]; @@ -70,15 +70,6 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, slot_lengths_lod.data(), slot_lengths.size() * sizeof(int64_t), hipMemcpyHostToDevice); -#elif defined(PADDLE_WITH_MUSA) - musaMemcpy(gpu_keys, - keys.data(), - keys.size() * sizeof(uint64_t*), - musaMemcpyHostToDevice); - musaMemcpy(gpu_len, - slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), - musaMemcpyHostToDevice); #else cudaMemcpy(gpu_keys, keys.data(), @@ -162,7 +153,7 @@ void BoxWrapper::PushSparseGradCase( PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in PaddleBox now.")); } else if (platform::is_gpu_place(place)) { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) int device_id = place.GetDeviceId(); phi::DenseTensor& cached_total_keys_tensor = keys_tensor[device_id]; uint64_t* total_keys = diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 7ac9e4f7302a66..05433c1014656f 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -784,7 +784,7 @@ void FleetWrapper::PushDenseVarsSync( const uint64_t table_id, const std::vector& var_names) {} -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ (defined PADDLE_WITH_PSLIB) void FleetWrapper::PushDenseVarsAsync( const Scope& scope, @@ -816,9 +816,6 @@ void FleetWrapper::PushDenseVarsAsync( #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream)); hipEventSynchronize(event); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, stream)); - musaEventSynchronize(event); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream)); cudaEventSynchronize(event); diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index 1284b379c9f20b..fb5cf917292566 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -175,7 +175,7 @@ class FleetWrapper { // Push dense variables to server in async mode // Param: scope, table_id, var_names, scale_datanorm, batch_size // Param: push_sparse_status -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void PushDenseVarsAsync( const Scope& scope, const uint64_t table_id, diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 1dbd675073dd7a..0af67107f0cbc6 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -96,18 +96,3 @@ if(WITH_ROCM) SRCS heter_ps.cu DEPS heter_comm) endif() -if(WITH_MUSA) - musa_library( - heter_comm - SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h - hashtable.h - DEPS cub device_context) - musa_test( - test_heter_comm - SRCS feature_value.h - DEPS heter_comm) - musa_library( - heter_ps - SRCS heter_ps.cu - DEPS heter_comm) -endif() \ No newline at end of file diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu index b5d788840ee547..3bf395071df274 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu @@ -846,7 +846,7 @@ void GraphGpuWrapper::init_service() { inter_comms_.resize(dev_size); if (gloo->Rank() == 0) { for (int i = 0; i < dev_size; ++i) { - platform::dynload::mcclGetUniqueId(&inter_ncclids_[i]); + platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]); } } @@ -860,13 +860,13 @@ void GraphGpuWrapper::init_service() { opts.setRoot(0); gloo::broadcast(opts); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (int i = 0; i < dev_size; ++i) { platform::CUDADeviceGuard guard(device_id_mapping[i]); platform::dynload::ncclCommInitRank( &inter_comms_[i], gloo->Size(), inter_ncclids_[i], gloo->Rank()); } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); rank_id_ = gloo->Rank(); node_size_ = gloo->Size(); diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h index 4045c615a27cb3..315a9860ed67a2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h @@ -22,7 +22,7 @@ #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" #ifdef PADDLE_WITH_HETERPS -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) #include #include #include @@ -302,9 +302,9 @@ class GraphGpuWrapper { int node_size_ = 1; int multi_node_ = 0; #ifdef PADDLE_WITH_CUDA - std::vector inner_comms_; - std::vector inter_comms_; - std::vector inter_ncclids_; + std::vector inner_comms_; + std::vector inter_comms_; + std::vector inter_ncclids_; #endif }; // class GraphGpuWrapper #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index b869ad1c235cb6..18e3966b220c0c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -166,8 +166,8 @@ class HeterComm { size_t len, Sgd& sgd); // NOLINT - void set_nccl_comm_and_size(const std::vector& inner_comms, - const std::vector& inter_comms, + void set_nccl_comm_and_size(const std::vector& inner_comms, + const std::vector& inter_comms, int comm_size, int rank_id) { nccl_inner_comms_ = inner_comms; @@ -791,8 +791,8 @@ class HeterComm { #if defined(PADDLE_WITH_CUDA) GpuRDMAChecker* rdma_checker_ = nullptr; - std::vector nccl_inner_comms_; - std::vector nccl_inter_comms_; + std::vector nccl_inner_comms_; + std::vector nccl_inter_comms_; int multi_mf_dim_{8}; int max_mf_dim_ = 8; std::vector> allocators_; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 3df6e6e89861ff..36fe556bcf3fbd 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -2870,7 +2870,7 @@ size_t HeterComm::send_data_by_all2all( auto &loc = storage_[gpu_id]; auto nccl_stream = resource_->comm_stream(gpu_id, 0); size_t total_fea_num = 0; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (int i = 0; i < nccl_node_size; i++) { if (i == nccl_rank_id) { continue; @@ -2881,7 +2881,7 @@ size_t HeterComm::send_data_by_all2all( PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclSend(&d_send_buff[send_offset], send_size * value_bytes, - mcclInt8, + ncclInt8, i, comm, nccl_stream)); @@ -2893,14 +2893,14 @@ size_t HeterComm::send_data_by_all2all( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( reinterpret_cast(&d_rev_buff[recv_offset]), recv_size * value_bytes, - mcclInt8, + ncclInt8, i, comm, nccl_stream)); total_fea_num += recv_size; } } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream)); return total_fea_num; @@ -2959,11 +2959,11 @@ size_t HeterComm:: cache.node_barrier_.Resume(); auto &comm = nccl_inter_comms_[gpu_id]; auto nccl_stream = resource_->comm_stream(gpu_id, 0); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( &res.d_node_size_ptr[rank_offset], reinterpret_cast(res.d_node_size_ptr), node_size_, - mcclInt, + ncclInt, comm, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream)); @@ -3780,11 +3780,11 @@ size_t HeterComm:: my_cache.node_barrier_.Resume(); auto &comm = nccl_inter_comms_[gpu_id]; auto nccl_stream = resource_->comm_stream(gpu_id, 0); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( &res.d_node_size_ptr[rank_id_ * node_size_], reinterpret_cast(res.d_node_size_ptr), node_size_, - mcclInt, + ncclInt, comm, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream)); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 017e3726357b9a..3fe05753e09a31 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -134,8 +134,8 @@ void HeterPs::push_sparse(int num, template class GPUOptimizer> void HeterPs::set_nccl_comm_and_size( - const std::vector& inner_comms, - const std::vector& inter_comms, + const std::vector& inner_comms, + const std::vector& inter_comms, int comm_size, int rank_id) { comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size, rank_id); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index d1c1d0c8b611bb..c472c2ed75a9d6 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -49,8 +49,8 @@ class HeterPs : public HeterPsBase { size_t chunk_size, int stream_num) override; #if defined(PADDLE_WITH_CUDA) - void set_nccl_comm_and_size(const std::vector& inner_comms, - const std::vector& inter_comms, + void set_nccl_comm_and_size(const std::vector& inner_comms, + const std::vector& inter_comms, int comm_size, int rank_id) override; void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index b729cdfcbb0f96..8624425d8bfbd2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -46,8 +46,8 @@ class HeterPsBase { virtual int get_index_by_devid(int devid) = 0; #if defined(PADDLE_WITH_CUDA) virtual void set_nccl_comm_and_size( - const std::vector& inner_comms, - const std::vector& inter_comms, + const std::vector& inner_comms, + const std::vector& inter_comms, int comm_size, int rank_id) = 0; virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0; diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc index 97b704b4f3d219..a8ce9be92bdf68 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.cc +++ b/paddle/fluid/framework/fleet/heter_wrapper.cc @@ -121,7 +121,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname, tensor->numel() * SizeOfType(framework::TransToProtoVarType(tensor->dtype()))); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(), @@ -141,7 +141,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void HeterWrapper::DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, platform::Place place, @@ -169,7 +169,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, void* tensor_data = tensor->mutable_data( place, framework::TransToPhiDataType(ToVarType(req_var.data_type()))); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) memory::Copy(place, tensor_data, platform::CPUPlace(), diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h index 70cbce2acc24d7..77838fbec6d00e 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_wrapper.h @@ -92,7 +92,7 @@ class HeterWrapper { framework::proto::VarType::Type ToVarType(VariableMessage::Type type); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, platform::Place place, diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc index 8be530c3170ba3..640f7dd08dc8d1 100644 --- a/paddle/fluid/framework/fleet/nccl_wrapper.cc +++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc @@ -21,9 +21,9 @@ std::shared_ptr NCCLWrapper::s_instance_ = NULL; bool NCCLWrapper::is_initialized_ = false; void NCCLWrapper::InitNCCL() { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclCommInitRank(&(nccl_info_.comm_), + platform::dynload::ncclCommInitRank(&(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_, nccl_info_.my_global_rank_)); @@ -32,16 +32,16 @@ void NCCLWrapper::InitNCCL() { } void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) nccl_info_.nccl_id_ = nccl_info.nccl_id_; #endif return; } NCCLInfo NCCLWrapper::GetNCCLId() { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclGetUniqueId(&(nccl_info_.nccl_id_))); + platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_))); #endif return nccl_info_; } @@ -49,15 +49,13 @@ NCCLInfo NCCLWrapper::GetNCCLId() { void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, const int ranks) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) nccl_info_.local_rank_ = local_rank; nccl_info_.my_global_rank_ = global_rank; nccl_info_.global_ranks_ = ranks; platform::SetDeviceId(local_rank); #ifdef PADDLE_WITH_RCCL PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&(nccl_info_.stream_))); -#elif defined(PADDLE_WITH_MCCL) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&(nccl_info_.stream_))); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_))); #endif @@ -68,22 +66,20 @@ void NCCLWrapper::SetRankInfo(const int local_rank, void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope, const std::vector& var_names) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) for (auto& name : var_names) { auto var = scope.FindVar(name); phi::DenseTensor* tensor = var->GetMutable(); int32_t total_size = tensor->numel(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( reinterpret_cast(tensor->data()), total_size, - mcclFloat, + ncclFloat, root_rank, nccl_info_.comm_, nccl_info_.stream_)); #ifdef PADDLE_WITH_RCCL hipStreamSynchronize(nccl_info_.stream_); -#elif defined(PADDLE_WITH_MCCL) - musaStreamSynchronize(nccl_info_.stream_); #else cudaStreamSynchronize(nccl_info_.stream_); #endif diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h index 46cdae20395e91..7e9cc0c56a6b46 100644 --- a/paddle/fluid/framework/fleet/nccl_wrapper.h +++ b/paddle/fluid/framework/fleet/nccl_wrapper.h @@ -31,10 +31,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_RCCL #include "paddle/fluid/platform/dynload/rccl.h" #endif -#ifdef PADDLE_WITH_MCCL -#include "paddle/fluid/platform/dynload/mccl.h" -#endif - #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN namespace paddle { @@ -55,9 +51,9 @@ class NCCLInfo { int local_rank_; int global_ranks_; int my_global_rank_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) - mcclUniqueId nccl_id_; - mcclComm_t comm_; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + ncclUniqueId nccl_id_; + ncclComm_t comm_; gpuStream_t stream_; #endif }; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 85fe092e963db2..edfa4048b55287 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -314,7 +314,7 @@ class PSGPUWrapper { inter_comms_.resize(dev_size); if (gloo->Rank() == 0) { for (int i = 0; i < dev_size; ++i) { - platform::dynload::mcclGetUniqueId(&inter_ncclids_[i]); + platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]); } } @@ -328,13 +328,13 @@ class PSGPUWrapper { opts.setRoot(0); gloo::broadcast(opts); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (int i = 0; i < dev_size; ++i) { platform::CUDADeviceGuard guard(dev_ids[i]); platform::dynload::ncclCommInitRank( &inter_comms_[i], gloo->Size(), inter_ncclids_[i], gloo->Rank()); } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); rank_id_ = gloo->Rank(); node_size_ = gloo->Size(); @@ -979,9 +979,9 @@ class PSGPUWrapper { uint64_t table_id_; int gpu_graph_mode_ = 0; #ifdef PADDLE_WITH_CUDA - std::vector inner_comms_; - std::vector inter_comms_; - std::vector inter_ncclids_; + std::vector inner_comms_; + std::vector inter_comms_; + std::vector inter_ncclids_; #endif std::vector heter_devices_; std::unordered_set gpu_ps_config_keys_; diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 5f9db8c20d51ff..d0620381ae8e91 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -13,7 +13,7 @@ // limitations under the License. #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/framework/garbage_collector.h" @@ -64,7 +64,7 @@ void IPUGarbageCollector::ClearCallback(const std::function &callback) { } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) {} @@ -93,8 +93,6 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, platform::CUDADeviceGuard guard(place.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream_)); callback_manager_ = @@ -203,7 +201,7 @@ std::unique_ptr CreateGarbageCollector( const platform::Place &place, const size_t max_memory_size) { std::unique_ptr gc = nullptr; if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (IsFastEagerDeletionModeEnabled()) { gc = std::make_unique(place, max_memory_size); diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index f9d94600a513d9..5376739624d6f3 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -85,7 +85,7 @@ class IPUGarbageCollector : public GarbageCollector { }; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class UnsafeFastGPUGarbageCollector : public GarbageCollector { public: UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index 83dbe31d86a5a8..b98094ab74101c 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -28,7 +28,7 @@ limitations under the License. */ #include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/math_function.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/phi/core/distributed/nccl_comm_context.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif @@ -1202,20 +1202,20 @@ bool HogwildWorker::CheckBatchNum(int flag) { // comm_ctx->AllReduce only support allreduce on the whole tensor, // single element is not supported now. PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclAllReduce(&stat_ptr[flag], + platform::dynload::ncclAllReduce(&stat_ptr[flag], &stat_ptr[2], 1, ncclFloat32, - mcclProd, + ncclProd, comm_ctx->GetNcclComm(), stream)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&stat_ptr[flag], + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag], &stat_ptr[2], 1, ncclFloat32, - mcclProd, + ncclProd, comm->comm(), stream)); } @@ -1246,11 +1246,11 @@ bool HogwildWorker::GetPassEnd(int flag) { // auto stream = static_cast(dev_ctx_)->stream(); // PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); auto stream = comm->stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&stat_ptr[flag], + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag], &stat_ptr[2], 1, ncclFloat32, - mcclProd, + ncclProd, comm->comm(), stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret, // output @@ -1267,7 +1267,7 @@ bool HogwildWorker::GetPassEnd(int flag) { void HogwildWorker::TrainFilesWithProfiler() { platform::SetNumThreads(1); #if defined(PADDLE_WITH_HETERPS) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) platform::SetDeviceId(thread_id_); #elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL) platform::SetXPUDeviceId(thread_id_); @@ -1473,7 +1473,7 @@ void HogwildWorker::TrainFiles() { platform::Timer timeline; timeline.Start(); #if defined(PADDLE_WITH_HETERPS) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) platform::SetDeviceId(thread_id_); #elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL) platform::SetXPUDeviceId(thread_id_); diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index d0c11c3098ddb2..46183fd93e97fd 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -3,7 +3,7 @@ add_subdirectory(memory_optimize_pass) add_subdirectory(multi_devices_graph_pass) if(NOT APPLE AND NOT WIN32 - AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA)) + AND (WITH_GPU OR WITH_ROCM)) add_subdirectory(fusion_group) endif() @@ -169,7 +169,7 @@ if(WITH_TENSORRT) pass_library(trt_remove_amp_strategy_op_pass inference) endif() -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) pass_library(cudnn_placement_pass base DEPS placement_pass_base) pass_library(embedding_eltwise_layernorm_fuse_pass inference) endif() @@ -493,7 +493,7 @@ cc_test( SRCS relu6_fuse_pass_test.cc DEPS relu6_fuse_pass) -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) cc_test( test_embedding_eltwise_layernorm_fuse_pass SRCS embedding_eltwise_layernorm_fuse_pass_tester.cc @@ -543,7 +543,7 @@ if(WITH_MKLDNN) device_context phi common) - if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) + if(WITH_GPU OR WITH_ROCM) set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv) endif() cc_test( diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc index a28930961efa0e..a54138060283bc 100644 --- a/paddle/fluid/framework/ir/cost_model.cc +++ b/paddle/fluid/framework/ir/cost_model.cc @@ -128,7 +128,7 @@ bool CostData::SetCostData(const ProgramDesc& program, double cpu_time_ms = main_thread_events[op_push_index].CpuElapsedMs( main_thread_events[op_pop_index]); double gpu_time_ms = 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpu_time_ms = main_thread_events[op_push_index].CudaElapsedMs( main_thread_events[op_pop_index]); #endif @@ -152,7 +152,7 @@ bool CostData::SetCostData(const ProgramDesc& program, double cpu_time_ms = main_thread_events[start_profiler_idx].CpuElapsedMs( main_thread_events[stop_profiler_idx]); double gpu_time_ms = 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpu_time_ms = main_thread_events[start_profiler_idx].CudaElapsedMs( main_thread_events[stop_profiler_idx]); #endif diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc index e0a9502c685d25..048b33a649f94d 100644 --- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc @@ -34,8 +34,8 @@ namespace framework { namespace ir { void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDNN_VERSION_MIN(7, 4, 1) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1) // forward std::unordered_set act_types = {"relu"}; graph = FuseBatchNormAct(graph, act_types); diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc index 36fa8a3331e7e1..2a24c5476a5010 100644 --- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc @@ -25,8 +25,8 @@ namespace framework { namespace ir { void FuseBatchNormAddActPass::ApplyImpl(ir::Graph *graph) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDNN_VERSION_MIN(7, 4, 1) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1) // forward std::unordered_set act_types = {"relu"}; graph = FuseBatchNormAddAct(graph, act_types); diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt index 390dd25b9cf5dd..570b081aae95ed 100644 --- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt +++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt @@ -2,7 +2,7 @@ cc_library( code_generator SRCS operation.cc code_generator.cc code_generator_helper.cc DEPS graph subgraph_detector) -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) cc_test( test_code_generator SRCS code_generator_tester.cc diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc index 92c1c1c6f02077..9749fb2bfa81c5 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc @@ -27,7 +27,7 @@ namespace phi { class DenseTensor; } // namespace phi -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h index 232e9bbf43607f..195b29a9794a9a 100644 --- a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h +++ b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h @@ -34,7 +34,7 @@ __device__ inline double Log(double x) { return log(x); } __device__ inline double Sqrt(double x) { return sqrt(x); } )"; -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#ifdef PADDLE_WITH_HIP static constexpr char predefined_cuda_functions_fp16[] = R"( __device__ inline __half Exp(const __half x) { return hexp(x); } __device__ inline __half Log(const __half x) { return hlog(x); } diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 17910d7dfae80b..30a001777bd587 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_utils.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -513,7 +513,7 @@ static OpDesc *ReplaceScaleLossGradOp(const Node &node, OpDesc *desc) { void ReplaceAllReduceOp(const Node &node, proto::BlockDesc *block, std::vector *ops) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) bool is_fused = (node.Name() == "fused_all_reduce"); details::OpHandleBase &op_handle = @@ -688,7 +688,7 @@ static void GetGraphOpDesc(const std::vector &nodes, ops->emplace_back(); auto &desc = ops->back(); ReplaceScaleLossGradOp(*n, &desc); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) } else if ((n->Name() == "allreduce" || n->Name() == "fused_all_reduce") && dynamic_cast( &(n->Wrapper())) != nullptr) { diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index c2a8c1bc73e8ea..9c60a665de0021 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -204,7 +204,7 @@ TEST(test_reference_count_pass, test_no_need_buffer_var_shrink) { {}); std::vector use_cuda_list{false}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) use_cuda_list.push_back(true); #endif for (auto use_cuda : use_cuda_list) { diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc index 4579e172ef665e..0dcf316c33c696 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc @@ -30,7 +30,7 @@ class AllReduceDepsPass : public ir::Pass { std::vector all_reduce_op_handles = GetSortedAllReduceOps(*graph); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto use_hierarchical_allreduce = Get(details::kUseHierarchicalAllReduce); for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) { diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc index a24fd784bb4088..dc18979260f928 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc @@ -37,7 +37,7 @@ class FuseAllReduceOpPass : public ir::Pass { auto &places = Get>(details::kPlaces); auto &local_scopes = Get>(details::kLocalScopes); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto *multi_nccl_ctxs = &Get(details::kNCCLCtxs); #elif defined(PADDLE_WITH_XPU_BKCL) @@ -95,7 +95,7 @@ class FuseAllReduceOpPass : public ir::Pass { for (auto &p_g : group_p_g) { group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second)); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) InsertFusedAllReduce(places, local_scopes, group_size, @@ -177,7 +177,7 @@ class FuseAllReduceOpPass : public ir::Pass { const std::vector &local_scopes, const size_t num_of_all_reduce, const std::vector &all_reduce_ops, -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) const platform::NCCLCommunicator *multi_nccl_ctxs, #elif defined(PADDLE_WITH_XPU_BKCL) const platform::BKCLCommunicator *multi_bkcl_ctxs, @@ -244,7 +244,7 @@ class FuseAllReduceOpPass : public ir::Pass { result->RemoveNode(op_handle.Node()); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, @@ -285,7 +285,7 @@ class FuseAllReduceOpPass : public ir::Pass { const std::vector &local_scopes, bool is_grad_merge, const std::string &grad_merge_cond_name, -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) const platform::NCCLCommunicator *multi_nccl_ctxs, #elif defined(PADDLE_WITH_XPU_BKCL) const platform::BKCLCommunicator *multi_bkcl_ctxs, @@ -293,7 +293,7 @@ class FuseAllReduceOpPass : public ir::Pass { ir::Graph *result) const { details::FusedAllReduceOpHandle *op_handle = nullptr; if (is_grad_merge) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) op_handle = new details::FusedGradMergeAllReduceOpHandle( result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), @@ -321,7 +321,7 @@ class FuseAllReduceOpPass : public ir::Pass { grad_merge_cond_name); #endif } else { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) op_handle = new details::FusedAllReduceOpHandle( result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), @@ -355,7 +355,7 @@ class FuseAllReduceOpPass : public ir::Pass { op_handle->AddOutput(out); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (!multi_nccl_ctxs) { SetCommunicationContext(places, op_handle); } diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc index 9e7b22b8930cca..295ef57cfdfead 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc @@ -170,7 +170,7 @@ void MultiDevSSAGraphBuilderBase::Init() const { places_ = Get>(details::kPlaces); local_scopes_ = Get>(details::kLocalScopes); strategy_ = Get(kStrategy); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) multi_nccl_ctxs_ = &Get(details::kNCCLCtxs); nccl_ctxs_ = nullptr; if (multi_nccl_ctxs_) { @@ -338,7 +338,7 @@ std::vector MultiDevSSAGraphBuilderBase::SortOperations( bool MultiDevSSAGraphBuilderBase::UseGPU() const { bool use_gpu = false; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) use_gpu = nccl_ctxs_ != nullptr; #endif return use_gpu; @@ -389,7 +389,7 @@ void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, void MultiDevSSAGraphBuilderBase::SetCommunicationContext( details::OpHandleBase *op_handle, const platform::Place &p) const { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nccl_ctxs_ == nullptr) { op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -408,7 +408,7 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext( void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto *op_handle = new details::BroadcastOpHandle( result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), local_scopes_, @@ -453,7 +453,7 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto *op_handle = new details::FusedBroadcastOpHandle( result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), local_scopes_, @@ -534,7 +534,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, -> details::OpHandleBase * { if (is_encoded) { #if defined(PADDLE_WITH_DGC) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) result->Get(kGraphOps).emplace_back( new details::SparseAllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -553,7 +553,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, grad_merge_cond_name = PADDLE_GET_CONST( std::string, node->Op()->GetAttr(GRAD_MERGE_COND_NAME)); VLOG(10) << "og=" << og << " use grad_merge_allreduce"; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) result->Get(kGraphOps).emplace_back( new details::GradMergeAllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -578,7 +578,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, grad_merge_cond_name)); #endif } else { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) result->Get(kGraphOps).emplace_back( new details::AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -718,7 +718,7 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps( details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp( ir::Graph *result, const std::string &og, size_t dst_dev_id) const { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) result->Get(kGraphOps).emplace_back(new details::ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h index 397922ad4bc88a..9e8fb5202a2d57 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h @@ -39,7 +39,7 @@ class Graph; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class NCCLCommunicator; class NCCLContextMap; #elif defined(PADDLE_WITH_XPU_BKCL) @@ -126,7 +126,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { void CreateIsolatedVarNode(ir::Graph *result, ir::Node *var_node) const; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) mutable platform::NCCLContextMap *nccl_ctxs_{nullptr}; mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr}; #elif defined(PADDLE_WITH_XPU_BKCL) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc index 976cd32e8ae515..debc3be7a32e00 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc @@ -34,7 +34,7 @@ #include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/pir/core/block_argument.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -105,7 +105,7 @@ platform::DeviceContext* ParseDeviceContext( return dev_ctx; } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum // with use_cal_stream==false by returning a device context getting from the // global NCCLCommContext instance. Because when use_calc_stream==false, in @@ -338,7 +338,7 @@ bool GetCondData(const phi::DenseTensor& cond) { // when platform::is_gpu_place(cond.place()) or // platform::is_xpu_place(cond.place()) is true std::unique_ptr cpu_cond{new phi::DenseTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) paddle::framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); #else diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc index a7434ad9d41819..8383b1fdd1790c 100644 --- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc +++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc @@ -53,7 +53,7 @@ inline std::tuple GetThreadPoolConfig(const phi::Place& place, processor_count = static_cast(std::thread::hardware_concurrency()); if (processor_count) { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) device_count = phi::backends::gpu::GetGPUDeviceCount(); #endif } diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 491370d4198fbf..46b9247728d63e 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -749,7 +749,7 @@ void BuildOpFuncList(const platform::Place& place, *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context); auto expected_kernel_key = framework::TransPhiKernelKeyToOpKernelType( op_with_kernel->GetExpectedKernelType(exec_ctx)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (op_with_kernel->CanCUDNNBeUsed(exec_ctx, expected_kernel_key.data_type_)) { expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index bc273000e626f5..5b60205fbc529f 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -20,7 +20,7 @@ #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" #include "paddle/fluid/platform/device_context.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -229,7 +229,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext( return dev_ctx; } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum // with use_cal_stream==false by returning a device context getting from the // global NCCLCommContext instance. Because when use_calc_stream==false, in diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index f6a5ed407c3f34..ff5832ba8335e6 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -48,7 +48,7 @@ PD_DECLARE_bool(benchmark); PHI_DECLARE_uint64(executor_log_deps_every_microseconds); PHI_DECLARE_bool(new_executor_use_cuda_graph); PHI_DECLARE_bool(enable_pir_in_executor); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PHI_DECLARE_bool(sync_nccl_allreduce); #endif @@ -121,7 +121,7 @@ class InterpreterBaseImpl { inline void SetDeviceId(const platform::Place& place) { // TODO(zhiqiu): reduce the cost if (platform::is_gpu_place(place)) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW(platform::errors::Unavailable( "Cannot run operator on place %s, please recompile paddle or " "reinstall Paddle with CUDA support.", diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index ee7587140b9234..a336e2c377dfd1 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -314,7 +314,7 @@ void Instruction::AddInplace(Variable* in, Variable* out) { void Instruction::ClearInplace() { vec_inplace_in_to_out_.clear(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void Instruction::UpdataRecordStreamForGcInfo() { if (!IsInterpretercoreFastGCEnabled() || KernelType() != OpFuncType::kGpuAsync) { @@ -328,7 +328,7 @@ void Instruction::UpdataRecordStreamForGcInfo() { stream_ = reinterpret_cast(DeviceContext()).stream(); // TODO(lizhiyu): Only analyse the 'send_v2' for GPT pp strategy right now. // To support all the operators for communicating in the future. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto operator_base_ptr = OpBase(); if ((operator_base_ptr->Type() == "send_v2") && (operator_base_ptr->Attr("use_calc_stream") == false)) { diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 6e96c0e5c109fa..66773746deb274 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -26,7 +26,7 @@ #include "paddle/fluid/platform/event.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/utils/rw_lock.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -306,7 +306,7 @@ class Instruction { const OpFuncNode* OpFunc() const { return &op_func_node_; } // record stream for gc -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool need_record_stream_for_gc_ = false; gpuStream_t stream_{nullptr}; void UpdataRecordStreamForGcInfo(); diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index fe64b51464214c..66de40585130b5 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -64,7 +64,7 @@ #include "paddle/pir/core/builtin_attribute.h" #include "paddle/pir/dialect/control_flow/ir/cf_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -857,7 +857,7 @@ void PirInterpreter::RecordMemcpyD2H(InstructionBase* instr_node) { } void PirInterpreter::RecordStreamForGC(InstructionBase* instr) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW(platform::errors::Unimplemented( "RecordStreamForGC is only implemented when compiled with GPU.")); #else @@ -876,7 +876,7 @@ void PirInterpreter::RecordStreamForGC(InstructionBase* instr) { reinterpret_cast(instr->DeviceContext()).stream(); // TODO(lizhiyu): Only analyse the 'send_v2' for GPT pp strategy right now. // To support all the operators for communicating in the future. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (instr->Name() == "pd_op.send_v2") { ::pir::Operation* op = instr->Operation(); if (op->HasAttribute("use_calc_stream") && @@ -998,7 +998,7 @@ void PirInterpreter::CheckGC(InstructionBase* instr) { platform::RecordEvent record( "CheckGC", platform::TracerEventType::UserDefined, 10); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) RecordStreamForGC(instr); #endif @@ -1619,7 +1619,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) { if (FLAGS_benchmark) { instr_node->DeviceContext().Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << instr_node->Name() // NOLINT << "): context wait and get last error"; diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h index f2fa9fd50eedbb..95eee77d362883 100644 --- a/paddle/fluid/framework/new_executor/profiler.h +++ b/paddle/fluid/framework/new_executor/profiler.h @@ -42,7 +42,7 @@ class ProfilerGuard { private: void TotalCUDAAllocatedMemorySize(const platform::Place& place) { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto cuda_place = place; cost_info_->device_memory_bytes = platform::RecordedGpuMallocSize(cuda_place.device); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index f0aefb94e6b691..d1ce9f55e46901 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -32,7 +32,7 @@ #endif #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/phi/backends/device_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -92,7 +92,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place, PrepareForCUDAGraphCapture(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) calculate_stream_timer_ = std::make_unique(place); #endif } @@ -659,7 +659,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { std::tuple ProgramInterpreter::InterpreterRunTime() { double start_time = 0, end_time = 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) start_time = calculate_stream_timer_->StartTime(); end_time = calculate_stream_timer_->EndTime(); #endif @@ -701,7 +701,7 @@ void ProgramInterpreter::Convert( #endif vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) vec_instruction_.back().UpdataRecordStreamForGcInfo(); #endif } @@ -973,7 +973,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) { 1, platform::EventRole::kInnerOp); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (is_in_op_profiling_mode_) { platform::GpuDeviceSync(); } @@ -1009,7 +1009,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) { OperatorDistAttr* op_dist_attr = block_.Op(op->Id())->MutableDistAttr(); platform::Timer op_timer; op_timer.Start(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::GpuDeviceSync(); #endif op_timer.Pause(); @@ -1040,7 +1040,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) { /*For profiling/benchmark only*/ if (FLAGS_benchmark) { instr_node.DeviceContext().Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op->Type() // NOLINT << "): context wait and get last error"; @@ -1105,7 +1105,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { try { instr_node.WaitEvent(place_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (enable_job_schedule_profiler_) { if (!calculate_stream_timer_->IsStarted() && op->Type() != "feed" && !interpreter::IsCommunicationOp(instr_node)) { @@ -1124,7 +1124,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { } instr_node.RecordEvent(place_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (enable_job_schedule_profiler_) { if (instr_node.Id() == last_calculate_instr_id_ && calculate_stream_timer_->IsStarted()) { @@ -1320,7 +1320,7 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) { } void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW(platform::errors::Unimplemented( "RecordStreamForGC is only implemented when compiled with GPU.")); #else @@ -1428,7 +1428,7 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) { void ProgramInterpreter::CheckGC(const Instruction& instr) { platform::RecordEvent record( "CheckGC", platform::TracerEventType::UserDefined, 10); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (instr.need_record_stream_for_gc_) { RecordStreamForGC(instr); } diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 701da4f9473599..b19e3a06a42588 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -16,7 +16,7 @@ #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/kernels/autotune/gpu_timer.h" #endif @@ -234,7 +234,7 @@ class ProgramInterpreter : public InterpreterBaseImpl { std::vector output_hookfuncs_; std::vector input_hookfuncs_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::unique_ptr calculate_stream_timer_; #endif size_t last_calculate_instr_id_; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index f4a5f6d410eae0..84ee045918fcd7 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -359,7 +359,7 @@ struct OpKernelRegistrarFunctorExCanCUDNNBeUsed(exe_ctx, kernel_type.data_type_)) { auto tmp_kernel_type = kernel_type; tmp_kernel_type.library_type_ = framework::LibraryType::kCUDNN; @@ -1567,12 +1567,12 @@ bool OperatorWithKernel::CanCUDNNBeUsed(const framework::ExecutionContext& ctx, bool use_cudnn = ctx.HasAttr("use_cudnn") && ctx.Attr("use_cudnn") && paddle::platform::is_gpu_place(ctx.GetPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (use_cudnn) { auto& dev_ctx = ctx.device_context(); use_cudnn &= (dev_ctx.cudnn_handle() != nullptr); } -#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP || defined(PADDLE_WITH_MUSA) +#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP #if defined(PADDLE_WITH_CUDA) if (use_cudnn && data_type == phi::DataType::BFLOAT16) { @@ -1808,7 +1808,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (this->CanCUDNNBeUsed(exe_ctx, kernel_type_->data_type_)) { kernel_type_->library_type_ = framework::LibraryType::kCUDNN; } @@ -2071,7 +2071,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, /*For profiling/benchmark only*/ if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM) || defined(PADLDE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); #endif VLOG(4) << "Operator(" << Type() << "): context wait and get last error"; @@ -2134,7 +2134,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (this->CanCUDNNBeUsed(ctx, expected_kernel_key.data_type_)) { expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; } @@ -2157,7 +2157,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( // CPUKernel will be executed and a warning will be given at the same // time. expected_kernel_key.place_ = platform::CPUPlace(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (SupportGPU()) { auto& dev_ctx = ctx.device_context(); expected_kernel_key.place_ = dev_ctx.GetPlace(); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index f8943d53f15909..d51c0ce0f415d0 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -584,7 +584,7 @@ class ExecutionContext : public phi::KernelContext { return device_context_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) const inline phi::GPUContext& cuda_device_context() const { PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index cef7e14a2a1b89..e6c11df275b569 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -41,14 +41,14 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/platform/flags.h" PHI_DECLARE_double(eager_delete_tensor_gb); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PHI_DECLARE_bool(sync_nccl_allreduce); #endif @@ -69,7 +69,7 @@ static std::once_flag gProfileOnce; static bool gProfileStarted = false; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::once_flag p2p_init_flag; #endif @@ -148,7 +148,7 @@ class ParallelExecutorPrivate { } } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) { VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_ << ", num_trainers:" << bst.num_trainers_ @@ -162,7 +162,7 @@ class ParallelExecutorPrivate { << bst.hierarchical_allreduce_exter_nranks_; } - std::vector flat_nccl_ids; + std::vector flat_nccl_ids; if (nranks_ == 1) { // FIXME(gongwb): need not to create ncclid when nranks==1 nccl_ctxs_->InitFlatCtxs( @@ -173,18 +173,18 @@ class ParallelExecutorPrivate { if (bst.enable_parallel_graph_) { VLOG(1) << "use only one ncclid in pg model"; - mcclUniqueId *nccl_id = nullptr; + ncclUniqueId *nccl_id = nullptr; std::string var_name = platform::GetFlatNCCLVarName(0); auto nccl_id_var = scope->FindVar(var_name); if (nccl_id_var) { - nccl_id = nccl_id_var->GetMutable(); + nccl_id = nccl_id_var->GetMutable(); VLOG(10) << "find nccl_id_var:" << var_name << ", nccl_id:" << nccl_id; } else { - nccl_id = new mcclUniqueId(); + nccl_id = new ncclUniqueId(); PADDLE_ENFORCE_EQ( - platform::dynload::mcclGetUniqueId(nccl_id), - mcclSuccess, + platform::dynload::ncclGetUniqueId(nccl_id), + ncclSuccess, platform::errors::PreconditionNotMet( "PaddlePaddle failed to get NCCL unique ID. It may due to your " "system settings or NCCL library error, please debug on NCCL")); @@ -213,7 +213,7 @@ class ParallelExecutorPrivate { PADDLE_ENFORCE_NOT_NULL( nccl_id_var, platform::errors::NotFound("Can't find nccl_id_var '%s'.", var_name)); - auto nccl_id = nccl_id_var->GetMutable(); + auto nccl_id = nccl_id_var->GetMutable(); flat_nccl_ids.push_back(nccl_id); } @@ -221,25 +221,25 @@ class ParallelExecutorPrivate { places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_); if (bst.use_hierarchical_allreduce_) { - std::vector inter_nccl_ids; + std::vector inter_nccl_ids; for (int i = 0; i < static_cast(bst.nccl_comm_num_); i++) { std::string var_name = platform::GetHierarchicalInterNCCLVarName(i); auto nccl_id_var = scope->FindVar(var_name); PADDLE_ENFORCE_NOT_NULL(nccl_id_var, platform::errors::NotFound( "Can't find nccl_id_var '%s'.", var_name)); - auto inter_nccl_id = nccl_id_var->GetMutable(); + auto inter_nccl_id = nccl_id_var->GetMutable(); inter_nccl_ids.push_back(inter_nccl_id); } - std::vector exter_nccl_ids; + std::vector exter_nccl_ids; for (int i = 0; i < static_cast(bst.nccl_comm_num_); i++) { std::string var_name = platform::GetHierarchicalExterNCCLVarName(i); auto nccl_id_var = scope->FindVar(var_name); PADDLE_ENFORCE_NOT_NULL(nccl_id_var, platform::errors::NotFound( "Can't find nccl_id_var '%s'.", var_name)); - auto nccl_id = nccl_id_var->GetMutable(); + auto nccl_id = nccl_id_var->GetMutable(); exter_nccl_ids.push_back(nccl_id); } @@ -400,7 +400,7 @@ class ParallelExecutorPrivate { std::unordered_map is_persistable_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) platform::NCCLCommunicator *nccl_ctxs_{nullptr}; #elif defined(PADDLE_WITH_XPU_BKCL) platform::BKCLCommunicator *bkcl_ctxs_{nullptr}; @@ -512,7 +512,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { } std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (IsFastEagerDeletionModeEnabled()) { gc = std::make_unique(place, max_memory_size); @@ -623,7 +623,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() { } void InitP2P(const std::vector &places) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::call_once(p2p_init_flag, [&]() { int count = places.size(); if (count <= 1) return; @@ -644,10 +644,6 @@ void InitP2P(const std::vector &places) { hipError_t ret = hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); if (ret != hipSuccess || can_acess != 1) { -#elif defined(PADDLE_WITH_MUSA) - musaError_t ret = - musaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); - if (ret != musaSuccess || can_acess != 1) { #else cudaError_t ret = cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); @@ -659,8 +655,6 @@ void InitP2P(const std::vector &places) { platform::CUDADeviceGuard guard(devices[i]); #ifdef PADDLE_WITH_HIP hipDeviceEnablePeerAccess(devices[j], 0); -#elif defined(PADDLE_WITH_MUSA) - musaDeviceEnablePeerAccess(devices[j], 0); #else cudaDeviceEnablePeerAccess(devices[j], 0); #endif @@ -813,12 +807,12 @@ void ParallelExecutor::BCastParamsToDevices( } auto &dims = main_tensor.dims(); if (paddle::platform::is_gpu_place(main_tensor.place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) std::vector buffers; buffers.reserve(member_->places_.size()); size_t numel = main_tensor.numel(); auto dtype = framework::TransToProtoVarType(main_tensor.dtype()); - mcclDataType_t data_type = platform::ToNCCLDataType(dtype); + ncclDataType_t data_type = platform::ToNCCLDataType(dtype); for (size_t i = 0; i < member_->places_.size(); ++i) { auto place = member_->places_[i]; void *buffer; @@ -846,7 +840,7 @@ void ParallelExecutor::BCastParamsToDevices( platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->places_.size(); ++i) { auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]); - platform::dynload::mcclBcast(buffers[i], + platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, @@ -1288,7 +1282,7 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( BuildStrategy::ReduceStrategy::kAllReduce; member_->use_all_reduce_ = true; } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( device_count, @@ -1297,8 +1291,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( } #endif -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ - (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && !defined(PADDLE_WITH_MCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ + (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( device_count, @@ -1456,7 +1450,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { } if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_); // Initialize device context's nccl comm, will be used by normal @@ -1507,7 +1501,7 @@ std::vector ParallelExecutor::CompileGraphWithBuildStrategy( std::vector async_graphs(device_count); auto &graphs = *device_graphs; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (member_->build_strategy_.async_mode_) { PADDLE_ENFORCE_EQ(graphs.size(), device_count, @@ -1662,7 +1656,7 @@ std::vector ParallelExecutor::CreateSSAGraphExecutor( final_graphs = *async_graphs; } else if (member_->build_strategy_.enable_parallel_graph_) { VLOG(3) << "use ParallelSSAGraphExecutor"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // TODO(Yancey1989): Remove passing in the main_program when // allreduce_seq_pass doesn't need it as the attr. bool is_inference = details::IsDataParallelInferenceGraph(*graph); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 48cd609d798e3d..32514089763c6e 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -33,7 +33,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 0b77e80a0b4658..cc5cf54724dabe 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -134,7 +134,7 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key, phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (kernel_key.backend() == phi::Backend::GPU || kernel_key.backend() == phi::Backend::GPUDNN) { PADDLE_THROW( diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index e37957918fe401..d1eb5558c54541 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -72,7 +72,7 @@ struct ConvertToPhiContext { using TYPE = phi::CPUContext; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> struct ConvertToPhiContext { using TYPE = phi::GPUContext; diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index 827e39c152640e..4566927e068ca6 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" @@ -34,7 +34,7 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ParseDumpConfig(trainer_desc); const auto& section_config = section_params.section_config(); int place_id = section_config.place_id(); -#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)|| (defined PADDLE_WITH_MCCL) +#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL) place_ = platform::CUDAPlace(place_id); #endif worker_ = DeviceWorkerFactory::CreateDeviceWorker( diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index 472eb5ef9b42f8..4b629c24cf0e64 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/trainer.h" -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL ||defined PADDLE_WITH_MCCL || \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index f1cc62bbfd3041..85fc30978f16a4 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/platform/lodtensor_printer.h" #include "paddle/fluid/string/string_helper.h" -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) #ifdef PADDLE_WITH_CUDA @@ -286,7 +286,7 @@ void PSGPUWorker::TrainFiles() { timeline.Start(); int total_ins_num = 0; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) platform::SetDeviceId(thread_id_); #elif defined(PADDLE_WITH_XPU_BKCL) platform::SetXPUDeviceId(thread_id_); @@ -511,7 +511,7 @@ void PSGPUWorker::TrainFilesWithProfiler() { int total_ins_num = 0; int cur_batch; timeline.Start(); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) platform::SetDeviceId(thread_id_); #elif defined(PADDLE_WITH_XPU_BKCL) platform::SetXPUDeviceId(thread_id_); diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc index 8b740ea6156e20..f295fa7106dd43 100644 --- a/paddle/fluid/framework/pull_dense_worker.cc +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -69,11 +69,11 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { fleet_ptr_ = FleetWrapper::GetInstance(); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) copy_streams_.clear(); #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA) + defined(PADDLE_WITH_XPU) places_.clear(); thread_scopes_.clear(); #endif @@ -81,7 +81,7 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { void PullDenseWorker::CreatePinVar() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA) + defined(PADDLE_WITH_XPU) // for (auto& v : dense_value_names_) { // for (auto& name : v.second) { for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size(); @@ -95,7 +95,7 @@ void PullDenseWorker::CreatePinVar() { auto* ptr = root_scope_->Var(name + "pin"); InitializeVariable(ptr, proto::VarType::LOD_TENSOR); phi::DenseTensor* pin_tensor = ptr->GetMutable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) pin_tensor->mutable_data(tensor->dims(), platform::CUDAPinnedPlace()); #endif @@ -125,7 +125,7 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { } status_vec->resize(0); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA) + defined(PADDLE_WITH_XPU) for (size_t i = 0; i < places_.size(); ++i) { // for (auto& v : dense_value_names_) { @@ -141,7 +141,7 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { Variable* var = thread_scopes_[i]->FindVar(name); phi::DenseTensor* tensor = var->GetMutable(); float* w = tensor->data(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) memory::Copy(places_[i], w, platform::CUDAPinnedPlace(), @@ -177,7 +177,7 @@ void PullDenseWorker::PullDense(bool force_update) { dwp_param_.program_config(0).pull_dense_table_id(i)); if (force_update || CheckUpdateParam(tid)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA) + defined(PADDLE_WITH_XPU) VLOG(3) << "pull dense " << force_update << " " << tid; fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 9f347ca4c01264..f88dbc409d1704 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include #include "paddle/fluid/framework/device_worker.h" @@ -228,7 +228,7 @@ void SectionWorker::TrainFiles() { int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; if (max_memory_size >= 0) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place_)) { if (IsFastEagerDeletionModeEnabled()) { gc = std::make_unique(place_, diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 01267fd059c1f7..27dc5902c75ba3 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -125,7 +125,7 @@ void TensorCopyImpl(const TENSOR& src, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); @@ -379,7 +379,7 @@ void TensorCopySync(const phi::DenseTensor& src, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); @@ -482,7 +482,7 @@ void TensorToStream(std::ostream& os, platform::errors::ResourceExhausted( "tensor size %d overflow when writing tensor", size)); if (platform::is_gpu_place(tensor.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB std::unique_ptr buf(new char[kBufSize]); auto& gpu_dev_ctx = static_cast(dev_ctx); @@ -616,7 +616,7 @@ void TensorFromStream(std::istream& is, if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_custom_place(dev_ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) phi::DenseTensor cpu_tensor; cpu_tensor.Resize(common::make_ddim(shape)); @@ -690,7 +690,7 @@ void TensorFromStream(std::istream& is, platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_MUSA) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) phi::DenseTensor cpu_tensor; cpu_tensor.Resize(common::make_ddim(dims)); framework::VisitDataType( @@ -812,7 +812,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) { if (dl_tensor.device.device_type == kDLCPU) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (dl_tensor.device.device_type == kDLGPU) { platform::CUDAPlace dst_place = platform::CUDAPlace(dl_tensor.device.device_id); @@ -852,7 +852,7 @@ void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) { void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (src->dl_tensor.device.device_type == kDLGPU) { platform::CUDAPlace dst_place = platform::CUDAPlace(src->dl_tensor.device.device_id); diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index c4d9b9c143009a..d9e3e384337366 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -129,7 +129,7 @@ void TensorFromArray(const T* src, if (platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -175,7 +175,7 @@ void TensorFromVector(const std::vector& src, if (platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -304,7 +304,7 @@ void TensorToVector(const phi::DenseTensor& src, if (platform::is_cpu_place(src.place())) { memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -346,7 +346,7 @@ inline void TensorToVector(const phi::DenseTensor& src, if (platform::is_cpu_place(src.place())) { memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy(dst_place, dst_ptr, diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 75268cb5aea275..af7fc63a2122a8 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -159,7 +159,7 @@ class DistMultiTrainer : public MultiTrainer { std::shared_ptr pull_dense_worker_; }; -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined(PADDLE_WITH_MUSA)|| \ +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \ defined PADDLE_WITH_XPU) && \ (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS)) class HeterServiceContext { @@ -175,7 +175,7 @@ class HeterServiceContext { int place_num_; Scope* scope_{nullptr}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuEvent_t event_; #endif std::vector ops_; @@ -207,7 +207,7 @@ class HeterXpuTrainer : public TrainerBase { virtual std::string GetDumpPath(int tid) { return ""; } virtual void InitDumpEnv() {} template -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void HeterMemCpy(phi::DenseTensor* tensor, phi::DenseTensor* root_tensor, const paddle::platform::Place& thread_place, @@ -245,7 +245,7 @@ class HeterXpuTrainer : public TrainerBase { std::vector place_scopes_; BtObjectPool object_pool_; std::vector places_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::vector copy_streams_; std::vector events_; #endif @@ -253,7 +253,7 @@ class HeterXpuTrainer : public TrainerBase { #endif -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) class PSGPUTrainer : public TrainerBase { @@ -305,7 +305,7 @@ class PSGPUTrainer : public TrainerBase { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index aeb033649509fd..ba5dac4830aa18 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -72,17 +72,17 @@ REGISTER_TRAINER_CLASS(DistMultiTrainer); REGISTER_TRAINER_CLASS(HeterPipelineTrainer); #endif -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA || \ +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \ defined PADDLE_WITH_XPU) && \ (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS)) REGISTER_TRAINER_CLASS(HeterXpuTrainer); #endif -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) REGISTER_TRAINER_CLASS(PSGPUTrainer); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) REGISTER_TRAINER_CLASS(PipelineTrainer); #endif } // namespace framework diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index 42471cceb30252..c1f192673a7022 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -37,13 +37,6 @@ #include "paddle/fluid/operators/miopen_rnn_cache.h" #endif -#ifdef PADDLE_WITH_MUSA -#if defined(PADDLE_WITH_MCCL) -#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT -#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT -#endif -#endif - #if defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #endif diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 61790dc36e912e..9bffd125a3f3da 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -34,14 +34,6 @@ #include #endif #endif - -#ifdef PADDLE_WITH_MUSA -#include -#if defined(PADDLE_WITH_MCCL) -#include -#endif -#endif - #ifdef PADDLE_WITH_HIP #include #ifdef PADDLE_WITH_RCCL @@ -68,8 +60,8 @@ class SparseCsrTensor; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class Communicator; class NCCLCommunicator; #endif @@ -198,13 +190,13 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< FetchList, FeedList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) - mcclUniqueId, +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + ncclUniqueId, platform::Communicator, platform::NCCLCommunicator, #endif - // operators::CudnnRNNCache, + operators::CudnnRNNCache, #endif #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index ebf1fd4141ace0..b6d846e9a0c12d 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -97,7 +97,7 @@ cc_library( SRCS profiler.cc DEPS phi common) if(NOT WIN32) - if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) + if(WITH_NCCL OR WITH_RCCL) cc_library( imperative_all_reduce SRCS all_reduce.cc @@ -119,12 +119,6 @@ if(NOT WIN32) SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce) endif() - if(WITH_MCCL) - musa_library( - reducer - SRCS reducer.cc reducer.cu - DEPS layer imperative_all_reduce) - endif() endif() if(WITH_XPU_BKCL) cc_library( @@ -144,7 +138,6 @@ if(NOT WIN32) if(NOT (WITH_NCCL OR WITH_RCCL - OR WITH_MCCL OR WITH_XPU_BKCL OR WITH_GLOO)) cc_library( @@ -155,7 +148,6 @@ if(NOT WIN32) endif() if(WITH_NCCL OR WITH_RCCL - OR WITH_MCCL OR WITH_XPU_BKCL OR WITH_CUSTOM_DEVICE) cc_library( @@ -177,7 +169,6 @@ if(WITH_GLOO) OR (NOT (WITH_NCCL OR WITH_RCCL - OR WITH_MCCL OR WITH_XPU_BKCL) )) cc_library( diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc index 5436364e56f7fd..c4bb42e4c22bb4 100644 --- a/paddle/fluid/imperative/all_reduce.cc +++ b/paddle/fluid/imperative/all_reduce.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/imperative/all_reduce.h" @@ -26,11 +26,6 @@ #include #endif -#ifdef PADDLE_WITH_MCCL -#include -#endif - - #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/variable.h" @@ -74,16 +69,16 @@ static void AllReduce(const phi::DenseTensor &src, auto *dst_ptr = dst->mutable_data(src.place(), src.dtype()); auto nccl_dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(src.dtype())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(src_ptr, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(src_ptr, dst_ptr, src.numel(), nccl_dtype, - mcclSum, + ncclSum, comm->comm(), stream)); } -// #if NCCL_VERSION_CODE >= 2212 +#if NCCL_VERSION_CODE >= 2212 static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst, const ParallelStrategy &strategy, @@ -106,7 +101,7 @@ static void AllReduce(const phi::SelectedRows &src, bool use_calc_stream = (dev_ctx->stream() == stream); VLOG(4) << "Is use calculate stream: " << use_calc_stream; - // 1. Gather rows number from all workers. Here use mcclAllGather to do this, + // 1. Gather rows number from all workers. Here use ncclAllGather to do this, // but we can use other ways to implement is in the future const auto &src_rows = src.rows(); phi::Vector rows_num_vector(strategy.nranks_); @@ -119,10 +114,10 @@ static void AllReduce(const phi::SelectedRows &src, dev_ctx->Wait(); } PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclAllGather(gpu_rows_num_ptr + strategy.local_rank_, + platform::dynload::ncclAllGather(gpu_rows_num_ptr + strategy.local_rank_, gpu_rows_num_ptr, 1, - mcclInt64, + ncclInt64, comm->comm(), stream)); @@ -168,14 +163,14 @@ static void AllReduce(const phi::SelectedRows &src, // allgather is used to speed up the allreduce by replacing broadcast. auto row_sendcount = cpu_rows_num_ptr[0]; VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce"; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(src_rows_ptr, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(src_rows_ptr, dst_rows_ptr, row_sendcount, - mcclInt64, + ncclInt64, comm->comm(), stream)); auto value_sendcount = cpu_rows_num_ptr[0] * feature_size; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(src_tensor_ptr, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(src_tensor_ptr, dst_tensor_ptr, value_sendcount, nccl_dtype, @@ -186,10 +181,10 @@ static void AllReduce(const phi::SelectedRows &src, if (cpu_rows_num_ptr[i] > 0) { // 2. Broadcast the rows of SelectedRows PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclBroadcast(src_rows_ptr, + platform::dynload::ncclBroadcast(src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i], - mcclInt64, + ncclInt64, i, comm->comm(), stream)); @@ -197,7 +192,7 @@ static void AllReduce(const phi::SelectedRows &src, auto *dst_tensor_ptr_i = reinterpret_cast(dst_tensor_ptr) + row_offset * feature_size * sizeof_dtype; PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclBroadcast(src_tensor_ptr, + platform::dynload::ncclBroadcast(src_tensor_ptr, dst_tensor_ptr_i, cpu_rows_num_ptr[i] * feature_size, nccl_dtype, @@ -217,7 +212,7 @@ static void AllReduce(const phi::SelectedRows &src, VLOG(3) << "Result SelectedRows rows: " << string::join_strings(*dst_rows, ','); } -// #endif +#endif void AllReduce(const framework::Variable &src, framework::Variable *dst, @@ -239,7 +234,7 @@ void AllReduce(const framework::Variable &src, dst->GetMutable(), stream, comm); -// #if NCCL_VERSION_CODE >= 2212 +#if NCCL_VERSION_CODE >= 2212 } else if (src.IsType()) { if (&src != dst) { if (!dst->IsType()) { @@ -262,7 +257,7 @@ void AllReduce(const framework::Variable &src, platform::GpuStreamSync(stream); *dst = std::move(tmp_dst); } -// #endif +#endif } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported variable type %s for imperative allreduce, only " diff --git a/paddle/fluid/imperative/all_reduce.h b/paddle/fluid/imperative/all_reduce.h index 049345772de65a..49e30549242052 100644 --- a/paddle/fluid/imperative/all_reduce.h +++ b/paddle/fluid/imperative/all_reduce.h @@ -14,7 +14,7 @@ #pragma once -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) namespace paddle { namespace framework { diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index dfb231ead927ee..0c16a950358706 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -129,7 +129,7 @@ AmpOperators::AmpOperators() block_ops_(new std::unordered_set()), unsupported_fp16_ops_(new std::unordered_set()), unsupported_bf16_ops_(new std::unordered_set()) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto unsupported_ops_gpu_fp16 = std::get<2>( OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16)); unsupported_fp16_ops_->insert(unsupported_ops_gpu_fp16.begin(), diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index 58ecec47cccf39..4e0df45e840f25 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -141,7 +141,7 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src, const auto &src_tensor = src.value(); const auto &place = src_tensor.place(); auto dtype = framework::TransToProtoVarType(src_tensor.dtype()); - // 1. Gather rows number from all workers. Here use mcclAllGather to do this, + // 1. Gather rows number from all workers. Here use ncclAllGather to do this, // but we can use other ways to implement is in the future auto &src_rows = src.rows(); auto gloo_wrapper = framework::GlooWrapper::GetInstance(); diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 61bb0a1d7c14e8..267540f0807413 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -209,7 +209,7 @@ void TensorAdd(const VarType& src, VarType* dst) { } if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_TENSOR_ADD(float, phi::GPUContext); PADDLE_TENSOR_ADD(double, phi::GPUContext); PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext); @@ -326,7 +326,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { return; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (paddle::platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double); @@ -334,7 +334,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { #endif PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } #endif @@ -381,7 +381,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, return; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double); @@ -389,7 +389,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, #endif PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } #endif @@ -447,7 +447,7 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, return dst_var; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (paddle::platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double); @@ -463,7 +463,7 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, #if defined(PADDLE_WITH_XPU) } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } #endif @@ -734,7 +734,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (paddle::platform::is_gpu_place(place)) { // sum selected rows firstly for (auto& var_info : tmp_grad_vars_) { @@ -800,7 +800,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, // Increase count IncreaseCurCnt(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } #endif tmp_grad_vars_.clear(); diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index 13a3d356e61c5b..d70d40808f915d 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/imperative/nccl_context.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/imperative/all_reduce.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" @@ -41,10 +41,10 @@ class Variable; namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) void NCCLParallelContext::BcastNCCLId( - std::vector &nccl_ids, // NOLINT + std::vector &nccl_ids, // NOLINT int root, int server_fd) { if (strategy_.local_rank_ == root) { @@ -64,13 +64,13 @@ void NCCLParallelContext::BcastNCCLId( void NCCLParallelContext::Init() { int server_fd = -1; - std::vector nccl_ids; + std::vector nccl_ids; nccl_ids.resize(strategy_.nrings_); if (strategy_.local_rank_ == 0) { // generate the unique ncclid on the root worker for (auto &nccl_id : nccl_ids) { - platform::dynload::mcclGetUniqueId(&nccl_id); + platform::dynload::ncclGetUniqueId(&nccl_id); } } else { // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server @@ -101,12 +101,12 @@ void NCCLParallelContext::Init() { void NCCLParallelContext::InitWithRingID(int ring_id) { int server_fd = -1; - std::vector nccl_ids; + std::vector nccl_ids; nccl_ids.resize(1); if (strategy_.local_rank_ == 0) { // generate the unique ncclid on the root worker - platform::dynload::mcclGetUniqueId(&nccl_ids[0]); + platform::dynload::ncclGetUniqueId(&nccl_ids[0]); } else { // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server // on rank0. @@ -152,7 +152,7 @@ void NCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) { void *src_ptr = src_tensor->data(); auto nccl_dtype = platform::ToNCCLDataType( framework::TransToProtoVarType(src_tensor->dtype())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( src_ptr, src_tensor->numel(), nccl_dtype, 0, comm->comm(), stream)); } @@ -188,9 +188,6 @@ void NCCLParallelContext::WaitCompute(int ring_id) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream)); - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0)); @@ -221,9 +218,6 @@ void NCCLParallelContext::WaitComm(int ring_id) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream)); - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0)); diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h index f71c57af3f4f6d..7db96b2ee3d486 100644 --- a/paddle/fluid/imperative/nccl_context.h +++ b/paddle/fluid/imperative/nccl_context.h @@ -17,7 +17,7 @@ #include #include -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #endif @@ -29,10 +29,6 @@ #include "paddle/fluid/platform/dynload/rccl.h" #endif -#ifdef PADDLE_WITH_MCCL -#include "paddle/fluid/platform/dynload/mccl.h" -#endif - #include "paddle/fluid/imperative/parallel_context.h" namespace paddle { @@ -44,7 +40,7 @@ class Variable; namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class NCCLParallelContext : public ParallelContext { public: explicit NCCLParallelContext(const ParallelStrategy& strategy, @@ -53,7 +49,7 @@ class NCCLParallelContext : public ParallelContext { ~NCCLParallelContext() override = default; - void BcastNCCLId(std::vector& nccl_ids, + void BcastNCCLId(std::vector& nccl_ids, int root, // NOLINT int server_fd); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 1545eb0bd6e68d..d336488a42327c 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -205,7 +205,7 @@ PreparedOp PrepareImpl( } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (op.CanCUDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.dtype())) { expected_kernel_key.set_backend(phi::Backend::GPUDNN); } @@ -555,7 +555,7 @@ static void PreparedOpRunImpl( if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; #endif @@ -645,7 +645,7 @@ static void PreparedOpRunPtImpl( if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; #endif diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index ef63b4a1b62d32..4bbc52662fc96e 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -29,7 +29,7 @@ namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) // div the nranks @@ -40,7 +40,7 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { : dense_contents_.GetMutable(); if (platform::is_gpu_place(tensor->place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) DivNRanks(tensor, nranks, context); #endif } else if (platform::is_cpu_place(tensor->place())) { @@ -228,7 +228,7 @@ void SplitTensorsWithType( void Group::ConcatTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ConcatTensorsWithType(static_cast(context), dense_tensors_, &dense_contents_, @@ -264,7 +264,7 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { void Group::SplitTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) SplitTensorsWithType(static_cast(context), &dense_contents_, &dense_tensors_, @@ -1020,7 +1020,7 @@ void Reducer::FinalizeBackward() { if (find_unused_vars_each_step_) { // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_GLOO) ProcessUnusedDenseVars(); #endif diff --git a/paddle/fluid/imperative/reducer.cu b/paddle/fluid/imperative/reducer.cu index 5d89f487bc379f..59b7ecf9154230 100644 --- a/paddle/fluid/imperative/reducer.cu +++ b/paddle/fluid/imperative/reducer.cu @@ -17,7 +17,7 @@ namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) void Group::DivNRanks(phi::DenseTensor *tensor, int64_t nranks, const platform::DeviceContext &context) { diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index 9a6e1de71fe9d2..011c8871329a55 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -44,7 +44,7 @@ class VariableWrapper; namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index d01fefc7795943..0f992c9b8be309 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -137,7 +137,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( if (gcs_.count(place) == 0) { std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gc = std::make_unique(place, 0); VLOG(10) << "Created GarbageCollector at " << place; @@ -147,7 +147,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( "Please recompile or reinstall Paddle with GPU support.")); #endif } else if (platform::is_cuda_pinned_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gc = std::make_unique(place, 0); VLOG(10) << "Created GarbageCollector at " << place; @@ -309,7 +309,7 @@ void Tracer::TraceOpImpl(const std::string& type, try { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::SetDeviceId(place.device); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 3f4e7a9344a30c..d2f834a5938e96 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -35,7 +35,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(phi_modules GLOBAL PROPERTY PHI_MODULES) get_property(ir_targets GLOBAL PROPERTY IR_TARGETS) get_property(not_infer_modules GLOBAL PROPERTY NOT_INFER_MODULES) -set(utils_modules pretty_log string_helper benchmark utf8proc) +set(utils_modules pretty_log string_helper utf8proc) if(NOT WITH_GFLAGS) set(utils_modules ${utils_modules} paddle_flags) diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 302bc160c99387..221e6b7de1abfe 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -38,7 +38,7 @@ namespace paddle { namespace inference { namespace analysis { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; @@ -215,7 +215,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { argument->scope_valid(), true, platform::errors::PreconditionNotMet("The scope field should be valid")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (argument->use_gpu_valid()) { CopyParamsToGpu(argument); } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index 6ab7d83b8922d2..ee29af1c13308b 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -32,7 +32,7 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { std::string repr() const override; private: -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void CopyParamsToGpu(Argument *argument); #endif diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index d9d7d5aa3659ad..94e71f1cfddf16 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -32,7 +32,7 @@ #include "paddle/fluid/inference/tensorrt/helper.h" #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PHI_DECLARE_uint64(initial_gpu_memory_in_mb); #endif @@ -100,7 +100,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id, Precision precision_mode) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) use_gpu_ = true; memory_pool_init_size_mb_ = memory_pool_init_size_mb; FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_; @@ -180,6 +180,11 @@ void AnalysisConfig::EnableXpu(int l3_size, bool transformer_encoder_adaptive_seqlen, bool enable_multi_stream) { #if defined(PADDLE_WITH_XPU) || defined(LITE_SUBGRAPH_WITH_XPU) + LOG_FIRST_N(WARNING, 1) + << "Parameters in EnableXpu/enable_xpu is deprecated since version " + "2.6.1, and will be removed in version 3.0! Please use " + "EnableXpu/enable_xpu without parameters, and use " + "SetXpuConfig/set_xpu_config to set options."; use_xpu_ = true; xpu_config_.l3_size = l3_size; xpu_config_.conv_autotune_level = conv_autotune; @@ -636,7 +641,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { } void AnalysisConfig::EnableCUDNN() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) use_cudnn_ = use_gpu_; #else LOG(ERROR) << "Please compile with CUDA first to use cuDNN"; @@ -991,7 +996,7 @@ void AnalysisConfig::Update() { } if (use_gpu() && use_cudnn_) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!enable_ir_optim_) { LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled."; } else { @@ -1207,7 +1212,7 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads( } float AnalysisConfig::fraction_of_gpu_memory_for_pool() const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Get the GPU memory details and calculate the fraction of memory for the // GPU memory pool. size_t gpu_total, gpu_available; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index b8d95d712bdd82..476c78638c47fc 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -120,7 +120,7 @@ PHI_DECLARE_bool(pir_apply_inplace_pass); namespace paddle { namespace { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void UpdatePrivateDeviceContext(InferGPUContext *gpu_context, GPUContextResource *gpu_resource, Place place_) { @@ -152,7 +152,7 @@ void UpdatePrivateDeviceContext(InferGPUContext *gpu_context, gpu_context->SetBlasTF32Handle( gpu_resource->GetBlasTF32TensorCoreHandleCreator()); gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator()); - // gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator()); + gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator()); gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator()); gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice()); @@ -292,7 +292,7 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt, false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place)); auto dst_gpu_place = place; @@ -424,7 +424,7 @@ bool AnalysisPredictor::Init( return true; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // TODO(inference): Now only gpu with external stream support private // device_context. if (config_.use_gpu_ && config_.use_external_stream_) { @@ -472,7 +472,7 @@ void AnalysisPredictor::InitPlace() { platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (config_.thread_local_stream_enabled()) { LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. " "Please use config.SetExecStream instead."; @@ -543,14 +543,14 @@ void AnalysisPredictor::InitPlace() { } void AnalysisPredictor::InitResourceManager(void *stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) predictor_stream_ = ResourceManager::Instance().InitGPUResource(place_, stream); #endif } void AnalysisPredictor::InitDeviceContexts() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Init GPUContext. if (place_.GetType() == phi::AllocationType::GPU) { device_contexts_.emplace( @@ -598,7 +598,7 @@ void AnalysisPredictor::InitDeviceContexts() { } void *AnalysisPredictor::GetExecStream() const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (place_.GetType() == phi::AllocationType::GPU) { if (private_context_) { return predictor_stream_; @@ -2315,7 +2315,7 @@ bool AnalysisPredictor::ZeroCopyRun() { return true; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { if (!private_context_) { PADDLE_THROW(platform::errors::Fatal( @@ -2326,8 +2326,6 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { if (stream != predictor_stream_) { #ifdef PADDLE_WITH_HIP hipStreamSynchronize(static_cast(predictor_stream_)); -#elif defined(PADDLE_WITH_MUSA) - musaStreamSynchronize(static_cast(predictor_stream_)); #else cudaStreamSynchronize(static_cast(predictor_stream_)); #endif @@ -2367,13 +2365,11 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() { paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); if (config_.use_gpu()) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto *dev_ctx = pool.Get(place_); auto stream = static_cast(dev_ctx)->stream(); #ifdef PADDLE_WITH_HIP hipStreamSynchronize(stream); -#elif defined(PADDLE_WITH_MUSA) - musaStreamSynchronize(stream); #else cudaStreamSynchronize(stream); #endif @@ -2768,7 +2764,7 @@ AnalysisPredictor::~AnalysisPredictor() { // NOLINT if (config_.shape_range_info_collected()) { StatisticShapeRangeInfo(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (predictor_stream_ != nullptr) { ResourceManager::Instance().DestroyGPUResource(predictor_stream_); } @@ -3334,15 +3330,6 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p, return false; } -bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p, - musaStream_t stream) { -#ifdef PADDLE_WITH_MUSA - auto pred = dynamic_cast(p->predictor_.get()); - return pred->ExpRunWithExternalStream(stream); -#endif - return false; -} - bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p, void *config) { auto pred = dynamic_cast(p->predictor_.get()); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 6725915a2c00c3..4a5cfb229a459e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -208,7 +208,7 @@ class AnalysisPredictor : public PaddlePredictor { /// bool ZeroCopyRun() override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Note: Can only be used under thread_local semantics. bool ExpRunWithExternalStream(const gpuStream_t stream); #endif diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 3c26f329d4747d..d886885edb5ba5 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -250,7 +250,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place_)); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 530bc6f8a3eda7..eee3a707a03b14 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -110,7 +110,7 @@ T *Tensor::mutable_data(PlaceType place) { return tensor->mutable_data(paddle::platform::CPUPlace()); } case static_cast(PlaceType::kGPU): { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) paddle::platform::CUDAPlace gpu_place(device_); auto *dev_ctxs = reinterpret_castmutable_data(paddle::platform::CPUPlace()); std::memcpy(static_cast(t_data), data, ele_size); } else if (place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) paddle::platform::CUDAPlace gpu_place(device_); auto *dev_ctxs = reinterpret_caststream()); #ifdef PADDLE_WITH_HIP hipStreamSynchronize(dev_ctx->stream()); -#elif defined(PADDLE_WITH_MUSA) - // async, return stream - if (nullptr != exec_stream) { - *(static_cast(exec_stream)) = dev_ctx->stream(); - // async with callback - } else if (cb) { - musaLaunchHostFunc(dev_ctx->stream(), cb, cb_params); - // sync - } else { - musaStreamSynchronize(dev_ctx->stream()); - } #else // async, return stream if (nullptr != exec_stream) { @@ -868,7 +857,7 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t, auto *t_data = tensor->mutable_data(paddle::platform::CPUPlace()); std::memcpy(static_cast(t_data), data, ele_size); } else if (t->place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) paddle::platform::CUDAPlace gpu_place(t->device_); auto *t_data = tensor->mutable_data(gpu_place); paddle::memory::Copy(gpu_place, @@ -938,7 +927,7 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); #endif } else if (t->place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) paddle::memory::Copy(paddle::platform::CPUPlace(), static_cast(data), t_place, diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc index d0bad85bfdee13..7879adb57d86ef 100644 --- a/paddle/fluid/inference/api/infer_context.cc +++ b/paddle/fluid/inference/api/infer_context.cc @@ -22,7 +22,7 @@ namespace paddle { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) InferGPUContext::InferGPUContext(const phi::Place& place) : phi::GPUContext(place, false) {} #endif diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h index 518a85119ed792..216c7747f07065 100644 --- a/paddle/fluid/inference/api/infer_context.h +++ b/paddle/fluid/inference/api/infer_context.h @@ -26,7 +26,7 @@ class InferCPUContext : public phi::CPUContext { using phi::CPUContext::SetEigenDevice; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class InferGPUContext : public phi::GPUContext { public: explicit InferGPUContext(const phi::Place& place); @@ -35,7 +35,7 @@ class InferGPUContext : public phi::GPUContext { using phi::GPUContext::SetBlasTF32Handle; using phi::GPUContext::SetDnnHandle; using phi::GPUContext::SetEigenDevice; - // using phi::GPUContext::SetSolverHandle; + using phi::GPUContext::SetSolverHandle; using phi::GPUContext::SetSparseHandle; using phi::GPUContext::SetStream; // using phi::GPUContext::SetDnnWorkspaceHandle; diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 6a3e943dec7e9a..b5a26ff9225aa4 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -111,6 +111,7 @@ struct PD_INFER_DECL XpuConfig { bool conv_autotune_file_writeback{false}; // Fc autotune level. The Optional values are 0-9. Default 0 means no + // autotune. int fc_autotune_level{0}; // Base fc autotune info is read from fc_autotune_file. std::string fc_autotune_file; @@ -367,7 +368,7 @@ struct PD_INFER_DECL AnalysisConfig { /// void EnableXpu(int l3_size = 0xfffc00, bool l3_locked = false, - bool conv_autotune = true, + bool conv_autotune = false, const std::string& conv_autotune_file = "", const std::string& transformer_encoder_precision = "int16", bool transformer_encoder_adaptive_seqlen = false, diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 10e6d38e5a900d..3fefba9ef22be8 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -470,7 +470,6 @@ PD_INFER_DECL std::shared_ptr MakeCipher( // forward declation using cudaStream_t = struct CUstream_st*; using hipStream_t = struct ihipStream_t*; -using musaStream_t = struct MUstream_st*; namespace paddle_infer { class Predictor; @@ -508,8 +507,6 @@ class PD_INFER_DECL InternalUtils { cudaStream_t stream); static bool RunWithExternalStream(paddle_infer::Predictor* pred, hipStream_t stream); - static bool RunWithExternalStream(paddle_infer::Predictor* pred, - musaStream_t stream); static bool RunWithRuntimeConfig(paddle_infer::Predictor* pred, void* config); static void UpdateConfigInterleaved(paddle_infer::Config* c, diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 9aaa2184875dc7..4af87b029fd22f 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -16,10 +16,7 @@ #ifdef PADDLE_WITH_CUDA #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#endif -#ifdef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_HIP #include #endif #ifdef PADDLE_WITH_TENSORRT diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc index 96676ff818c56c..2a8029555e94f5 100644 --- a/paddle/fluid/inference/api/resource_manager.cc +++ b/paddle/fluid/inference/api/resource_manager.cc @@ -44,7 +44,7 @@ namespace paddle { namespace internal { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class EigenGpuStreamDevice : public Eigen::StreamInterface { public: EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { @@ -102,9 +102,6 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); @@ -135,7 +132,7 @@ void CPUContextResource::InitCPUResource() { CPUContextResource::CPUContextResource() { InitCPUResource(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) GPUContextResource::GPUContextResource(const phi::Place& place, void* stream) : place_(place) { InitGPUResource(stream); @@ -161,8 +158,6 @@ void GPUContextResource::DestroyGPUResource() { if (owned_stream_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_)); #endif @@ -171,8 +166,8 @@ void GPUContextResource::DestroyGPUResource() { DestroyDnnHandle(); DestroyBlasHandle(); - // DestroyBlasLtHandle(); - // DestroySolverHandle(); + DestroyBlasLtHandle(); + DestroySolverHandle(); DestroySparseHandle(); } @@ -210,21 +205,21 @@ void GPUContextResource::DestroyBlasHandle() { phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_); } -// void GPUContextResource::InitBlasLtHandle() { -// phi::InitBlasLtHandle(&blaslt_handle_); -// } +void GPUContextResource::InitBlasLtHandle() { + phi::InitBlasLtHandle(&blaslt_handle_); +} -// void GPUContextResource::DestroyBlasLtHandle() { -// phi::DestroyBlasLtHandle(blaslt_handle_); -// } +void GPUContextResource::DestroyBlasLtHandle() { + phi::DestroyBlasLtHandle(blaslt_handle_); +} -// void GPUContextResource::InitSolverHandle() { -// phi::InitSolverHandle(&solver_handle_, stream_); -// } +void GPUContextResource::InitSolverHandle() { + phi::InitSolverHandle(&solver_handle_, stream_); +} -// void GPUContextResource::DestroySolverHandle() { -// phi::DestroySolverHandle(solver_handle_); -// } +void GPUContextResource::DestroySolverHandle() { + phi::DestroySolverHandle(solver_handle_); +} void GPUContextResource::InitSparseHandle() { phi::InitSparseHandle(&sparse_handle_, stream_); @@ -292,29 +287,29 @@ GPUContextResource::GetBlasTF32TensorCoreHandleCreator() { }; } -// blasLtHandle_t GPUContextResource::GetBlasLtHandle() const { -// return blaslt_handle_; -// } +blasLtHandle_t GPUContextResource::GetBlasLtHandle() const { + return blaslt_handle_; +} -// std::function -// GPUContextResource::GetBlasLtHandleCreator() { -// return [&]() { -// InitBlasLtHandle(); -// return blaslt_handle_; -// }; -// } +std::function +GPUContextResource::GetBlasLtHandleCreator() { + return [&]() { + InitBlasLtHandle(); + return blaslt_handle_; + }; +} -// phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const { -// return solver_handle_; -// } +phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const { + return solver_handle_; +} -// std::function -// GPUContextResource::GetSolverDnHandleCreator() { -// return [&]() { -// InitSolverHandle(); -// return solver_handle_; -// }; -// } +std::function +GPUContextResource::GetSolverDnHandleCreator() { + return [&]() { + InitSolverHandle(); + return solver_handle_; + }; +} phi::sparseHandle_t GPUContextResource::GetSparseHandle() const { return sparse_handle_; @@ -385,7 +380,7 @@ CPUContextResource* ResourceManager::GetCPUResource() const { return cpu_resource_.get(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) { std::lock_guard lock_gurad(gpu_mutex_); if (gpu_resources_.count(stream)) { diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h index 96d534e8cc9540..1f4d4ea420e1b6 100644 --- a/paddle/fluid/inference/api/resource_manager.h +++ b/paddle/fluid/inference/api/resource_manager.h @@ -26,7 +26,7 @@ #include "paddle/utils/test_macros.h" #include "unsupported/Eigen/CXX11/Tensor" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" @@ -50,7 +50,7 @@ class CPUContextResource { std::unique_ptr cpu_eigen_device_; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class GPUContextResource { public: explicit GPUContextResource(const phi::Place& place, void* stream); @@ -61,8 +61,8 @@ class GPUContextResource { std::function GetBlasHandleCreator(); std::function GetBlasTensorCoreHandleCreator(); std::function GetBlasTF32TensorCoreHandleCreator(); - // std::function GetBlasLtHandleCreator(); - // std::function GetSolverDnHandleCreator(); + std::function GetBlasLtHandleCreator(); + std::function GetSolverDnHandleCreator(); std::function GetSparseHandleCreator(); std::function GetGpuEigenDeviceCreator(); @@ -71,8 +71,8 @@ class GPUContextResource { blasHandle_t GetBlasHandle() const; blasHandle_t GetBlasTensorCoreHandle() const; blasHandle_t GetBlasTF32Handle() const; - // blasLtHandle_t GetBlasLtHandle() const; - // phi::solverHandle_t GetSolverDnHandle() const; + blasLtHandle_t GetBlasLtHandle() const; + phi::solverHandle_t GetSolverDnHandle() const; phi::sparseHandle_t GetSparseHandle() const; Eigen::GpuDevice* GetGpuEigenDevice() const; int GetGpuComputeCapability() const; @@ -91,10 +91,10 @@ class GPUContextResource { void InitDnnHanlde(); void DestroyDnnHandle(); void DestroyBlasHandle(); - // void InitBlasLtHandle(); - // void DestroyBlasLtHandle(); - // void InitSolverHandle(); - // void DestroySolverHandle(); + void InitBlasLtHandle(); + void DestroyBlasLtHandle(); + void InitSolverHandle(); + void DestroySolverHandle(); void InitSparseHandle(); void DestroySparseHandle(); @@ -117,9 +117,9 @@ class GPUContextResource { blasHandle_t blas_handle_{nullptr}; blasHandle_t blas_tensor_core_handle_{nullptr}; blasHandle_t blas_tf32_tensor_core_handle_{nullptr}; - // blasLtHandle_t blaslt_handle_{nullptr}; + blasLtHandle_t blaslt_handle_{nullptr}; dnnHandle_t dnn_handle_{nullptr}; - // phi::solverHandle_t solver_handle_{nullptr}; + phi::solverHandle_t solver_handle_{nullptr}; phi::sparseHandle_t sparse_handle_{nullptr}; // DnnWorkspaceHandle }; @@ -139,7 +139,7 @@ class ResourceManager { std::mutex cpu_mutex_; std::unique_ptr cpu_resource_{nullptr}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // GPU Resource public: void* InitGPUResource(const phi::Place& place, void* stream); diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index f3c953fb60a97e..9b36b6dc745e85 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -127,7 +127,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) { memory::Copy(cpu_place, dst_data, cpu_place, src_data, size); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_cpu_place(dst_place) && platform::is_gpu_place(src_place)) { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc old mode 100755 new mode 100644 index 8cf589541b1e04..10763eb911543a --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -47,6 +47,7 @@ struct SimpleOpTypeSetTeller : public Teller { #endif #if IS_TRT_VERSION_GE(7000) teller_set.insert("tile"); + int8_teller_set.insert("tile"); teller_set.insert("flatten_contiguous_range"); int8_teller_set.insert("flatten_contiguous_range"); teller_set.insert("rnn"); @@ -2302,15 +2303,20 @@ struct SimpleOpTypeSetTeller : public Teller { if (!with_dynamic_shape) { if (tile_inputs.find("repeat_times_tensor") != tile_inputs.end()) { if (!desc.Input("repeat_times_tensor").empty()) { + VLOG(3) << "Tile op: repeat_times_tensor is not empty."; return false; } } if (tile_inputs.find("RepeatTimes") != tile_inputs.end()) { if (!desc.Input("RepeatTimes").empty()) { + VLOG(3) << "Tile op: RepeatTimes is not empty."; return false; } } - if (!desc.HasAttr("repeat_times")) return false; + if (!desc.HasAttr("repeat_times")) { + VLOG(3) << "Tile op:`repeat_times` is not set."; + return false; + } } } #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu index 6da8e874adc813..b3b0cd35fb300b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu @@ -19,7 +19,7 @@ #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/utils.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/phi/core/distributed/nccl_comm_context.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); @@ -30,13 +30,13 @@ namespace inference { namespace tensorrt { namespace plugin { #if defined(PADDLE_WITH_NCCL) -inline mcclDataType_t NvInferDtypeToNCCLDType(nvinfer1::DataType type) { +inline ncclDataType_t NvInferDtypeToNCCLDType(nvinfer1::DataType type) { if (type == nvinfer1::DataType::kFLOAT) { - return mcclFloat; + return ncclFloat; } else if (type == nvinfer1::DataType::kHALF) { - return mcclFloat16; + return ncclFloat16; } else if (type == nvinfer1::DataType::kINT8) { - return mcclInt8; + return ncclInt8; } else if (type == nvinfer1::DataType::kINT32) { return ncclInt32; } else { @@ -159,23 +159,23 @@ int CAllReducePluginDynamic::enqueue( auto input_type = input_desc[0].type; void* sendbuff = const_cast(inputs[0]); void* recvbuff = outputs[0]; - mcclDataType_t dtype = NvInferDtypeToNCCLDType(input_type); - mcclRedOp_t nccl_red_type = mcclSum; + ncclDataType_t dtype = NvInferDtypeToNCCLDType(input_type); + ncclRedOp_t nccl_red_type = ncclSum; switch (red_type_) { case kRedSum: - nccl_red_type = mcclSum; + nccl_red_type = ncclSum; break; case kRedMax: - nccl_red_type = mcclMax; + nccl_red_type = ncclMax; break; case kRedMin: - nccl_red_type = mcclMin; + nccl_red_type = ncclMin; break; case kRedProd: - nccl_red_type = mcclProd; + nccl_red_type = ncclProd; break; default: @@ -202,9 +202,9 @@ int CAllReducePluginDynamic::enqueue( "NCCLCommContext is nullptr, collective op should " "has ring_id attr.")); auto stream = comm_ctx->GetStream(); - mcclRedOp_t nccl_red_type = mcclSum; + ncclRedOp_t nccl_red_type = ncclSum; // comm_ctx->AllReduce(&inputs[0], inputs[0], nccl_red_type, stream); - phi::dynload::mcclAllReduce(sendbuff, + phi::dynload::ncclAllReduce(sendbuff, recvbuff, numel, dtype, @@ -215,7 +215,7 @@ int CAllReducePluginDynamic::enqueue( } else { auto comm = platform::NCCLCommContext::Instance().Get(ring_id_); cudaStream_t custream = use_calc_stream_ ? stream : comm->stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff, recvbuff, numel, dtype, diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index fec0a927b20e8b..298f54de48e8f3 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -218,9 +218,6 @@ void QkvToContextPluginDynamic::configurePlugin( #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt index 3dbc06bfc11b7e..0ad2cb0e3f0c84 100644 --- a/paddle/fluid/inference/utils/CMakeLists.txt +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -1,8 +1,3 @@ -cc_library( - benchmark - SRCS benchmark.cc - DEPS enforce common) -paddle_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) cc_library( infer_io_utils SRCS io_utils.cc @@ -13,13 +8,5 @@ cc_library( DEPS proto_desc enforce common) cc_library(table_printer SRCS table_printer.cc) -paddle_test(test_table_printer SRCS table_printer_tester.cc) proto_library(shape_range_info_proto SRCS shape_range_info.proto) - -if(WITH_ONNXRUNTIME AND WIN32) - # Copy onnxruntime for some c++ test in Windows, since the test will - # be build only in CI, so suppose the generator in Windows is Ninja. - copy_onnx(test_benchmark) - copy_onnx(test_table_printer) -endif() diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc deleted file mode 100644 index 24bc99ed183fad..00000000000000 --- a/paddle/fluid/inference/utils/benchmark.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/utils/benchmark.h" - -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace inference { - -std::string Benchmark::SerializeToString() const { - std::stringstream ss; - ss << "-----------------------------------------------------\n"; - ss << "name\t"; - ss << "batch_size\t"; - ss << "num_threads\t"; - ss << "latency\t"; - ss << "qps"; - ss << '\n'; - - ss << name_ << "\t"; - ss << batch_size_ << "\t\t"; - ss << num_threads_ << "\t"; - ss << latency_ << "\t"; - ss << 1000.0 / latency_; - ss << '\n'; - return ss.str(); -} -void Benchmark::PersistToFile(const std::string &path) const { - std::ofstream file(path, std::ios::app); - PADDLE_ENFORCE_EQ( - file.is_open(), - true, - platform::errors::Unavailable("Can not open %s to add benchmark.", path)); - file << SerializeToString(); - file.flush(); - file.close(); -} - -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h deleted file mode 100644 index 56789843c3728e..00000000000000 --- a/paddle/fluid/inference/utils/benchmark.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include - -#include "paddle/utils/test_macros.h" - -namespace paddle { -namespace inference { - -/* - * Helper class to calculate the performance. - */ -struct TEST_API Benchmark { - int batch_size() const { return batch_size_; } - void SetBatchSize(int x) { batch_size_ = x; } - - int num_threads() const { return num_threads_; } - void SetNumThreads(int x) { num_threads_ = x; } - - bool use_gpu() const { return use_gpu_; } - void SetUseGpu() { use_gpu_ = true; } - - float latency() const { return latency_; } - void SetLatency(float x) { latency_ = x; } - - const std::string& name() const { return name_; } - void SetName(const std::string& name) { name_ = name; } - - std::string SerializeToString() const; - void PersistToFile(const std::string& path) const; - - private: - bool use_gpu_{false}; - int batch_size_{0}; - float latency_; - int num_threads_{1}; - std::string name_; -}; - -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc deleted file mode 100644 index 8f7614cb10a44e..00000000000000 --- a/paddle/fluid/inference/utils/benchmark_tester.cc +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "paddle/fluid/inference/utils/benchmark.h" - -using namespace paddle::inference; // NOLINT -TEST(Benchmark, basic) { - Benchmark benchmark; - benchmark.SetName("key0"); - benchmark.SetBatchSize(10); - benchmark.SetUseGpu(); - benchmark.SetLatency(220); - LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString(); -} - -TEST(Benchmark, PersistToFile) { - Benchmark benchmark; - benchmark.SetName("key0"); - benchmark.SetBatchSize(10); - benchmark.SetUseGpu(); - benchmark.SetLatency(220); - - benchmark.PersistToFile("1.log"); - benchmark.PersistToFile("2.log"); - benchmark.PersistToFile("3.log"); -} diff --git a/paddle/fluid/inference/utils/table_printer_tester.cc b/paddle/fluid/inference/utils/table_printer_tester.cc deleted file mode 100644 index fc482807b2854c..00000000000000 --- a/paddle/fluid/inference/utils/table_printer_tester.cc +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "paddle/fluid/inference/utils/table_printer.h" - -namespace paddle { -namespace inference {} // namespace inference -} // namespace paddle - -TEST(table_printer, output) { - std::vector header{"config", "value"}; - paddle::inference::TablePrinter table(header); - - // model_dir - table.InsertRow({"model_dir", "./model_dir"}); - // model - table.InsertRow({"model_file", "./model.pdmodel"}); - table.InsertRow({"params_file", "./model.pdiparams"}); - - table.InsetDivider(); - // gpu - table.InsertRow({"use_gpu", "true"}); - table.InsertRow({"gpu_device_id", "0"}); - table.InsertRow({"memory_pool_init_size", "100MB"}); - table.InsertRow({"thread_local_stream", "false"}); - table.InsetDivider(); - - // trt precision - table.InsertRow({"use_trt", "true"}); - table.InsertRow({"trt_precision", "fp32"}); - table.InsertRow({"enable_dynamic_shape", "true"}); - table.InsertRow({"DisableTensorRtOPs", "{}"}); - table.InsertRow({"EnableVarseqlen", "ON"}); - table.InsertRow({"tensorrt_dla_enabled", "ON"}); - table.InsetDivider(); - - // lite - table.InsertRow({"use_lite", "ON"}); - table.InsetDivider(); - - // xpu - table.InsertRow({"use_xpu", "true"}); - table.InsertRow({"xpu_device_id", "0"}); - table.InsetDivider(); - - // ir - table.InsertRow({"ir_optim", "true"}); - table.InsertRow({"ir_debug", "false"}); - table.InsertRow({"enable_memory_optim", "false"}); - table.InsertRow({"EnableProfile", "false"}); - table.InsertRow({"glog_info_disabled", "false"}); - table.InsetDivider(); - - // cpu - table.InsertRow({"CpuMathLibrary", "4"}); - // mkldnn - table.InsertRow({"enable_mkldnn", "false"}); - table.InsertRow({"mkldnn_cache_capacity", "10"}); - - // a long string - table.InsertRow( - {"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ a long string " - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~", - "------------------------------------------ a long value " - "-----------------------------------------------------"}); - - LOG(INFO) << table.PrintTable(); -} diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index aed5d674e49ff5..5b49d927ae6762 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -62,17 +62,6 @@ if(WITH_ROCM) DEPS malloc gpu_info place) endif() -if(WITH_MUSA) - musa_test( - malloc_test - SRCS malloc_test.cu - DEPS device_context malloc) - musa_test( - cuda_managed_memory_test - SRCS cuda_managed_memory_test.cu - DEPS malloc gpu_info place) -endif() - if(WITH_TESTING AND TEST cuda_managed_memory_test) set_tests_properties( cuda_managed_memory_test diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index eae17991ff2fe5..ffce57d78f1642 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -19,7 +19,7 @@ set(ALLOCATOR_SRCS buddy_allocator.cc system_allocator.cc) -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) list( APPEND ALLOCATOR_SRCS @@ -90,13 +90,6 @@ if(WITH_ROCM) SRCS thread_local_allocator_test.cc DEPS allocator) endif() -if(WITH_MUSA) - musa_test( - thread_local_allocator_test - SRCS thread_local_allocator_test.cc - DEPS allocator) -endif() - if(WITH_GPU) nv_test( @@ -108,11 +101,6 @@ elseif(WITH_ROCM) best_fit_allocator_test SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu DEPS allocator memcpy) -elseif(WITH_MUSA) - musa_test( - best_fit_allocator_test - SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu - DEPS allocator memcpy) else() cc_test_old(best_fit_allocator_test SRCS best_fit_allocator_test.cc DEPS allocator) diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index 17839ecf0caecc..dd86ba9855fbab 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -26,9 +26,9 @@ #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/flags.h" -#ifdef PADDLE_WITH_MCCL -#include -#include "paddle/fluid/platform/dynload/mccl.h" +#ifdef PADDLE_WITH_NCCL +#include +#include "paddle/fluid/platform/dynload/nccl.h" #endif PHI_DECLARE_string(allocator_strategy); @@ -144,22 +144,22 @@ using DecoratedAllocationPtr = template static T&& FillValue(T&& allocation) { -#if defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) if (allocation != nullptr) { if (FLAGS_sync_after_alloc || FLAGS_alloc_fill_value >= 0) { - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); if (FLAGS_alloc_fill_value >= 0) { VLOG(10) << "Set " << FLAGS_alloc_fill_value << " on " << allocation->ptr() << " " << allocation->place() << " " << allocation->size(); if (platform::is_gpu_place(allocation->place())) { - PADDLE_ENFORCE_GPU_SUCCESS(musaMemset( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset( allocation->ptr(), FLAGS_alloc_fill_value, allocation->size())); } else { std::memset( allocation->ptr(), FLAGS_alloc_fill_value, allocation->size()); } - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); } } } diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index e7df0f7213363f..59ab4eaf154724 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -27,7 +27,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include #include "paddle/fluid/memory/allocation/cuda_allocator.h" @@ -165,7 +165,7 @@ class AllocatorFacadePrivate { public: using AllocatorMap = std::map>; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) using CUDAAllocatorMap = std::map>>; @@ -193,7 +193,7 @@ class AllocatorFacadePrivate { InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } @@ -219,7 +219,7 @@ class AllocatorFacadePrivate { case AllocatorStrategy::kAutoGrowth: { InitNaiveBestFitCPUAllocator(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) allow_free_idle_chunk_ = allow_free_idle_chunk; for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), @@ -294,7 +294,7 @@ class AllocatorFacadePrivate { InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); } @@ -353,7 +353,7 @@ class AllocatorFacadePrivate { LIKELY(FLAGS_use_system_allocator == false); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) { auto it = cuda_allocators_.find(place); if (it == cuda_allocators_.end()) { @@ -730,7 +730,7 @@ class AllocatorFacadePrivate { #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void InitNaiveBestFitCUDAPinnedAllocator() { if (FLAGS_use_auto_growth_pinned_allocator) { auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20; @@ -804,7 +804,7 @@ class AllocatorFacadePrivate { auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20; VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is " << FLAGS_auto_growth_chunk_size_in_mb; -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) auto cuda_allocator = CreateCUDAAllocator(p); cuda_allocators_[p][stream] = std::make_shared( cuda_allocator, @@ -890,7 +890,7 @@ class AllocatorFacadePrivate { auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20; VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is " << FLAGS_auto_growth_chunk_size_in_mb; -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) auto cuda_allocator = CreateCUDAAllocator(p); allocators_[p] = std::make_shared( cuda_allocator, @@ -1252,7 +1252,7 @@ class AllocatorFacadePrivate { system_allocators_[p] = std::make_shared(p); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) system_allocators_[platform::CUDAPinnedPlace()] = std::make_shared(); int device_count = platform::GetGPUDeviceCount(); @@ -1276,7 +1276,7 @@ class AllocatorFacadePrivate { if (!zero_size_allocators_.empty()) return; std::vector places; places.emplace_back(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) int device_count = platform::GetGPUDeviceCount(); for (int dev_id = 0; dev_id < device_count; ++dev_id) { places.emplace_back(platform::CUDAPlace(dev_id)); @@ -1322,7 +1322,7 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(allocators_); CheckAllocThreadSafe(zero_size_allocators_); CheckAllocThreadSafe(system_allocators_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (is_stream_safe_cuda_allocator_used_) { CheckCUDAAllocThreadSafe(cuda_allocators_); } @@ -1355,7 +1355,7 @@ class AllocatorFacadePrivate { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // a standalone CUDA allocator to support multi-stream GC in new executor std::map> default_stream_safe_cuda_allocators_; @@ -1489,7 +1489,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, } } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) AllocatorFacadePrivate* m = GetPrivate(); if (!m->IsStreamSafeCUDAAllocatorUsed()) { VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!"; @@ -1515,7 +1515,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, bool AllocatorFacade::InSameStream( const std::shared_ptr& allocation, const phi::Stream& stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t s = reinterpret_cast(stream.id()); return s == GetStream(allocation); #else @@ -1527,7 +1527,7 @@ bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() { return GetPrivate()->IsStreamSafeCUDAAllocatorUsed(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, gpuStream_t stream) { AllocatorFacadePrivate* m = GetPrivate(); diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 39819e0d66bdc9..acfd73a411932f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -81,7 +81,7 @@ class AllocatorFacade { bool IsStreamSafeCUDAAllocatorUsed(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream); void RecordStream(std::shared_ptr allocation, gpuStream_t stream); diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc index 0f532d1fff4d78..4f08db4921f8ba 100644 --- a/paddle/fluid/memory/allocation/buddy_allocator.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #define USE_DEVICE PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif @@ -54,7 +54,7 @@ BuddyAllocator::BuddyAllocator( }; use_custom_device_ = true; } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) init_allocate_size_func_ = &platform::GpuInitAllocSize; re_allocate_size_func_ = &platform::GpuReallocSize; #endif @@ -279,7 +279,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( allocate_bytes = DeviceAllocateSize( init_allocate_size_func_, re_allocate_size_func_, request_bytes); #else -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) allocate_bytes = DeviceAllocateSize( &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes); #endif diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 3f50fa9651ced2..781addd7dba60b 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -23,10 +23,6 @@ #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#endif - #include #include "paddle/fluid/platform/cuda_device_guard.h" diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 139e2358d161c8..7286f84160c6ad 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -82,9 +82,6 @@ class GPUContextAllocator : public Allocator { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaEventCreateWithFlags(&event_, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreate(&event_, cudaEventDisableTiming)); @@ -95,9 +92,8 @@ class GPUContextAllocator : public Allocator { if (event_) { platform::CUDADeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP + PADDLE_WARN_GPU_SUCCESS(hipEventDestroy(event_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_WARN_GPU_SUCCESS(musaEventDestroy(event_)); #else PADDLE_WARN_GPU_SUCCESS(cudaEventDestroy(event_)); #endif @@ -117,9 +113,6 @@ class GPUContextAllocator : public Allocator { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, default_stream_)); - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(default_stream_, event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0)); diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc index 331fe723d32bb9..77ca495cacbc70 100644 --- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc @@ -19,11 +19,6 @@ #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#include -#endif - #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index c8ac552bf1b73a..d39cb285517f2c 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -26,7 +26,7 @@ #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/common/place.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/platform/flags.h" @@ -213,7 +213,7 @@ size_t Used(const platform::XPUPlace &place) { } // For CUDA -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class GPUBuddyAllocatorList { private: GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) { @@ -283,7 +283,7 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { template <> size_t Used(const platform::CUDAPlace &place) { -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA) +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) return GetGPUBuddyAllocator(place.device)->Used(); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -294,7 +294,7 @@ size_t Used(const platform::CUDAPlace &place) { template <> void *Alloc(const platform::CUDAPlace &place, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto *buddy_allocator = GetGPUBuddyAllocator(place.device); auto *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { @@ -315,8 +315,6 @@ void *Alloc(const platform::CUDAPlace &place, if (FLAGS_init_allocated_mem) { #ifdef PADDLE_WITH_HIP hipMemset(ptr, 0xEF, size); -#elif defined(PADDLE_WITH_MUSA) - musaMemset(ptr, 0xEF, size); #else cudaMemset(ptr, 0xEF, size); #endif @@ -333,7 +331,7 @@ template <> void Free(const platform::CUDAPlace &place, void *p, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) GetGPUBuddyAllocator(place.device)->Free(p); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -343,7 +341,7 @@ void Free(const platform::CUDAPlace &place, template <> uint64_t Release(const platform::CUDAPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return GetGPUBuddyAllocator(place.device)->Release(); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -351,7 +349,7 @@ uint64_t Release(const platform::CUDAPlace &place) { #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) BuddyAllocator *GetCUDAPinnedBuddyAllocator() { static std::once_flag init_flag; static BuddyAllocator *ba = nullptr; @@ -369,7 +367,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() { template <> size_t Used(const platform::CUDAPinnedPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return GetCUDAPinnedBuddyAllocator()->Used(); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -380,7 +378,7 @@ size_t Used(const platform::CUDAPinnedPlace &place) { template <> void *Alloc(const platform::CUDAPinnedPlace &place, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); auto *buddy_allocator = GetCUDAPinnedBuddyAllocator(); void *ptr = buddy_allocator->Alloc(size); @@ -402,7 +400,7 @@ template <> void Free(const platform::CUDAPinnedPlace &place, void *p, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) VLOG(10) << "Free " << size << " bytes on " << platform::Place(place); GetCUDAPinnedBuddyAllocator()->Free(p); #else @@ -414,7 +412,7 @@ void Free(const platform::CUDAPinnedPlace &place, template <> uint64_t Release( const platform::CUDAPinnedPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) VLOG(10) << "Release on " << platform::Place(place); return GetCUDAPinnedBuddyAllocator()->Release(); #else @@ -605,7 +603,7 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const { } size_t Usage::operator()(const platform::CUDAPlace &gpu) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return Used(gpu); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -614,7 +612,7 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const { } size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return Used(cuda_pinned); #else PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 206ad954468010..32853f08f94e5a 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -23,8 +23,6 @@ bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr())); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaFreeHost(allocation->ptr())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr())); #endif @@ -40,8 +38,6 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { void *ptr; #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaHostAlloc(&ptr, size, musaHostAllocPortable)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); #endif diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 30fe2d9b095eb7..48b18f07456c66 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -92,17 +92,6 @@ bool StreamSafeCUDAAllocation::CanBeFreed() { } PADDLE_ENFORCE_GPU_SUCCESS(err); PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); - -#elif defined(PADDLE_WITH_MUSA) - gpuError_t err = musaEventQuery(event); - if (err == musaErrorNotReady) { - VLOG(9) << "Event " << event << " for " << ptr() << " is not completed"; - // Erase the completded event before "it" - outstanding_event_map_.erase(outstanding_event_map_.begin(), it); - return false; - } - PADDLE_ENFORCE_GPU_SUCCESS(err); - PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else gpuError_t err = hipEventQuery(event); if (err == hipErrorNotReady) { @@ -139,9 +128,6 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); -#elif defined (PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaEventCreateWithFlags(&new_event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); @@ -156,8 +142,6 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); -#elif defined (PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(record_event, stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); #endif diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 79a7c7abf01de2..31508a10799228 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -24,9 +24,6 @@ #ifdef PADDLE_WITH_CUDA #include -#elif defined(PADDLE_WITH_MUSA) -#include -#include #else #include #endif diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc index cb9c4afd7b9fcf..e9a9fcbff9831e 100644 --- a/paddle/fluid/memory/allocation/system_allocator.cc +++ b/paddle/fluid/memory/allocation/system_allocator.cc @@ -33,7 +33,7 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_info.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -120,7 +120,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { bool CPUAllocator::UseGpu() const { return false; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void* GPUAllocator::Alloc(size_t* index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr @@ -216,8 +216,6 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { // PINNED memory is visible to all CUDA contexts. #ifdef PADDLE_WITH_HIP hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable); -#elif defined(PADDLE_WITH_MUSA) - musaError_t result = musaHostAlloc(&p, size, musaHostAllocPortable); #else cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); #endif @@ -261,22 +259,6 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { platform::errors::Fatal( "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err)); } -#elif defined(PADDLE_WITH_MUSA) - err = musaFreeHost(p); - - // Purposefully allow cudaErrorCudartUnloading, because - // that is returned if you ever call cudaFreeHost after the - // driver has already shutdown. This happens only if the - // process is terminating, in which case we don't care if - // cudaFreeHost succeeds. - if (err != musaErrorMusartUnloading) { - PADDLE_ENFORCE_EQ( - err, - 0, - platform::errors::Fatal( - "cudaFreeHost failed in GPUPinnedAllocator, error code is %d", - err)); - } #else err = cudaFreeHost(p); diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h index b2cce04a04d37e..67376a3e39a224 100644 --- a/paddle/fluid/memory/allocation/system_allocator.h +++ b/paddle/fluid/memory/allocation/system_allocator.h @@ -43,7 +43,7 @@ class CPUAllocator : public SystemAllocator { virtual bool UseGpu() const; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class GPUAllocator : public SystemAllocator { public: explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 63504621f98c5b..0c40da19d47e5f 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -57,7 +57,7 @@ void* GetBasePtr(const std::shared_ptr& allocation) { return allocation::AllocatorFacade::Instance().GetBasePtr(allocation); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream) { return allocation::AllocatorFacade::Instance().Release(place, stream); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 48fbc541e5fa91..3b098e5a13e515 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -49,7 +49,7 @@ extern bool InSameStream(const std::shared_ptr& allocation, extern void* GetBasePtr(const std::shared_ptr& allocation); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) extern uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream); void RecordStream(std::shared_ptr allocation, gpuStream_t stream); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index c8ce60e7c39d6e..bffbcbdfad76bc 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/common/place.h" +#include "paddle/utils/test_macros.h" #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_header.h" @@ -110,11 +111,11 @@ void Copy( #endif // PADDLE_WITH_CUSTOM_DEVICE template <> -void Copy(platform::CPUPlace, - void* dst, - platform::CPUPlace, - const void* src, - size_t num) { +TEST_API void Copy(platform::CPUPlace, + void* dst, + platform::CPUPlace, + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num; std::memcpy(dst, src, num); @@ -256,8 +257,7 @@ void Copy(phi::Place dst_place, #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K #ifdef PADDLE_WITH_HIP @@ -272,22 +272,10 @@ inline void SyncCUDAStream() { } #endif } -#elif defined(PADDLE_WITH_MUSA) -inline void SyncCUDAStream() { -#if !defined(_WIN32) - musaStreamSynchronize(0); -#else - musaError_t e_sync = musaSuccess; - while (e_sync = musaStreamQuery(0)) { - if (e_sync == musaErrorNotReady) continue; - break; - } -#endif -} #else inline void SyncCUDAStream() { #if !defined(_WIN32) - cudaStreamSynchronize(0); + cudaStreamSynchronize(nullptr); #else cudaError_t e_sync = cudaSuccess; while (e_sync = cudaStreamQuery(0)) { @@ -305,7 +293,7 @@ inline void SyncCUDAStream() { // https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ template <> -void Copy( +TEST_API void Copy( platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, @@ -326,12 +314,6 @@ void Copy( num, hipMemcpyDeviceToHost, reinterpret_cast(stream)); -#elif defined(PADDLE_WITH_MUSA) - platform::GpuMemcpyAsync(dst, - src, - num, - musaMemcpyDeviceToHost, - reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -344,8 +326,6 @@ void Copy( "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); -#elif defined(PADDLE_WITH_MUSA) - platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); #endif @@ -357,7 +337,7 @@ void Copy( } template <> -void Copy( +TEST_API void Copy( platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, @@ -378,12 +358,6 @@ void Copy( num, hipMemcpyHostToDevice, reinterpret_cast(stream)); -#elif defined(PADDLE_WITH_MUSA) - platform::GpuMemcpyAsync(dst, - src, - num, - musaMemcpyHostToDevice, - reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -396,8 +370,6 @@ void Copy( "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); -#elif defined(PADDLE_WITH_MUSA) - platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); #endif @@ -432,12 +404,6 @@ void Copy( num, hipMemcpyDeviceToDevice, reinterpret_cast(stream)); -#elif defined(PADDLE_WITH_MUSA) - platform::GpuMemcpyAsync(dst, - src, - num, - musaMemcpyDeviceToDevice, - reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -451,8 +417,6 @@ void Copy( 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice); -#elif defined(PADDLE_WITH_MUSA) - platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); #endif @@ -492,7 +456,7 @@ void Copy( } template <> -void Copy( +TEST_API void Copy( platform::CUDAPinnedPlace dst_place, void* dst, platform::CPUPlace src_place, @@ -528,7 +492,7 @@ void Copy( if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by thream(" << stream << ")"; + << dst_place << " by stream(" << stream << ")"; if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned", platform::TracerEventType::UserDefined, @@ -539,12 +503,6 @@ void Copy( num, hipMemcpyDeviceToHost, reinterpret_cast(stream)); -#elif defined(PADDLE_WITH_MUSA) - platform::GpuMemcpyAsync(dst, - src, - num, - musaMemcpyDeviceToHost, - reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -558,8 +516,6 @@ void Copy( 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); -#elif defined(PADDLE_WITH_MUSA) - platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); #endif @@ -578,7 +534,7 @@ void Copy( platform::SetDeviceId(dst_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by thream(" << stream << ")"; + << dst_place << " by stream(" << stream << ")"; if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU", platform::TracerEventType::UserDefined, @@ -589,12 +545,6 @@ void Copy( num, hipMemcpyHostToDevice, reinterpret_cast(stream)); -#elif defined(PADDLE_WITH_MUSA) - platform::GpuMemcpyAsync(dst, - src, - num, - musaMemcpyHostToDevice, - reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -608,8 +558,6 @@ void Copy( 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); -#elif defined(PADDLE_WITH_MUSA) - platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); #endif @@ -796,11 +744,10 @@ void Copy(phi::Place dst_place, VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (src_place.GetType() == phi::AllocationType::CPU && - dst_place.GetType() == phi::AllocationType::CPU) { + dst_place.GetType() == phi::AllocationType::CPU) { // NOLINT std::memcpy(dst, src, num); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT dst_place.GetType() == phi::AllocationType::GPUPINNED) { std::memcpy(dst, src, num); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 6754c17978ea31..fe5fae7bafaebb 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -102,7 +102,7 @@ op_library(quantize_linear_op DEPS phi common) op_library(save_combine_op DEPS string_array phi common) op_library(load_combine_op DEPS string_array) -if (WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if (WITH_GPU OR WITH_ROCM) register_cu_kernel(class_center_sample_op SRCS class_center_sample_op.cu DEPS ${OP_HEADER_DEPS}) endif() @@ -110,7 +110,7 @@ if (WITH_MKLDNN) register_mkldnn_kernel(layer_norm_op SRCS layer_norm_mkldnn_op.cc DEPS ${OP_HEADER_DEPS}) endif() -if (WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if (WITH_GPU OR WITH_ROCM) op_library(activation_op SRCS activation_op.cc activation_op.kps soft_relu_op.cu DEPS ${OP_HEADER_DEPS}) elseif (WITH_XPU_KP) op_library(activation_op SRCS activation_op.cc activation_op.kps DEPS ${OP_HEADER_DEPS}) @@ -118,9 +118,9 @@ else() op_library(activation_op SRCS activation_op.cc DEPS ${OP_HEADER_DEPS}) endif() -if (WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if (WITH_GPU OR WITH_ROCM) op_library(sync_batch_norm_op DEPS processgroup_comm_utils) - if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT WITH_MUSA) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) ) + if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) ) op_library(sparse_attention_op DEPS processgroup_comm_utils) endif() endif() @@ -152,10 +152,10 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} beam_search) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper ps_gpu_wrapper) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} processgroup_comm_utils) -if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) +if(WITH_NCCL OR WITH_RCCL) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} process_group_nccl) endif() -if (WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if (WITH_GPU OR WITH_ROCM) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor) endif() if(WITH_XPU) diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index dcbe58ffceb6a1..a07f311c6125ef 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(__NVCC__) || defined(__MUSACC__) +#ifdef __NVCC__ #include "cub/cub.cuh" #endif diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index 79e677034ce0f1..2c85ec6ea2076b 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -55,7 +55,7 @@ struct ArrayToLoDFunctor { if (std::is_same::value) { Apply(static_cast(pool.Get(place))); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) Apply(static_cast(pool.Get(place))); #else PADDLE_THROW( diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index c25344994cb503..012edde57294a9 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include #include #include -#if defined(__NVCC__) || defined(__MUSACC__) +#ifdef __NVCC__ #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index aa03c2b57355c6..ecfae25270f911 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -19,14 +19,6 @@ #include typedef hiprandState curandState; namespace cub = hipcub; - -#elif defined(PADDLE_WITH_MUSA) -#include -#include - -#include -typedef murandState curandState; - #else #include #include @@ -42,7 +34,7 @@ typedef murandState curandState; #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/tensor_utils.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -84,11 +76,6 @@ __global__ void RandomSampleClassCenter(const int64_t n, CUDA_KERNEL_LOOP(i, n) { buffer[i] = static_cast(hiprand(&localState) % max_val); } -#elif defined(PADDLE_WITH_MUSA) - murand_init(local_seed, id, increment, &localState); - CUDA_KERNEL_LOOP(i, n) { - buffer[i] = static_cast(murand(&localState) % max_val); - } #else curand_init(local_seed, id, increment, &localState); CUDA_KERNEL_LOOP(i, n) { @@ -365,7 +352,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx, phi::TensorFromVector(shard_dim_vec, dev_ctx, &num_classes_per_device); T* num_classes_per_device_ptr = num_classes_per_device.data(); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); if (map->has(ring_id)) { @@ -410,15 +397,15 @@ void ClassCenterSampleKernel(const Context& dev_ctx, if (comm_ctx) { comm_ctx->AllReduce( - &num_classes_per_device, num_classes_per_device, mcclSum, stream); + &num_classes_per_device, num_classes_per_device, ncclSum, stream); paddle::platform::GpuStreamSync(stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( num_classes_per_device_ptr, num_classes_per_device_ptr, num_classes_per_device.numel(), phi::ToNCCLDataType(num_classes_per_device.dtype()), - mcclSum, + ncclSum, comm->comm(), stream)); } diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index fdecbca81fc590..1c8c8f00217cc5 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -30,7 +30,7 @@ register_operators( DEPS ${COLLECTIVE_DEPS}) -if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) +if(WITH_NCCL OR WITH_RCCL) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper phi common) op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc index b554d658126f54..11b51602d4d75a 100644 --- a/paddle/fluid/operators/collective/alltoall_op.cu.cc +++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/utils.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -33,12 +33,12 @@ template class AllToAllOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto out = ctx.Output("Out"); int send_numel = x->numel(); - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); int ring_id = ctx.Attr("ring_id"); @@ -114,7 +114,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel { comm_ctx->GroupEnd(); VLOG(3) << "new comm_context_manager has rid " << ring_id; } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (auto i = 0; i < nranks; ++i) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( send_buf + offset, send_numel, dtype, i, comm->comm(), stream)); @@ -122,7 +122,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel { recv_buf + offset, send_numel, dtype, i, comm->comm(), stream)); offset += send_numel; } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); VLOG(3) << "old NCCLCommContext has rid " << ring_id; } #else diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc index 2b1f04a491d5e3..210c42d30f6d50 100644 --- a/paddle/fluid/operators/collective/barrier_op.cu.cc +++ b/paddle/fluid/operators/collective/barrier_op.cu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/barrier_op.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -30,12 +30,12 @@ template class BarrierOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); auto place = ctx.GetPlace(); - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); int64_t numel = in->numel(); const void* sendbuff = in->data(); @@ -62,7 +62,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel { "NCCLCommContext is nullptr, collective op should " "has ring_id attr.")); auto stream = comm_ctx->GetStream(); - mcclRedOp_t nccl_red_type = mcclSum; + ncclRedOp_t nccl_red_type = ncclSum; comm_ctx->AllReduce(out, *in, nccl_red_type, stream); platform::GpuStreamSync(stream); VLOG(3) << "new NCCLCommContext has rid " << rid; @@ -70,8 +70,8 @@ class BarrierOpCUDAKernel : public framework::OpKernel { auto comm = platform::NCCLCommContext::Instance().Get(rid, place); // should ExecutionContext for calc stream. auto stream = ctx.cuda_device_context().stream(); - mcclRedOp_t nccl_red_type = mcclSum; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff, + ncclRedOp_t nccl_red_type = ncclSum; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff, recvbuff, numel, dtype, diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index 0de5e22aaabeb6..bd105c35886cb0 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allgather_op.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -33,10 +33,10 @@ template class CAllGatherOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); int nranks = ctx.Attr("nranks"); @@ -103,10 +103,10 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel { comm_ctx->AllGather(out, *in, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclAllGather(send_buff, + platform::dynload::ncclAllGather(send_buff, recv_buff, send_numel, - static_cast(dtype), + static_cast(dtype), comm->comm(), stream)); } diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc index b45f568b835f8d..277988b56916f8 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc @@ -28,9 +28,9 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max, ALL_LAYOUT, ops::CAllReduceMaxCUDAKernel, float, -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -// #endif +#endif double, int, int64_t, diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 7bf5e59431f8ff..9cd472f4217881 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -25,14 +25,14 @@ limitations under the License. */ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #elif defined(PADDLE_WITH_XPU_BKCL) @@ -309,13 +309,13 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { } } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); int rid = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); int64_t numel = in->numel(); const void* sendbuff = in->data(); @@ -395,22 +395,22 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { << ", dtype:" << dtype << ", comm:" << comm << ", stream:" << stream; - mcclRedOp_t nccl_red_type = mcclSum; + ncclRedOp_t nccl_red_type = ncclSum; switch (red_type) { case kRedSum: - nccl_red_type = mcclSum; + nccl_red_type = ncclSum; break; case kRedMax: - nccl_red_type = mcclMax; + nccl_red_type = ncclMax; break; case kRedMin: - nccl_red_type = mcclMin; + nccl_red_type = ncclMin; break; case kRedProd: - nccl_red_type = mcclProd; + nccl_red_type = ncclProd; break; default: @@ -421,7 +421,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { if (comm_ctx) { comm_ctx->AllReduce(out, *in, nccl_red_type, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff, recvbuff, numel, dtype, diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc index f886e4aaab212f..76d809cd234f03 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc @@ -28,9 +28,9 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum, ALL_LAYOUT, ops::CAllReduceSumCUDAKernel, float, -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -// #endif +#endif double, int, int64_t, diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index 348c22bd8be48e..4d49bc4990c6ec 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -29,7 +29,7 @@ template class CBroadcastOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto x = ctx.Input("X"); auto out = ctx.Output("Out"); @@ -50,11 +50,11 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { } else { // NOTE(liyurui): This will be removed after moving this operator to phi. int numel = x->numel(); - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); auto comm = platform::NCCLCommContext::Instance().Get(rid, place); if (root == comm->rank()) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( reinterpret_cast(const_cast(x->data())), numel, dtype, @@ -71,7 +71,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { static_cast(out)); } } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( out->data(), numel, dtype, root, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received " << common::product(out->dims()); @@ -100,8 +100,8 @@ PD_REGISTER_STRUCT_KERNEL(c_broadcast, int64_t, float, double, -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -// #endif +#endif plat::float16) { } diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc index 2e84a0e80c2dcc..2dc9af01395468 100644 --- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/platform/collective_helper.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -56,7 +56,7 @@ class CCommInitAllOp : public framework::OperatorBase { // platform::errors::PreconditionNotMet( // "CCommInitAllOp can run on gpu place only")); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) std::vector devices = Attr>("devices"); if (devices.empty()) { devices = platform::GetSelectedDevices(); diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc index 4d92c369abfebc..39d22fcd5f50d8 100644 --- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc @@ -17,10 +17,6 @@ limitations under the License. */ #if defined(PADDLE_WITH_RCCL) #include #endif - -#if defined(PADDLE_WITH_MCCL) -#include -#endif #include #include @@ -32,7 +28,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" // #include "paddle/fluid/operators/distributed/distributed.h" // #include "paddle/fluid/operators/distributed/request_handler_impl.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -59,8 +55,8 @@ class CCommInitMultiTrainerOp : public framework::OperatorBase { auto var = scope.FindVar(Input("X")); PADDLE_ENFORCE_NOT_NULL( var, platform::errors::InvalidArgument("Input X must be provided.")); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) - mcclUniqueId* nccl_id = var->GetMutable(); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + ncclUniqueId* nccl_id = var->GetMutable(); int ntrainers = Attr("ntrainers"); int train_id = Attr("trainer_id"); diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc index 3f7683fb405cb1..086257eab60383 100644 --- a/paddle/fluid/operators/collective/c_comm_init_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -17,11 +17,6 @@ limitations under the License. */ #if defined(PADDLE_WITH_RCCL) #include #endif - -#if defined(PADDLE_WITH_MCCL) -#include -#endif - #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" #endif @@ -29,12 +24,12 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE) #include "paddle/fluid/platform/collective_helper.h" #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/phi/core/distributed/nccl_comm_context.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #elif defined(PADDLE_WITH_XPU_BKCL) @@ -97,8 +92,8 @@ class CCommInitOp : public framework::OperatorBase { #endif } else { // TODO(wangxi): Put this in the unified header file -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) - using UniqueId = mcclUniqueId; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + using UniqueId = ncclUniqueId; using CommContext = platform::NCCLCommContext; #elif defined(PADDLE_WITH_XPU_BKCL) using UniqueId = BKCLUniqueId; @@ -114,7 +109,7 @@ class CCommInitOp : public framework::OperatorBase { platform::errors::PreconditionNotMet( "CCommInitOp can run on gpu or xpu place only.")); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) auto var = scope.FindVar(Input("X")); PADDLE_ENFORCE_NOT_NULL( @@ -150,7 +145,7 @@ class CCommInitOp : public framework::OperatorBase { return; } #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) VLOG(3) << "#### use old comm lab ####"; UniqueId* comm_id = var->GetMutable(); diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc index f170e07b6532f9..d13179cbae48b1 100644 --- a/paddle/fluid/operators/collective/c_concat_op.cu.cc +++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/phi/api/include/tensor.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -38,7 +38,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto x = ctx.Input("X"); auto out = ctx.Output("Out"); - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); int nranks = ctx.Attr("nranks"); @@ -65,7 +65,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel { rank, nranks)); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) phi::DenseTensor temp_out; framework::DDim temp_out_dims = x->dims(); temp_out_dims[0] *= nranks; @@ -130,10 +130,10 @@ class CConcatOpCUDAKernel : public framework::OpKernel { comm_ctx->AllGather(&temp_out, *x, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclAllGather(send_buff, + platform::dynload::ncclAllGather(send_buff, recv_buff, send_numel, - static_cast(dtype), + static_cast(dtype), comm->comm(), stream)); } @@ -175,8 +175,8 @@ PD_REGISTER_STRUCT_KERNEL(c_concat, double, int, int64_t, -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -// #endif +#endif plat::float16) { } diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 9851b9d9d9f685..4a07f7e98f793c 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -27,14 +27,14 @@ PHI_DECLARE_bool(dynamic_static_unified_comm); namespace paddle { namespace operators { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) -static void GenNCCLID(std::vector* nccl_ids) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +static void GenNCCLID(std::vector* nccl_ids) { for (auto& nccl_id : *nccl_ids) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetUniqueId(&nccl_id)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id)); } } -static void CopyNCCLIDToVar(const std::vector& nccl_ids, +static void CopyNCCLIDToVar(const std::vector& nccl_ids, std::function func, const framework::Scope& scope) { for (size_t i = 0; i < nccl_ids.size(); ++i) { @@ -44,8 +44,8 @@ static void CopyNCCLIDToVar(const std::vector& nccl_ids, var, platform::errors::NotFound("Variable with name %s is not found", var_name.c_str())); - auto nccl_id = var->GetMutable(); - memcpy(nccl_id, &nccl_ids[i], sizeof(mcclUniqueId)); + auto nccl_id = var->GetMutable(); + memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId)); } } @@ -68,7 +68,7 @@ class CGenNCCLIdOp : public framework::OperatorBase { std::string endpoint = Attr("endpoint"); - std::vector nccl_ids; + std::vector nccl_ids; nccl_ids.resize(1); if (!FLAGS_dynamic_static_unified_comm) { diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index 26cacdd87fa863..20884d1ae8a969 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -26,14 +26,14 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #elif defined(PADDLE_WITH_XPU_BKCL) @@ -236,12 +236,12 @@ template class CReduceOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); auto place = ctx.GetPlace(); - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); int64_t numel = in->numel(); const void* sendbuff = in->data(); @@ -286,22 +286,22 @@ class CReduceOpCUDAKernel : public framework::OpKernel { stream = ctx.cuda_device_context().stream(); } - mcclRedOp_t nccl_red_type = mcclSum; + ncclRedOp_t nccl_red_type = ncclSum; switch (red_type) { case kRedSum: - nccl_red_type = mcclSum; + nccl_red_type = ncclSum; break; case kRedMax: - nccl_red_type = mcclMax; + nccl_red_type = ncclMax; break; case kRedMin: - nccl_red_type = mcclMin; + nccl_red_type = ncclMin; break; case kRedProd: - nccl_red_type = mcclProd; + nccl_red_type = ncclProd; break; default: @@ -315,7 +315,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel { if (comm_ctx) { comm_ctx->Reduce(out, *in, nccl_red_type, root, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce(sendbuff, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(sendbuff, recvbuff, numel, dtype, diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc index af26bf7d858ba0..cd1cf0c0176363 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reducescatter_op.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -30,7 +30,7 @@ template class CReduceScatterOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); @@ -105,14 +105,14 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel { platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); if (comm_ctx) { - comm_ctx->ReduceScatter(out, *in, mcclSum, stream); + comm_ctx->ReduceScatter(out, *in, ncclSum, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduceScatter( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter( send_buff, recv_buff, recv_numel, - static_cast(dtype), - mcclSum, + static_cast(dtype), + ncclSum, comm->comm(), stream)); } @@ -135,9 +135,9 @@ PD_REGISTER_STRUCT_KERNEL(c_reducescatter, ops::CReduceScatterOpCUDAKernel, float, double, -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -// #endif +#endif int, int64_t, plat::float16) { diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc index 86bb602256aefb..7f4b4f6734de0c 100644 --- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_scatter_op.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -30,11 +30,11 @@ template class CScatterOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto x = ctx.Input("X"); auto out = ctx.Output("Out"); int numel = x->numel(); - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); int nranks = ctx.Attr("nranks"); @@ -123,7 +123,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel { } } else { if (root_id == comm->rank()) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( reinterpret_cast(const_cast(x->data())), numel, dtype, @@ -137,7 +137,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel { *platform::DeviceContextPool::Instance().Get(place), static_cast(&temp)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( out_ptr, numel, dtype, root_id, comm->comm(), stream)); } } diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index 7ea80d8a54e9ad..f8f43d5c9da48c 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/softmax_impl.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/phi/core/distributed/nccl_comm_context.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); @@ -208,17 +208,17 @@ struct CSoftmaxWithCrossEntropyFunctor { eigen_logits.maximum(along_axis); if (comm_ctx) { - comm_ctx->AllReduce(&logits_max, logits_max, mcclMax, stream); + comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream); } else { void* logits_max_buff = logits_max.mutable_data(place); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( logits_max_buff, logits_max_buff, logits_max.numel(), platform::ToNCCLDataType( framework::TransToProtoVarType(logits_max.dtype())), - mcclMax, + ncclMax, comm->comm(), stream)); } @@ -273,16 +273,16 @@ struct CSoftmaxWithCrossEntropyFunctor { predicted_logits.mutable_data(place); if (comm_ctx) { - comm_ctx->AllReduce(&predicted_logits, predicted_logits, mcclSum, stream); + comm_ctx->AllReduce(&predicted_logits, predicted_logits, ncclSum, stream); } else { void* predict_logits_buff = predicted_logits.data(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( predict_logits_buff, predict_logits_buff, predicted_logits.numel(), platform::ToNCCLDataType( framework::TransToProtoVarType(predicted_logits.dtype())), - mcclSum, + ncclSum, comm->comm(), stream)); } @@ -301,16 +301,16 @@ struct CSoftmaxWithCrossEntropyFunctor { eigen_softmax.sum(along_axis); if (comm_ctx) { - comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, mcclSum, stream); + comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream); } else { void* sum_exp_logits_buff = sum_exp_logits.data(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), platform::ToNCCLDataType( framework::TransToProtoVarType(sum_exp_logits.dtype())), - mcclSum, + ncclSum, comm->comm(), stream)); } diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h index 79c32bc907045f..e100397924af56 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h @@ -39,7 +39,7 @@ template class CSyncCalcStreamKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) auto place = ctx.GetPlace(); auto dev_ctx = static_cast( diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h index 52f4e6f6d88fee..8d60d633272a98 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h @@ -18,14 +18,14 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #elif defined(PADDLE_WITH_XPU_BKCL) @@ -40,7 +40,7 @@ template class CSyncCommStreamKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto place = ctx.GetPlace(); int ring_id = ctx.Attr("ring_id"); diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc index c97da1a737b0f2..f2eab0532b9df2 100644 --- a/paddle/fluid/operators/collective/c_wait_comm_op.cc +++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc @@ -19,7 +19,7 @@ namespace framework { class Scope; } // namespace framework } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -47,7 +47,7 @@ class CWaitCommOp : public framework::OperatorBase { "wait_comm op can run on gpu place only for now, but got %s", place.DebugString())); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) int ring_id = Attr("ring_id"); gpuStream_t compute_stream = @@ -89,9 +89,6 @@ class CWaitCommOp : public framework::OperatorBase { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream)); - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0)); diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc index 3088e1ed61d66e..33b56cbe6581d0 100644 --- a/paddle/fluid/operators/collective/c_wait_compute_op.cc +++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc @@ -19,7 +19,7 @@ namespace framework { class Scope; } // namespace framework } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -47,7 +47,7 @@ class CWaitComputeOp : public framework::OperatorBase { "wait_compute op can run on gpu place only for now, but got %s", place.DebugString())); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) int ring_id = Attr("ring_id"); gpuStream_t compute_stream = @@ -89,9 +89,6 @@ class CWaitComputeOp : public framework::OperatorBase { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream)); - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0)); diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc index da13a5ba800a63..1d03cb151e4a01 100644 --- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc @@ -34,14 +34,14 @@ class Scope; namespace paddle { namespace operators { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) -static void GenNCCLID(std::vector* nccl_ids) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +static void GenNCCLID(std::vector* nccl_ids) { for (auto& nccl_id : *nccl_ids) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetUniqueId(&nccl_id)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id)); } } -static void CopyNCCLIDToVar(const std::vector& nccl_ids, +static void CopyNCCLIDToVar(const std::vector& nccl_ids, std::function func, const framework::Scope& scope) { for (size_t i = 0; i < nccl_ids.size(); ++i) { @@ -51,8 +51,8 @@ static void CopyNCCLIDToVar(const std::vector& nccl_ids, var, platform::errors::NotFound("Variable with name %s is not found", var_name.c_str())); - auto nccl_id = var->GetMutable(); - memcpy(nccl_id, &nccl_ids[i], sizeof(mcclUniqueId)); + auto nccl_id = var->GetMutable(); + memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId)); } } @@ -130,7 +130,7 @@ class GenNCCLIdOp : public framework::OperatorBase { << ", trainers:" << ss.str(); int server_fd = -1; - std::vector nccl_ids; + std::vector nccl_ids; nccl_ids.resize(nccl_comm_num); /// 1. init flat diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc index a1e09d2c35cbb8..7a9c02628088fd 100644 --- a/paddle/fluid/operators/collective/global_gather_op.cu.cc +++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/global_gather_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -31,8 +31,8 @@ namespace operators { template struct GlobalGatherFunctor { void operator()(const framework::ExecutionContext& ctx) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) -// #if NCCL_VERSION_CODE >= 2703 +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto local_count = ctx.Input("local_count"); auto global_count = ctx.Input("global_count"); @@ -73,7 +73,7 @@ struct GlobalGatherFunctor { cpu_global_count_data = cpu_global_count.data(); } - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); int ring_id = ctx.Attr("ring_id"); @@ -165,11 +165,11 @@ struct GlobalGatherFunctor { auto send_buf = x->data(); auto recv_buf = out->data(); for (auto i = 0; i < n_expert; ++i) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (auto j = 0; j < nranks; ++j) { int idx = i + j * n_expert; if (cpu_global_count_data[idx]) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( send_buf + send_ptr * in_feat, cpu_global_count_data[idx] * in_feat, dtype, @@ -179,7 +179,7 @@ struct GlobalGatherFunctor { send_ptr += cpu_global_count_data[idx]; } if (cpu_local_count_data[idx]) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( recv_buf + expert_ptr[idx] * in_feat, cpu_local_count_data[idx] * in_feat, dtype, @@ -188,13 +188,13 @@ struct GlobalGatherFunctor { stream)); } } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); } } -// #else - // PADDLE_THROW( - // platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); -// #endif +#else + PADDLE_THROW( + platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); +#endif #else PADDLE_THROW( platform::errors::Unavailable("PaddlePaddle should compile with GPU.")); @@ -205,8 +205,8 @@ struct GlobalGatherFunctor { template struct GlobalGatherProcessGroupFunctor { void operator()(const framework::ExecutionContext& ctx) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) -// #if NCCL_VERSION_CODE >= 2703 +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto local_count = ctx.Input("local_count"); auto global_count = ctx.Input("global_count"); @@ -304,16 +304,14 @@ struct GlobalGatherProcessGroupFunctor { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif -// #else -// PADDLE_THROW( -// platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); -// #endif +#else + PADDLE_THROW( + platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); +#endif #else PADDLE_THROW( platform::errors::Unavailable("PaddlePaddle should compile with GPU.")); diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc index 38a992d3baaa31..6b915d35be0430 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -32,8 +32,8 @@ namespace operators { template struct GlobalScatterFunctor { void operator()(const framework::ExecutionContext& ctx) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) -// #if NCCL_VERSION_CODE >= 2703 +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto local_count = ctx.Input("local_count"); auto global_count = ctx.Input("global_count"); @@ -72,7 +72,7 @@ struct GlobalScatterFunctor { global_count_len = cpu_global_count.numel(); } - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); int ring_id = ctx.Attr("ring_id"); @@ -173,11 +173,11 @@ struct GlobalScatterFunctor { auto recv_buf = out->data(); for (auto i = 0; i < n_expert; ++i) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (auto j = 0; j < nranks; ++j) { int idx = i + j * n_expert; if (cpu_local_count_data[idx]) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( send_buf + expert_ptr[idx] * in_feat, cpu_local_count_data[idx] * in_feat, dtype, @@ -186,7 +186,7 @@ struct GlobalScatterFunctor { stream)); } if (cpu_global_count_data[idx]) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( recv_buf + recv_ptr * in_feat, cpu_global_count_data[idx] * in_feat, dtype, @@ -196,14 +196,14 @@ struct GlobalScatterFunctor { recv_ptr += cpu_global_count_data[idx]; } } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); } } -// #else -// PADDLE_THROW( -// platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); -// #endif +#else + PADDLE_THROW( + platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); +#endif #else PADDLE_THROW( platform::errors::Unavailable("PaddlePaddle should compile with GPU.")); @@ -214,8 +214,8 @@ struct GlobalScatterFunctor { template struct GlobalScatterProcessGroupFunctor { void operator()(const framework::ExecutionContext& ctx) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) -// #if NCCL_VERSION_CODE >= 2703 +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto local_count = ctx.Input("local_count"); auto global_count = ctx.Input("global_count"); @@ -311,16 +311,14 @@ struct GlobalScatterProcessGroupFunctor { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif -// #else -// PADDLE_THROW( -// platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); -// #endif +#else + PADDLE_THROW( + platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); +#endif #else PADDLE_THROW( platform::errors::Unavailable("PaddlePaddle should compile with GPU.")); diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc index d53a92369df401..b4773a8eb54562 100644 --- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc +++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc @@ -31,8 +31,8 @@ PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum, double, int, int64_t, -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -// #endif +#endif plat::float16) { } diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc index 863850b6e38396..b0cdabce48503a 100644 --- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/partial_allgather_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -32,11 +32,11 @@ template class PartialAllGatherOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); int64_t numel = in->numel(); - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); int nranks = ctx.Attr("nranks"); @@ -128,10 +128,10 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel { const T* send_buff = in->data() + offset; T* recv_buff = out->data(); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclAllGather(send_buff, + platform::dynload::ncclAllGather(send_buff, recv_buff, send_numel, - static_cast(dtype), + static_cast(dtype), comm->comm(), stream)); } @@ -155,9 +155,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_allgather, ops::PartialAllGatherOpCUDAKernel, float, double, -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -// #endif +#endif int, int64_t, plat::float16) { diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc index fdfb31e7b2eab1..c8844058696e14 100644 --- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/partial_recv_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -32,8 +32,8 @@ template class PartialRecvOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) - // NCCL_VERSION_CODE >= 2703 +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ + NCCL_VERSION_CODE >= 2703 auto out = ctx.Output("Out"); auto out_dims = out->dims(); auto numel = out->numel(); @@ -142,7 +142,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel { peer, nranks)); - mcclDataType_t dtype = platform::ToNCCLDataType(type); + ncclDataType_t dtype = platform::ToNCCLDataType(type); if (comm_ctx) { auto recv_buf = distributed::GetPartialTensor(*out, offset, recv_numel); @@ -150,7 +150,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel { comm_ctx->Recv(&recv_buf, recv_numel, peer, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclRecv(out->data() + offset, + platform::dynload::ncclRecv(out->data() + offset, recv_numel, dtype, peer, @@ -180,9 +180,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_recv, ops::PartialRecvOpCUDAKernel, float, double, -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -// #endif +#endif int, int64_t, plat::float16) { diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc index d395f3a5febb34..39858b3ed37a26 100644 --- a/paddle/fluid/operators/collective/partial_send_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/partial_send_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -32,8 +32,8 @@ template class PartialSendCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) - // NCCL_VERSION_CODE >= 2703 +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ + NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); int numel = x->numel(); int rid = ctx.Attr("ring_id"); @@ -136,7 +136,7 @@ class PartialSendCUDAKernel : public framework::OpKernel { peer, nranks)); - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); if (comm_ctx) { @@ -145,7 +145,7 @@ class PartialSendCUDAKernel : public framework::OpKernel { comm_ctx->Send(send_buf, send_numel, peer, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclSend(x->data() + offset, + platform::dynload::ncclSend(x->data() + offset, send_numel, dtype, peer, @@ -176,9 +176,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_send, ops::PartialSendCUDAKernel, float, double, -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -// #endif +#endif int, int64_t, plat::float16) { diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc index 283e75d7a53e87..41c2e70df8c35f 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/recv_v2_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" @@ -29,7 +29,8 @@ PHI_DECLARE_bool(dynamic_static_unified_comm); namespace paddle { namespace operators { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ + NCCL_VERSION_CODE >= 2703 framework::DDim recv_shape_info(const platform::Place &place, const gpuStream_t &stream, platform::NCCLComm *comm, @@ -46,7 +47,7 @@ framework::DDim recv_shape_info(const platform::Place &place, } phi::DataType shape_dtype = phi::DataType::INT32; - mcclDataType_t nccl_dtype = + ncclDataType_t nccl_dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(shape_dtype)); // step1: recv the shape size @@ -59,7 +60,7 @@ framework::DDim recv_shape_info(const platform::Place &place, if (comm_ctx) { comm_ctx->Recv(&gpu_shape_size_tensor, 1, peer, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( gpu_data, 1, nccl_dtype, peer, comm->comm(), stream)); } } @@ -89,7 +90,7 @@ framework::DDim recv_shape_info(const platform::Place &place, if (comm_ctx) { comm_ctx->Recv(&gpu_shape_tensor, shape_size, peer, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( gpu_shape_data, shape_size, nccl_dtype, peer, comm->comm(), stream)); } } @@ -123,7 +124,8 @@ template class RecvOpV2CUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ + NCCL_VERSION_CODE >= 2703 int rid = ctx.Attr("ring_id"); bool dynamic_shape = ctx.Attr("dynamic_shape"); PADDLE_ENFORCE_GE( @@ -214,7 +216,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel { int data_type = ctx.Attr("dtype"); framework::proto::VarType::Type type = framework::proto::VarType::Type(data_type); - mcclDataType_t dtype = platform::ToNCCLDataType(type); + ncclDataType_t dtype = platform::ToNCCLDataType(type); auto *out_var = ctx.OutputVar("Out"); if (out_var->IsType()) { @@ -233,7 +235,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel { if (comm_ctx) { comm_ctx->Recv(out, numel, peer, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( out->data(), numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " recv " << common::product(out_dims) << " from " << peer; @@ -272,7 +274,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel { "be less than comm->nranks (%d).", peer, comm->nranks())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( out->data(), numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " recv " << common::product(out->dims()) << " from " << peer; @@ -297,9 +299,9 @@ PD_REGISTER_STRUCT_KERNEL(recv_v2, ops::RecvOpV2CUDAKernel, float, double, -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -// #endif +#endif int, int64_t, int8_t, diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc index 5ad3124b32017d..86be6908e3cd28 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/send_v2_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" @@ -28,7 +28,8 @@ PHI_DECLARE_bool(dynamic_static_unified_comm); namespace paddle { namespace operators { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ + NCCL_VERSION_CODE >= 2703 void send_shape_info(const phi::DenseTensor& x, const platform::Place& place, const gpuStream_t& stream, @@ -45,7 +46,7 @@ void send_shape_info(const phi::DenseTensor& x, "to send the shape info.")); } phi::DataType shape_dtype = phi::DataType::INT32; - mcclDataType_t nccl_dtype = + ncclDataType_t nccl_dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(shape_dtype)); auto dims = x.dims(); int shape_size = dims.size(); @@ -72,7 +73,7 @@ void send_shape_info(const phi::DenseTensor& x, comm_ctx->Send(*gpu_shape_size_tensor, 1, peer, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclSend(gpu_shape_size_tensor->data(), + platform::dynload::ncclSend(gpu_shape_size_tensor->data(), 1, nccl_dtype, peer, @@ -105,7 +106,7 @@ void send_shape_info(const phi::DenseTensor& x, comm_ctx->Send(*gpu_shape_tensor, shape_size, peer, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclSend(gpu_shape_tensor->data(), + platform::dynload::ncclSend(gpu_shape_tensor->data(), shape_size, nccl_dtype, peer, @@ -121,7 +122,8 @@ template class SendOpV2CUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ + NCCL_VERSION_CODE >= 2703 int rid = ctx.Attr("ring_id"); bool dynamic_shape = ctx.Attr("dynamic_shape"); PADDLE_ENFORCE_GE( @@ -215,12 +217,12 @@ class SendOpV2CUDAKernel : public framework::OpKernel { VLOG(3) << "LodTensorArray: idx(" << idx << ")"; auto& x = x_array.at(idx); int numel = x.numel(); - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x.dtype())); if (comm_ctx) { comm_ctx->Send(x, numel, peer, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( x.data(), numel, dtype, peer, comm->comm(), stream)); } VLOG(3) << "rank " << comm->rank() << " send " @@ -245,9 +247,9 @@ class SendOpV2CUDAKernel : public framework::OpKernel { if (comm_ctx) { comm_ctx->Send(*x, numel, peer, stream); } else { - mcclDataType_t dtype = + ncclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( x->data(), numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " send " << common::product(x->dims()) << " to " << peer; @@ -272,9 +274,9 @@ PD_REGISTER_STRUCT_KERNEL(send_v2, ops::SendOpV2CUDAKernel, float, double, -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -// #endif +#endif int, int64_t, int8_t, diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index d5419d2b13a4e0..0f04a295ed263f 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -77,7 +77,7 @@ class ConditionalOp : public framework::OperatorBase { ips[0]->numel())); bool res = false; if (platform::is_gpu_place(ips[0]->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::DenseTensor cpu_tensor; framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index b44be01ca1a8e2..94b946e43dc7a1 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -222,7 +222,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE( ALL_LAYOUT, paddle::operators::FeedSparseCooTensorKernel) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL_FOR_ALL_DTYPE( feed_sparse_coo_tensor, GPU, diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc index 3fb50e695d1a36..9262ca59af970b 100644 --- a/paddle/fluid/operators/controlflow/get_places_op.cc +++ b/paddle/fluid/operators/controlflow/get_places_op.cc @@ -26,7 +26,7 @@ namespace imperative { class OpBase; } // namespace imperative } // namespace paddle -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -34,7 +34,7 @@ namespace paddle { namespace operators { static size_t CUDADevCount() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return platform::GetGPUDeviceCount(); #else return 0UL; diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index ef0dccff7197f0..8ddce0da7faacc 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -227,7 +227,7 @@ bool GetCondData(const phi::DenseTensor &cond) { // when platform::is_gpu_place(cond.place()) or // platform::is_xpu_place(cond.place()) is true std::unique_ptr cpu_cond{new phi::DenseTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); #else diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu index da1eec366937d8..509c067e24e421 100644 --- a/paddle/fluid/operators/data_norm_op.cu +++ b/paddle/fluid/operators/data_norm_op.cu @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/data_norm_op.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" @@ -216,7 +216,7 @@ class DataNormGradKernel : public framework::OpKernel { d_batch_square_sum); if (need_sync_stats) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) int rid = 0; platform::NCCLComm *comm = nullptr; const auto &comm_context_manager = @@ -247,59 +247,59 @@ class DataNormGradKernel : public framework::OpKernel { } if (comm_ctx) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( reinterpret_cast(d_batch_size), reinterpret_cast(d_batch_size), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - mcclSum, + ncclSum, comm_ctx->GetNcclComm(), stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( reinterpret_cast(d_batch_sum), reinterpret_cast(d_batch_sum), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - mcclSum, + ncclSum, comm_ctx->GetNcclComm(), stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( reinterpret_cast(d_batch_square_sum), reinterpret_cast(d_batch_square_sum), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - mcclSum, + ncclSum, comm_ctx->GetNcclComm(), stream)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( reinterpret_cast(d_batch_size), reinterpret_cast(d_batch_size), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - mcclSum, + ncclSum, comm->comm(), stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( reinterpret_cast(d_batch_sum), reinterpret_cast(d_batch_sum), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - mcclSum, + ncclSum, comm->comm(), stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( reinterpret_cast(d_batch_square_sum), reinterpret_cast(d_batch_square_sum), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - mcclSum, + ncclSum, comm->comm(), stream)); } diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 688178ac7b5825..d38a72556f7596 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -11,7 +11,7 @@ function(detection_library TARGET_NAME) set(srcs) # filter cuda source file when not build with cuda/rocm foreach(src ${detection_library_SRCS}) - if(NOT WITH_GPU AND NOT WITH_ROCM AND NOT WITH_MUSA) + if(NOT WITH_GPU AND NOT WITH_ROCM) if(${src} MATCHES ".*\\.cc$") list(APPEND srcs ${src}) endif() @@ -57,7 +57,7 @@ detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) set(TMPDEPS memory) if(WITH_GPU) if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 945678dfd96acd..adb60a8a8d0642 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include #include -#if defined(__NVCC__) || defined(__MUSACC__) +#ifdef __NVCC__ #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index 6f203e9cca7379..b2bbd9c82095c8 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(__NVCC__) || defined(__MUSACC__) +#ifdef __NVCC__ #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h index 807f7e907e5ce4..d954ea1bf82af7 100644 --- a/paddle/fluid/operators/dgc_clip_by_norm_op.h +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/clip_by_norm_op.h" -#include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/clip_by_norm_kernel.h" #include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h" @@ -26,49 +25,48 @@ template class DGCClipByNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(false, "not supported"); - // auto rampup_begin_step = ctx.Attr("rampup_begin_step"); - // if (static_cast(rampup_begin_step) < 0) { - // return; - // } + auto rampup_begin_step = ctx.Attr("rampup_begin_step"); + if (static_cast(rampup_begin_step) < 0) { + return; + } - // auto current_step_tensor = ctx.Input("current_step"); - // auto* current_step = current_step_tensor->data(); + auto current_step_tensor = ctx.Input("current_step"); + auto* current_step = current_step_tensor->data(); - // VLOG(10) << "current_step:" << *current_step - // << ", rampup_begin_step:" << rampup_begin_step; + VLOG(10) << "current_step:" << *current_step + << ", rampup_begin_step:" << rampup_begin_step; - // if (static_cast(*current_step) < static_cast(rampup_begin_step)) { - // VLOG(10) << "current_step:" << *current_step - // << " < rampup_begin_step:" << rampup_begin_step - // << " so does't use dgc_clip_by_norm"; - // return; - // } + if (static_cast(*current_step) < static_cast(rampup_begin_step)) { + VLOG(10) << "current_step:" << *current_step + << " < rampup_begin_step:" << rampup_begin_step + << " so does't use dgc_clip_by_norm"; + return; + } - // auto in_var = ctx.InputVar("X"); - // auto max_norm = ctx.Attr("max_norm"); - // auto& dev_ctx = ctx.device_context(); + auto in_var = ctx.InputVar("X"); + auto max_norm = ctx.Attr("max_norm"); + auto& dev_ctx = ctx.device_context(); - // if (in_var->IsType()) { - // auto* x = ctx.Input("X"); - // auto* y = ctx.Output("Out"); - // return phi::ClipByNormKernel( - // static_cast::TYPE&>(dev_ctx), - // *x, - // max_norm, - // y); - // } else if (in_var->IsType()) { - // auto* x = ctx.Input("X"); - // phi::SelectedRows* output_selected_rows = - // ctx.Output("Out"); - // return phi::sr::ClipByNormKernel( - // static_cast::TYPE&>(dev_ctx), - // *x, - // max_norm, - // output_selected_rows); - // } + if (in_var->IsType()) { + auto* x = ctx.Input("X"); + auto* y = ctx.Output("Out"); + return phi::ClipByNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, + max_norm, + y); + } else if (in_var->IsType()) { + auto* x = ctx.Input("X"); + phi::SelectedRows* output_selected_rows = + ctx.Output("Out"); + return phi::sr::ClipByNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, + max_norm, + output_selected_rows); + } }; }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 1b2dc157fb4022..face0f758f8484 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -32,14 +32,11 @@ limitations under the License. */ #include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/cpu/elementwise_grad.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) #ifdef __NVCC__ #include #elif defined(__HIPCC__) #include -#elif defined(__MUSACC__) -#include -#include #endif #include @@ -314,7 +311,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x, } } -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) template GetReduceDim(const framework::DDim &in, return phi::funcs::GetReduceDim(in, out, axis); } -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) template void GetGradXAndYOut(const phi::GPUContext &dev_ctx, diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index 8be70c6fc8e933..4c2dd992657812 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -283,7 +283,7 @@ REGISTER_OP_CPU_KERNEL(expand_grad, ops::ExpandGradKernel, ops::ExpandGradKernel, ops::ExpandGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( expand, ops::ExpandKernel, diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h index 976ce30d2f0be9..bdf8a80debb649 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu.h +++ b/paddle/fluid/operators/fake_quantize_op.cu.h @@ -193,8 +193,6 @@ struct FindChannelAbsMaxFunctor { #ifdef PADDLE_WITH_HIP hipMemset(out_abs_max, 0, sizeof(T) * cout); -#elif defined(PADDLE_WITH_MUSA) - musaMemset(out_abs_max, 0, sizeof(T) * cout); #else cudaMemset(out_abs_max, 0, sizeof(T) * cout); #endif // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_ diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 942dd94f4dca22..ced20a0108a527 100755 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -32,16 +32,16 @@ if(WITH_XPU) op_library(fused_feedforward_op) endif() -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) # fused_bn_activation_op needs cudnn 7.4.1 above # HIP not support bn act fuse in MIOPEN - if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) + if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) op_library(fused_bn_activation_op) endif() # HIP not support cudnnTransformTensor # fusion_conv_inception_op needs cudnn 7 above # HIP not support cudnnConvolutionBiasActivationForward - if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100)) + if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100)) op_library(fusion_conv_inception_op) endif() op_library(yolo_box_head_op) @@ -53,12 +53,12 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) endif() # fused_bn_add_activation # HIP not support bn act fuse in MIOPEN - if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) + if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) op_library(fused_bn_add_activation_op) endif() # fused_dropout # only support CUDA - if(NOT WITH_ROCM AND NOT WITH_MUSA) + if(NOT WITH_ROCM) op_library(fused_feedforward_op) # fused_attention_op op_library(fused_attention_op) @@ -66,7 +66,7 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) op_library(fused_multi_transformer_int8_op) endif() # resnet_unit needs cudnn 8.0 above - if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000)) + if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000)) op_library(resnet_unit_op) endif() diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 6b3e435529e715..8ea1e11cd29f41 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#if defined(__NVCC__) || defined(__MUSACC__) +#ifdef __NVCC__ #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/fused/fused_attention_utils.h b/paddle/fluid/operators/fused/fused_attention_utils.h index c37b6e2307b585..b198c4a5792912 100644 --- a/paddle/fluid/operators/fused/fused_attention_utils.h +++ b/paddle/fluid/operators/fused/fused_attention_utils.h @@ -14,7 +14,7 @@ #pragma once -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -34,7 +34,7 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT const int ring_id, const phi::GPUContext &dev_ctx) { if (ring_id == -1) return; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); if (map->has(ring_id)) { @@ -86,10 +86,10 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT VLOG(3) << "old NCCLCommContext has ring_id " << ring_id; } if (comm_ctx) { - comm_ctx->AllReduce(&tensor, tensor, mcclSum, stream); + comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce( - sendbuff, recvbuff, numel, dtype, mcclSum, comm->comm(), stream)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); } } #else diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index 7081180ea67667..ccd099109487c9 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include -#include -#include +#include +#include #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/fused/quant_dequant_kernel.h" diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h index ad73be604fddb2..40717402846db5 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h @@ -19,7 +19,7 @@ limitations under the License. */ #pragma once -#include +#include #include #include @@ -39,7 +39,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/fusion/gpu/attn_gemm.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -61,7 +61,7 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT const int count, const phi::GPUContext &ctx) { if (ring_id == -1) return; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); if (map->has(ring_id)) { @@ -117,10 +117,10 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT VLOG(3) << "old NCCLCommContext has ring_id " << ring_id; } if (comm_ctx) { - comm_ctx->AllReduce(&tensor, tensor, mcclSum, stream); + comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( - sendbuff, recvbuff, count, dtype, mcclSum, comm->comm(), stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, count, dtype, ncclSum, comm->comm(), stream)); } } #else diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu index e78579a27c1a94..362860aa23bdf7 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu @@ -150,34 +150,6 @@ void FusedSeqpoolCVM(const framework::ExecutionContext lods.size() * sizeof(size_t *), hipMemcpyHostToDevice, stream); -#elif defined(PADDLE_WITH_MUSA) - T **gpu_input_values = reinterpret_cast(temp_ptr->ptr()); - platform::GpuMemcpyAsync(gpu_input_values, - input_data.data(), - input_data.size() * sizeof(T *), - musaMemcpyHostToDevice, - stream); - T **gpu_output_values = - reinterpret_cast(&gpu_input_values[input_data.size()]); - platform::GpuMemcpyAsync(gpu_output_values, - output_data.data(), - output_data.size() * sizeof(T *), - musaMemcpyHostToDevice, - stream); - T **gpu_seqpool_output_values = - reinterpret_cast(&gpu_output_values[output_data.size()]); - platform::GpuMemcpyAsync(gpu_seqpool_output_values, - seqpool_output_data.data(), - seqpool_output_data.size() * sizeof(T *), - musaMemcpyHostToDevice, - stream); - size_t **lods_values = reinterpret_cast( - &gpu_seqpool_output_values[seqpool_output_data.size()]); - platform::GpuMemcpyAsync(lods_values, - lods.data(), - lods.size() * sizeof(size_t *), - musaMemcpyHostToDevice, - stream); #else T **gpu_input_values = reinterpret_cast(temp_ptr->ptr()); platform::GpuMemcpyAsync(gpu_input_values, @@ -384,37 +356,6 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx, lods.size() * sizeof(size_t *), hipMemcpyHostToDevice, stream); -#elif defined(PADDLE_WITH_MUSA) - T **gpu_out_grads_values = reinterpret_cast(temp_ptr->ptr()); - platform::GpuMemcpyAsync(gpu_out_grads_values, - out_grads_data.data(), - out_grads_data.size() * sizeof(T *), - musaMemcpyHostToDevice, - stream); - - T **gpu_in_grads_values = - reinterpret_cast(&gpu_out_grads_values[out_grads_data.size()]); - platform::GpuMemcpyAsync(gpu_in_grads_values, - in_grads_data.data(), - in_grads_data.size() * sizeof(T *), - musaMemcpyHostToDevice, - stream); - - T **gpu_cvm_values = - reinterpret_cast(&gpu_in_grads_values[in_grads_data.size()]); - platform::GpuMemcpyAsync(gpu_cvm_values, - cvm_data.data(), - cvm_data.size() * sizeof(T *), - musaMemcpyHostToDevice, - stream); - - size_t **lods_values = - reinterpret_cast(&gpu_cvm_values[cvm_data.size()]); - platform::GpuMemcpyAsync(lods_values, - lods.data(), - lods.size() * sizeof(size_t *), - musaMemcpyHostToDevice, - stream); #else T **gpu_out_grads_values = reinterpret_cast(temp_ptr->ptr()); platform::GpuMemcpyAsync(gpu_out_grads_values, diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu index c6fe13548033ac..72bb97a2aae9ee 100644 --- a/paddle/fluid/operators/fused/yolo_box_post_op.cu +++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu @@ -255,9 +255,6 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipMemcpy( bbox_count_device_ptr, &bbox_count, sizeof(int), hipMemcpyHostToDevice); -#elif defined(PADDLE_WITH_MUSA) - musaMemcpy( - bbox_count_device_ptr, &bbox_count, sizeof(int), musaMemcpyHostToDevice); #else cudaMemcpy( bbox_count_device_ptr, &bbox_count, sizeof(int), cudaMemcpyHostToDevice); @@ -271,9 +268,6 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipMemcpy( &bbox_count, bbox_count_device_ptr, sizeof(int), hipMemcpyDeviceToHost); -#elif defined(PADDLE_WITH_MUSA) - musaMemcpy( - &bbox_count, bbox_count_device_ptr, sizeof(int), musaMemcpyDeviceToHost); #else cudaMemcpy( &bbox_count, bbox_count_device_ptr, sizeof(int), cudaMemcpyDeviceToHost); @@ -289,9 +283,6 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipFree(bbox_tensor); hipMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); -#elif defined(PADDLE_WITH_MUSA) - musaFree(bbox_tensor); - musaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); #else cudaFree(bbox_tensor); cudaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); @@ -305,9 +296,6 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipMemcpy( bbox_index_device_ptr, &bbox_index, sizeof(int), hipMemcpyHostToDevice); -#elif defined(PADDLE_WITH_MUSA) - musaMemcpy( - bbox_index_device_ptr, &bbox_index, sizeof(int), musaMemcpyHostToDevice); #else cudaMemcpy( bbox_index_device_ptr, &bbox_index, sizeof(int), cudaMemcpyHostToDevice); @@ -368,13 +356,6 @@ class YoloBoxPostKernel : public framework::OpKernel { anchors.data(), anchors.size() * sizeof(int), hipMemcpyHostToDevice); -#elif defined(PADDLE_WITH_MUSA) - musaMalloc(reinterpret_cast(&device_anchors), - anchors.size() * sizeof(int)); - musaMemcpy(device_anchors, - anchors.data(), - anchors.size() * sizeof(int), - musaMemcpyHostToDevice); #else cudaMalloc(reinterpret_cast(&device_anchors), anchors.size() * sizeof(int)); @@ -407,10 +388,6 @@ class YoloBoxPostKernel : public framework::OpKernel { hipMalloc( reinterpret_cast(&ts_info[i].bboxes_dev_ptr), ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)); -#elif defined(PADDLE_WITH_MUSA) - musaMalloc( - reinterpret_cast(&ts_info[i].bboxes_dev_ptr), - ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)); #else cudaMalloc( reinterpret_cast(&ts_info[i].bboxes_dev_ptr), @@ -421,9 +398,6 @@ class YoloBoxPostKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP hipMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), sizeof(int)); -#elif defined(PADDLE_WITH_MUSA) - musaMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), - sizeof(int)); #else cudaMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), sizeof(int)); @@ -435,8 +409,6 @@ class YoloBoxPostKernel : public framework::OpKernel { int* bbox_index_device_ptr; #ifdef PADDLE_WITH_HIP hipMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); -#elif defined(PADDLE_WITH_MUSA) - musaMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); #else cudaMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); #endif @@ -484,12 +456,6 @@ class YoloBoxPostKernel : public framework::OpKernel { ts_info[ts_id].bboxes_dev_ptr, ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float), hipMemcpyDeviceToHost); -#elif defined(PADDLE_WITH_MUSA) - musaMemcpyAsync( - ts_info[ts_id].bboxes_host_ptr, - ts_info[ts_id].bboxes_dev_ptr, - ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float), - musaMemcpyDeviceToHost); #else cudaMemcpyAsync( ts_info[ts_id].bboxes_host_ptr, @@ -568,8 +534,6 @@ class YoloBoxPostKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP hipFree(bbox_index_device_ptr); -#elif defined(PADDLE_WITH_MUSA) - musaFree(bbox_index_device_ptr); #else cudaFree(bbox_index_device_ptr); #endif @@ -577,9 +541,6 @@ class YoloBoxPostKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP hipFree(ts_info[i].bboxes_dev_ptr); hipFree(ts_info[i].bbox_count_device_ptr); -#elif defined(PADDLE_WITH_MUSA) - musaFree(ts_info[i].bboxes_dev_ptr); - musaFree(ts_info[i].bbox_count_device_ptr); #else cudaFree(ts_info[i].bboxes_dev_ptr); cudaFree(ts_info[i].bbox_count_device_ptr); diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc index c6a8a4fe7b9822..8ae92b04b7df44 100644 --- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc +++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc @@ -111,7 +111,7 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu index b45fdd9619a61d..b4e0f511f6d61b 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cu +++ b/paddle/fluid/operators/graph_khop_sampler_op.cu @@ -32,9 +32,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include #include -#elif defined(PADDLE_WITH_MUSA) -#include -#include #else #include #include @@ -98,12 +95,6 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng); -#elif defined(PADDLE_WITH_MUSA) - murandState rng; - murand_init(rand_seed * gridDim.x + blockIdx.x, - threadIdx.y * WARP_SIZE + threadIdx.x, - 0, - &rng); #else curandState rng; curand_init(rand_seed * gridDim.x + blockIdx.x, @@ -137,8 +128,6 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) { #ifdef PADDLE_WITH_HIP const int num = hiprand(&rng) % (idx + 1); -#elif defined(PADDLE_WITH_MUSA) - const int num = murand(&rng) % (idx + 1); #else const int num = curand(&rng) % (idx + 1); #endif diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc index 3530beda000b4e..c88d36602bd79c 100644 --- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc +++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#ifndef PADDLE_WITH_HIP // HIP not support cudnnSpatialTfGridGeneratorForward #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index ea38db87e63e7d..dea3ce3fe695b8 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -156,7 +156,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_STRUCT_KERNEL( hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index e1e9ca5ef66673..8c123bb8a32f22 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -201,7 +201,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_STRUCT_KERNEL( im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h index 5c03b7395a4f24..5352ccc99df92e 100644 --- a/paddle/fluid/operators/isfinite_op.h +++ b/paddle/fluid/operators/isfinite_op.h @@ -67,7 +67,7 @@ bool TensorIsfinite(const phi::DenseTensor& tensor); FiniteVisitor(Isnan, Any, CPU); FiniteVisitor(Isinf, Any, CPU); FiniteVisitor(Isfinite, All, CPU); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) FiniteVisitor(Isnan, Any, GPU); FiniteVisitor(Isinf, Any, GPU); FiniteVisitor(Isfinite, All, GPU); @@ -82,7 +82,7 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor, IsnanVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsnanVisitorGPU(tensor, out)); @@ -99,7 +99,7 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor, IsinfVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsinfVisitorGPU(tensor, out)); @@ -116,7 +116,7 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor, IsfiniteVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsfiniteVisitorGPU(tensor, out)); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index 3918ba54599808..8f0b705c8de79f 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -96,7 +96,7 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {} PD_REGISTER_STRUCT_KERNEL( l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {} PD_REGISTER_STRUCT_KERNEL( l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {} diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 197aaa74bb3e13..dd85ccff87f2d2 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -133,7 +133,7 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {} PD_REGISTER_KERNEL( load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {} PD_REGISTER_KERNEL( load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index da8ea875e93938..94b03197291174 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -66,7 +66,7 @@ struct LoDTensorToArrayFunctor { if (std::is_same::value) { Apply(static_cast(dev_ctx)); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) Apply(static_cast(dev_ctx)); #else PADDLE_THROW( diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 3f0ccf3bf40ffb..edd8b20da160c5 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -221,9 +221,6 @@ struct LookupTableV2GradCUDAFunctor { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index 216e9863a5e277..75ef56accb10b4 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -16,8 +16,6 @@ #ifdef PADDLE_WITH_HIP #include namespace cub = hipcub; -#elif defined(PADDLE_WITH_MUSA) - #else #include #endif @@ -38,7 +36,7 @@ namespace cub = hipcub; #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/eigen/common.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -74,7 +72,7 @@ void GetClassInterval(const gpuStream_t& stream, return; } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) DenseTensor num_classes_per_device; phi::TensorFromVector(shard_dim_vec, dev_ctx, &num_classes_per_device); int* num_classes_per_device_ptr = num_classes_per_device.data(); @@ -125,15 +123,15 @@ void GetClassInterval(const gpuStream_t& stream, if (comm_ctx) { comm_ctx->AllReduce(&num_classes_per_device, num_classes_per_device, - mcclSum, + ncclSum, calcu_stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( num_classes_per_device_ptr, num_classes_per_device_ptr, num_classes_per_device.numel(), phi::ToNCCLDataType(num_classes_per_device.dtype()), - mcclSum, + ncclSum, comm->comm(), calcu_stream)); } @@ -272,7 +270,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, DenseTensor* loss) { const auto& place = dev_ctx.GetPlace(); // old code -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) paddle::platform::NCCLComm* comm = nullptr; const auto& comm_context_manager = phi::distributed::CommContextManager::GetInstance(); @@ -407,7 +405,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, phi::kps::IdentityFunctor(), {1}); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { if (pg) { std::vector in_tensor; @@ -421,14 +419,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, task->Wait(); } else { if (comm_ctx) { - comm_ctx->AllReduce(&logits_max, logits_max, mcclMax, stream); + comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllReduce(logits_max_buff, + phi::dynload::ncclAllReduce(logits_max_buff, logits_max_buff, logits_max.numel(), phi::ToNCCLDataType(logits_max.dtype()), - mcclMax, + ncclMax, comm->comm(), stream)); } @@ -452,7 +450,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, phi::kps::ExpFunctor(), {1}); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { if (pg) { std::vector in_tensor; @@ -466,14 +464,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, task->Wait(); } else { if (comm_ctx) { - comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, mcclSum, stream); + comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), phi::ToNCCLDataType(sum_exp_logits.dtype()), - mcclSum, + ncclSum, comm->comm(), stream)); } @@ -514,7 +512,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, class_interval.data()); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { if (pg) { std::vector in_tensor; @@ -528,14 +526,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, task->Wait(); } else { if (comm_ctx) { - comm_ctx->AllReduce(loss, *loss, mcclSum, stream); + comm_ctx->AllReduce(loss, *loss, ncclSum, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllReduce(loss_ptr, + phi::dynload::ncclAllReduce(loss_ptr, loss_ptr, loss->numel(), phi::ToNCCLDataType(loss->dtype()), - mcclSum, + ncclSum, comm->comm(), stream)); } diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h index d1e0a772f3eaa6..76e27380b90e21 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.h +++ b/paddle/fluid/operators/math/bert_encoder_functor.h @@ -20,12 +20,6 @@ limitations under the License. */ #include // NOLINT #endif - -#ifdef PADDLE_WITH_MUSA -#include -#include -#endif - #ifdef PADDLE_WITH_HIP #include @@ -53,7 +47,7 @@ struct CUDATypeTraits { typedef float TYPE; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // This functor involves a fusion calculation in Ernie or Bert. // The fusion mode is as follows: // diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc index 2b0d3432720dfa..857d870847ee8c 100644 --- a/paddle/fluid/operators/math/gru_compute.cc +++ b/paddle/fluid/operators/math/gru_compute.cc @@ -28,7 +28,7 @@ struct GRUUnitFunctor { const detail::ActivationType active_node, const detail::ActivationType active_gate, bool origin_mode) { -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__) +#if !defined(__NVCC__) && !defined(__HIPCC___) auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value) { blas.GEMM(false, @@ -92,7 +92,7 @@ struct GRUUnitGradFunctor { const detail::ActivationType active_node, const detail::ActivationType active_gate, bool origin_mode) { -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__) +#if !defined(__NVCC__) && !defined(__HIPCC___) detail::backward_state_grad(detail::backward::gru_stateGrad(), value, grad, @@ -182,7 +182,7 @@ struct GRUUnitFunctorV2 { int batch_size, const detail::ActivationType active_node, const detail::ActivationType active_gate) { -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__) +#if !defined(__NVCC__) && !defined(__HIPCC___) auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value) { blas.GEMM(CblasNoTrans, @@ -234,7 +234,7 @@ struct GRUUnitGradFunctorV2 { int batch_size, const detail::ActivationType active_node, const detail::ActivationType active_gate) { -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__) +#if !defined(__NVCC__) && !defined(__HIPCC___) // calculate grad_update_gate, grad_frame_state, // grad_reset_output, grad_reset_gate detail::cpu_gru_backward(context, diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 792a08423be0ac..3032b78a2029d0 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -14,7 +14,7 @@ #pragma once -#if defined(__NVCC__) || defined(__MUSACC__) +#ifdef __NVCC__ #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h index 1762353abaa9f2..00ff1fbcbc38db 100644 --- a/paddle/fluid/operators/math/prelu.h +++ b/paddle/fluid/operators/math/prelu.h @@ -23,7 +23,7 @@ namespace paddle { namespace operators { namespace math { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template class PreluChannelWiseDirectCUDAFunctor { public: diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu index 87fe1ee33f0f15..bf028c4ada3695 100644 --- a/paddle/fluid/operators/math/sample_prob.cu +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -160,11 +160,6 @@ void GPUSampleWithProb::operator()(const phi::GPUContext& context, s_data, sizeof(int64_t) * num_samples, hipMemcpyHostToDevice)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(samples_data + num_true, - s_data, - sizeof(int64_t) * num_samples, - musaMemcpyHostToDevice)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true, s_data, diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h index da8c22aa67bbb3..524ba826a57047 100644 --- a/paddle/fluid/operators/math/sample_prob.h +++ b/paddle/fluid/operators/math/sample_prob.h @@ -106,7 +106,7 @@ class SampleWithProb { } }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template class GPUSampleWithProb { public: diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index f082189fa0f370..895a427bae6e20 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -98,7 +98,7 @@ ComputeMatmulImpl(const framework::ExecutionContext &context) { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) + !defined(PADDLE_WITH_HIP) head_number = context.Attr("head_number"); #endif @@ -112,7 +112,7 @@ ComputeMatmulImpl(const framework::ExecutionContext &context) { } } #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) + !defined(PADDLE_WITH_HIP) bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_); if (head_number > 1) { @@ -271,7 +271,7 @@ class MatMulGradKernel : public framework::OpKernel { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) + !defined(PADDLE_WITH_HIP) if (context.HasAttr("head_number")) { head_number = context.Attr("head_number"); } @@ -403,7 +403,7 @@ class MatMulDoubleGradKernel : public framework::OpKernel { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) + !defined(PADDLE_WITH_HIP) head_number = context.Attr("head_number"); #endif @@ -645,7 +645,7 @@ class MatMulOp : public framework::OperatorWithKernel { } int64_t dim_out_y = mat_dim_y.width_; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) + !defined(PADDLE_WITH_HIP) int head_number = context->Attrs().Get("head_number"); bool split_vertical_y = (mat_dim_x.width_ != mat_dim_y.height_); if (context->IsRuntime()) { @@ -788,7 +788,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { .AsExtra(); #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) + !defined(PADDLE_WITH_HIP) AddAttr("head_number", "The number of heads of the matrix") .SetDefault(1); #endif diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index a4b6e061bfdff0..5f480461d77cdb 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -39,7 +39,7 @@ class MemcpyH2DFunctor { void operator()(const phi::DenseTensor &lod_tensor) const { auto &out_tensor = *out_->GetMutable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto stream = static_cast(&dev_ctx_)->stream(); #else auto stream = nullptr; diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index 935b93d1c3ae31..3ed27460e16b6c 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -68,7 +68,7 @@ class MergeLoDTensorOp : public framework::OperatorBase { if (platform::is_cpu_place(mask.place())) { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) framework::TensorCopy( mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc index 580ea2da8721cd..64bc176d971492 100644 --- a/paddle/fluid/operators/minus_op.cc +++ b/paddle/fluid/operators/minus_op.cc @@ -157,6 +157,6 @@ REGISTER_OPERATOR(minus, ops::MinusGradMaker); PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {} #endif diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt index 2d079c8ef521d6..629b41b4b582b7 100644 --- a/paddle/fluid/operators/nccl/CMakeLists.txt +++ b/paddle/fluid/operators/nccl/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT (WITH_NCCL OR WITH_RCCL OR WITH_MCCL)) +if(NOT (WITH_NCCL OR WITH_RCCL)) return() endif() @@ -16,14 +16,7 @@ if(WITH_ROCM AND NOT WIN32) DEPS device_context operator) endif() -if(WITH_MUSA AND NOT WIN32) - musa_library( - nccl_common - SRCS nccl_gpu_common.cc - DEPS device_context operator) -endif() - -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) op_library(nccl_op DEPS nccl_common) set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc index 4916d71b2f73a0..9f7d967a84708e 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc @@ -18,7 +18,7 @@ namespace paddle { namespace platform { namespace { // TODO(panyx0718): Where to destroy them. -std::unique_ptr> global_comms; +std::unique_ptr> global_comms; std::unique_ptr> comm_id_map; bool inited = false; size_t last_num_gpus = -1; @@ -41,21 +41,21 @@ void Communicator::InitAll(const std::vector& gpus) { if (global_comms) { for (size_t i = 0; i < global_comms->size(); ++i) { // FIXME(dzh) : PADDLE_ENFORCE return void - dynload::mcclCommDestroy((*global_comms)[i]); + dynload::ncclCommDestroy((*global_comms)[i]); } } - global_comms = std::make_unique>(); + global_comms = std::make_unique>(); comm_id_map = std::make_unique>(); global_comms->resize(gpus.size()); for (size_t i = 0; i < gpus.size(); ++i) { (*comm_id_map)[gpus[i]] = i; } PADDLE_ENFORCE_GPU_SUCCESS( - dynload::mcclCommInitAll(global_comms->data(), gpus.size(), gpus.data())); + dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data())); inited = true; } -const std::vector& Communicator::comms() const { +const std::vector& Communicator::comms() const { std::lock_guard guard(comm_mu); return *global_comms; } diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h index 0427180d56c04f..01905d8ca84b3b 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.h +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h @@ -25,8 +25,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #ifdef PADDLE_WITH_RCCL #include "paddle/fluid/platform/dynload/rccl.h" -#elif defined(PADDLE_WITH_MCCL) -#include "paddle/fluid/platform/dynload/mccl.h" #else #include "paddle/fluid/platform/dynload/nccl.h" #endif @@ -44,7 +42,7 @@ struct Communicator { void InitAll(const std::vector& gpus); - const std::vector& comms() const; + const std::vector& comms() const; }; } // namespace platform diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc index 7e9b2b1d4dd19f..8b06aa653c070f 100644 --- a/paddle/fluid/operators/nccl/nccl_op.cc +++ b/paddle/fluid/operators/nccl/nccl_op.cc @@ -105,8 +105,8 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { std::string reduction = ctx->Attrs().Get("reduction"); PADDLE_ENFORCE_EQ( - (reduction == "mcclSum" || reduction == "mcclProd" || - reduction == "mcclMin" || reduction == "mcclMax"), + (reduction == "ncclSum" || reduction == "ncclProd" || + reduction == "ncclMin" || reduction == "ncclMax"), true, platform::errors::InvalidArgument("invalid nccl reduction.")); @@ -124,9 +124,9 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Communicator", "Communicator for communicating between gpus"); AddOutput("Out", "The output of AllReduce op"); AddAttr("reduction", - "(string, default 'mcclSum') " - "{'mcclMin', 'mcclMax', 'mcclProd', 'mcclSum'}.") - .SetDefault("mcclSum"); + "(string, default 'ncclSum') " + "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") + .SetDefault("ncclSum"); AddComment(R"DOC( NCCLAllReduce Operator. @@ -151,8 +151,8 @@ class NCCLReduceOp : public framework::OperatorWithKernel { std::string reduction = ctx->Attrs().Get("reduction"); PADDLE_ENFORCE_EQ( - (reduction == "mcclSum" || reduction == "mcclProd" || - reduction == "mcclMin" || reduction == "mcclMax"), + (reduction == "ncclSum" || reduction == "ncclProd" || + reduction == "ncclMin" || reduction == "ncclMax"), true, platform::errors::InvalidArgument("invalid nccl reduction.")); @@ -170,9 +170,9 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Communicator", "Communicator for communicating between gpus"); AddOutput("Out", "The output of Reduce op"); AddAttr("reduction", - "(string, default 'mcclSum') " - "{'mcclMin', 'mcclMax', 'mcclProd', 'mcclSum'}.") - .SetDefault("mcclSum"); + "(string, default 'ncclSum') " + "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") + .SetDefault("ncclSum"); AddAttr("root", "(int, default kInvalidGPUId) " "Root gpu of the parameter. If not, " @@ -246,10 +246,10 @@ REGISTER_OPERATOR( ops::NCCLInitOpVarTypeInference, ops::NCCLInitOpShapeInference); -REGISTER_OP_WITHOUT_GRADIENT(mcclAllReduce, +REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp, ops::NCCLAllReduceOpMaker); -REGISTER_OP_WITHOUT_GRADIENT(mcclBcast, +REGISTER_OP_WITHOUT_GRADIENT(ncclBcast, ops::NCCLBcastOp, ops::NCCLBcastOpMaker); REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc index 7b99c47cf13c88..abb24cc8cae10d 100644 --- a/paddle/fluid/operators/nccl/nccl_op.cu.cc +++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc @@ -27,33 +27,33 @@ class NCCLTypeWrapper; template <> class NCCLTypeWrapper { public: - static const mcclDataType_t type = mcclFloat; + static const ncclDataType_t type = ncclFloat; }; template <> class NCCLTypeWrapper { public: - static const mcclDataType_t type = mcclDouble; + static const ncclDataType_t type = ncclDouble; }; -static mcclRedOp_t str_to_nccl_red_type(std::string reduction) { - static const std::unordered_map str_to_type = { - {"mcclSum", mcclSum}, - {"mcclMin", mcclMin}, - {"mcclMax", mcclMax}, - {"mcclProd", mcclProd}, +static ncclRedOp_t str_to_nccl_red_type(std::string reduction) { + static const std::unordered_map str_to_type = { + {"ncclSum", ncclSum}, + {"ncclMin", ncclMin}, + {"ncclMax", ncclMax}, + {"ncclProd", ncclProd}, }; auto it = str_to_type.find(reduction); PADDLE_ENFORCE_EQ(it != str_to_type.end(), true, platform::errors::InvalidArgument( - "Invalid nccl reduction. Must be mcclMin | mcclMax | " - "mcclProd | mcclSum")); + "Invalid nccl reduction. Must be ncclMin | ncclMax | " + "ncclProd | ncclSum")); return it->second; } template -class mcclAllReduceKernel : public framework::OpKernel { +class NCCLAllReduceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), @@ -74,7 +74,7 @@ class mcclAllReduceKernel : public framework::OpKernel { << " invoke allreduce. send " << x->numel() << " recv " << out->numel(); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclAllReduce(x->data(), + platform::dynload::ncclAllReduce(x->data(), out->mutable_data(ctx.GetPlace()), out->numel(), NCCLTypeWrapper::type, @@ -115,7 +115,7 @@ class NCCLReduceKernel : public framework::OpKernel { VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() << " recv " << out->numel(); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclReduce(x->data(), + platform::dynload::ncclReduce(x->data(), recvbuffer, x->numel(), NCCLTypeWrapper::type, @@ -144,7 +144,7 @@ class NCCLBcastKernel : public framework::OpKernel { if (idx == root) { auto* x = ctx.Input("X"); VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( reinterpret_cast(const_cast(x->data())), x->numel(), NCCLTypeWrapper::type, @@ -157,7 +157,7 @@ class NCCLBcastKernel : public framework::OpKernel { VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " << common::product(out->dims()); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclBcast(out->mutable_data(ctx.GetPlace()), + platform::dynload::ncclBcast(out->mutable_data(ctx.GetPlace()), out->numel(), NCCLTypeWrapper::type, root, @@ -173,8 +173,8 @@ class NCCLBcastKernel : public framework::OpKernel { namespace ops = paddle::operators; PD_REGISTER_STRUCT_KERNEL( - mcclAllReduce, GPU, ALL_LAYOUT, ops::mcclAllReduceKernel, float) {} + ncclAllReduce, GPU, ALL_LAYOUT, ops::NCCLAllReduceKernel, float) {} PD_REGISTER_STRUCT_KERNEL( - mcclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {} + ncclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {} PD_REGISTER_STRUCT_KERNEL( ncclReduce, GPU, ALL_LAYOUT, ops::NCCLReduceKernel, float) {} diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 8290da165800b5..6b0a36fc564721 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -30,13 +30,13 @@ #include "paddle/phi/kernels/funcs/tensor_to_string.h" #include "paddle/utils/optional.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/phi/core/distributed/nccl_comm_context.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif -#if defined(__NVCC__) || defined(__MUSACC__) +#ifdef __NVCC__ #include "cub/cub.cuh" #include "math.h" // NOLINT #endif @@ -74,8 +74,6 @@ static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) { static_assert(!std::is_same::value, "T cannot be void."); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(x, 0, n * sizeof(T), stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream)); #endif @@ -273,10 +271,6 @@ static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) { PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( &cpu_value, ptr, sizeof(float), hipMemcpyDeviceToHost, stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync( - &cpu_value, ptr, sizeof(float), musaMemcpyDeviceToHost, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( &cpu_value, ptr, sizeof(float), cudaMemcpyDeviceToHost, stream)); @@ -901,14 +895,14 @@ static void MultiTensorUpdateLambParamAndBetaPows( #undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static bool CreatePreMulScaleOpIfSupported( - mcclDataType_t dtype, - mcclComm_t comm, + ncclDataType_t dtype, + ncclComm_t comm, const void *scale, - mcclRedOp_t *op, + ncclRedOp_t *op, distributed::NCCLCommContext *comm_ctx = nullptr) { -// #if NCCL_VERSION_CODE >= 21100 +#if NCCL_VERSION_CODE >= 21100 if (FLAGS_dynamic_static_unified_comm) { PADDLE_ENFORCE_NOT_NULL( comm_ctx, @@ -919,32 +913,32 @@ static bool CreatePreMulScaleOpIfSupported( "But parameter of comm_ctx should not be nullptr.")); int ver = comm_ctx->GetNcclVersion(); if (ver >= 21100) { - VLOG(10) << "mcclRedOpCreatePreMulSum is supported."; + VLOG(10) << "ncclRedOpCreatePreMulSum is supported."; comm_ctx->RedOpCreatePreMulSum( - op, const_cast(scale), dtype, mcclScalarDevice); + op, const_cast(scale), dtype, ncclScalarDevice); return true; } } else { int ver; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclGetVersion(&ver)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&ver)); if (ver >= 21100) { - VLOG(10) << "mcclRedOpCreatePreMulSum is supported."; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpCreatePreMulSum( - op, const_cast(scale), dtype, mcclScalarDevice, comm)); + VLOG(10) << "ncclRedOpCreatePreMulSum is supported."; + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum( + op, const_cast(scale), dtype, ncclScalarDevice, comm)); return true; } } -// #endif - VLOG(10) << "mcclRedOpCreatePreMulSum is not supported."; +#endif + VLOG(10) << "ncclRedOpCreatePreMulSum is not supported."; return false; } static void DestoryOpIfSupported( - mcclRedOp_t op, - mcclComm_t comm, + ncclRedOp_t op, + ncclComm_t comm, distributed::NCCLCommContext *comm_ctx = nullptr) { -// #if NCCL_VERSION_CODE >= 21100 - VLOG(10) << "mcclRedOpDestroy starts"; +#if NCCL_VERSION_CODE >= 21100 + VLOG(10) << "ncclRedOpDestroy starts"; if (FLAGS_dynamic_static_unified_comm) { PADDLE_ENFORCE_NOT_NULL( @@ -956,12 +950,12 @@ static void DestoryOpIfSupported( "But parameter of comm_ctx should not be nullptr.")); comm_ctx->RedOpDestroy(op); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpDestroy(op, comm)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, comm)); } - VLOG(10) << "mcclRedOpDestroy ends"; + VLOG(10) << "ncclRedOpDestroy ends"; -// #endif - VLOG(10) << "mcclRedOpDestroy is not supported."; +#endif + VLOG(10) << "ncclRedOpDestroy is not supported."; } template @@ -986,11 +980,11 @@ static void LaunchScaleKernel(const phi::GPUContext &dev_ctx, } template -static void mcclSumWithScaleBase(const T *sendbuff, +static void NCCLSumWithScaleBase(const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks, - mcclComm_t comm, + ncclComm_t comm, gpuStream_t stream, const phi::GPUContext &dev_ctx, distributed::NCCLCommContext *comm_ctx, @@ -1022,9 +1016,9 @@ static void mcclSumWithScaleBase(const T *sendbuff, return; } - mcclRedOp_t op = mcclSum; - mcclDataType_t dtype = - std::is_same::value ? mcclFloat32 : mcclFloat16; + ncclRedOp_t op = ncclSum; + ncclDataType_t dtype = + std::is_same::value ? ncclFloat32 : ncclFloat16; bool should_destroy_op = scale && CreatePreMulScaleOpIfSupported( dtype, comm, scale, &op, comm_ctx); memory_utils::Buffer buffer(dev_ctx.GetPlace()); @@ -1040,7 +1034,7 @@ static void mcclSumWithScaleBase(const T *sendbuff, // TODO(BeingGod): NCCLCommContext::ReduceScatter only accept DenseTensor, // but sendbuff or recvbuff maybe allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclReduceScatter(sendbuff, + phi::dynload::ncclReduceScatter(sendbuff, recvbuff, recvcount, dtype, @@ -1051,7 +1045,7 @@ static void mcclSumWithScaleBase(const T *sendbuff, // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but sendbuff or recvbuff maybe allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllReduce(sendbuff, + phi::dynload::ncclAllReduce(sendbuff, recvbuff, recvcount, dtype, @@ -1061,10 +1055,10 @@ static void mcclSumWithScaleBase(const T *sendbuff, } } else { if (UseReduceScatter) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclReduceScatter( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduceScatter( sendbuff, recvbuff, recvcount, dtype, op, comm, stream)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( sendbuff, recvbuff, recvcount, dtype, op, comm, stream)); } } @@ -1075,16 +1069,16 @@ static void mcclSumWithScaleBase(const T *sendbuff, } template -static void mcclReduceScatterWithScale(const T *sendbuff, +static void NCCLReduceScatterWithScale(const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks, - mcclComm_t comm, + ncclComm_t comm, gpuStream_t stream, const phi::GPUContext &dev_ctx, distributed::NCCLCommContext *comm_ctx, const T *scale = nullptr) { - mcclSumWithScaleBase(sendbuff, + NCCLSumWithScaleBase(sendbuff, recvbuff, recvcount, nranks, @@ -1096,16 +1090,16 @@ static void mcclReduceScatterWithScale(const T *sendbuff, } template -static void mcclAllReduceWithScale(const T *sendbuff, +static void NCCLAllReduceWithScale(const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks, - mcclComm_t comm, + ncclComm_t comm, gpuStream_t stream, const phi::GPUContext &dev_ctx, distributed::NCCLCommContext *comm_ctx, const T *scale = nullptr) { - mcclSumWithScaleBase(sendbuff, + NCCLSumWithScaleBase(sendbuff, recvbuff, recvcount, nranks, @@ -1246,10 +1240,6 @@ static std::string GetMinMaxStr(const T *x, size_t n, const phi::Place &place) { PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( &ret_cpu[0], ret, 2 * sizeof(T), hipMemcpyDeviceToHost, stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync( - &ret_cpu[0], ret, 2 * sizeof(T), musaMemcpyDeviceToHost, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( &ret_cpu[0], ret, 2 * sizeof(T), cudaMemcpyDeviceToHost, stream)); @@ -1306,12 +1296,6 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) { sizeof(flag), hipMemcpyDeviceToHost, dev_ctx.stream())); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(&flag, - out.Get(), - sizeof(flag), - musaMemcpyDeviceToHost, - dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&flag, out.Get(), @@ -1474,7 +1458,7 @@ void DistributedFusedLambKernel( DenseTensor *acc_step, DenseTensor *stop_update, DenseTensor *step) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto stream = dev_ctx.stream(); auto place = dev_ctx.GetPlace(); found_inf->Resize({1}); @@ -1772,7 +1756,7 @@ void DistributedFusedLambKernel( // Step 6: allreduce + global norm gradient clip int64_t global_rank = 0, local_rank = 0; - mcclComm_t global_comm = nullptr, local_comm = nullptr, + ncclComm_t global_comm = nullptr, local_comm = nullptr, external_comm = nullptr; paddle::platform::NCCLComm *nccl_comm_handle = nullptr, *local_nccl_comm_handle = nullptr; @@ -1884,7 +1868,7 @@ void DistributedFusedLambKernel( // (1) ReduceScater first if (local_shard) { if (use_hierarchical_allreduce) { - mcclReduceScatterWithScale( + NCCLReduceScatterWithScale( fp32_grad_data, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -1893,7 +1877,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, local_comm_ctx); - mcclAllReduceWithScale( + NCCLAllReduceWithScale( fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -1903,7 +1887,7 @@ void DistributedFusedLambKernel( dev_ctx, external_comm_ctx); - mcclReduceScatterWithScale( + NCCLReduceScatterWithScale( fp16_grad_data, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -1912,7 +1896,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, local_comm_ctx); - mcclAllReduceWithScale( + NCCLAllReduceWithScale( fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -1922,7 +1906,7 @@ void DistributedFusedLambKernel( dev_ctx, external_comm_ctx); } else { - mcclAllReduceWithScale(fp32_grad_data, + NCCLAllReduceWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel, nranks, @@ -1930,7 +1914,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, comm_ctx); - mcclAllReduceWithScale(fp16_grad_data, + NCCLAllReduceWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel, nranks, @@ -1942,7 +1926,7 @@ void DistributedFusedLambKernel( fp32_sum_grad += (local_rank * fp32_numel_each_device); fp16_sum_grad += (local_rank * fp16_numel_each_device); } else { - mcclReduceScatterWithScale(fp32_grad_data, + NCCLReduceScatterWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel_each_device, nranks, @@ -1950,7 +1934,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, comm_ctx); - mcclReduceScatterWithScale(fp16_grad_data, + NCCLReduceScatterWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel_each_device, nranks, @@ -1973,11 +1957,11 @@ void DistributedFusedLambKernel( // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but fp32_square_grad_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllReduce(fp32_square_grad_norm, + phi::dynload::ncclAllReduce(fp32_square_grad_norm, fp32_square_grad_norm, 1, - mcclFloat32, - mcclSum, + ncclFloat32, + ncclSum, local_comm, stream)); } @@ -2030,7 +2014,7 @@ void DistributedFusedLambKernel( << HasNanInf(dev_ctx, fp16_grad_data, fp16_numel); if (local_shard) { if (use_hierarchical_allreduce) { - mcclReduceScatterWithScale( + NCCLReduceScatterWithScale( fp32_grad_data, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -2040,7 +2024,7 @@ void DistributedFusedLambKernel( dev_ctx, local_comm_ctx, fp32_scale); - mcclAllReduceWithScale( + NCCLAllReduceWithScale( fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -2049,7 +2033,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, external_comm_ctx); - mcclReduceScatterWithScale( + NCCLReduceScatterWithScale( fp16_grad_data, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -2059,7 +2043,7 @@ void DistributedFusedLambKernel( dev_ctx, local_comm_ctx, fp16_scale); - mcclAllReduceWithScale( + NCCLAllReduceWithScale( fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -2069,7 +2053,7 @@ void DistributedFusedLambKernel( dev_ctx, external_comm_ctx); } else { - mcclAllReduceWithScale(fp32_grad_data, + NCCLAllReduceWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel, nranks, @@ -2078,7 +2062,7 @@ void DistributedFusedLambKernel( dev_ctx, comm_ctx, fp32_scale); - mcclAllReduceWithScale(fp16_grad_data, + NCCLAllReduceWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel, nranks, @@ -2091,7 +2075,7 @@ void DistributedFusedLambKernel( fp32_sum_grad += (local_rank * fp32_numel_each_device); fp16_sum_grad += (local_rank * fp16_numel_each_device); } else { - mcclReduceScatterWithScale(fp32_grad_data, + NCCLReduceScatterWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel_each_device, nranks, @@ -2100,7 +2084,7 @@ void DistributedFusedLambKernel( dev_ctx, comm_ctx, fp32_scale); - mcclReduceScatterWithScale(fp16_grad_data, + NCCLReduceScatterWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel_each_device, nranks, @@ -2125,11 +2109,11 @@ void DistributedFusedLambKernel( // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but fp32_square_grad_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllReduce(fp32_square_grad_norm, + phi::dynload::ncclAllReduce(fp32_square_grad_norm, fp32_square_grad_norm, 1, - mcclFloat32, - mcclSum, + ncclFloat32, + ncclSum, local_comm, stream)); VLOG(1) << "Grad square norm after all reduce: " @@ -2142,7 +2126,7 @@ void DistributedFusedLambKernel( } else { if (local_shard) { if (use_hierarchical_allreduce) { - mcclReduceScatterWithScale( + NCCLReduceScatterWithScale( fp32_grad_data, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -2151,7 +2135,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, local_comm_ctx); - mcclAllReduceWithScale( + NCCLAllReduceWithScale( fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -2160,7 +2144,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, external_comm_ctx); - mcclReduceScatterWithScale( + NCCLReduceScatterWithScale( fp16_grad_data, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -2169,7 +2153,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, local_comm_ctx); - mcclAllReduceWithScale( + NCCLAllReduceWithScale( fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -2179,7 +2163,7 @@ void DistributedFusedLambKernel( dev_ctx, external_comm_ctx); } else { - mcclAllReduceWithScale(fp32_grad_data, + NCCLAllReduceWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel, nranks, @@ -2187,7 +2171,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, comm_ctx); - mcclAllReduceWithScale(fp16_grad_data, + NCCLAllReduceWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel, nranks, @@ -2199,7 +2183,7 @@ void DistributedFusedLambKernel( fp32_sum_grad += (local_rank * fp32_numel_each_device); fp16_sum_grad += (local_rank * fp16_numel_each_device); } else { - mcclReduceScatterWithScale(fp32_grad_data, + NCCLReduceScatterWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel_each_device, num_devices, @@ -2207,7 +2191,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, comm_ctx); - mcclReduceScatterWithScale(fp16_grad_data, + NCCLReduceScatterWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel_each_device, num_devices, @@ -2227,11 +2211,11 @@ void DistributedFusedLambKernel( // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but fp32_square_grad_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllReduce(fp32_square_grad_norm, + phi::dynload::ncclAllReduce(fp32_square_grad_norm, fp32_square_grad_norm, 1, - mcclFloat32, - mcclSum, + ncclFloat32, + ncclSum, local_comm, stream)); } @@ -2373,26 +2357,26 @@ void DistributedFusedLambKernel( // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but param_square_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllReduce(param_square_norm + fp32_global_param_num, + phi::dynload::ncclAllReduce(param_square_norm + fp32_global_param_num, param_square_norm + fp32_global_param_num, 2 * param_num - fp32_global_param_num, - mcclFloat32, - mcclSum, + ncclFloat32, + ncclSum, local_comm, stream)); } else { // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but trust_ratio_div_square_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllReduce(trust_ratio_div_square_norm, + phi::dynload::ncclAllReduce(trust_ratio_div_square_norm, trust_ratio_div_square_norm, param_num, - mcclFloat32, - mcclSum, + ncclFloat32, + ncclSum, local_comm, stream)); } - VLOG(10) << "mcclAllReduce done"; + VLOG(10) << "ncclAllReduce done"; } LogParamAndTrustRatioDivSquareNorm<1>( @@ -2417,7 +2401,7 @@ void DistributedFusedLambKernel( beta1, beta2); if (num_devices > 1) { - // mcclAllGather + // ncclAllGather if (local_comm_ctx) { auto send_buf = distributed::GetPartialTensor( *fp32_param_out, fp32_offset, fp32_numel_each_device); @@ -2426,10 +2410,10 @@ void DistributedFusedLambKernel( local_comm_ctx->AllGather(&recv_buf, send_buf, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllGather(fp32_param_data + fp32_offset, + phi::dynload::ncclAllGather(fp32_param_data + fp32_offset, fp32_param_data, fp32_numel_each_device, - mcclFloat32, + ncclFloat32, local_comm, stream)); } @@ -2455,7 +2439,7 @@ void DistributedFusedLambKernel( beta1, beta2); if (num_devices > 1) { - // mcclAllGather + // ncclAllGather if (local_comm_ctx) { auto send_buf = distributed::GetPartialTensor( *fp16_param_out, fp16_offset, fp16_numel_each_device); @@ -2464,10 +2448,10 @@ void DistributedFusedLambKernel( local_comm_ctx->AllGather(&recv_buf, send_buf, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllGather(fp16_param_data + fp16_offset, + phi::dynload::ncclAllGather(fp16_param_data + fp16_offset, fp16_param_data, fp16_numel_each_device, - mcclFloat16, + ncclFloat16, local_comm, stream)); } diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h index 13d925bbe19a19..4c47fd2b621784 100644 --- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h +++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h @@ -25,8 +25,7 @@ #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/common/amp_type_traits.h" -#if defined(__NVCC__) || defined(__MUSACC__) - +#ifdef __NVCC__ #include "cub/cub.cuh" #endif #ifdef __HIPCC__ @@ -462,7 +461,7 @@ class SparseMomentumOpKernel : public framework::OpKernel { grad_index.mutable_data({num_index}, ctx.GetPlace()); if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) auto sort_value_ptr = sort_value.mutable_data({num_index}, ctx.GetPlace()); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc index cc11601be0be61..4f118565396e11 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc @@ -107,7 +107,7 @@ PD_REGISTER_STRUCT_KERNEL(send_and_recv, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_STRUCT_KERNEL(send_and_recv, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index b9f05d663dba08..ebdddfd41b33f5 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -246,7 +246,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_STRUCT_KERNEL( rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 24457c24a54ace..b73ffe4319be78 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -48,7 +48,7 @@ BufferedReader::BufferedReader( buffer_size_(buffer_size), pin_memory_(pin_memory) { VLOG(1) << "BufferedReader"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place_) && !pin_memory) { int dev_idx = place_.device; // NOLINT compute_stream_ = @@ -118,7 +118,7 @@ void BufferedReader::ReadAsync(size_t i) { return -1UL; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // @{ Group GPU Place +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // @{ Group GPU Place if (platform::is_gpu_place(place_)) { TensorVec &cuda = cuda_buffer_[i]; if (cuda.empty()) { @@ -197,11 +197,6 @@ void BufferedReader::ReadAsync(size_t i) { hipEventRecord(events_[i].get(), compute_stream_)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream_.get(), events_[i].get(), 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaEventRecord(events_[i].get(), compute_stream_)); - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamWaitEvent(stream_.get(), events_[i].get(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(events_[i].get(), compute_stream_)); diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index db849dc70b5da9..032a74b7e23f14 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -21,7 +21,7 @@ #include "ThreadPool.h" #include "paddle/fluid/framework/reader.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #endif @@ -80,7 +80,7 @@ class BufferedReader : public framework::DecoratedReader { std::vector xpu_buffer_; std::vector custom_device_buffer_; size_t prev_pos_{-1UL}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t compute_stream_; std::shared_ptr stream_; std::vector> events_; diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index d0bde6af204893..e69492501c1173 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -28,7 +28,7 @@ limitations under the License. */ #include "paddle/fluid/framework/phi_utils.h" #include "paddle/phi/kernels/cpu/reduce.h" -#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) || defined(__MUSACC__) +#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) #include "paddle/phi/kernels/gpu/reduce.h" #include "paddle/phi/kernels/gpu/reduce_grad.h" #endif @@ -757,7 +757,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar. virtual std::string GetOpType() const = 0; }; -#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) || defined(__MUSACC__) +#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) template class ReduceBaseOp, diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 1a26271a97f225..30d4fb0cf9ad4c 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -429,7 +429,7 @@ class ReshapeKernel { pt_scalar_shape, out); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeInferKernel(static_cast(dev_ctx), @@ -462,7 +462,7 @@ class ReshapeGradKernel { phi::ReshapeGradKernel( static_cast(dev_ctx), *d_out, d_x); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeGradKernel( @@ -492,7 +492,7 @@ class ReshapeDoubleGradKernel { phi::ReshapeDoubleGradKernel( static_cast(dev_ctx), *d_out, *dd_x, dd_out); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeDoubleGradKernel( @@ -764,7 +764,7 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::ReshapeDoubleGradOpNoNeedBufferVarInferer, Reshape2DoubleGradInferShapeFunctor); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index 14b86627c3825d..f025d278074215 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -121,7 +121,7 @@ PD_REGISTER_KERNEL(save_sr, kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(save, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h index 7e3de57345a4bc..2b7f884f6170c3 100644 --- a/paddle/fluid/operators/select_op_helper.h +++ b/paddle/fluid/operators/select_op_helper.h @@ -39,7 +39,7 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) { } // when platform::is_gpu_place(mask.place()) is true std::unique_ptr cpu_mask{new phi::DenseTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get()); #else diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h index 13133e54f04152..2236988025cbc3 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h @@ -136,7 +136,7 @@ class SequenceReverseOpKernel : public framework::OpKernel { const size_t *lod; size_t lod_count = x.lod()[0].size(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { auto xlod = x.lod()[0]; phi::MixVector mixv_xlod(&xlod); @@ -144,7 +144,7 @@ class SequenceReverseOpKernel : public framework::OpKernel { } else { #endif lod = x.lod()[0].data(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } #endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc index 316f8a55cc8034..01f7bb3e928902 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc @@ -26,8 +26,44 @@ template class SequenceSoftmaxCUDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(false,"not support"); + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto& lod = x->lod(); + auto& dims = x->dims(); + + const size_t level = lod.size() - 1; + PADDLE_ENFORCE_EQ( + dims[0], + static_cast(lod[level].back()), + platform::errors::InvalidArgument( + "The first dimension of Input(X) should be equal to the sum of all " + "sequences' lengths. But received first dimension of Input(X) is " + "%d, the sum of all sequences' lengths is %d.", + dims[0], + static_cast(lod[level].back()))); + PADDLE_ENFORCE_EQ(dims[0], + x->numel(), + platform::errors::InvalidArgument( + "The width of each timestep in Input(X) of " + "SequenceSoftmaxOp should be 1.")); + + out->mutable_data(ctx.GetPlace()); + for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + Tensor x_i = x->Slice(start_pos, end_pos); + Tensor out_i = out->Slice(start_pos, end_pos); + + // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) + framework::DDim dims_i = + // common::make_ddim({1UL, end_pos - start_pos, 1UL, 1UL}); + common::make_ddim({1UL, end_pos - start_pos}); + x_i.Resize(dims_i); + out_i.Resize(dims_i); + phi::funcs::SoftmaxCUDNNFunctor()( + ctx.template device_context(), &x_i, &out_i); + } } }; @@ -35,7 +71,36 @@ template class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(false,"not support"); + auto* out = ctx.Input("Out"); + auto* out_grad = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + auto* x_grad = ctx.Output(framework::GradVarName("X")); + if (x_grad) { + x_grad->set_lod(x->lod()); + } + auto& lod = x->lod(); + const size_t level = lod.size() - 1; + + x_grad->mutable_data(ctx.GetPlace()); // NOLINT + for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + + Tensor out_i = out->Slice(start_pos, end_pos); + Tensor out_grad_i = out_grad->Slice(start_pos, end_pos); + Tensor x_grad_i = x_grad->Slice(start_pos, end_pos); + + // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) + framework::DDim dims_i = common::make_ddim({1UL, end_pos - start_pos}); + out_i.Resize(dims_i); + out_grad_i.Resize(dims_i); + x_grad_i.Resize(dims_i); + phi::funcs::SoftmaxGradCUDNNFunctor()( + ctx.template device_context(), + &out_i, + &out_grad_i, + &x_grad_i); + } } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc index a037d0dcf73ccf..12d4f72a91169e 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu index 3262bef2bf5e93..40a7a451a6e21a 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu @@ -17,10 +17,6 @@ limitations under the License. */ #include #endif -#ifdef __MUSACC__ -#include -#endif - #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 16864b80b5c765..a0aa1f589191ff 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -151,32 +151,26 @@ class SetValueGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr op) const override { - if (this->HasInput("ValueTensor")) { - op->SetType("set_value_grad"); - - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetInput("ValueTensor", this->Input("ValueTensor")); - if (this->HasInput("StartsTensorList")) { - op->SetInput("StartsTensorList", this->Input("StartsTensorList")); - } - if (this->HasInput("EndsTensorList")) { - op->SetInput("EndsTensorList", this->Input("EndsTensorList")); - } - if (this->HasInput("StepsTensorList")) { - op->SetInput("StepsTensorList", this->Input("StepsTensorList")); - } - - op->SetAttrMap(this->Attrs()); - - op->SetOutput(framework::GradVarName("ValueTensor"), - this->InputGrad("ValueTensor")); - op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); - - } else { - op->SetType("assign"); - op->SetInput("X", this->OutputGrad("Out")); - op->SetOutput("Out", this->InputGrad("Input")); + op->SetType("set_value_grad"); + op->SetInput("ValueTensor", this->Input("ValueTensor")); + op->SetOutput(framework::GradVarName("ValueTensor"), + this->InputGrad("ValueTensor")); + + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + + if (this->HasInput("StartsTensorList")) { + op->SetInput("StartsTensorList", this->Input("StartsTensorList")); + } + if (this->HasInput("EndsTensorList")) { + op->SetInput("EndsTensorList", this->Input("EndsTensorList")); } + if (this->HasInput("StepsTensorList")) { + op->SetInput("StepsTensorList", this->Input("StepsTensorList")); + } + + op->SetAttrMap(this->Attrs()); + + op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); } }; diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index a1e4a328cf439f..6b79d5c35b7838 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -69,7 +69,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { if (platform::is_cpu_place(mask.place())) { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) framework::TensorCopy( mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index c2911806996ce5..caa31565d4cf3d 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -478,7 +478,7 @@ struct DeviceIndependenceTensorOperations { std::vector out_shape = GetBroadcastShape({&x, &y}); ret.Resize(common::make_ddim(out_shape)); if (platform::is_gpu_place(context.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) // For GPU, there is no need to define XxxInverseFunctor and call // ElementwiseComputeEx in two branches. ElementwiseComputeEx, DeviceContext, InT>( diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu index 21406abff8d9f2..af69594f992cde 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cu +++ b/paddle/fluid/operators/sync_batch_norm_op.cu @@ -15,7 +15,6 @@ #include "paddle/fluid/operators/sync_batch_norm_utils.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" -#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/sync_batch_norm_kernel.h" @@ -105,8 +104,8 @@ void SyncBatchNormKernel(const Context& ctx, <<>>(x_d, N, H * W * D, C, stats); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) - mcclComm_t comm = static_cast(detail::GetCCLComm(x.place(), 0)); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + ncclComm_t comm = static_cast(detail::GetCCLComm(x.place(), 0)); if (comm == nullptr) { comm = ctx.nccl_comm(); } @@ -115,11 +114,11 @@ void SyncBatchNormKernel(const Context& ctx, int dtype = phi::ToNCCLDataType(mean_out->dtype()); // In-place operation PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllReduce(stats, + phi::dynload::ncclAllReduce(stats, stats, 2 * C + 1, - static_cast(dtype), - mcclSum, + static_cast(dtype), + ncclSum, comm, stream)); VLOG(3) << "Sync result using all reduce"; @@ -237,28 +236,26 @@ void SyncBatchNormCooKernel(const Context& dev_ctx, DenseTensor* saved_mean, DenseTensor* saved_variance, DenseTensor* reserve_space) { - PADDLE_ENFORCE(false, "error"); - - // EmptyLikeCooKernel(dev_ctx, x, y); - // phi::SyncBatchNormKernel(dev_ctx, - // x.values(), - // mean, - // variance, - // scale, - // bias, - // is_test, - // momentum, - // epsilon, - // data_layout, - // use_global_stats, - // trainable_statistics, - // y->mutable_values(), - // mean_out, - // variance_out, - // saved_mean, - // saved_variance, - // reserve_space); - // y->SetIndicesDict(x.GetIndicesDict()); + EmptyLikeCooKernel(dev_ctx, x, y); + phi::SyncBatchNormKernel(dev_ctx, + x.values(), + mean, + variance, + scale, + bias, + is_test, + momentum, + epsilon, + data_layout, + use_global_stats, + trainable_statistics, + y->mutable_values(), + mean_out, + variance_out, + saved_mean, + saved_variance, + reserve_space); + y->SetIndicesDict(x.GetIndicesDict()); } template @@ -280,27 +277,26 @@ void SyncBatchNormCooGradKernel( SparseCooTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad) { - PADDLE_ENFORCE(false, "error"); - // EmptyLikeCooKernel(dev_ctx, x, x_grad); - // *scale_grad = phi::EmptyLike(dev_ctx, scale); - // *bias_grad = phi::EmptyLike(dev_ctx, bias); - // phi::SyncBatchNormGradKernel(dev_ctx, - // x.values(), - // scale, - // bias, - // saved_mean, - // saved_variance, - // reserve_space, - // y_grad.values(), - // momentum, - // epsilon, - // data_layout, - // is_test, - // use_global_stats, - // trainable_statistics, - // x_grad->mutable_values(), - // scale_grad, - // bias_grad); + EmptyLikeCooKernel(dev_ctx, x, x_grad); + *scale_grad = phi::EmptyLike(dev_ctx, scale); + *bias_grad = phi::EmptyLike(dev_ctx, bias); + phi::SyncBatchNormGradKernel(dev_ctx, + x.values(), + scale, + bias, + saved_mean, + saved_variance, + reserve_space, + y_grad.values(), + momentum, + epsilon, + data_layout, + is_test, + use_global_stats, + trainable_statistics, + x_grad->mutable_values(), + scale_grad, + bias_grad); } } // namespace sparse diff --git a/paddle/fluid/operators/sync_batch_norm_utils.h b/paddle/fluid/operators/sync_batch_norm_utils.h index 21f1052e03a289..c132a91bb5346c 100644 --- a/paddle/fluid/operators/sync_batch_norm_utils.h +++ b/paddle/fluid/operators/sync_batch_norm_utils.h @@ -19,8 +19,7 @@ limitations under the License. */ #include #include #include -#if defined(__NVCC__) || defined(__MUSACC__) - +#ifdef __NVCC__ #include "cub/cub.cuh" #endif #ifdef __HIPCC__ @@ -28,7 +27,7 @@ limitations under the License. */ namespace cub = hipcub; #endif #include "paddle/fluid/distributed/collective/process_group.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #endif #include "paddle/common/layout.h" @@ -571,9 +570,9 @@ void SyncBatchNormGradFunctor( } } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) int global_gid = 0; - mcclComm_t comm = nullptr; + ncclComm_t comm = nullptr; if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has( global_gid)) { @@ -589,11 +588,11 @@ void SyncBatchNormGradFunctor( int dtype = paddle::platform::ToNCCLDataType(scale.dtype()); // In-place operation PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mcclAllReduce(stats, + phi::dynload::ncclAllReduce(stats, stats, 2 * C + 1, - static_cast(dtype), - mcclSum, + static_cast(dtype), + ncclSum, comm, stream)); VLOG(3) << "Sync result using all reduce"; diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 63d8614f3c3697..ef6172b6965f22 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -15,8 +15,7 @@ limitations under the License. */ #pragma once #include #include -#if defined(__NVCC__) || defined(__MUSACC__) - +#ifdef __NVCC__ #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index 20fe009e4c0912..458794223dc743 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) #include #include "paddle/phi/core/generator.h" @@ -113,7 +113,7 @@ inline std::vector GetNewDataFromShapeTensorList( return vec_new_shape; } -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) template struct UniformGenerator { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 1aaafb99cf9696..113ba40ec0cf31 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -64,7 +64,7 @@ if(WITH_DGC) set(dgc_deps dgc) endif() -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) set(GPU_CTX_DEPS dynload_cuda dynamic_loader) endif() @@ -90,14 +90,8 @@ if(WITH_ROCM) SRCS stream_callback_manager.cc DEPS simple_threadpool enforce common) endif() -if(WITH_MUSA) - musa_library( - stream_callback_manager - SRCS stream_callback_manager.cc - DEPS simple_threadpool enforce common) -endif() -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) set(STREAM_CALLBACK_DEPS stream_callback_manager) else() set(STREAM_CALLBACK_DEPS) @@ -144,7 +138,7 @@ cc_library( SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce common) -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) target_link_libraries(device_context gpu_resource_pool) endif() @@ -242,31 +236,6 @@ if(WITH_ROCM) DEPS device_context gpu_info) endif() -if(WITH_MUSA) - musa_library( - device_event_gpu - SRCS device_event_gpu.cc - DEPS device_event_base) - set(DEVICE_EVENT_LIBS - device_event_gpu - CACHE INTERNAL "device event libs") - if(WITH_CUSTOM_DEVICE) - musa_test( - device_event_test - SRCS device_event_test.cc - DEPS device_event_gpu device_event_custom_device) - else() - musa_test( - device_event_test - SRCS device_event_test.cc - DEPS device_event_gpu) - endif() - musa_test( - device_context_test - SRCS device_context_test.cu - DEPS device_context gpu_info) -endif() - cc_library(timer SRCS timer.cc) cc_test( timer_test @@ -316,18 +285,6 @@ elseif(WITH_ROCM) stats op_proto_maker shape_inference) -elseif(WITH_MUSA) - musa_library( - profiler - SRCS profiler.cc profiler.cu - DEPS phi - common - gpu_info - enforce - new_profiler - stats - op_proto_maker - shape_inference) elseif(WITH_XPU) cc_library( profiler @@ -408,23 +365,8 @@ if(WITH_ROCM) DEPS gpu_info) endif() -if(WITH_MUSA) - musa_test( - float16_gpu_test - SRCS float16_test.cu - DEPS lod_tensor) - musa_test( - test_limit_gpu_memory - SRCS test_limit_gpu_memory.cu - DEPS gpu_info phi common) - musa_library( - cuda_device_guard - SRCS cuda_device_guard.cc - DEPS gpu_info) -endif() - if(NOT APPLE AND NOT WIN32) - if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) + if(WITH_GPU OR WITH_ROCM) cc_test( device_code_test SRCS device_code_test.cc diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index 0c322075018983..4ffcf53b1a5747 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -23,7 +23,7 @@ namespace paddle { namespace platform { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class NCCLCommImpl : public NCCLComm { public: void set_ring_id(int ring_id) { ring_id_ = ring_id; } @@ -37,8 +37,8 @@ class NCCLCommImpl : public NCCLComm { int device_id() const override { return dev_ctx_->GetPlace().device; } - void set_comm(mcclComm_t comm) { comm_ = comm; } - mcclComm_t comm() const override { return comm_; } + void set_comm(ncclComm_t comm) { comm_ = comm; } + ncclComm_t comm() const override { return comm_; } gpuStream_t stream() const override { return dev_ctx_->stream(); } @@ -64,7 +64,7 @@ class NCCLCommImpl : public NCCLComm { int ring_id_; int nranks_; int rank_; - mcclComm_t comm_; + ncclComm_t comm_; std::unique_ptr dev_ctx_; // used for comm wait compute, compute_stream-->event-->comm_stream @@ -80,7 +80,7 @@ NCCLCommContext& NCCLCommContext::Instance() { } NCCLComm* NCCLCommContext::CreateComm( - mcclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id) { + ncclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id) { PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument( "The nccl unique id should not be null.")); @@ -106,10 +106,10 @@ NCCLComm* NCCLCommContext::CreateComm( platform::errors::InvalidArgument( "Expected dev_id >= 0. But received dev_id is %d.", dev_id)); - mcclComm_t comm = nullptr; + ncclComm_t comm = nullptr; SetDeviceId(dev_id); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::mcclCommInitRank(&comm, nranks, *nccl_id, rank)); + platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank)); auto* comm_wrapper = AssignNCCLComm(comm, nranks, rank, dev_id, ring_id); @@ -133,8 +133,8 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector& dev_ids, dev_ids.size())); const int kDevices = dev_ids.size(); - mcclComm_t comms[kDevices]; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclCommInitAll( + ncclComm_t comms[kDevices]; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclCommInitAll( comms, dev_ids.size(), dev_ids.data())); PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), @@ -156,7 +156,7 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector& dev_ids, void NCCLCommContext::CreateNCCLCommMultiTrainer( const std::vector& dev_ids, - mcclUniqueId* nccl_id, + ncclUniqueId* nccl_id, int ntrainers, int train_id, int ring_id) { @@ -169,22 +169,20 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( VLOG(1) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices << ", ntrainers: " << ntrainers << ", train_id: " << train_id << ", rind_id: " << ring_id; - mcclComm_t comms[kDevices]; + ncclComm_t comms[kDevices]; { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart()); for (int i = 0; i < kDevices; i++) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaSetDevice(i)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(i)); #endif - platform::dynload::mcclCommInitRank( + platform::dynload::ncclCommInitRank( comms + i, kDevices * ntrainers, *nccl_id, train_id * kDevices + i); VLOG(1) << "ncclCommInitRank: " << i; } - PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd()); VLOG(1) << "nccl group end seccessss"; } PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), @@ -210,7 +208,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( } NCCLComm* NCCLCommContext::AssignNCCLComm( - mcclComm_t comm, int nranks, int rank, int dev_id, int ring_id) { + ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id) { std::unique_ptr dev_ctx( new phi::GPUContext(CUDAPlace(dev_id))); dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h index d88e6e69fba50b..6636856a0eb6ce 100644 --- a/paddle/fluid/platform/collective_helper.h +++ b/paddle/fluid/platform/collective_helper.h @@ -28,10 +28,10 @@ namespace paddle { namespace platform { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) // In order to apply hierarchical communication with NCCL, we need // a communication ring contains NCCL communicators associated to a global -// mcclUniqueId. E.g. for a hierarchical case, +// ncclUniqueId. E.g. for a hierarchical case, // // 11 - 12 21 - 22 // | | | | @@ -55,7 +55,7 @@ class NCCLComm { virtual int nranks() const = 0; virtual int rank() const = 0; virtual int device_id() const = 0; - virtual mcclComm_t comm() const = 0; + virtual ncclComm_t comm() const = 0; virtual gpuStream_t stream() const = 0; virtual gpuEvent_t compute_event() const = 0; virtual gpuEvent_t comm_event() const = 0; @@ -69,12 +69,12 @@ class NCCLCommContext { static NCCLCommContext& Instance(); NCCLComm* CreateComm( - mcclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id = 0); + ncclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id = 0); void CreateAllNCCLComms(const std::vector& dev_ids, int ring_id = 0); void CreateNCCLCommMultiTrainer(const std::vector& dev_ids, - mcclUniqueId* nccl_id, + ncclUniqueId* nccl_id, int nranks, int rank, int ring_id); @@ -82,7 +82,7 @@ class NCCLCommContext { // a latter comm with the same dev_id and the same ring_id // will override the former NCCLComm* AssignNCCLComm( - mcclComm_t comm, int nranks, int rank, int dev_id, int ring_id = 0); + ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id = 0); // retrieve a communicator by the ring id in multiprocessing mode NCCLComm* Get(int ring_id) const { @@ -99,7 +99,7 @@ class NCCLCommContext { return comm_map_.at(ring_id).begin()->second.get(); } - int GetRingId(mcclComm_t comm) const { + int GetRingId(ncclComm_t comm) const { for (const auto& pair : comm_map_) { for (const auto& p : pair.second) { if (p.second.get()->comm() == comm) { diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt index b782a45047117b..6f0d86f0a4b176 100644 --- a/paddle/fluid/platform/device/CMakeLists.txt +++ b/paddle/fluid/platform/device/CMakeLists.txt @@ -1,7 +1,7 @@ set(DEV_LIBS custom_device) # GPU -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) add_subdirectory(gpu) endif() diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h index bcfb316837a302..aa2dba03c90824 100644 --- a/paddle/fluid/platform/device/device_wrapper.h +++ b/paddle/fluid/platform/device/device_wrapper.h @@ -16,7 +16,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt index 3176d042b7146d..65c3fb20631675 100644 --- a/paddle/fluid/platform/device/gpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt @@ -28,18 +28,6 @@ elseif(WITH_ROCM) cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) -elseif(WITH_MUSA) - # add_subdirectory(musa) - musa_library( - gpu_info - SRCS gpu_info.cc - DEPS phi common glog enforce monitor dynload_cuda) - - musa_test(cuda_helper_test SRCS cuda_helper_test.cu) - musa_test( - cudnn_desc_test - SRCS cudnn_desc_test.cc - DEPS dynload_cuda) endif() cc_library( diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h index f94f5d55b7eeef..878a122a492243 100644 --- a/paddle/fluid/platform/device/gpu/gpu_helper.h +++ b/paddle/fluid/platform/device/gpu/gpu_helper.h @@ -13,12 +13,10 @@ // limitations under the License. #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h" -#elif defined(PADDLE_WITH_MUSA) -#include "paddle/fluid/platform/device/gpu/musa/musa_helper.h" #else #include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h" #include "paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h" diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index f82d836e83e770..3a26b73e64b772 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -35,8 +35,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/miopen.h" -#elif defined(PADDLE_WITH_MUSA) -#include "paddle/fluid/platform/dynload/mudnn.h" #else #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/phi/backends/gpu/cuda/cuda_graph.h" @@ -217,12 +215,6 @@ class RecordedGpuMallocHelper { } else { result = hipMalloc(ptr, size); } -#elif defined(PADDLE_WITH_MUSA) - if (UNLIKELY(malloc_managed_memory)) { - result = musaMallocManaged(ptr, size); - } else { - result = musaMalloc(ptr, size); - } #else phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard; if (UNLIKELY(malloc_managed_memory)) { @@ -268,9 +260,6 @@ class RecordedGpuMallocHelper { #ifdef PADDLE_WITH_HIP auto err = hipFree(ptr); if (err != hipErrorDeinitialized) { -#elif defined(PADDLE_WITH_MUSA) - auto err = musaFree(ptr); - if (err != musaErrorMusartUnloading) { #else auto err = cudaFree(ptr); VLOG(10) << "[cudaFree] size=" << static_cast(size) / (1 << 20) @@ -317,8 +306,6 @@ class RecordedGpuMallocHelper { CUDADeviceGuard guard(dev_id_); #ifdef PADDLE_WITH_HIP auto result = hipMemGetInfo(actual_avail, actual_total); -#elif defined(PADDLE_WITH_MUSA) - auto result = musaMemGetInfo(actual_avail, actual_total); #else auto result = cudaMemGetInfo(actual_avail, actual_total); #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h index a2fe54ae4dca4f..b5a00e9257a80e 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.h +++ b/paddle/fluid/platform/device/gpu/gpu_info.h @@ -11,7 +11,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index 018fee5f7416f8..98c6e379342f25 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -16,12 +16,10 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_CUDA #include -#elif defined(PADDLE_WITH_MUSA) -#include #else #include #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc index 0fb7e061e3243c..9f2168e1cdb8b0 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -30,9 +30,6 @@ CudaStreamResourcePool::CudaStreamResourcePool() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamCreateWithFlags(&stream, musaStreamNonBlocking)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); @@ -44,8 +41,6 @@ CudaStreamResourcePool::CudaStreamResourcePool() { platform::SetDeviceId(dev_idx); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream)); #endif @@ -87,9 +82,6 @@ CudaEventResourcePool::CudaEventResourcePool() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event, hipEventDisableTiming)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaEventCreateWithFlags(&event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); @@ -101,8 +93,6 @@ CudaEventResourcePool::CudaEventResourcePool() { platform::SetDeviceId(dev_idx); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h index 17e649b9ac62a8..2ac13e692f7837 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h @@ -14,16 +14,13 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_CUDA #include #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#include -#endif + #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index df8b87ed3a0365..c9afafdef7166c 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -15,19 +15,14 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_HIP #include #include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/rocblas.h" -#elif defined(PADDLE_WITH_MUSA) -#include -#include "paddle/fluid/platform/dynload/mublas.h" -#include "paddle/fluid/platform/dynload/mudnn.h" -#include "paddle/phi/backends/gpu/forwards.h" #else #include @@ -39,95 +34,78 @@ namespace paddle { #ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = ROCM_TYPE; -#elif defined(PADDLE_WITH_MUSA) -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ - using GPU_TYPE = MUSA_TYPE; #else // CDUA -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ + +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = CUDA_TYPE; #endif -DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t); -DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t); -DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t); -DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind); -DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp); - - - // DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t, mudnnDataType_t); - // DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, - // cudnnActivationStruct, - // miopenActivationDescriptor, - // mudnnActivationStruct); - // DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, - // cudnnActivationMode_t, - // miopenActivationMode_t, - // mudnnActivationMode_t); - // DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, - // cudnnTensorStruct, - // miopenTensorDescriptor, - // mudnnTensorStruct); - // DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, - // cudnnTensorFormat_t, - // miopenTensorFormat_t, - // mudnnTensorFormat_t); - // DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, - // cudnnFilterStruct, - // miopenTensorDescriptor, - // mudnnFilterStruct); - // DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, - // cudnnFilterDescriptor_t, - // miopenTensorDescriptor_t, - // mudnnFilterDescriptor_t); - // DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, - // cudnnConvolutionStruct, - // miopenConvolutionDescriptor, - // mudnnConvolutionStruct); - // DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, - // cudnnConvolutionDescriptor_t, - // miopenConvolutionDescriptor_t, - // mudnnConvolutionDescriptor_t); - // DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, - // cudnnPoolingDescriptor_t, - // miopenPoolingDescriptor_t, - // mudnnPoolingDescriptor_t); - // DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t,mudnnPoolingMode_t);MUDNN_DNN_ROUTINE_EACH - // DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, - // cudnnDropoutDescriptor_t, - // miopenDropoutDescriptor_t, - // mudnnDropoutDescriptor_t); - DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t,mudnnHandle_t); - -DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle,mublasHandle_t); +DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t); +DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t); +DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t); +DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind); +DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t); + +DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t); +DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, + cudnnActivationStruct, + miopenActivationDescriptor); +DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, + cudnnActivationMode_t, + miopenActivationMode_t); +DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, + cudnnTensorStruct, + miopenTensorDescriptor); +DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, + cudnnTensorFormat_t, + miopenTensorFormat_t); +DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, + cudnnFilterStruct, + miopenTensorDescriptor); +DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, + cudnnFilterDescriptor_t, + miopenTensorDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, + cudnnConvolutionStruct, + miopenConvolutionDescriptor); +DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, + cudnnConvolutionDescriptor_t, + miopenConvolutionDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, + cudnnPoolingDescriptor_t, + miopenPoolingDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); +DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, + cudnnDropoutDescriptor_t, + miopenDropoutDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); + +DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workround. -// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasHandle_t); +DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); using CUDAGraphID = unsigned long long; // NOLINT #undef DECLARE_TYPE_FOR_GPU #ifdef PADDLE_WITH_HIP -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ constexpr auto GPU_CV = ROCM_CV; -#elif defined(PADDLE_WITH_MUSA) -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ - constexpr auto GPU_CV = MUSA_CV; #else // CDUA -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ constexpr auto GPU_CV = CUDA_CV; #endif DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation, - hipErrorOutOfMemory, - musaErrorMemoryAllocation); -DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady); -DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess); + hipErrorOutOfMemory); +DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); +DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); #undef DECLARE_CONSTANT_FOR_GPU } // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/musa/musa_helper.h b/paddle/fluid/platform/device/gpu/musa/musa_helper.h deleted file mode 100644 index 45ded21129a5ad..00000000000000 --- a/paddle/fluid/platform/device/gpu/musa/musa_helper.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT - -#include "paddle/fluid/platform/dynload/mublas.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/macros.h" - -namespace paddle { -namespace platform { - -/* - * Summary: Grid stride looping macro in CUDA kernel - * - * [ Why need this macro? ] - * - * The original looping in CUDA kernel is: - * - * `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ - * i += blockDim.x * gridDim.x)` - * - * This for condition is risky. The value of `blockIdx.x * blockDim.x` - * may be large, such as over 1GB, the first iteration is no problem here, - * but when `i += blockDim.x * gridDim.x` is executed, the value of i - * will greater than INT_MAX and overflow becomes negative value, at - * this time, the cycle condition `i < (n)` is still satisfied, so it - * will cause illegal access to cuda memory. - * - * Here is a real example in ERINE, it will trigger above error. - * The related data are: - * - blockIdx.x = 2172938 - * - blockDim.x = 512 - * - blockIdx.x * blockDim.x = 1112543864 - * - INT_MAX = 2147483647 - * - * So we polish the for condition as follow, the int64_t __index__ will - * prevent overflow in the loop increment. - * - * Parameters: - * - i: loop index - * - num: total element numbers - * - * Examples: - * template - * __global__ void Scale(T* logit_grad, const T* loss_grad, const int num, - * const int d, const int remain) { - * CUDA_KERNEL_LOOP(index, num) { - * int idx_n = index / d; - * int idx_remain = index % remain; - * logit_grad[index] *= loss_grad[idx_n * remain + idx_remain]; - * } - * } - * - */ - -#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ - int64_t __index__ = \ - static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ - int64_t __stride__ = static_cast(blockDim.x) * gridDim.x; \ - for (index_type i = __index__; __index__ < (num); \ - __index__ += __stride__, i = __index__) - -class CublasHandleHolder { - public: - explicit CublasHandleHolder(musaStream_t stream) { - PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasCreate(&handle_)); - PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasSetStream(handle_, stream)); - } - - const mublasHandle_t& GetCublasHandle() const { return handle_; } - - ~CublasHandleHolder() PADDLE_MAY_THROW { - PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasDestroy(handle_)); - } - - template - inline void Call(Callback&& callback) const { - std::lock_guard guard(mtx_); - callback(handle_); - } - - private: - DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); - - mublasHandle_t handle_; - mutable std::mutex mtx_; -}; - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index db5bcbc08c5de6..8afcfc9f2b7005 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -14,7 +14,7 @@ #pragma once -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include #include @@ -29,15 +29,9 @@ #ifdef PADDLE_WITH_NCCL #include "paddle/fluid/platform/dynload/nccl.h" #endif -#ifdef PADDLE_WITH_MCCL -#include "paddle/fluid/platform/dynload/mccl.h" -#endif #ifdef PADDLE_WITH_RCCL #include "paddle/fluid/platform/dynload/rccl.h" #endif -#ifdef PADDLE_WITH_MCCL -#include "paddle/fluid/platform/dynload/mccl.h" -#endif #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/bfloat16.h" @@ -50,63 +44,63 @@ namespace paddle { namespace platform { -inline mcclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { +inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { if (type == framework::proto::VarType::FP32) { - return mcclFloat; + return ncclFloat; } else if (type == framework::proto::VarType::FP64) { - return mcclFloat; + return ncclDouble; } else if (type == framework::proto::VarType::INT32) { - return mcclInt; + return ncclInt; } else if (type == framework::proto::VarType::INT64) { - return mcclInt64; + return ncclInt64; } else if (type == framework::proto::VarType::FP16) { - return mcclFloat16; + return ncclFloat16; } else if (type == framework::proto::VarType::INT8) { - return mcclInt8; + return ncclInt8; } else if (type == framework::proto::VarType::UINT8) { - return mcclUint8; + return ncclUint8; } else if (type == framework::proto::VarType::BOOL) { - return mcclUint8; -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 - // } else if (type == framework::proto::VarType::BF16) { - // return mcclBfloat16; -// #endif + return ncclUint8; +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 + } else if (type == framework::proto::VarType::BF16) { + return ncclBfloat16; +#endif } else { PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); } } -inline mcclDataType_t ToNCCLDataType(phi::DataType type) { +inline ncclDataType_t ToNCCLDataType(phi::DataType type) { if (type == phi::DataType::FLOAT32) { - return mcclFloat; + return ncclFloat; } else if (type == phi::DataType::FLOAT64) { - return mcclFloat; + return ncclDouble; } else if (type == phi::DataType::INT32) { - return mcclInt; + return ncclInt; } else if (type == phi::DataType::INT64) { - return mcclInt64; + return ncclInt64; } else if (type == phi::DataType::FLOAT16) { - return mcclFloat16; + return ncclFloat16; } else if (type == phi::DataType::UINT8) { - return mcclUint8; + return ncclUint8; } else if (type == phi::DataType::INT8) { - return mcclInt8; + return ncclInt8; } else if (type == phi::DataType::BOOL) { - return mcclUint8; -// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 - // } else if (type == phi::DataType::BFLOAT16) { - // return mcclBfloat16; -// #endif + return ncclUint8; +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 + } else if (type == phi::DataType::BFLOAT16) { + return ncclBfloat16; +#endif } else { PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); } } -// NOTE(minqiyang): according to the mcclGroupEnd documentations: +// NOTE(minqiyang): according to the ncclGroupEnd documentations: // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, -// mcclGroupEnd will wait for all communicators to be initialized, which will +// ncclGroupEnd will wait for all communicators to be initialized, which will // cause blocking problem when a runtime_error was thrown, so try only guard // NCCL actions when use it. class NCCLGroupGuard { @@ -118,18 +112,18 @@ class NCCLGroupGuard { inline NCCLGroupGuard() { NCCLMutex().lock(); - PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart()); } inline ~NCCLGroupGuard() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd()); NCCLMutex().unlock(); } }; struct NCCLContext { std::unique_ptr ctx_; - mcclComm_t comm_; + ncclComm_t comm_; explicit NCCLContext(int dev_id) : comm_{nullptr} { ctx_.reset(new phi::GPUContext(CUDAPlace(dev_id))); @@ -156,7 +150,7 @@ struct NCCLContext { } gpuStream_t stream() const { return ctx_->stream(); } - mcclComm_t comm() const { return comm_; } + ncclComm_t comm() const { return comm_; } int device_id() const { return ctx_->GetPlace().device; } }; @@ -166,7 +160,7 @@ struct NCCLContextMap { std::vector order_; explicit NCCLContextMap(const std::vector &places, - mcclUniqueId *nccl_id = nullptr, + ncclUniqueId *nccl_id = nullptr, size_t num_trainers = 1, size_t trainer_id = 0) { PADDLE_ENFORCE_EQ(!places.empty(), @@ -185,11 +179,11 @@ struct NCCLContextMap { platform::errors::Unavailable("NCCL Context Map does not support " "contain two or more same device.")); - std::unique_ptr comms(new mcclComm_t[order_.size()]); + std::unique_ptr comms(new ncclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. if (num_trainers == 1 && nccl_id == nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); - PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::mcclCommInitAll( + PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); } else { PADDLE_ENFORCE_NOT_NULL( @@ -209,7 +203,7 @@ struct NCCLContextMap { VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i]; SetDeviceId(gpu_id); - PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::mcclCommInitRank( + PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( comms.get() + i, nranks, *nccl_id, rank)); } } @@ -304,7 +298,7 @@ class NCCLCommunicator { } void InitFlatCtxs(const std::vector &places, - const std::vector &nccl_ids, + const std::vector &nccl_ids, size_t trainers_num, size_t trainer_id) { if (nccl_ids.size() == 0) { @@ -336,8 +330,8 @@ class NCCLCommunicator { } void InitHierarchicalCtxs(const std::vector &places, - const std::vector &inter_nccl_ids, - const std::vector &exter_nccl_ids, + const std::vector &inter_nccl_ids, + const std::vector &exter_nccl_ids, size_t trainers_num, size_t trainer_id, size_t inter_trainers_num, diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 786b38239e60ef..c4f40767fd52ce 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/phi/core/expect.h" #include "paddle/phi/core/generator.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/phi/backends/gpu/gpu_context.h" @@ -53,7 +53,7 @@ DeviceType Place2DeviceType(const platform::Place& place) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template typename std::enable_if::value, DevCtx*>::type @@ -86,7 +86,7 @@ inline std::unique_ptr CreateDeviceContext( DevCtx* dev_ctx = ConstructDevCtx(p, stream_priority); auto& instance = paddle::memory::allocation::AllocatorFacade::Instance(); if (p.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto* cuda_ctx = dynamic_cast(dev_ctx); PADDLE_ENFORCE_NOT_NULL( cuda_ctx, @@ -184,7 +184,7 @@ void EmplaceDeviceContexts( /*unused*/ stream_priority); #endif } else if (place.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) EmplaceDeviceContext( place_to_device_context, place, @@ -221,7 +221,7 @@ void EmplaceDeviceContexts( "option.")); #endif } else if (platform::is_cuda_pinned_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) EmplaceDeviceContext( place_to_device_context, place, diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index b015bb9a3e6259..4a75d3ea97f9ae 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -53,18 +53,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" // NOLINT #endif - -#ifdef PADDLE_WITH_MUSA -#include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT -#include "paddle/fluid/platform/dynload/mudnn.h" -#include "paddle/fluid/platform/dynload/mublas.h" -#include "paddle/phi/backends/gpu/gpu_context.h" // NOLINT -#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) -#include "paddle/fluid/platform/dynload/mccl.h" -#endif -#include "paddle/fluid/platform/device/gpu/gpu_info.h" // NOLINT -#endif - #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" #endif @@ -148,7 +136,7 @@ namespace xpu = baidu::xpu::api; using XPUDeviceContext = phi::XPUContext; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) using CUDAPinnedDeviceContext = phi::GPUPinnedContext; #endif @@ -177,7 +165,7 @@ struct DefaultDeviceContextType { }; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> struct DefaultDeviceContextType { using TYPE = paddle::platform::CUDAPinnedDeviceContext; diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h index cb43f00f7fe0fb..402974b89e5c90 100644 --- a/paddle/fluid/platform/device_event.h +++ b/paddle/fluid/platform/device_event.h @@ -31,7 +31,7 @@ using ::paddle::platform::kXPU; USE_EVENT(kCPU) USE_EVENT_WAIT(kCPU, kCPU) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) USE_EVENT(kCUDA); USE_EVENT_WAIT(kCUDA, kCUDA) USE_EVENT_WAIT(kCPU, kCUDA) diff --git a/paddle/fluid/platform/device_event_base.cc b/paddle/fluid/platform/device_event_base.cc index c23f395e0e36bb..cd2d31f1fbefb7 100644 --- a/paddle/fluid/platform/device_event_base.cc +++ b/paddle/fluid/platform/device_event_base.cc @@ -53,14 +53,6 @@ unsigned int GenerateDeviceEventFlag(bool enable_timing, return flags; #endif -#ifdef PADDLE_WITH_MUSA - unsigned int flags = - (blocking ? musaEventBlockingSync : musaEventDefault) | - (enable_timing ? musaEventDefault : musaEventDisableTiming) | - (interprocess ? musaEventInterprocess : musaEventDefault); - return flags; -#endif - return 0; } diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc index bbeb67821e023d..d64b062cda0acc 100644 --- a/paddle/fluid/platform/device_event_gpu.cc +++ b/paddle/fluid/platform/device_event_gpu.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/platform/device_event_base.h" #include "paddle/fluid/platform/event.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) namespace paddle { namespace platform { struct CUDADeviceEventWrapper { diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 10f582069e6613..29f7b91a171572 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -22,10 +22,6 @@ endif() if(WITH_ROCM) list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc) endif() -if(WITH_MUSA) - list(APPEND MUSA_SRCS mublas.cc mudnn.cc murand.cc mufft.cc) -endif() - # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows. @@ -43,15 +39,6 @@ if(NOT APPLE) list(APPEND HIP_SRCS cupti.cc) endif() endif() - if(WITH_MUSA) - list(APPEND MUSA_SRCS musartc.cc musa_driver.cc) - if(WITH_MCCL) - list(APPEND MUSA_SRCS mccl.cc) - endif() - if(CUPTI_FOUND) - list(APPEND MUSA_SRCS mupti.cc) - endif() - endif() endif() if(TENSORRT_FOUND) @@ -75,15 +62,6 @@ if(WITH_ROCM) dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi common) -elseif(WITH_MUSA) - musa_library( - dynload_cuda - SRCS ${MUSA_SRCS} - DEPS dynamic_loader phi common) - cc_library( - dynload_warpctc - SRCS warpctc.cc - DEPS dynamic_loader warpctc phi common) else() nv_library( dynload_cuda diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index aebdd715b9e1cc..93a19645a0a34e 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -47,7 +47,6 @@ void* GetMKLRTDsoHandle(); void* GetROCFFTDsoHandle(); void* GetCusparseLtDsoHandle(); void* GetXPTIDsoHandle(); -void* GetMUFFTDsoHandle(); void SetPaddleLibPath(const std::string&); } // namespace dynload diff --git a/paddle/fluid/platform/dynload/mccl.cc b/paddle/fluid/platform/dynload/mccl.cc deleted file mode 100644 index 8497d35e2484d2..00000000000000 --- a/paddle/fluid/platform/dynload/mccl.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/dynload/mccl.h" - -namespace paddle { -namespace platform { -namespace dynload { - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); - -// #if NCCL_VERSION_CODE >= 2212 -MCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP) -// #endif - -// #if NCCL_VERSION_CODE >= 2304 -MCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP) -// #endif - -// #if NCCL_VERSION_CODE >= 2703 -MCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) -// #endif - -// #if NCCL_VERSION_CODE >= 21100 -MCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP) -// #endif - -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mccl.h b/paddle/fluid/platform/dynload/mccl.h deleted file mode 100644 index 0e1eac41691a58..00000000000000 --- a/paddle/fluid/platform/dynload/mccl.h +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include - -#include // NOLINT - -#include "paddle/phi/backends/dynload/mccl.h" - -namespace paddle { -namespace platform { -namespace dynload { - -#define PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name) \ - using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ - extern DynLoad__##__name __name - -MCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP) - -#define MCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(mcclBroadcast); -MCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP) - -#define MCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); -MCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP) - -#define MCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ - __macro(mcclSend); \ - __macro(mcclRecv); -MCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP) - - -#define MCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \ - __macro(mcclRedOpCreatePreMulSum); \ - __macro(mcclRedOpDestroy); -MCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP) - -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mublas.cc b/paddle/fluid/platform/dynload/mublas.cc deleted file mode 100644 index 0ca4c6c3dac999..00000000000000 --- a/paddle/fluid/platform/dynload/mublas.cc +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/dynload/mublas.h" - -namespace paddle { -namespace platform { -namespace dynload { - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); - -#ifdef MUBLAS_BLAS_ROUTINE_EACH_R2 -MUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); -#endif - -#ifdef MUBLAS_BLAS_ROUTINE_EACH_R3 -MUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); -#endif - -#ifdef MUBLAS_BLAS_ROUTINE_EACH_R4 -MUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); -#endif -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mublas.h b/paddle/fluid/platform/dynload/mublas.h deleted file mode 100644 index 0b7d21a4ecb76f..00000000000000 --- a/paddle/fluid/platform/dynload/mublas.h +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include // NOLINT -#include - -#include "paddle/phi/backends/dynload/mublas.h" - -namespace paddle { -namespace platform { -namespace dynload { - -/** - * The following macro definition can generate structs - * (for each function) to dynamic load mublas routine - * via operator overloading. - * - * note: default dynamic linked libs - */ -#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name) \ - using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ - extern DynLoad__##__name __name - - -MUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) - - -MUBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) - - -MUBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) - - -MUBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) - -#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mudnn.cc b/paddle/fluid/platform/dynload/mudnn.cc deleted file mode 100644 index 8b6ee172e14556..00000000000000 --- a/paddle/fluid/platform/dynload/mudnn.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/dynload/mudnn.h" - -#include "paddle/phi/backends/dynload/mudnn.h" -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -namespace paddle { -namespace platform { -namespace dynload { - -// MUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP); - -bool HasCUDNN() { return phi::dynload::HasCUDNN(); } - -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mudnn.h b/paddle/fluid/platform/dynload/mudnn.h deleted file mode 100644 index f980972538a0e4..00000000000000 --- a/paddle/fluid/platform/dynload/mudnn.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifdef PADDLE_WITH_MUSA -#include -#include -#include "paddle/phi/backends/dynload/mudnn.h" - -namespace paddle { -namespace platform { -namespace dynload { - -using ::musa::dnn::BatchNorm; -using ::musa::dnn::Convolution; -using ::musa::dnn::Handle; -using ::musa::dnn::MemoryHandler; -using ::musa::dnn::Pooling; -using ::musa::dnn::Softmax; -using ::musa::dnn::Tensor; - -extern bool HasCUDNN(); - -} // namespace dynload -} // namespace platform -} // namespace paddle - -#endif diff --git a/paddle/fluid/platform/dynload/mufft.cc b/paddle/fluid/platform/dynload/mufft.cc deleted file mode 100644 index 1126ab516619c7..00000000000000 --- a/paddle/fluid/platform/dynload/mufft.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/dynload/mufft.h" - -#include "paddle/phi/backends/dynload/mufft.h" - -namespace paddle { -namespace platform { -namespace dynload { - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); - - -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mufft.h b/paddle/fluid/platform/dynload/mufft.h deleted file mode 100644 index 31452acd9d817f..00000000000000 --- a/paddle/fluid/platform/dynload/mufft.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifdef PADDLE_WITH_MUSA -#include -#include -#include - -#include // NOLINT - -#include "paddle/phi/backends/dynload/mufft.h" - -namespace paddle { -namespace platform { -namespace dynload { - - -#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUFFT_WRAP(__name) \ - using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ - extern DynLoad__##__name __name - -/** - * include all needed cufft functions in HPPL - * different cufft version has different interfaces - **/ -#define MUFFT_FFT_ROUTINE_EACH(__macro) \ - __macro(mufftPlan1d); \ - __macro(mufftPlan2d); \ - __macro(mufftPlan3d); \ - __macro(mufftPlanMany); \ - __macro(mufftMakePlan1d); \ - __macro(mufftMakePlan2d); \ - __macro(mufftMakePlan3d); \ - __macro(mufftMakePlanMany); \ - __macro(mufftEstimate1d); \ - __macro(mufftEstimate2d); \ - __macro(mufftEstimate3d); \ - __macro(mufftEstimateMany); \ - __macro(mufftCreate); \ - __macro(mufftGetSize1d); \ - __macro(mufftGetSize2d); \ - __macro(mufftGetSize3d); \ - __macro(mufftGetSizeMany); \ - __macro(mufftGetSize); \ - __macro(mufftSetWorkArea); \ - __macro(mufftSetAutoAllocation); \ - __macro(mufftExecC2C); \ - __macro(mufftExecR2C); \ - __macro(mufftExecC2R); \ - __macro(mufftExecZ2Z); \ - __macro(mufftExecD2Z); \ - __macro(mufftExecZ2D); \ - __macro(mufftSetStream); \ - __macro(mufftDestroy); \ - __macro(mufftGetVersion); \ - __macro(mufftGetProperty); \ - __macro(mufftXtSetGPUs); \ - __macro(mufftXtMalloc); \ - __macro(mufftXtMemcpy); \ - __macro(mufftXtFree); \ - __macro(mufftXtExecDescriptorC2C); \ - __macro(mufftXtExecDescriptorR2C); \ - __macro(mufftXtExecDescriptorC2R); \ - __macro(mufftXtExecDescriptorZ2Z); \ - __macro(mufftXtExecDescriptorD2Z); \ - __macro(mufftXtExecDescriptorZ2D); \ - __macro(mufftXtQueryPlan); \ - __macro(mufftXtSetCallback); \ - __macro(mufftXtClearCallback); \ - __macro(mufftXtMakePlanMany); \ - __macro(mufftXtGetSizeMany); \ - __macro(mufftXtExec); \ - __macro(mufftXtExecDescriptor); - -MUFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUFFT_WRAP) - -} // namespace dynload -} // namespace platform -} // namespace paddle - -#endif diff --git a/paddle/fluid/platform/dynload/murand.cc b/paddle/fluid/platform/dynload/murand.cc deleted file mode 100644 index 82b911ead32715..00000000000000 --- a/paddle/fluid/platform/dynload/murand.cc +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/dynload/murand.h" - -namespace paddle { -namespace platform { -namespace dynload { - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); - -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/murand.h b/paddle/fluid/platform/dynload/murand.h deleted file mode 100644 index b20a49a7043846..00000000000000 --- a/paddle/fluid/platform/dynload/murand.h +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include - -#include // NOLINT - -#include "paddle/phi/backends/dynload/murand.h" - -namespace paddle { -namespace platform { -namespace dynload { - -#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ - using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ - extern DynLoad__##__name __name - -#define MURAND_RAND_ROUTINE_EACH(__macro) \ - __macro(murandCreateGenerator); \ - __macro(murandSetStream); \ - __macro(murandSetPseudoRandomGeneratorSeed); \ - __macro(murandGenerateUniform); \ - __macro(murandGenerateUniformDouble); \ - __macro(murandGenerateNormal); \ - __macro(murandDestroyGenerator); - -MURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP); - -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musa_driver.cc b/paddle/fluid/platform/dynload/musa_driver.cc deleted file mode 100644 index 8898bd4dfb654a..00000000000000 --- a/paddle/fluid/platform/dynload/musa_driver.cc +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/dynload/musa_driver.h" - -#include "paddle/phi/backends/dynload/musa_driver.h" - -namespace paddle { -namespace platform { -namespace dynload { - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MUSA_ROUTINE_EACH(DEFINE_WRAP); - -bool HasCUDADriver() { return phi::dynload::HasCUDADriver(); } - -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musa_driver.h b/paddle/fluid/platform/dynload/musa_driver.h deleted file mode 100644 index 261841e8e73845..00000000000000 --- a/paddle/fluid/platform/dynload/musa_driver.h +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include // NOLINT - -#include "paddle/phi/backends/dynload/musa_driver.h" - -namespace paddle { -namespace platform { -namespace dynload { - -extern bool HasCUDADriver(); - -#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name) \ - using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ - extern DynLoad__##__name __name - -/** - * include all needed musa driver functions - **/ -#define PLATFORM_MUSA_ROUTINE_EACH(__macro) \ - __macro(muInit); \ - __macro(muDriverGetVersion); \ - __macro(muGetErrorString); \ - __macro(muModuleLoadData); \ - __macro(muModuleGetFunction); \ - __macro(muModuleUnload); \ - __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \ - __macro(muLaunchKernel); \ - __macro(muCtxCreate); \ - __macro(muCtxGetCurrent); \ - __macro(muDeviceGetCount); \ - __macro(muDevicePrimaryCtxGetState); \ - __macro(muDeviceGetAttribute); \ - __macro(muDeviceGet) - -PLATFORM_MUSA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP); - -#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP - -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musartc.cc b/paddle/fluid/platform/dynload/musartc.cc deleted file mode 100644 index 4e15dab9c1359d..00000000000000 --- a/paddle/fluid/platform/dynload/musartc.cc +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/dynload/musartc.h" - -#include "paddle/phi/backends/dynload/musartc.h" - -namespace paddle { -namespace platform { -namespace dynload { - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MUSARTC_ROUTINE_EACH(DEFINE_WRAP); - -bool HasNVRTC() { return phi::dynload::HasNVRTC(); } - -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musartc.h b/paddle/fluid/platform/dynload/musartc.h deleted file mode 100644 index fca957131ef4ee..00000000000000 --- a/paddle/fluid/platform/dynload/musartc.h +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // NOLINT - -#include "paddle/phi/backends/dynload/musartc.h" - -namespace paddle { -namespace platform { -namespace dynload { - -extern bool HasNVRTC(); - -#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ - using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ - extern DynLoad__##__name __name - -/** - * include all needed musartc functions - **/ -#define MUSARTC_ROUTINE_EACH(__macro) \ - __macro(mtrtcVersion); \ - __macro(mtrtcGetErrorString); \ - __macro(mtrtcCompileProgram); \ - __macro(mtrtcCreateProgram); \ - __macro(mtrtcDestroyProgram); \ - __macro(mtrtcGetMUSA); \ - __macro(mtrtcGetMUSASize); \ - __macro(mtrtcGetProgramLog); \ - __macro(mtrtcGetProgramLogSize) - -MUSARTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP); - -#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP - -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musparse.cc b/paddle/fluid/platform/dynload/musparse.cc deleted file mode 100644 index 347059362bc8db..00000000000000 --- a/paddle/fluid/platform/dynload/musparse.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/dynload/musparse.h" - -namespace paddle { -namespace platform { -namespace dynload { - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -#ifdef MUSPARSE_ROUTINE_EACH -MUSPARSE_ROUTINE_EACH(DEFINE_WRAP); -#endif - -} // namespace dynload -} // namespace platform -} // namespace paddle - diff --git a/paddle/fluid/platform/dynload/musparse.h b/paddle/fluid/platform/dynload/musparse.h deleted file mode 100644 index 586decb9c55c19..00000000000000 --- a/paddle/fluid/platform/dynload/musparse.h +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -#include // NOLINT - -#include "paddle/phi/backends/dynload/musparse.h" - -namespace paddle { -namespace platform { -namespace dynload { - -#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name) \ - using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ - extern DynLoad__##__name __name - -#if defined(PADDLE_WITH_MUSA) - - -MUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP) -#endif // PADDLE_WITH_MUSA - -#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP -} // namespace dynload -} // namespace platform -} // namespace paddle - diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc index 2cf04248687f27..7b0ea3bb7f3c1f 100644 --- a/paddle/fluid/platform/dynload/nccl.cc +++ b/paddle/fluid/platform/dynload/nccl.cc @@ -22,21 +22,21 @@ namespace dynload { NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); -// #if NCCL_VERSION_CODE >= 2212 +#if NCCL_VERSION_CODE >= 2212 NCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP) -// #endif +#endif -// #if NCCL_VERSION_CODE >= 2304 +#if NCCL_VERSION_CODE >= 2304 NCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP) -// #endif +#endif -// #if NCCL_VERSION_CODE >= 2703 +#if NCCL_VERSION_CODE >= 2703 NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) -// #endif +#endif -// #if NCCL_VERSION_CODE >= 21100 +#if NCCL_VERSION_CODE >= 21100 NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP) -// #endif +#endif } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h index d2150204b8810a..d9516c9f4de4e8 100644 --- a/paddle/fluid/platform/dynload/nccl.h +++ b/paddle/fluid/platform/dynload/nccl.h @@ -29,18 +29,18 @@ namespace dynload { #define NCCL_RAND_ROUTINE_EACH(__macro) \ __macro(ncclCommInitAll); \ - __macro(mcclGetUniqueId); \ + __macro(ncclGetUniqueId); \ __macro(ncclCommInitRank); \ __macro(ncclCommAbort); \ __macro(ncclCommDestroy); \ __macro(ncclCommCount); \ __macro(ncclCommCuDevice); \ __macro(ncclCommUserRank); \ - __macro(mcclAllReduce); \ - __macro(mcclBcast); \ - __macro(mcclAllGather); \ - __macro(mcclGroupStart); \ - __macro(mcclGroupEnd); \ + __macro(ncclAllReduce); \ + __macro(ncclBcast); \ + __macro(ncclAllGather); \ + __macro(ncclGroupStart); \ + __macro(ncclGroupEnd); \ __macro(ncclReduce); \ __macro(ncclReduceScatter); \ __macro(ncclCommGetAsyncError); \ @@ -48,29 +48,29 @@ namespace dynload { NCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) -// #if NCCL_VERSION_CODE >= 2212 +#if NCCL_VERSION_CODE >= 2212 #define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast); NCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) -// #endif +#endif -// #if NCCL_VERSION_CODE >= 2304 -#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); +#if NCCL_VERSION_CODE >= 2304 +#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion); NCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) -// #endif +#endif -// #if NCCL_VERSION_CODE >= 2703 +#if NCCL_VERSION_CODE >= 2703 #define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ __macro(ncclSend); \ __macro(ncclRecv); NCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) -// #endif +#endif -// #if NCCL_VERSION_CODE >= 21100 +#if NCCL_VERSION_CODE >= 21100 #define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \ __macro(ncclRedOpCreatePreMulSum); \ __macro(ncclRedOpDestroy); NCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) -// #endif +#endif } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/rccl.cc b/paddle/fluid/platform/dynload/rccl.cc index 512a8fbafe6f61..62bb6a88af7c0a 100644 --- a/paddle/fluid/platform/dynload/rccl.cc +++ b/paddle/fluid/platform/dynload/rccl.cc @@ -22,21 +22,21 @@ namespace dynload { RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); -// #if NCCL_VERSION_CODE >= 2212 +#if NCCL_VERSION_CODE >= 2212 RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP) -// #endif +#endif -// #if NCCL_VERSION_CODE >= 2304 +#if NCCL_VERSION_CODE >= 2304 RCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP) -// #endif +#endif -// #if NCCL_VERSION_CODE >= 2703 +#if NCCL_VERSION_CODE >= 2703 RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) -// #endif +#endif -// #if NCCL_VERSION_CODE >= 21100 +#if NCCL_VERSION_CODE >= 21100 RCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP) -// #endif +#endif } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h index cba083334ce5c1..4d988e4fb08a08 100644 --- a/paddle/fluid/platform/dynload/rccl.h +++ b/paddle/fluid/platform/dynload/rccl.h @@ -29,17 +29,17 @@ namespace dynload { #define RCCL_RAND_ROUTINE_EACH(__macro) \ __macro(ncclCommInitAll); \ - __macro(mcclGetUniqueId); \ + __macro(ncclGetUniqueId); \ __macro(ncclCommInitRank); \ __macro(ncclCommDestroy); \ __macro(ncclCommCount); \ __macro(ncclCommCuDevice); \ __macro(ncclCommUserRank); \ - __macro(mcclAllReduce); \ - __macro(mcclBcast); \ - __macro(mcclAllGather); \ - __macro(mcclGroupStart); \ - __macro(mcclGroupEnd); \ + __macro(ncclAllReduce); \ + __macro(ncclBcast); \ + __macro(ncclAllGather); \ + __macro(ncclGroupStart); \ + __macro(ncclGroupEnd); \ __macro(ncclReduce); \ __macro(ncclReduceScatter); \ __macro(ncclGetErrorString); @@ -52,7 +52,7 @@ RCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP) #endif #if NCCL_VERSION_CODE >= 2304 -#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); +#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion); RCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP) #endif diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 8dab0df5007822..1a82b05f3bc3af 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -38,16 +38,6 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA -#ifdef PADDLE_WITH_MUSA -#include -#include -#include -#include -#include -#include -#include -#endif // PADDLE_WITH_CUDA - #ifdef PADDLE_WITH_HIP #include #include @@ -91,20 +81,6 @@ limitations under the License. */ #endif // __APPLE__ #endif // PADDLE_WITH_CUDA - -#ifdef PADDLE_WITH_MUSA -#include "paddle/phi/backends/dynload/mublas.h" -#include "paddle/phi/backends/dynload/mudnn.h" -#include "paddle/phi/backends/dynload/murand.h" -// #include "paddle/phi/backends/dynload/musolver.h" -#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) -#include - -#include "paddle/phi/backends/dynload/mccl.h" -#endif // __APPLE__ -#endif // PADDLE_WITH_MUSA - - #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/hipfft.h" #include "paddle/phi/backends/dynload/hiprand.h" @@ -122,7 +98,7 @@ limitations under the License. */ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/phi/core/enforce.h" // Note: this header for simplify HIP and CUDA type string -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_types.h" #endif #include "paddle/phi/core/flags.h" diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index d3148257ea6dea..690580d8f9c5de 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -391,7 +391,7 @@ TEST(enforce, hip_success) { EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_ALLOC_FAILED, "HIPFFT error")); #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) - EXPECT_TRUE(CheckCudaStatusSuccess(mcclSuccess)); + EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Rccl error")); EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Rccl error")); #endif @@ -498,7 +498,7 @@ TEST(enforce, cuda_success) { EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_SUPPORTED, "CUFFT error")); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) - EXPECT_TRUE(CheckCudaStatusSuccess(mcclSuccess)); + EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error")); EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "NCCL error")); EXPECT_TRUE(CheckCudaStatusFailure(ncclInternalError, diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index 68a7a2e462aa7c..e807a54fdee2d7 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -21,11 +21,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif - -#ifdef PADDLE_WITH_MUSA -#include -#endif - #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc index 6bcf6a368331fa..a77e396adee5f4 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.cc +++ b/paddle/fluid/platform/gen_comm_id_helper.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE) #include "paddle/fluid/platform/gen_comm_id_helper.h" @@ -500,8 +500,8 @@ SocketServer& SocketServer::GetInstance(const std::string& end_point) { std::vector* nccl_ids, \ int ring_id = 0); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) -INSTANT_TEMPLATE(mcclUniqueId) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +INSTANT_TEMPLATE(ncclUniqueId) #endif #ifdef PADDLE_WITH_XPU_BKCL INSTANT_TEMPLATE(BKCLUniqueId) diff --git a/paddle/fluid/platform/gen_comm_id_helper.h b/paddle/fluid/platform/gen_comm_id_helper.h index 0d975d84093cfd..d97b41311995e1 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.h +++ b/paddle/fluid/platform/gen_comm_id_helper.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE) #include #include diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index c07772e1a1afc6..a3fff528f7903e 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/backends/cpu/cpu_info.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -57,8 +57,8 @@ limitations under the License. */ #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/custom_kernel.h" -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #endif @@ -169,7 +169,7 @@ void InitDevices() { #endif /*Init all available devices by default */ std::vector devices; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) try { // use user specified GPUs in single-node multi-process mode. devices = platform::GetSelectedDevices(); @@ -209,7 +209,7 @@ void InitDevices(const std::vector devices) { continue; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) places.emplace_back(platform::CUDAPlace(device)); #endif #ifdef PADDLE_WITH_XPU @@ -220,7 +220,7 @@ void InitDevices(const std::vector devices) { #endif } places.emplace_back(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) places.emplace_back(platform::CUDAPinnedPlace()); #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE @@ -431,19 +431,19 @@ void InitMemoryMethod() { memory_method->allocation_deleter = paddle::memory::allocation::Allocator::AllocationDeleter; #if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \ - defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) + defined(PADDLE_WITH_HIP) memory_method->copy_with_stream = paddle::memory::Copy; #endif memory_method->copy = paddle::memory::Copy; memory_method->device_memory_stat_current_value = paddle::memory::DeviceMemoryStatCurrentValue; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage; #endif -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) // TODO(GhostScreaming): Use phi methods later. memory_method->get_allocator = [](int device_id, phi::gpuStream_t stream) -> phi::Allocator * { diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index b0bc0a111cdd23..3d215435881cfe 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -58,7 +58,7 @@ typename Visitor::result_type VisitPlace(const Place &place, const Visitor &visitor) { switch (place.GetType()) { case phi::AllocationType::GPU: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPlace p(place.GetDeviceId()); return visitor(p); #else @@ -68,7 +68,7 @@ typename Visitor::result_type VisitPlace(const Place &place, #endif } case phi::AllocationType::GPUPINNED: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPinnedPlace p; return visitor(p); #else diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 1ed73672f0e3e5..44c17c32fa8d56 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -685,7 +685,7 @@ void EnableProfiler(ProfilerState state) { HostTraceLevel::GetInstance().SetLevel(option.trace_level); should_send_profile_state = true; phi::GetDeviceTracer()->Enable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA || phi::ProfilerHelper::g_state == ProfilerState::kAll || phi::ProfilerHelper::g_state == ProfilerState::kCPU) { diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu index 84a20f8bf7d3c1..5d1caffd45326d 100644 --- a/paddle/fluid/platform/profiler.cu +++ b/paddle/fluid/platform/profiler.cu @@ -16,11 +16,6 @@ limitations under the License. */ #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#include -#endif - #ifdef PADDLE_WITH_HIP #include #endif @@ -57,20 +52,6 @@ void DummyKernelAndEvent() { PADDLE_ENFORCE_GPU_SUCCESS(hipFree(ptr)); }); } -#elif defined(PADDLE_WITH_MUSA) - for (int i = 0; i < 5; i++) { - ForEachDevice([](int d) { - platform::SetDeviceId(d); - musaStream_t stream; - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream)); - Mark("_musa_startup_"); - int *ptr; - PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&ptr, sizeof(int))); - DummyKernel<<<1, 1, 0, stream>>>(ptr); - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); - PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr)); - }); - } #else for (int i = 0; i < 5; i++) { ForEachDevice([](int d) { diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 89c78f01ac4872..4d6bc9cc242d47 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -31,7 +31,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/mem_tracing.h" #include "paddle/fluid/platform/profiler/supplement_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -198,7 +198,7 @@ std::string OpName(const framework::VariableNameMap& name_map, const std::string& type_name); void SetTracerOption(TracerOption option); platform::TracerOption GetTracerOption(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void DummyKernelAndEvent(); #endif diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index e67b0fbc3c68db..de8fd01a1e59de 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -552,7 +552,7 @@ void ChromeTracingLogger::LogMetaInfo(const std::string& version, span_indx); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void ChromeTracingLogger::LogDeviceProperty( const std::map& device_property_map) { // add device property information @@ -664,44 +664,6 @@ void ChromeTracingLogger::LogDeviceProperty( device_nums -= 1; } #endif -#if defined(PADDLE_WITH_MUSA) - for (auto it = device_property_map.begin(); it != device_property_map.end(); - it++) { - const gpuDeviceProp& device_property = it->second; - if (device_nums > 1) { - output_file_stream_ << string_format(std::string( - R"JSON( - { - "id": %u, "name": "%s", "totalGlobalMem": %llu, - "computeMajor": %d, "computeMinor": %d, - "smCount": %d - }, - )JSON"), - it->first, - device_property.name, - device_property.totalGlobalMem, - device_property.major, - device_property.minor, - device_property.multiProcessorCount); - } else { - output_file_stream_ << string_format(std::string( - R"JSON( - { - "id": %u, "name": "%s", "totalGlobalMem": %llu, - "computeMajor": %d, "computeMinor": %d, - "smCount": %d - }], - )JSON"), - it->first, - device_property.name, - device_property.totalGlobalMem, - device_property.major, - device_property.minor, - device_property.multiProcessorCount); - } - device_nums -= 1; - } -#endif } #endif diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h index e0cf523ea53eea..37323d1450bf2d 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.h +++ b/paddle/fluid/platform/profiler/chrometracing_logger.h @@ -41,7 +41,7 @@ class ChromeTracingLogger : public BaseLogger { void LogNodeTrees(const NodeTrees&) override; void LogExtraInfo(const std::unordered_map); void LogMemTraceEventNode(const MemTraceEventNode&) override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void LogDeviceProperty( const std::map& device_property_map); #endif diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc index c2020acf35d25a..1fce7edc3e329e 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -129,7 +129,7 @@ std::unique_ptr DeserializationReader::Parse() { // restore NodeTrees object std::unique_ptr tree(new NodeTrees(thread_event_trees_map)); // restore gpuDeviceProp -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::map device_property_map; for (auto indx = 0; indx < node_trees_proto_->device_property_size(); indx++) { @@ -155,7 +155,7 @@ DeserializationReader::~DeserializationReader() { // NOLINT input_file_stream_.close(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuDeviceProp DeserializationReader::RestoreDeviceProperty( const DevicePropertyProto& device_property_proto) { gpuDeviceProp device_property; diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h index c8ac33c5bea49b..5f99f6fd82c55d 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -39,7 +39,7 @@ class DeserializationReader { MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&); OperatorSupplementEventNode* RestoreOperatorSupplementEventNode( const OperatorSupplementEventNodeProto&); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&); #endif diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc index 9b5b2636db30bb..6f4ed06de9e8ec 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -40,7 +40,7 @@ void SerializationLogger::OpenFile() { node_trees_proto_ = new NodeTreesProto(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void SerializationLogger::LogDeviceProperty( const std::map& device_property_map) { for (const auto& item : device_property_map) { diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h index 67eafdf44e3cd1..80d5413106dedc 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.h +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -37,7 +37,7 @@ class SerializationLogger : public BaseLogger { void LogNodeTrees(const NodeTrees&) override; void LogExtraInfo(const std::unordered_map); void LogMemTraceEventNode(const MemTraceEventNode&) override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void LogDeviceProperty( const std::map& device_property_map); #endif diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc index 4ea1b756a458cd..c01b4abcfbbd3d 100644 --- a/paddle/fluid/platform/profiler/event_python.cc +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -130,7 +130,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { return host_python_node; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ProfilerResult::ProfilerResult( std::unique_ptr tree, const ExtraInfo& extra_info, @@ -170,7 +170,7 @@ void ProfilerResult::Save(const std::string& file_name, if (format == std::string("json")) { ChromeTracingLogger logger(file_name); logger.LogMetaInfo(version_, span_indx_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) logger.LogDeviceProperty(device_property_map_); #endif tree_->LogMe(&logger); @@ -178,7 +178,7 @@ void ProfilerResult::Save(const std::string& file_name, } else if (format == std::string("pb")) { SerializationLogger logger(file_name); logger.LogMetaInfo(version_, span_indx_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) logger.LogDeviceProperty(device_property_map_); #endif tree_->LogMe(&logger); diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h index f1d217674bf6c6..dae32a1902834e 100644 --- a/paddle/fluid/platform/profiler/event_python.h +++ b/paddle/fluid/platform/profiler/event_python.h @@ -138,7 +138,7 @@ struct HostPythonNode { class ProfilerResult { public: ProfilerResult() : tree_(nullptr) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) explicit ProfilerResult( std::unique_ptr tree, const ExtraInfo& extra_info, @@ -166,7 +166,7 @@ class ProfilerResult { std::string GetVersion() { return version_; } uint32_t GetSpanIndx() { return span_indx_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::map GetDeviceProperty() { return device_property_map_; } @@ -176,7 +176,7 @@ class ProfilerResult { std::map thread_event_trees_map_; std::shared_ptr tree_; ExtraInfo extra_info_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::map device_property_map_; #endif std::string version_; diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index 2bb7731b0c1599..bcb35f5b7bd352 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -18,16 +18,10 @@ #ifdef PADDLE_WITH_CUDA #include #endif - -#ifdef PADDLE_WITH_MUSA -#include -#include -#endif - #ifdef PADDLE_WITH_HIP #include #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif #include "paddle/fluid/platform/enforce.h" @@ -53,9 +47,6 @@ void SynchronizeDevice() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif -#ifdef PADDLE_WITH_MUSA - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); -#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : dev_types) { @@ -171,7 +162,7 @@ std::unique_ptr Profiler::Stop() { std::string("%s"), kv.second.c_str()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::map device_property_map; std::vector device_ids = GetSelectedDevices(); for (auto device_id : device_ids) { diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc index 86243e9258dd62..f7f888d9e67396 100644 --- a/paddle/fluid/platform/profiler/profiler_test.cc +++ b/paddle/fluid/platform/profiler/profiler_test.cc @@ -23,9 +23,6 @@ #ifdef PADDLE_WITH_HIP #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#endif #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_python.h" @@ -83,11 +80,6 @@ TEST(ProfilerTest, TestCudaTracer) { hipStream_t stream; hipStreamCreate(&stream); hipStreamSynchronize(stream); -#endif -#ifdef PADDLE_WITH_MUSA - musaStream_t stream; - musaStreamCreate(&stream); - musaStreamSynchronize(stream); #endif auto profiler_result = profiler->Stop(); auto nodetree = profiler_result->GetNodeTrees(); diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index e1720874e1489c..9835e7525c51ef 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -34,10 +34,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#include -#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE #include "paddle/phi/backends/device_manager.h" #endif @@ -107,17 +103,6 @@ void SynchronizeAllDevice() { } SetDeviceId(pre_device_id); #endif - -#ifdef PADDLE_WITH_MUSA - int pre_device_id = GetCurrentDeviceId(); - int count = GetGPUDeviceCount(); - for (int i = 0; i < count; i++) { - SetDeviceId(i); - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); - } - SetDeviceId(pre_device_id); -#endif - #ifdef PADDLE_WITH_HIP int pre_device_id = GetCurrentDeviceId(); int count = GetGPUDeviceCount(); @@ -156,7 +141,7 @@ void PrintMemProfiler( << " Memory Profiling Report " << "<-------------------------\n\n"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) int num_gpus = GetGPUDeviceCount(); std::cout.setf(std::ios::left); if (num_gpus > 0) { @@ -358,7 +343,7 @@ void SetEvent(bool merge_thread, if (rit != pushed_events->rend()) { double event_time = 0; double gpu_time = 0.0f; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpu_time = rit->CudaElapsedMs(analyze_event); #endif double cpu_time = rit->CpuElapsedMs(analyze_event); diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 97ca34c0209d39..c55bcb71a7d432 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -24,11 +24,6 @@ static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status, void *user_data) #endif -#ifdef PADDLE_WITH_MUSA -static void StreamCallbackFunc(gpuStream_t stream, - gpuError_t status, - void *user_data) -#endif #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 static void CUDART_CB StreamCallbackFunc(void *user_data) @@ -63,11 +58,6 @@ void StreamCallbackManager::AddCallback( PADDLE_ENFORCE_GPU_SUCCESS( hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); #endif -#ifdef PADDLE_WITH_MUSA - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); -#endif - #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 PADDLE_ENFORCE_GPU_SUCCESS( @@ -81,7 +71,7 @@ void StreamCallbackManager::AddCallback( template void StreamCallbackManager::Wait() const { -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) platform::GpuStreamSync(stream_); #endif { @@ -98,8 +88,5 @@ template struct StreamCallbackManager; #ifdef PADDLE_WITH_HIP template struct StreamCallbackManager; #endif -#ifdef PADDLE_WITH_MUSA -template struct StreamCallbackManager; -#endif } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 1cc0f0e5cf1e9a..7cd6930a9d0d0f 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -25,11 +25,6 @@ #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#include -#endif - #include #include // NOLINT #include diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h index a35095c98d4a29..66f17168ec01a5 100644 --- a/paddle/fluid/primitive/composite/composite.h +++ b/paddle/fluid/primitive/composite/composite.h @@ -22,6 +22,9 @@ namespace paddle { namespace primitive { namespace details { +// empty_shape means x.shape=[] +static std::vector empty_shape; + template Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) { auto org_dtype = x.dtype(); @@ -345,62 +348,66 @@ std::tuple layer_norm_decomp( // cast dtype to float32 if dtype =float16 or bfloat16 if (need_cast) { - x_cast = cast(x_cast, phi::DataType::FLOAT32); + x_cast = cast(x_cast, DataType::FLOAT32); } auto x_dim = common::vectorize(x.dims()); for (size_t i = begin_norm_axis; i < x_dim.size(); i++) { axis.push_back(static_cast(i)); } - auto mean_ = mean_decomp(x_cast, IntArray(axis), true); + auto mean_ = mean_decomp(x_cast, axis, true); auto difference = x_cast - mean_; auto var_tmp1 = difference * difference; - auto variance = mean_decomp(var_tmp1, IntArray(axis), true); + auto variance = mean_decomp(var_tmp1, axis, true); auto var_tmp3 = variance + epsilon; auto rsqrt_var = elementwise_pow( - var_tmp3, - full(common::vectorize(var_tmp3.dims()), -0.5, var_tmp3.dtype())); + var_tmp3, full(empty_shape, -0.5, var_tmp3.dtype())); auto out = difference * rsqrt_var; auto scale_ptr = scale.get_ptr(); auto bias_ptr = bias.get_ptr(); - std::vector slice_shape; - for (int64_t i = begin_norm_axis; i < static_cast(x_dim.size()); - i++) { - slice_shape.push_back(x_dim[i]); + std::vector slice_shape_l; + std::vector slice_shape_r; + for (int64_t i = 0; i < static_cast(x_dim.size()); i++) { + if (i < begin_norm_axis) { + slice_shape_l.push_back(x_dim[i]); + } else { + slice_shape_r.push_back(x_dim[i]); + } } Tensor scale_cast; if (scale_ptr) { - if (slice_shape != scale_ptr->shape()) { - scale_cast = reshape(*scale_ptr, slice_shape); + if (slice_shape_r != scale_ptr->shape()) { + scale_cast = reshape(*scale_ptr, slice_shape_r); } else { scale_cast = *scale_ptr; } if (need_cast) { - scale_cast = cast(scale_cast, phi::DataType::FLOAT32); + scale_cast = cast(scale_cast, DataType::FLOAT32); } out = out * scale_cast; } Tensor bias_cast; if (bias_ptr) { - if (slice_shape != bias_ptr->shape()) { - bias_cast = reshape(*bias_ptr, slice_shape); + if (slice_shape_r != bias_ptr->shape()) { + bias_cast = reshape(*bias_ptr, slice_shape_r); } else { bias_cast = *bias_ptr; } if (need_cast) { - bias_cast = cast(bias_cast, phi::DataType::FLOAT32); + bias_cast = cast(bias_cast, DataType::FLOAT32); } out = out + bias_cast; } - mean_ = reshape(mean_, std::vector({-1})); - variance = reshape(variance, std::vector({-1})); + mean_ = reshape(mean_, slice_shape_l); + variance = reshape(variance, slice_shape_l); + // same as LayerNormInferMeta + // x: float32 --> out: float32, mean: float32, variance: float32 + // x: float16 --> out: float16, mean: float32, variance: float32 if (need_cast) { out = cast(out, org_dtype); - mean_ = cast(mean_, org_dtype); - variance = cast(variance, org_dtype); } return std::make_tuple(out, mean_, variance); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 5306d282e797ca..4f761aa3c8536d 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -67,7 +67,7 @@ if(WITH_RPC) set(PYBIND_DEPS ${PYBIND_DEPS} paddle_rpc ${EXTERNAL_BRPC_DEPS} zlib phi common) endif() -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda) set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard) endif() @@ -79,7 +79,7 @@ if(WITH_IPU) set(PYBIND_DEPS ${PYBIND_DEPS} ipu_info) endif() -if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) +if(WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) endif() @@ -99,7 +99,6 @@ if(WITH_CUSTOM_DEVICE) if(NOT (WITH_NCCL OR WITH_RCCL - OR WITH_MCCL OR WITH_XPU_BKCL)) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) @@ -108,7 +107,7 @@ endif() if(NOT WIN32) set(PYBIND_DEPS ${PYBIND_DEPS} data_loader) - if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) + if(WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) endif() @@ -163,7 +162,7 @@ endif() if(WITH_PYTHON) set(PYBIND_DEPS ${PYBIND_DEPS} process_group eager_reducer) - if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) + if(WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} process_group_nccl) endif() if(WITH_XPU_BKCL) @@ -247,7 +246,7 @@ if(WITH_RPC) set(PYBIND_SRCS rpc.cc ${PYBIND_SRCS}) endif() -if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) +if(WITH_NCCL OR WITH_RCCL) list(APPEND PYBIND_SRCS nccl_wrapper_py.cc) endif() @@ -266,7 +265,7 @@ if(WITH_PYTHON) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS}) - if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) + if(WITH_NCCL OR WITH_RCCL) list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context) endif() @@ -287,20 +286,15 @@ if(WITH_PYTHON) eager_legacy_op_function_generator.cc) set(GENERATOR_DEPS ${PYBIND_DEPS}) list(REMOVE_DUPLICATES GENERATOR_DEPS) - if(NOT WITH_ARM) + if(WIN32) list(REMOVE_ITEM GENERATOR_DEPS python) endif() target_link_libraries(eager_legacy_op_function_generator ${GENERATOR_DEPS}) - # if(NOT WIN32) - # add_executable(kernel_signature_generator kernel_signature_generator.cc) - # if(WITH_MUSA) - # # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/' - # target_link_options(kernel_signature_generator PRIVATE - # -Wl,-rpath,/usr/lib/x86_64-linux-gnu/) - # endif() - # target_link_libraries(kernel_signature_generator - # ${OP_FUNCTION_GENERETOR_DEPS}) - # endif() + if(NOT WIN32) + add_executable(kernel_signature_generator kernel_signature_generator.cc) + target_link_libraries(kernel_signature_generator + ${OP_FUNCTION_GENERETOR_DEPS}) + endif() get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(eager_legacy_op_function_generator diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc index 6351d021dfe8cb..391dbabb1a2109 100644 --- a/paddle/fluid/pybind/communication.cc +++ b/paddle/fluid/pybind/communication.cc @@ -48,7 +48,7 @@ void BindCommContextManager(py::module *m) { .def_static("set_device_id", &phi::distributed::CommContextManager::SetDeviceId, py::call_guard()) -#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) .def_static( "create_nccl_comm_context", &phi::distributed::CommContextManager::CreateNCCLCommContext, diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc index a07aef2fb69965..2a6c639735a2b4 100644 --- a/paddle/fluid/pybind/cuda_streams_py.cc +++ b/paddle/fluid/pybind/cuda_streams_py.cc @@ -24,7 +24,7 @@ namespace py = pybind11; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::CUDAStream *get_current_stream(int device_id) { if (device_id == -1) { device_id = phi::backends::gpu::GetCurrentDeviceId(); @@ -51,7 +51,7 @@ void BindCudaStream(py::module *m_ptr) { m.def( "_get_current_stream", [](int deviceId) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return platform::get_current_stream(deviceId); #else PADDLE_THROW( @@ -64,7 +64,7 @@ void BindCudaStream(py::module *m_ptr) { m.def( "_set_current_stream", [](phi::CUDAStream *stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return platform::set_current_stream(stream); #else PADDLE_THROW( @@ -75,7 +75,7 @@ void BindCudaStream(py::module *m_ptr) { py::return_value_policy::reference); m.def("_device_synchronize", [](int device_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (device_id == -1) { device_id = paddle::platform::GetCurrentDeviceId(); } @@ -84,8 +84,6 @@ void BindCudaStream(py::module *m_ptr) { paddle::platform::SetDeviceId(device_id); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #endif @@ -116,7 +114,7 @@ void BindCudaStream(py::module *m_ptr) { >>> s3 = paddle.device.cuda.Stream() )DOC") -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) .def( "wait_event", [](phi::CUDAStream &self, paddle::platform::CudaEvent &event) { @@ -251,7 +249,7 @@ void BindCudaStream(py::module *m_ptr) { .def( "__init__", [](phi::CUDAStream &self, platform::CUDAPlace *place, int priority) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (priority != 1 && priority != 2) { PADDLE_THROW(platform::errors::InvalidArgument( "Priority should be 1(high) or 2(normal) ")); @@ -277,7 +275,7 @@ void BindCudaStream(py::module *m_ptr) { .def( "__init__", [](phi::CUDAStream &self, int device, int priority) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (priority != 1 && priority != 2) { PADDLE_THROW(platform::errors::InvalidArgument( "Priority should be 1(high) or 2(normal) ")); @@ -307,7 +305,7 @@ void BindCudaStream(py::module *m_ptr) { py::arg("device") = -1, py::arg("priority") = 2) .def("__init__", [](phi::CUDAStream &self) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) int device_id = platform::GetCurrentDeviceId(); auto stream_flag = phi::CUDAStream::StreamFlag::kStreamNonBlocking; new (&self) phi::CUDAStream( @@ -334,7 +332,7 @@ void BindCudaStream(py::module *m_ptr) { >>> event = paddle.device.cuda.Event() )DOC") -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) .def( "record", [](paddle::platform::CudaEvent &self, phi::CUDAStream *stream) { @@ -401,7 +399,7 @@ void BindCudaStream(py::module *m_ptr) { bool enable_timing, bool blocking, bool interprocess) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) unsigned int flags = platform::GenerateDeviceEventFlag( enable_timing, blocking, interprocess); new (&self) paddle::platform::CudaEvent(flags); diff --git a/paddle/fluid/pybind/cuda_streams_py.h b/paddle/fluid/pybind/cuda_streams_py.h index 61f27960e25e9d..d10608a6e8ea96 100644 --- a/paddle/fluid/pybind/cuda_streams_py.h +++ b/paddle/fluid/pybind/cuda_streams_py.h @@ -17,7 +17,7 @@ #include "pybind11/pybind11.h" #include "pybind11/stl.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/core/cuda_stream.h" #else namespace phi { @@ -29,7 +29,7 @@ namespace py = pybind11; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::CUDAStream* get_current_stream(int device_id = -1); phi::CUDAStream* set_current_stream(phi::CUDAStream* stream); #endif diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index ea61387ae53e51..4577171fd77bb5 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -32,7 +32,7 @@ limitations under the License. */ #include "paddle/phi/api/all.h" #include "paddle/phi/core/distributed/types.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #endif @@ -1224,7 +1224,7 @@ void BindDistributed(py::module *m) { py::arg("id"), py::call_guard()); -#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) py::class_>( *m, "ProcessGroupNCCL", ProcessGroup) diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 098c2fa4bdf778..894ede8db18d2b 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -322,7 +322,7 @@ void InitTensorWithNumpyValue(TensorObject* self, #endif SetTensorFromPyArray(impl_ptr, array, place, zero_copy); } else if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::backends::gpu::SetDeviceId(place.device); VLOG(4) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << static_cast(place.device); diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 956de0e9d371a0..df84ca68b9182b 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -58,7 +58,7 @@ typedef SSIZE_T ssize_t; #include "pybind11/numpy.h" #include "pybind11/pybind11.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/pybind/cuda_streams_py.h" #endif diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc index e932ecb34201c7..2c01e122914aa4 100644 --- a/paddle/fluid/pybind/eager_math_op_patch.cc +++ b/paddle/fluid/pybind/eager_math_op_patch.cc @@ -139,7 +139,7 @@ std::set _complex_dtypes{ void SetDevice(paddle::platform::Place place) { if (paddle::platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::backends::gpu::SetDeviceId(place.device); VLOG(6) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << static_cast(place.device); diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 48a8fdc8daa700..584d1b8b58482a 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -54,6 +54,7 @@ typedef SSIZE_T ssize_t; #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #include "paddle/common/ddim.h" #include "paddle/fluid/eager/amp_utils.h" +#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/eager_amp_auto_cast.h" #include "paddle/fluid/framework/python_headers.h" @@ -318,13 +319,11 @@ static PyObject* tensor_method_numpy(TensorObject* self, dense_tensor->Holder()->size()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if (self->tensor.is_gpu()) { eager_gil_scoped_release guard; #if defined(PADDLE_WITH_CUDA) gpuMemcpyKind kind = cudaMemcpyDeviceToHost; -#elif defined(PADDLE_WITH_MUSA) - gpuMemcpyKind kind = musaMemcpyDeviceToHost; #elif defined(PADDLE_WITH_HIP) gpuMemcpyKind kind = hipMemcpyDeviceToHost; phi::DeviceContextPool::Instance().Get(self->tensor.place())->Wait(); @@ -1361,6 +1360,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, &use_strided_slice); // step2: Dealing with basic indexing + bool out_is_view = false; auto out = getTensorWithBasicIndexing(tensor, &slice_axes, &slice_starts, @@ -1369,7 +1369,8 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, &decrease_axis, &none_axes, &infer_flags, - &use_strided_slice); + &use_strided_slice, + &out_is_view); if (!has_advanced_index) { return ToPyObject(out); @@ -1377,7 +1378,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, // step3: Dealing with advanced indexing std::vector transed_index; - std::vector trans_back_dim; + std::vector trans_back_dim, trans_dim; int pos_of_new_dim = INT_MAX, rank_of_new_dim = 1; paddle::Tensor transed_tensor = dealWithAdvancedIndex(out, @@ -1387,7 +1388,9 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, &transed_index, &trans_back_dim, &pos_of_new_dim, - &rank_of_new_dim); + &rank_of_new_dim, + &trans_dim, + &out_is_view); if (transed_index.size() == 1 && transed_index[0].dtype() == phi::DataType::BOOL) { @@ -1417,14 +1420,14 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, if (pos_of_new_dim != 0) { std::vector perm(out.shape().size(), 0); - int tmp1 = pos_of_new_dim, tmp2 = 0, + int tmp1 = rank_of_new_dim, tmp2 = 0, tmp3 = pos_of_new_dim + rank_of_new_dim; for (int i = 0; i < static_cast(out.shape().size()); ++i) { - if (i < rank_of_new_dim) { + if (i < pos_of_new_dim) { perm[i] = - tmp1++; // range(pos_of_new_dim, pos_of_new_dim + rank_of_new_dim) - } else if (i >= rank_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) { - perm[i] = tmp2++; // range(0, pos_of_new_dim) + tmp1++; // range(rank_of_new_dim, pos_of_new_dim + rank_of_new_dim) + } else if (i >= pos_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) { + perm[i] = tmp2++; // range(0, rank_of_new_dim) } else { perm[i] = tmp3++; // range(pos_of_new_dim + rank_of_new_dim, out.ndim) } @@ -1609,12 +1612,9 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, &use_strided_slice); // step2: Parse values - PADDLE_ENFORCE( - PyCheckTensor(value_obj), - platform::errors::InvalidArgument("The value must be a Tensor")); - + std::vector values; paddle::Tensor value_tensor = - reinterpret_cast(value_obj)->tensor; + dealWithValues(tensor, value_obj, &values, has_advanced_index); if (!has_advanced_index) { // use set_value OP if there is no advanced index @@ -1622,45 +1622,60 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, // Release gil and do tracing py::gil_scoped_release release; // use inplace set_value_ operator - if (value_tensor.initialized() && - (self->tensor.dtype() != value_tensor.dtype())) { - if (egr::Controller::Instance().GetAMPLevel() != - paddle::imperative::AmpLevel::O0) { - paddle::small_vector, - egr::kSlotSmallVectorSize> - tmps = {{self->tensor}, {value_tensor}}; - auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps); - self->tensor = egr::EagerAmpAutoCast( - self->tensor.name(), self->tensor, amp_dtype, "set_value"); - value_tensor = egr::EagerAmpAutoCast( - value_tensor.name(), value_tensor, amp_dtype, "set_value"); - } + if (value_tensor.initialized()) { if (self->tensor.dtype() != value_tensor.dtype()) { - value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + paddle::small_vector, + egr::kSlotSmallVectorSize> + tmps = {{self->tensor}, {value_tensor}}; + auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps); + self->tensor = egr::EagerAmpAutoCast( + self->tensor.name(), self->tensor, amp_dtype, "set_value"); + value_tensor = egr::EagerAmpAutoCast( + value_tensor.name(), value_tensor, amp_dtype, "set_value"); + } + if (self->tensor.dtype() != value_tensor.dtype()) { + value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); + } } - } - // step3.1: Only basic indexing, use OP set_value. - const phi::distributed::ProcessMesh* mesh = nullptr; - if (InputsContainDistTensor(&mesh, self->tensor, value_tensor)) { - ConvertAllInputsToDistTensor(mesh, self->tensor, value_tensor); - } - self->tensor = set_value_with_tensor__ad_func(self->tensor, - value_tensor, - slice_starts, - slice_ends, - slice_strides, - slice_axes, - decrease_axis, - none_axes); - if (PyCheckTensor(value_obj)) { - // pass the stop_gradient from value to tensor. - // pass stop gradient should be done after CheckInplace in - // set_value__dygraph_function. - if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && - egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { - egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); + // step3.1: Only basic indexing, use OP set_value. + const phi::distributed::ProcessMesh* mesh = nullptr; + if (InputsContainDistTensor(&mesh, self->tensor, value_tensor)) { + ConvertAllInputsToDistTensor(mesh, self->tensor, value_tensor); } + self->tensor = set_value_with_tensor__ad_func(self->tensor, + value_tensor, + slice_starts, + slice_ends, + slice_strides, + slice_axes, + decrease_axis, + none_axes); + if (PyCheckTensor(value_obj)) { + // pass the stop_gradient from value to tensor. + // pass stop gradient should be done after CheckInplace in + // set_value__dygraph_function. + if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && + egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { + egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); + } + } + } else { + const phi::distributed::ProcessMesh* mesh = nullptr; + if (InputsContainDistTensor(&mesh, self->tensor)) { + ConvertAllInputsToDistTensor(mesh, self->tensor); + } + self->tensor = set_value__ad_func(self->tensor, + slice_starts, + slice_ends, + slice_strides, + slice_axes, + decrease_axis, + none_axes, + {1}, + values); } } else { // step3.2: Case for there are advanced indexing. @@ -1670,6 +1685,7 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, // 3. assign values to the sliced result by index_put OP; // 4. transpose back and assign the result to original tensor by set_value // OP. + bool out_is_view = false; paddle::Tensor sub_tensor = getTensorWithBasicIndexing(tensor, &slice_axes, &slice_starts, @@ -1678,12 +1694,13 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, &decrease_axis, &none_axes, &infer_flags, - &use_strided_slice); + &use_strided_slice, + &out_is_view); std::vector transed_index; - std::vector trans_back_dim; + std::vector trans_back_dim, trans_dim; - int pos_of_new_dim = 0, rank_of_new_dim = 0; + int pos_of_new_dim = INT_MAX, rank_of_new_dim = 1; paddle::Tensor transed_sub_tensor = dealWithAdvancedIndex(sub_tensor, @@ -1693,61 +1710,127 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, &transed_index, &trans_back_dim, &pos_of_new_dim, - &rank_of_new_dim); + &rank_of_new_dim, + &trans_dim, + &out_is_view); // Release gil and do tracing py::gil_scoped_release release; - - if (value_tensor.initialized() && - (self->tensor.dtype() != value_tensor.dtype())) { - if (egr::Controller::Instance().GetAMPLevel() != - paddle::imperative::AmpLevel::O0) { - paddle::small_vector, - egr::kSlotSmallVectorSize> - tmps = {{self->tensor}, {value_tensor}}; - auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps); - self->tensor = egr::EagerAmpAutoCast( - self->tensor.name(), self->tensor, amp_dtype, "index_put"); - value_tensor = egr::EagerAmpAutoCast( - value_tensor.name(), value_tensor, amp_dtype, "index_put"); - } + if (value_tensor.initialized()) { if (self->tensor.dtype() != value_tensor.dtype()) { - value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + paddle::small_vector, + egr::kSlotSmallVectorSize> + tmps = {{self->tensor}, {value_tensor}}; + auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps); + self->tensor = egr::EagerAmpAutoCast( + self->tensor.name(), self->tensor, amp_dtype, "index_put"); + value_tensor = egr::EagerAmpAutoCast( + value_tensor.name(), value_tensor, amp_dtype, "index_put"); + } + if (self->tensor.dtype() != value_tensor.dtype()) { + value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); + } } - } - // TODO(zoooo0820) 1.Using inplace version index_put - // 2.Remove following code after backward bug fixed. - transed_sub_tensor = assign_ad_func(transed_sub_tensor); + if (value_tensor.dims().size() > 1 && pos_of_new_dim != 0) { + value_tensor = transpose_ad_func(value_tensor, trans_dim); + } - const phi::distributed::ProcessMesh* mesh = nullptr; - if (InputsContainDistTensor( - &mesh, self->tensor, transed_sub_tensor, value_tensor)) { - ConvertAllInputsToDistTensor( - mesh, self->tensor, transed_sub_tensor, value_tensor); - } + const phi::distributed::ProcessMesh* mesh = nullptr; + if (InputsContainDistTensor( + &mesh, self->tensor, transed_sub_tensor, value_tensor)) { + ConvertAllInputsToDistTensor( + mesh, self->tensor, transed_sub_tensor, value_tensor); + } - transed_sub_tensor = - index_put_ad_func(transed_sub_tensor, transed_index, value_tensor); - - paddle::Tensor transback_sub_tensor = - transpose_ad_func(transed_sub_tensor, trans_back_dim); - - self->tensor = set_value_with_tensor__ad_func(self->tensor, - transback_sub_tensor, - slice_starts, - slice_ends, - slice_strides, - slice_axes, - decrease_axis, - none_axes); - if (PyCheckTensor(value_obj)) { - // pass the stop_gradient from value to tensor. - // pass stop gradient should be done after CheckInplace in - // set_value__dygraph_function. - if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && - egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { - egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); + if (transed_index.size() == 1 && + transed_index[0].dtype() == phi::DataType::BOOL && + transed_index[0].shape().size() == self->tensor.shape().size()) { + if (value_tensor.shape() != self->tensor.shape()) { + value_tensor = expand_ad_func(value_tensor, self->tensor.shape()); + } + transed_sub_tensor = + where__ad_func(logical_not_ad_func(transed_index[0]), + transed_sub_tensor, + value_tensor); + } else { + transed_sub_tensor = + index_put__ad_func(transed_sub_tensor, transed_index, value_tensor); + } + + if (out_is_view) { + // NOTE(zoooo0820): if out_is_view is true, it is a case of + // combined-indexing setitem, i.e. firstly we get a view of + // self->tensor, then modified it with inplace api index_put_ For now, + // in design of Paddle, the forward result is right. But the backward + // edge can not be established because the Base Tensor cannot sense + // whether it has been modified by other operations. Following codes are + // to add a new node (set_value_with_tensor_grad) to record the backward + // edge, with out ad_function which needs to do the forward calculation. + + egr::AutogradMeta* x_autograd_meta = + egr::EagerUtils::nullable_autograd_meta(self->tensor); + egr::AutogradMeta* values_autograd_meta = + egr::EagerUtils::nullable_autograd_meta(transed_sub_tensor); + bool trace_backward = egr::Controller::Instance().HasGrad(); + bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( + trace_backward, x_autograd_meta, values_autograd_meta); + // Node Declaration + std::shared_ptr grad_node; + // Set grad_node before API Call + if (require_any_grad) { + paddle::Tensor transback_sub_tensor = + transpose_ad_func(transed_sub_tensor, trans_back_dim); + const auto& values_tmp = + (require_any_grad && transback_sub_tensor.is_dense_tensor() && + !std::dynamic_pointer_cast( + transback_sub_tensor.impl()) + ->meta() + .is_contiguous()) + ? paddle::Tensor( + std::make_shared( + std::move(paddle::experimental::Trans2Contiguous( + *(std::dynamic_pointer_cast( + transback_sub_tensor.impl()))))), + transback_sub_tensor.mutable_autograd_meta()) + : transback_sub_tensor; + + grad_node = std::shared_ptr( + new SetValueWithTensorGradNode(1, 2)); // NOLINT + grad_node->SetAttributestarts(slice_starts); + grad_node->SetAttributeends(slice_ends); + grad_node->SetAttributesteps(slice_strides); + grad_node->SetAttributeaxes(slice_axes); + grad_node->SetAttributedecrease_axes(decrease_axis); + grad_node->SetAttributenone_axes(none_axes); + grad_node->SetTensorWrappervalues(values_tmp); + + paddle::memory::LogDeviceMemoryStats( + egr::Controller::Instance().GetExpectedPlace(), + "set_value_with_tensor"); + egr::EagerUtils::CheckInplace( + self->tensor, x_autograd_meta, require_any_grad); + egr::EagerUtils::PassStopGradient(false, x_autograd_meta); + // SetGradOutMeta & SetEdges + grad_node->SetGradOutMeta(self->tensor, 0); + grad_node->SetGradOutMeta(transback_sub_tensor, 1); + if (x_autograd_meta) { + egr::EagerUtils::SetOutRankWithSlot(x_autograd_meta, 0); + egr::EagerUtils::SetHistory(x_autograd_meta, grad_node); + } + grad_node->SetGradInMeta(self->tensor, 0); + } + } + if (PyCheckTensor(value_obj)) { + // pass the stop_gradient from value to tensor. + // pass stop gradient should be done after CheckInplace in + // set_value__dygraph_function. + if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && + egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { + egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); + } } } } diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc index 05374b08d8fc25..520fe09bc710cd 100644 --- a/paddle/fluid/pybind/generator_py.cc +++ b/paddle/fluid/pybind/generator_py.cc @@ -40,7 +40,7 @@ void BindGenerator(py::module* m_ptr) { [](std::shared_ptr& self) { return self->current_seed; }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) // NOTE(shenliang03): Due to the inability to serialize mt19937_64 // type, resulting in a problem with precision under the cpu. diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 7199eb13c579bc..8ba56008fb2b0b 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -869,7 +869,7 @@ void BindImperative(py::module *m_ptr) { }, py::call_guard()); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) py::class_()); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) py::class_>( @@ -951,7 +951,7 @@ void BindImperative(py::module *m_ptr) { py::arg("ring_id")); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE) py::class_) @@ -373,10 +373,10 @@ void BindPlace(pybind11::module &m) { // NOLINT #endif .def("__repr__", string::to_string) .def("__str__", string::to_string); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { // Only GPUs with Compute Capability >= 53 support float16 -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#ifdef PADDLE_WITH_HIP return true; #else return platform::GetGPUComputeCapability(place.device) >= 53; @@ -384,7 +384,7 @@ void BindPlace(pybind11::module &m) { // NOLINT }); m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool { // Only GPUs with Compute Capability >= 80 support bfloat16 -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#ifdef PADDLE_WITH_HIP return false; #else return platform::GetGPUComputeCapability(place.device) >= 80; @@ -546,7 +546,7 @@ void BindPlace(pybind11::module &m) { // NOLINT cudapinnedplace .def("__init__", [](platform::CUDAPinnedPlace &self) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPinnedPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); diff --git a/paddle/fluid/pybind/process_group_utils.h b/paddle/fluid/pybind/process_group_utils.h index 7b9002feed8ed7..3ba9ec3239c371 100644 --- a/paddle/fluid/pybind/process_group_utils.h +++ b/paddle/fluid/pybind/process_group_utils.h @@ -268,7 +268,7 @@ void ConcatTensor(const phi::DeviceContext &dev_ctx, const auto &place = dev_ctx.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ConcatDenseTensorWithType(static_cast(dev_ctx), tensor_list, dense_tensor, @@ -325,7 +325,7 @@ void SplitTensor(const phi::DeviceContext &dev_ctx, const auto &place = dev_ctx.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) SplitDenseTensorWithType(static_cast(dev_ctx), tensor, &dense_list, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 7949d7c1c33946..feafd1fa4333e6 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -134,7 +134,7 @@ limitations under the License. */ #include "paddle/phi/core/lod_utils.h" #include "paddle/utils/none.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/pybind/nccl_wrapper_py.h" #endif #include "paddle/fluid/framework/data_type.h" @@ -146,11 +146,11 @@ limitations under the License. */ #include "paddle/fluid/pybind/tensor.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif -#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#ifndef PADDLE_WITH_HIP #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -238,7 +238,7 @@ bool IsCompiledWithAVX() { } bool IsCompiledWithCUDA() { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) return false; #else return true; @@ -279,15 +279,7 @@ bool IsCompiledWithMPIAWARE() { } bool IsCompiledWithROCM() { -#if !defined(PADDLE_WITH_HIP) - return false; -#else - return true; -#endif -} - -bool IsCompiledWithMUSA() { -#if !defined(PADDLE_WITH_MUSA) +#ifndef PADDLE_WITH_HIP return false; #else return true; @@ -683,16 +675,16 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() { string::join_strings(ops, ','))); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static int GetNCCLVersion() { -// #if NCCL_VERSION_CODE >= 2304 +#if NCCL_VERSION_CODE >= 2304 int ver; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetVersion(&ver)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetVersion(&ver)); return ver; -// #else -// PADDLE_THROW(platform::errors::External( -// "Cannot get NCCL version successfully when nccl version < 2.3.4")); -// #endif +#else + PADDLE_THROW(platform::errors::External( + "Cannot get NCCL version successfully when nccl version < 2.3.4")); +#endif } #endif @@ -938,7 +930,7 @@ PYBIND11_MODULE(libpaddle, m) { return self->OutputMeta(); }); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("cudnn_version", &platform::DnnVersion); m.def("gpu_memory_available", []() { size_t available = 0; @@ -948,7 +940,7 @@ PYBIND11_MODULE(libpaddle, m) { }); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) m.def("nccl_version", &GetNCCLVersion); #endif @@ -990,7 +982,7 @@ PYBIND11_MODULE(libpaddle, m) { if (dl.device.device_type == kDLCPU) { paddle::framework::TensorFromDLPack(dmt, &tensor); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (dl.device.device_type == kDLGPU) { paddle::framework::TensorFromDLPack(dmt, &tensor); } @@ -1264,7 +1256,7 @@ All parameter, weight, gradient are variables in Paddle. "get_fetch_list", [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) .def( "get_communicator", [](Variable &self) -> platform::Communicator * { @@ -1732,7 +1724,7 @@ All parameter, weight, gradient are variables in Paddle. "create", [](paddle::platform::CUDAPlace &place) -> paddle::platform::DeviceContext * { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); @@ -1766,7 +1758,7 @@ All parameter, weight, gradient are variables in Paddle. "create", [](paddle::platform::CUDAPinnedPlace &place) -> paddle::platform::DeviceContext * { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPinnedPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); @@ -1774,7 +1766,7 @@ All parameter, weight, gradient are variables in Paddle. return new paddle::platform::CUDAPinnedDeviceContext(place); #endif }); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) py::class_(m, "Communicator").def(py::init<>()); #endif m.def("get_all_device_type", []() { @@ -2114,7 +2106,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_compiled_with_avx", IsCompiledWithAVX); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_rocm", IsCompiledWithROCM); - m.def("is_compiled_with_musa", IsCompiledWithMUSA); m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice); m.def("is_compiled_with_ipu", IsCompiledWithIPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU); @@ -2393,7 +2384,7 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::take_ownership); m.def("op_support_gpu", OpSupportGPU); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("get_cuda_device_count", platform::GetGPUDeviceCount); m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId); m.def("cuda_empty_cache", [] { @@ -2439,7 +2430,7 @@ All parameter, weight, gradient are variables in Paddle. return ostr.str(); }); -#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) && !defined(_WIN32) +#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32) m.def("nvprof_init", platform::CudaProfilerInit); m.def("nvprof_start", platform::CudaProfilerStart); m.def("nvprof_stop", platform::CudaProfilerStop); @@ -2521,7 +2512,7 @@ All parameter, weight, gradient are variables in Paddle. .def("save", &paddle::platform::ProfilerResult::Save) .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo) .def("get_version", &paddle::platform::ProfilerResult::GetVersion) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx) .def("get_device_property", &paddle::platform::ProfilerResult::GetDeviceProperty); @@ -2678,7 +2669,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("enable_op_info_recorder", &phi::EnableOpInfoRecorder); m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("set_cublas_switch", phi::SetAllowTF32Cublas); m.def("get_cublas_switch", phi::AllowTF32Cublas); m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn); @@ -2966,7 +2957,7 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_BOX_PS BindBoxWrapper(&m); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) BindNCCLWrapper(&m); #endif #ifdef PADDLE_WITH_GLOO diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h index 918d2eeae4272a..919a3a4650d3e7 100644 --- a/paddle/fluid/pybind/slice_utils.h +++ b/paddle/fluid/pybind/slice_utils.h @@ -26,9 +26,11 @@ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/scope_guard.h" #include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" +#include "pybind11/numpy.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" @@ -345,11 +347,13 @@ static paddle::Tensor getTensorWithBasicIndexing( std::vector* decrease_axis, std::vector* none_axes, std::vector* infer_flags, - bool* use_strided_slice) { + bool* use_strided_slice, + bool* out_is_view) { paddle::Tensor out; if (slice_axes->empty()) { out = tensor; } else { + *out_is_view = true; if (!(*use_strided_slice)) { eager_gil_scoped_release guard; out = slice_ad_func(tensor, @@ -370,6 +374,7 @@ static paddle::Tensor getTensorWithBasicIndexing( } } if (!none_axes->empty()) { + *out_is_view = true; eager_gil_scoped_release guard; // Deal with cases that decrease_axes is not empty // For example: @@ -397,9 +402,9 @@ static paddle::Tensor dealWithAdvancedIndex( std::vector* transed_index, std::vector* trans_back_dim, int* pos_of_new_dim, - int* rank_of_new_dim) { - std::vector trans_dim; - + int* rank_of_new_dim, + std::vector* trans_dim, + bool* out_is_view) { int p = 0; for (size_t i = 0; i < advanced_index_dim->size(); ++i) { auto index_dim = (*advanced_index_dim)[i]; @@ -408,30 +413,28 @@ static paddle::Tensor dealWithAdvancedIndex( // advanced_index_dim auto index = (*advanced_index)[p++]; - if (!is_for_setitem) { - if (index_dim == 0) { - // case 1: advanced indices at axis 0, the new dim will be at first. - *pos_of_new_dim = 0; - } else if (index_dim > 0 && trans_dim.size() > 0 && - trans_dim[trans_dim.size() - 1] != index_dim - 1) { - // case 2: there are not adjacent advanced indices, the new dim will - // be at first. - *pos_of_new_dim = 0; - } else { - *pos_of_new_dim = std::min(index_dim, *pos_of_new_dim); - } - *rank_of_new_dim = - std::max(*rank_of_new_dim, static_cast(index.shape().size())); + if (index_dim == 0) { + // case 1: advanced indices at axis 0, the new dim will be at first. + *pos_of_new_dim = 0; + } else if (index_dim > 0 && trans_dim->size() > 0 && + (*trans_dim)[trans_dim->size() - 1] != index_dim - 1) { + // case 2: there are not adjacent advanced indices, the new dim will + // be at first. + *pos_of_new_dim = 0; + } else { + *pos_of_new_dim = std::min(index_dim, *pos_of_new_dim); } + *rank_of_new_dim = + std::max(*rank_of_new_dim, static_cast(index.shape().size())); - trans_dim.push_back(index_dim); + trans_dim->push_back(index_dim); transed_index->push_back(std::move(index)); } } for (size_t i = 0; i < tensor.shape().size(); ++i) { if ((*advanced_index_dim)[i] == -1) { - trans_dim.push_back(i); + trans_dim->push_back(i); } } @@ -441,19 +444,20 @@ static paddle::Tensor dealWithAdvancedIndex( std::vector original_dim_order(tensor.shape().size()); std::iota(original_dim_order.begin(), original_dim_order.end(), 0); - if (original_dim_order == trans_dim) { + if (original_dim_order == *trans_dim) { transed_tensor = tensor; } else { - transed_tensor = transpose_ad_func(tensor, trans_dim); + *out_is_view = true; + transed_tensor = transpose_ad_func(tensor, *trans_dim); } if (is_for_setitem) { - trans_back_dim->resize(trans_dim.size()); + trans_back_dim->resize(trans_dim->size()); std::iota(trans_back_dim->begin(), trans_back_dim->end(), 0); std::sort(trans_back_dim->begin(), trans_back_dim->end(), [&trans_dim](int left, int right) { - return trans_dim[left] < trans_dim[right]; + return (*trans_dim)[left] < (*trans_dim)[right]; }); } return transed_tensor; @@ -511,5 +515,104 @@ static void ParseBoolAndBroadcastIndices( } } +static paddle::Tensor dealWithValues(const paddle::Tensor& tensor, + PyObject* value_obj, + std::vector* values, + const bool trans_to_tensor) { + paddle::Tensor value_tensor; + if (PyCheckTensor(value_obj)) { + value_tensor = reinterpret_cast(value_obj)->tensor; + } else if (py::isinstance(value_obj)) { + paddle::Tensor value_tensor_tmp( + std::make_shared(), + egr::Controller::Instance().GenerateUniqueName()); + py::object value_obj_tmp(py::handle(value_obj), true); + py::object value = value_obj_tmp; + if (tensor.dtype() == phi::DataType::FLOAT32) { + if (!py::isinstance>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray(value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::FLOAT64) { + if (!py::isinstance>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray(value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::INT32) { + if (!py::isinstance>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray(value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::INT64) { + if (!py::isinstance>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray(value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::BOOL) { + if (!py::isinstance>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray(value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::COMPLEX64) { + if (!py::isinstance>>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray>( + value_obj_tmp); + } + } else if (tensor.dtype() == phi::DataType::COMPLEX128) { + if (!py::isinstance>>(value_obj_tmp)) { + value = pybind11::detail::CastNumpyArray>( + value_obj_tmp); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "When assign a numpy.np value to a paddle.Tensor, " + "the data type of the paddle.Tensor must be bool, " + "float32, float64, complex64, complex128, int32 or int64, " + "please check the type of tensor.")); + } + SetTensorFromPyArray( + static_cast(value_tensor_tmp.impl().get()), + value, + tensor.place(), + false); + value_tensor = value_tensor_tmp; + } else { + py::object value_obj_tmp(py::handle(value_obj), true); + // convert the value to self data type + if (py::isinstance(value_obj_tmp) || + py::isinstance(value_obj_tmp) || + py::isinstance(value_obj_tmp) || + PyComplex_Check(value_obj)) { + if (tensor.dtype() == phi::DataType::FLOAT32 || + tensor.dtype() == phi::DataType::FLOAT16 || + tensor.dtype() == phi::DataType::BFLOAT16) { + values->push_back(value_obj_tmp.cast()); + } else if (tensor.dtype() == phi::DataType::FLOAT64) { + values->push_back(value_obj_tmp.cast()); + } else if (tensor.dtype() == phi::DataType::INT32 || + tensor.dtype() == phi::DataType::INT16 || + tensor.dtype() == phi::DataType::INT8 || + tensor.dtype() == phi::DataType::UINT8) { + values->push_back(value_obj_tmp.cast()); + } else if (tensor.dtype() == phi::DataType::INT64) { + values->push_back(value_obj_tmp.cast()); + } else if (tensor.dtype() == phi::DataType::BOOL) { + values->push_back(value_obj_tmp.cast()); + } else if (tensor.dtype() == phi::DataType::COMPLEX64) { + values->push_back(value_obj_tmp.cast>()); + } else if (tensor.dtype() == phi::DataType::COMPLEX128) { + values->push_back(value_obj_tmp.cast>()); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Value type error. The assign value allows " + "Tensor, numpy.ndarray, integer, float, complex or bool, " + "but received %s.", + Py_TYPE(value_obj))); + } + + if (trans_to_tensor) { + value_tensor = + full_ad_func({1}, (*values)[0], tensor.dtype(), tensor.place()); + } + } + return value_tensor; +} + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index cce09cf7fdfd54..44983e3e13df7f 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -117,7 +117,7 @@ limitations under the License. */ #include "paddle/phi/core/lod_utils.h" #include "paddle/utils/none.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/pybind/nccl_wrapper_py.h" #endif #include "paddle/fluid/framework/data_type.h" @@ -126,11 +126,11 @@ limitations under the License. */ #include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif -#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#ifndef PADDLE_WITH_HIP #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -1101,7 +1101,7 @@ void BindTensor(pybind11::module &m) { // NOLINT .def("height", &phi::SelectedRows::height) .def("set_rows", [](phi::SelectedRows &self, std::vector rows) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) self.set_rows(rows); #else std::vector new_rows(rows); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 622d054645eff1..dd5bd7f1d91c4d 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -37,7 +37,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/pybind/complex.h" #include "paddle/phi/kernels/funcs/strided_memcpy.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" @@ -325,7 +325,7 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) { #endif } else if (platform::is_gpu_place(self.place()) || platform::is_cuda_pinned_place(self.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) const T *a = self.data(); auto p = self.place(); paddle::memory::Copy( @@ -362,7 +362,7 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) { #endif } else if (platform::is_gpu_place(self->place()) || platform::is_cuda_pinned_place(self->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto p = self->place(); T *a = self->mutable_data(p); paddle::memory::Copy( @@ -457,7 +457,7 @@ void SetTensorFromPyArrayT( "Please recompile or reinstall Paddle with CustomDevice support.")); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (paddle::platform::is_gpu_place(place)) { // NOTE(wangxi): When copying data to the accelerator card, // we need set_device(dev_id) first. @@ -466,9 +466,6 @@ void SetTensorFromPyArrayT( #ifdef PADDLE_WITH_HIP paddle::platform::GpuMemcpySync( dst, array.data(), array.nbytes(), hipMemcpyHostToDevice); -#elif defined(PADDLE_WITH_MUSA) - paddle::platform::GpuMemcpySync( - dst, array.data(), array.nbytes(), musaMemcpyHostToDevice); #else paddle::platform::GpuMemcpySync( dst, array.data(), array.nbytes(), cudaMemcpyHostToDevice); @@ -793,7 +790,7 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self, output->mutable_data(place, self.dtype()); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_cuda_pinned_place(place)) { output->mutable_data(place, self.dtype()); } else if ((platform::is_gpu_place(place))) { @@ -1050,13 +1047,11 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, "Please recompile or reinstall Paddle with XPU support.")); #endif } else if (is_gpu_tensor) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) gpuMemcpyKind kind = cudaMemcpyDeviceToHost; #elif defined(PADDLE_WITH_HIP) gpuMemcpyKind kind = hipMemcpyDeviceToHost; -#elif defined(PADDLE_WITH_MUSA) - gpuMemcpyKind kind = musaMemcpyDeviceToHost; #endif phi::DenseTensor cpu_tensor; platform::CPUPlace cpu_place; diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 8636de26c4161e..09b4337ecb40b3 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -51,13 +51,6 @@ if(WITH_GPU) list(APPEND PHI_DEPS external_error_proto) endif() -if(WITH_MUSA) - set(DEPENDENT_LIBRARIES "") - list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmudnn.so") - list(APPEND PHI_DEPS ${DEPENDENT_LIBRARIES}) -endif() - - if(WITH_ASCEND_CL) list(APPEND PHI_DEPS npu_hccl) endif() @@ -141,11 +134,11 @@ if(WITH_GPU) SRCS ${PHI_SRCS} DEPS ${PHI_DEPS}) elseif(WITH_ROCM) - hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS}) - target_link_libraries(phi ${PHI_DEPS}) -elseif(WITH_MUSA) - musa_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS}) - target_link_libraries(phi ${PHI_DEPS}) + hip_library( + phi ${PHI_BUILD_TYPE} + SRCS ${PHI_SRCS} + DEPS ${PHI_DEPS}) + elseif(WITH_XPU_KP) xpu_library( phi ${PHI_BUILD_TYPE} diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h index a6f8b3949c20a4..86ba7b9cf75764 100644 --- a/paddle/phi/api/include/context_pool.h +++ b/paddle/phi/api/include/context_pool.h @@ -99,7 +99,7 @@ namespace paddle { */ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * Get the current CUDA stream for the passed CUDA device. */ diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index 3ef838410bed07..a6e78686e1e4ce 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -24,11 +24,6 @@ limitations under the License. */ using gpuStream_t = cudaStream_t; #endif -#ifdef PADDLE_WITH_MUSA -#include -using gpuStream_t = musaStream_t; -#endif - #ifdef PADDLE_WITH_HIP #include using gpuStream_t = hipStream_t; @@ -418,7 +413,7 @@ class PADDLE_API Tensor final { */ void set_impl(std::shared_ptr&& impl); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * @brief Get the stream where the tensor is currently located * This is a deprecated method and may be removed in the future! diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index 2d5d1a49f02e77..ed64ff1c937b64 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -385,7 +385,7 @@ void TransStride(phi::DeviceContext* dev_ctx, delete from; return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto* gpu_ctx = dynamic_cast(dev_ctx); if (gpu_ctx) { PD_VISIT_ALL_TYPES(to->dtype(), "StridedCopyKernel", ([&] { @@ -437,7 +437,7 @@ void TransStrideLegacy(phi::DeviceContext* dev_ctx, })); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto* gpu_ctx = dynamic_cast(dev_ctx); if (gpu_ctx) { PD_VISIT_ALL_TYPES(to->dtype(), "StridedCopyKernel", ([&] { @@ -489,7 +489,7 @@ void TransStride(phi::DeviceContext* dev_ctx, delete from[i]; continue; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto* gpu_ctx = dynamic_cast(dev_ctx); if (gpu_ctx) { PD_VISIT_ALL_TYPES(to[i]->dtype(), "StridedCopyKernel", ([&] { diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc index b2c3f9f28ee79c..ee1e21a58e2f1b 100644 --- a/paddle/phi/api/lib/context_pool.cc +++ b/paddle/phi/api/lib/context_pool.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/phi/core/allocator.h" #include "paddle/phi/core/enforce.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/core/cuda_stream.h" #endif @@ -75,7 +75,7 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) { return const_cast(&dev_ctx->GetAllocator()); // NOLINT } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) { PADDLE_ENFORCE_EQ(place.GetType(), phi::AllocationType::GPU, diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 2ea7ae4f5e3d84..03ac68d3319915 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -116,7 +116,7 @@ phi::DenseTensor CastDataType(const Context& dev_ctx, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx, const phi::DenseTensor& tensor, DataType dtype) { @@ -158,7 +158,7 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor, if (tensor.place().GetType() == phi::AllocationType::CPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return CastDataType(*dev_ctx, tensor, dtype); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if (tensor.place().GetType() == phi::AllocationType::GPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return CastDataType(*dev_ctx, tensor, dtype); @@ -196,7 +196,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor, << " dst_place: " << dst_place; auto& pool = phi::DeviceContextPool::Instance(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // NOTE(yy): TransDataPlace should wait for computation of input. if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) { pool.Get(tensor.place())->Wait(); @@ -247,7 +247,7 @@ phi::DenseTensor Trans2Contiguous(const phi::DenseTensor& tensor) { if (tensor.place().GetType() == phi::AllocationType::CPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return TensorContiguous(*dev_ctx, tensor); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if (tensor.place().GetType() == phi::AllocationType::GPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return TensorContiguous(*dev_ctx, tensor); diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index ee88e9fb1b0c88..49c47cbcce363c 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -376,7 +376,7 @@ void Tensor::set_impl(std::shared_ptr &&impl) { impl_ = std::move(impl); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t Tensor::stream() const { int device_id = phi::backends::gpu::GetCurrentDeviceId(); auto *gpu_context = DeviceContextPool::Instance().Get( diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc index 0aad2a6da5fdce..9c11e88260c1df 100644 --- a/paddle/phi/api/lib/tensor_utils.cc +++ b/paddle/phi/api/lib/tensor_utils.cc @@ -20,11 +20,11 @@ limitations under the License. */ #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h" #include "paddle/phi/core/enforce.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_CUDA #include #else -#include +#include #endif #endif @@ -33,26 +33,26 @@ namespace paddle { PD_REGISTER_API(from_blob) phi::Place GetPlaceFromPtr(void* data) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -// #ifdef PADDLE_WITH_CUDA -// #if CUDA_VERSION >= 10000 - musaPointerAttributes attr; - musaError_t status = musaPointerGetAttributes(&attr, data); - if (status == musaSuccess && attr.type == musaMemoryTypeDevice) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 10000 + cudaPointerAttributes attr; + cudaError_t status = cudaPointerGetAttributes(&attr, data); + if (status == cudaSuccess && attr.type == cudaMemoryTypeDevice) { return phi::GPUPlace(attr.device); } -// #else -// PADDLE_THROW( -// phi::errors::Unimplemented("The GetPlaceFromPtr() method is only " -// "supported when CUDA version >= 10.0.")); -// #endif -// #else -// hipPointerAttribute_t attr; -// hipError_t status = hipPointerGetAttributes(&attr, data); -// if (status == hipSuccess && attr.memoryType == hipMemoryTypeDevice) { -// return phi::GPUPlace(attr.device); -// } -// #endif +#else + PADDLE_THROW( + phi::errors::Unimplemented("The GetPlaceFromPtr() method is only " + "supported when CUDA version >= 10.0.")); +#endif +#else + hipPointerAttribute_t attr; + hipError_t status = hipPointerGetAttributes(&attr, data); + if (status == hipSuccess && attr.memoryType == hipMemoryTypeDevice) { + return phi::GPUPlace(attr.device); + } +#endif #endif return phi::CPUPlace(); } diff --git a/paddle/phi/api/profiler/event.h b/paddle/phi/api/profiler/event.h index 70fb4d948986c4..eb765ebdcb9dd3 100644 --- a/paddle/phi/api/profiler/event.h +++ b/paddle/phi/api/profiler/event.h @@ -27,10 +27,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/core/cuda_stream.h" #endif @@ -64,7 +62,7 @@ class Event { void set_name(std::string name) { name_ = name; } void set_role(EventRole role) { role_ = role; } std::string attr() const { return attr_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifndef PADDLE_WITH_CUPTI gpuEvent_t event() const { return event_; } int device() const { return device_; } @@ -83,7 +81,7 @@ class Event { uint64_t cpu_ns_; bool visited_status_{false}; std::string attr_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_CUPTI int64_t gpu_ns_ = 0; @@ -139,14 +137,12 @@ class MemEvent { }; class CudaEvent { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) public: CudaEvent() { #ifdef PADDLE_WITH_HIP hipEventCreateWithFlags(&event_, flags_); -#elif defined(PADDLE_WITH_MUSA) - musaEventCreateWithFlags(&event_, flags_); #else cudaEventCreateWithFlags(&event_, flags_); #endif @@ -156,8 +152,6 @@ class CudaEvent { explicit CudaEvent(unsigned int flags) : flags_(flags) { #ifdef PADDLE_WITH_HIP hipEventCreateWithFlags(&event_, flags_); -#elif defined(PADDLE_WITH_MUSA) - musaEventCreateWithFlags(&event_, flags_); #else cudaEventCreateWithFlags(&event_, flags_); #endif @@ -167,8 +161,6 @@ class CudaEvent { ~CudaEvent() { #ifdef PADDLE_WITH_HIP hipEventDestroy(event_); -#elif defined(PADDLE_WITH_MUSA) - musaEventDestroy(event_); #else cudaEventDestroy(event_); #endif @@ -177,8 +169,6 @@ class CudaEvent { void Record(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream)); #endif @@ -193,14 +183,6 @@ class CudaEvent { if (err == hipErrorNotReady) { return false; } -#elif defined(PADDLE_WITH_MUSA) - gpuError_t err = musaEventQuery(event_); - if (err == musaSuccess) { - return true; - } - if (err == musaErrorNotReady) { - return false; - } #else gpuError_t err = cudaEventQuery(event_); if (err == cudaSuccess) { @@ -217,8 +199,6 @@ class CudaEvent { void Synchronize() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventSynchronize(event_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_)); #endif @@ -228,8 +208,6 @@ class CudaEvent { private: #ifdef PADDLE_WITH_HIP unsigned int flags_ = hipEventDefault; -#elif defined(PADDLE_WITH_MUSA) - unsigned int flags_ = musaEventDefault; #else unsigned int flags_ = cudaEventDefault; #endif diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 3a87826337465b..81339a24c50de8 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -944,8 +944,6 @@ func : gather_nd_grad composite : gather_nd_grad(x, index, out_grad, x_grad) no_need_buffer : x - data_transform : - skip_transform : index - backward_op : gaussian_inplace_grad forward : gaussian_inplace(Tensor x, float mean=0, float std=1.0, int seed=0) -> Tensor(out) @@ -1762,8 +1760,8 @@ optional : boxes_num - backward_op : put_along_axis_grad - forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign") -> Tensor(out) - args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce) + forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true) -> Tensor(out) + args : (Tensor arr, Tensor indices, Tensor values, Tensor out, Tensor out_grad, int axis, str reduce, bool include_self) output : Tensor(arr_grad), Tensor(values_grad) infer_meta : func : GeneralBinaryGradInferMeta diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py index 3769155eb27e11..c7ec9ace290ac7 100644 --- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py +++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py @@ -425,6 +425,7 @@ def source_include(header_file_path, fw_header_file_path): #include "{fw_header_file_path}" #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/unary.h" +#include "paddle/phi/infermeta/fusion.h" #include "paddle/phi/api/profiler/event_tracing.h" #include "paddle/phi/api/profiler/supplement_tracing.h" diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 04cf57a88bb7cb..3f11781dfe88eb 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -614,14 +614,14 @@ - backward_op : set_value_grad forward : set_value (Tensor x, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes, int64_t[] shape, Scalar[] values) -> Tensor(out) - args : (Tensor out_grad) + args : (Tensor out_grad, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes) output : Tensor(x_grad) infer_meta: func: UnchangedInferMeta param: [out_grad] kernel: - func: assign - param: [out_grad] + func: set_value_with_scalar_grad + param: [out_grad, starts, ends, steps, axes, decrease_axes, none_axes] - backward_op : set_value_with_tensor_grad forward: set_value_with_tensor (Tensor x, Tensor values, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes) -> Tensor(out) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index e4bbb15073f418..dfcdf65673e208 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -2432,7 +2432,7 @@ outputs : out : Result attrs : - {axis : Axis, reduce : Reduce} + {axis : Axis, reduce : Reduce, include_self: Include_self} - op : pylayer backward : pylayer_grad diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 092b3d71a60b4d..efc1b17714a854 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -2032,7 +2032,7 @@ backward : psroi_pool_grad - op : put_along_axis - args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign") + args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true) output : Tensor(out) infer_meta : func : UnchangedInferMeta diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index db0d463bc67156..ed47487553bee7 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -7,7 +7,7 @@ if(NOT APPLE AND NOT WIN32) list(APPEND BACKENDS_SRCS device_code.cc) endif() -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc gpu/gpu_resources.cc) if(WITH_GPU) @@ -16,9 +16,6 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) if(WITH_ROCM) list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc) endif() - if(WITH_MUSA) - list(APPEND BACKENDS_SRCS gpu/musa/musa_info.cc) - endif() endif() if(WITH_XPU) @@ -52,7 +49,6 @@ list( if(WITH_GPU OR WITH_ROCM - OR WITH_MUSA OR WITH_CUSTOM_DEVICE) list(APPEND BACKENDS_SRCS device_base.cc) endif() diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc index 9e8ecd48e453c5..7824fc3b160b10 100644 --- a/paddle/phi/backends/context_pool.cc +++ b/paddle/phi/backends/context_pool.cc @@ -21,7 +21,7 @@ limitations under the License. */ namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool allow_tf32_cublas = true; void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; } bool AllowTF32Cublas() { return allow_tf32_cublas; } diff --git a/paddle/phi/backends/context_pool.h b/paddle/phi/backends/context_pool.h index a0537c779e52f7..52f0ced275ac5e 100644 --- a/paddle/phi/backends/context_pool.h +++ b/paddle/phi/backends/context_pool.h @@ -28,7 +28,7 @@ limitations under the License. */ namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void SetAllowTF32Cublas(bool active); /*Get the global variable allow_tf32_cublas value*/ bool AllowTF32Cublas(); @@ -47,7 +47,7 @@ struct DefaultDeviceContextType { using TYPE = phi::CPUContext; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> struct DefaultDeviceContextType { using TYPE = phi::GPUContext; diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc index 48bedd1bd939e4..ddbfc60f19f083 100644 --- a/paddle/phi/backends/custom/custom_device.cc +++ b/paddle/phi/backends/custom/custom_device.cc @@ -383,7 +383,7 @@ class CustomDevice : public DeviceInterface { void* ptr = nullptr; const auto device = &devices_pool[dev_id]; - if (!pimpl_->unified_memory_allocate) { + if (!pimpl_->host_memory_allocate) { PADDLE_THROW(phi::errors::Unavailable( "MemoryAllocateHost is not supported on %s.", Type())); } else { diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc index ac16a69aa7bee7..d160b5034f9986 100644 --- a/paddle/phi/backends/device_code.cc +++ b/paddle/phi/backends/device_code.cc @@ -78,8 +78,7 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { } for (auto& p : set) { if (p.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) device_codes_.emplace(p, DeviceCodeMap()); #else PADDLE_THROW(phi::errors::PreconditionNotMet( @@ -89,14 +88,12 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) GPUDeviceCode::CheckAvailableStatus(); #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_HIP static bool CheckCUDADriverResult(hipError_t result, std::string caller, @@ -104,13 +101,6 @@ static bool CheckCUDADriverResult(hipError_t result, if (result != hipSuccess) { const char* error = nullptr; error = dynload::hipGetErrorString(result); -#elif defined(PADDLE_WITH_MUSA) -static bool CheckCUDADriverResult(MUresult result, - std::string caller, - std::string kernel_name = "") { - if (result != MUSA_SUCCESS) { - const char* error = nullptr; - dynload::muGetErrorString(result, &error); #else static bool CheckCUDADriverResult(CUresult result, std::string caller, @@ -140,8 +130,6 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP hiprtcResult nvrtc_result = dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor); -#elif defined(PADDLE_WITH_MUSA) - mtrtcResult nvrtc_result = dynload::mtrtcVersion(&nvrtc_major, &nvrtc_minor); #else nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor); #endif @@ -152,9 +140,6 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version); if (driver_result == hipSuccess) { -#elif defined(PADDLE_WITH_MUSA) - MUresult driver_result = dynload::muDriverGetVersion(&driver_version); - if (driver_result == MUSA_SUCCESS) { #else CUresult driver_result = dynload::cuDriverGetVersion(&driver_version); if (driver_result == CUDA_SUCCESS) { @@ -168,8 +153,6 @@ void GPUDeviceCode::CheckAvailableStatus() { << "." << nvrtc_minor; #ifdef PADDLE_WITH_HIP if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) { -#elif defined(PADDLE_WITH_MUSA) - if (nvrtc_result != MTRTC_SUCCESS || driver_result != MUSA_SUCCESS) { #else if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) { #endif @@ -180,9 +163,6 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count), "hipGetDeviceCount")) { -#elif defined(PADDLE_WITH_MUSA) - if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count), - "muDeviceGetCount")) { #else if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count), "cuDeviceGetCount")) { @@ -222,8 +202,6 @@ static std::string FindCUDAIncludePath() { #ifdef PADDLE_WITH_HIP cuda_include_path = "/opt/rocm/include"; -#elif defined(PADDLE_WITH_MUSA) - cuda_include_path = "/usr/local/musa/include"; #else cuda_include_path = "/usr/local/cuda/include"; #endif @@ -251,8 +229,6 @@ GPUDeviceCode::GPUDeviceCode(const Place& place, name_ = name; #ifdef PADDLE_WITH_HIP kernel_ = "#include \n" + kernel; -#elif defined(PADDLE_WITH_MUSA) - kernel_ = kernel; #else kernel_ = kernel; #endif @@ -281,12 +257,12 @@ bool GPUDeviceCode::Compile(bool include_path) { auto* dev_ctx = reinterpret_cast( DeviceContextPool::Instance().Get(place_)); int compute_capability = dev_ctx->GetComputeCapability(); - std::vector options = {"-std=c++11", "--amdgpu-target=gfx906"}; + std::vector options = {"-std=c++11"}; std::string include_option; if (include_path) { std::string cuda_include_path = FindCUDAIncludePath(); if (!cuda_include_path.empty()) { - include_option = "--include-path=" + cuda_include_path; + include_option = "-I" + cuda_include_path; options.push_back(include_option.c_str()); } } @@ -342,86 +318,6 @@ bool GPUDeviceCode::Compile(bool include_path) { "hipModuleGetFunction")) { return false; } -#elif defined(PADDLE_WITH_MUSA) - mtrtcProgram program; - if (!CheckNVRTCResult(dynload::mtrtcCreateProgram(&program, - kernel_.c_str(), // buffer - name_.c_str(), // name - 0, // numHeaders - nullptr, // headers - nullptr), // includeNames - "mtrtcCreateProgram")) { - return false; - } - - // Compile the program for specified compute_capability - auto* dev_ctx = reinterpret_cast( - DeviceContextPool::Instance().Get(place_)); - int compute_capability = dev_ctx->GetComputeCapability(); - std::string compute_flag = - "--gpu-architecture=compute_" + std::to_string(compute_capability); - std::vector options = {"--std=c++11", compute_flag.c_str()}; - std::string include_option; - if (include_path) { - std::string cuda_include_path = FindCUDAIncludePath(); - if (!cuda_include_path.empty()) { - include_option = "--include-path=" + cuda_include_path; - options.push_back(include_option.c_str()); - } - } - mtrtcResult compile_result = - dynload::mtrtcCompileProgram(program, // program - options.size(), // numOptions - options.data()); // options - if (compile_result == MTRTC_ERROR_COMPILATION) { - // Obtain compilation log from the program - size_t log_size; - if (!CheckNVRTCResult(dynload::mtrtcGetProgramLogSize(program, &log_size), - "mtrtcGetProgramLogSize")) { - return false; - } - std::vector log; - log.resize(log_size + 1); - if (!CheckNVRTCResult(dynload::mtrtcGetProgramLog(program, log.data()), - "nvrtcGetProgramLog")) { - return false; - } - LOG(WARNING) << "JIT compiling of MUSA code failed:" - << "\n Kernel name: " << name_ << "\n Kernel body:\n" - << kernel_ << "\n Compiling log: " << log.data(); - - return false; - } - - // Obtain PTX from the program - size_t ptx_size; - if (!CheckNVRTCResult(dynload::mtrtcGetMUSASize(program, &ptx_size), - "mtrtcGetMUSASize")) { - return false; - } - ptx_.resize(ptx_size + 1); - if (!CheckNVRTCResult(dynload::mtrtcGetMUSA(program, ptx_.data()), - "mtrtcGetMUSA")) { - return false; - } - - if (!CheckNVRTCResult(dynload::mtrtcDestroyProgram(&program), - "mtrtcDestroyProgram")) { - return false; - } - - if (!CheckCUDADriverResult(dynload::muModuleLoadData(&module_, ptx_.data()), - "muModuleLoadData", - name_)) { - return false; - } - - if (!CheckCUDADriverResult( - dynload::muModuleGetFunction(&function_, module_, name_.c_str()), - "muModuleGetFunction", - name_)) { - return false; - } #else nvrtcProgram program; if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program, @@ -540,22 +436,6 @@ void GPUDeviceCode::Launch(const size_t n, std::vector* args) const { hipSuccess, errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)", name_.c_str())); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_EQ( - dynload::muLaunchKernel(function_, - num_blocks, - 1, - 1, // grid dim - num_threads_, - 1, - 1, // block dim - 0, // shared memory - dev_ctx->stream(), // stream - args->data(), // arguments - nullptr), - MUSA_SUCCESS, - errors::External("Fail to launch kernel %s (in muLaunchKernel.)", - name_.c_str())); #else PADDLE_ENFORCE_EQ( dynload::cuLaunchKernel(function_, @@ -584,18 +464,6 @@ bool GPUDeviceCode::CheckNVRTCResult(hiprtcResult result, << " > failed: " << dynload::hiprtcGetErrorString(result); return false; } - return true; -} -#elif defined(PADDLE_WITH_MUSA) -bool GPUDeviceCode::CheckNVRTCResult(mtrtcResult result, std::string function) { - if (result != MTRTC_SUCCESS) { - LOG_FIRST_N(WARNING, 1) - << "Call " << function << " for < " << name_ - << " > failed: " << dynload::mtrtcGetErrorString(result); - return false; - } - return true; -} #else bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { if (result != NVRTC_SUCCESS) { @@ -604,9 +472,9 @@ bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { << " > failed: " << dynload::nvrtcGetErrorString(result); return false; } +#endif return true; } #endif -#endif } // namespace phi diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h index 964124076e6057..8debb4dc9c45ee 100644 --- a/paddle/phi/backends/device_code.h +++ b/paddle/phi/backends/device_code.h @@ -26,20 +26,11 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/cuda_driver.h" #include "paddle/phi/backends/dynload/nvrtc.h" #endif -#ifdef PADDLE_WITH_MUSA -#include "paddle/phi/backends/dynload/musa_driver.h" -#include "paddle/phi/backends/dynload/musartc.h" -#endif #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/hiprtc.h" #include "paddle/phi/backends/dynload/rocm_driver.h" #endif -#ifdef PADDLE_WITH_MUSA -// #include "paddle/phi/backends/dynload/hiprtc.h" -// #include "paddle/phi/backends/dynload/rocm_driver.h" -#endif - namespace phi { class DeviceCode { @@ -57,7 +48,7 @@ class DeviceCode { std::string kernel_; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class GPUDeviceCode : public DeviceCode { public: explicit GPUDeviceCode(const Place& place, @@ -77,8 +68,6 @@ class GPUDeviceCode : public DeviceCode { private: #ifdef PADDLE_WITH_HIP bool CheckNVRTCResult(hiprtcResult result, std::string function); -#elif defined(PADDLE_WITH_MUSA) - bool CheckNVRTCResult(mtrtcResult result, std::string function); #else bool CheckNVRTCResult(nvrtcResult result, std::string function); #endif @@ -93,9 +82,6 @@ class GPUDeviceCode : public DeviceCode { #ifdef PADDLE_WITH_HIP hipModule_t module_; hipFunction_t function_; -#elif defined(PADDLE_WITH_MUSA) - MUmodule module_; - MUfunction function_; #else CUmodule module_; CUfunction function_; diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h index d731b6b6d1ecf9..c65e06364acd0e 100644 --- a/paddle/phi/backends/device_memory_aligment.h +++ b/paddle/phi/backends/device_memory_aligment.h @@ -36,7 +36,7 @@ inline size_t Alignment(size_t size, if (place.GetType() == phi::AllocationType::CPU) { alignment = phi::backends::cpu::CpuMinChunkSize(); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) alignment = phi::backends::gpu::GpuMinChunkSize(); #elif defined(PADDLE_WITH_XPU) alignment = phi::backends::xpu::XPUMinChunkSize(); diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index 2ea6f11aa53a65..2db75d7022f0a5 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -30,17 +30,6 @@ if(WITH_ROCM) rocsparse.cc) endif() -if(WITH_MUSA) - list( - APPEND - MUSA_SRCS - mublas.cc - mudnn.cc - murand.cc - mufft.cc - musparse.cc) -endif() - # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on macOS, and only do an early test on Linux and Windows. if(NOT APPLE) @@ -57,15 +46,6 @@ if(NOT APPLE) list(APPEND HIP_SRCS cupti.cc) endif() endif() - if(WITH_MUSA) - list(APPEND MUSA_SRCS musartc.cc musa_driver.cc) - if(WITH_MCCL) - list(APPEND MUSA_SRCS mccl.cc) - endif() - if(CUPTI_FOUND) - list(APPEND MUSA_SRCS cupti.cc) - endif() - endif() endif() if(TENSORRT_FOUND) @@ -113,8 +93,6 @@ if(WITH_ROCM) collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${HIP_SRCS}) elseif(WITH_GPU) collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${CUDA_SRCS}) -elseif(WITH_MUSA) - collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${MUSA_SRCS}) else() collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS}) endif() diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index 987f0eefc4397f..bdb9e120d2884b 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -102,29 +102,6 @@ PHI_DEFINE_string(rccl_dir, "dlopen will search rccl from LD_LIBRARY_PATH"); #endif -#ifdef PADDLE_WITH_MUSA - -PHI_DEFINE_string(mudnn_dir, - "", - "Specify path for loading libmudnn.so. For instance, " - "/usr/local/musa/lib. If empty [default], dlopen " - "will search mudnn from LD_LIBRARY_PATH"); - -PHI_DEFINE_string(musa_dir, - "", - "Specify path for loading rocm library, such as libmublas, " - "For instance, /usr/local/musa/lib. " - "If default, dlopen will search rocm from LD_LIBRARY_PATH"); - -PHI_DEFINE_string(mccl_dir, - "", - "Specify path for loading mccl library, such as libmccl.so. " - "For instance, /usr/local/musa/lib. If default, " - "dlopen will search rccl from LD_LIBRARY_PATH"); -#endif - - - #ifdef PADDLE_WITH_XPU PD_DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so."); #endif @@ -349,8 +326,6 @@ void* GetCublasDsoHandle() { FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so"); -#elif defined(PADDLE_WITH_MUSA) - return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmublas.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); #endif @@ -392,9 +367,6 @@ void* GetCUDNNDsoHandle() { FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); -#elif defined(PADDLE_WITH_MUSA) - return GetDsoHandleFromSearchPath( - FLAGS_cudnn_dir, "libmudnn.so", false, {cuda_lib_path}); #else return GetDsoHandleFromSearchPath( FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path}); @@ -419,8 +391,6 @@ void* GetCurandDsoHandle() { FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path}); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); -#elif defined(PADDLE_WITH_MUSA) - return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmurand.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); #endif @@ -436,12 +406,6 @@ void* GetROCFFTDsoHandle() { } #endif -#ifdef PADDLE_WITH_MUSA -void* GetMUFFTDsoHandle() { - return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmufft.so"); -} -#endif - void* GetNvjpegDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib"); @@ -472,8 +436,6 @@ void* GetCusparseDsoHandle() { FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path}); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so"); -#elif defined(PADDLE_WITH_MUSA) - return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusparse.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so"); #endif @@ -484,8 +446,6 @@ void* GetNVRTCDsoHandle() { return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); -#elif defined(PADDLE_WITH_MUSA) - return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusart.so", false); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false); #endif @@ -496,8 +456,6 @@ void* GetCUDADsoHandle() { return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); -#elif defined(PADDLE_WITH_MUSA) - return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusa.so", false); #elif defined(_WIN32) char system32_dir[MAX_PATH]; GetSystemDirectory(system32_dir, MAX_PATH); @@ -555,9 +513,6 @@ void* GetNCCLDsoHandle() { "You may need to install 'rccl' from ROCM official website: " "https://rocmdocs.amd.com/en/latest/Installation_Guide/" "Installation-Guide.html before install PaddlePaddle."); -#elif defined(PADDLE_WITH_MUSA) - std::string warning_msg( - "You may need to install 'mccl' from musa official website."); #else std::string warning_msg( "You may need to install 'nccl2' from NVIDIA official website: " @@ -571,9 +526,6 @@ void* GetNCCLDsoHandle() { #elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) return GetDsoHandleFromSearchPath( FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg); -#elif defined(PADDLE_WITH_MUSA) && defined(PADDLE_WITH_MCCL) - return GetDsoHandleFromSearchPath( - FLAGS_mccl_dir, "libmccl.so", true, {}, warning_msg); #else return GetDsoHandleFromSearchPath( FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg); diff --git a/paddle/phi/backends/dynload/dynamic_loader.h b/paddle/phi/backends/dynload/dynamic_loader.h index 02da303b2020f9..6ddeb1386410f0 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.h +++ b/paddle/phi/backends/dynload/dynamic_loader.h @@ -48,7 +48,6 @@ void* GetMKLRTDsoHandle(); void* GetROCFFTDsoHandle(); void* GetCusparseLtDsoHandle(); void* GetXPTIDsoHandle(); -void* GetMUFFTDsoHandle(); void SetPaddleLibPath(const std::string&); diff --git a/paddle/phi/backends/dynload/mccl.cc b/paddle/phi/backends/dynload/mccl.cc deleted file mode 100644 index 3bf5fd8c985d12..00000000000000 --- a/paddle/phi/backends/dynload/mccl.cc +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/backends/dynload/mccl.h" - -namespace phi { -namespace dynload { - -std::once_flag mccl_dso_flag; -void *mccl_dso_handle; - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); - -MCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP) - -MCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP) - -MCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) - -MCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP) - -} // namespace dynload -} // namespace phi diff --git a/paddle/phi/backends/dynload/mccl.h b/paddle/phi/backends/dynload/mccl.h deleted file mode 100644 index 4e2eaeea00afa3..00000000000000 --- a/paddle/phi/backends/dynload/mccl.h +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include - -#include // NOLINT - -#include "paddle/phi/backends/dynload/dynamic_loader.h" -#include "paddle/phi/backends/dynload/port.h" - -namespace phi { -namespace dynload { - -extern std::once_flag mccl_dso_flag; -extern void* mccl_dso_handle; - -#define DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - using nccl_func = decltype(&::__name); \ - std::call_once(mccl_dso_flag, []() { \ - mccl_dso_handle = phi::dynload::GetNCCLDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(mccl_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern DynLoad__##__name __name - -#define MCCL_RAND_ROUTINE_EACH(__macro) \ - __macro(mcclCommInitAll); \ - __macro(mcclGetUniqueId); \ - __macro(mcclCommInitRank); \ - __macro(mcclCommAbort); \ - __macro(mcclCommDestroy); \ - __macro(mcclCommCount); \ - __macro(mcclCommCuDevice); \ - __macro(mcclCommUserRank); \ - __macro(mcclAllReduce); \ - __macro(mcclBcast); \ - __macro(mcclGroupStart); \ - __macro(mcclAllGather); \ - __macro(mcclGroupEnd); \ - __macro(mcclReduce); \ - __macro(mcclReduceScatter); \ - __macro(mcclCommGetAsyncError); \ - __macro(mcclGetErrorString); - -MCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MCCL_WRAP) - -#define MCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(mcclBroadcast); -MCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_MCCL_WRAP) - -#define MCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); -MCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_MCCL_WRAP) - -#define MCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ - __macro(mcclSend); \ - __macro(mcclRecv); -MCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_MCCL_WRAP) - -#define MCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \ - __macro(mcclRedOpCreatePreMulSum); \ - __macro(mcclRedOpDestroy); -MCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_MCCL_WRAP) -} // namespace dynload -} // namespace phi diff --git a/paddle/phi/backends/dynload/mublas.cc b/paddle/phi/backends/dynload/mublas.cc deleted file mode 100644 index fd05d45414b47e..00000000000000 --- a/paddle/phi/backends/dynload/mublas.cc +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/backends/dynload/mublas.h" - -namespace phi { -namespace dynload { -std::once_flag mublas_dso_flag; -void *mublas_dso_handle = nullptr; - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); - -#ifdef MUBLAS_BLAS_ROUTINE_EACH_R2 -MUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); -#endif - -#ifdef MUBLAS_BLAS_ROUTINE_EACH_R3 -MUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); -#endif - -#ifdef MUBLAS_BLAS_ROUTINE_EACH_R4 -MUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); -#endif -} // namespace dynload -} // namespace phi diff --git a/paddle/phi/backends/dynload/mublas.h b/paddle/phi/backends/dynload/mublas.h deleted file mode 100644 index 9f8db31bd2d060..00000000000000 --- a/paddle/phi/backends/dynload/mublas.h +++ /dev/null @@ -1,128 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - - -#include -#include - -#include // NOLINT -#include - -#include "paddle/phi/backends/dynload/dynamic_loader.h" -#include "paddle/phi/backends/dynload/port.h" - -namespace phi { -namespace dynload { - -extern std::once_flag mublas_dso_flag; -extern void *mublas_dso_handle; - -/** - * The following macro definition can generate structs - * (for each function) to dynamic load mublas routine - * via operator overloading. - * - * note: default dynamic linked libs - */ -#define DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using blas_func = \ - decltype(::__name(std::declval()...)) (*)(Args...); \ - std::call_once(mublas_dso_flag, []() { \ - mublas_dso_handle = phi::dynload::GetCublasDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(mublas_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern DynLoad__##__name __name - -#define MUBLAS_BLAS_ROUTINE_EACH(__macro) \ - __macro(mublasSaxpy); \ - __macro(mublasDaxpy); \ - __macro(mublasCaxpy); \ - __macro(mublasZaxpy); \ - __macro(mublasSscal); \ - __macro(mublasDscal); \ - __macro(mublasScopy); \ - __macro(mublasDcopy); \ - __macro(mublasSgemv); \ - __macro(mublasDgemv); \ - __macro(mublasCgemv); \ - __macro(mublasZgemv); \ - __macro(mublasSgemm); \ - __macro(mublasDgemm); \ - __macro(mublasCgemm); \ - __macro(mublasZgemm); \ - __macro(mublasSgeam); \ - __macro(mublasDgeam); \ - __macro(mublasStrsm); \ - __macro(mublasDtrsm); \ - __macro(mublasCtrsm); \ - __macro(mublasZtrsm); \ - __macro(mublasCreate); \ - __macro(mublasDestroy); \ - __macro(mublasSetStream); \ - __macro(mublasSetPointerMode); \ - __macro(mublasGetPointerMode); \ - __macro(mublasSgemmBatched); \ - __macro(mublasDgemmBatched); \ - __macro(mublasCgemmBatched); \ - __macro(mublasZgemmBatched); \ - __macro(mublasStrsmBatched); \ - __macro(mublasDtrsmBatched); \ - __macro(mublasCtrsmBatched); \ - __macro(mublasZtrsmBatched); - // __macro(mublasHgemm); - //__macro(mublasSgemmEx); - //__macro(mublasSgetrfBatched); - //__macro(mublasSgetriBatched); - //__macro(mublasDgetrfBatched); - //__macro(mublasDgetriBatched); - //__macro(mublasSmatinvBatched); - //__macro(mublasDmatinvBatched); - //__macro(mublasSgetrsBatched); -// __macro(mublasDgetrsBatched); - -MUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) - -#define MUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ - __macro(mublasGemmEx); \ - __macro(mublasSgemmStridedBatched); \ - __macro(mublasDgemmStridedBatched); \ - __macro(mublasCgemmStridedBatched); \ - __macro(mublasZgemmStridedBatched); \ - __macro(mublasHgemmStridedBatched); - -MUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) - -#define MUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \ - __macro(mublasSetMathMode); \ - __macro(mublasGetMathMode); - -MUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) - -#define MUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ - __macro(mublasGemmBatchedEx); - // __macro(mublasGemmStridedBatchedEx); - -MUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) - -#undef DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP -} // namespace dynload -} // namespace phi diff --git a/paddle/phi/backends/dynload/mudnn.cc b/paddle/phi/backends/dynload/mudnn.cc deleted file mode 100644 index cd193688bc347d..00000000000000 --- a/paddle/phi/backends/dynload/mudnn.cc +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef PADDLE_WITH_MUSA -#include "paddle/phi/backends/dynload/mudnn.h" - -namespace phi { -namespace dynload { - -bool HasCUDNN() { - // note: mudnn.so is not imported by dlopen, which will be linked - // in cmakelist.txt. - return true; -} - -void mudnnCreate(Handle** handle, int device) { *handle = new Handle(device); } - -void mudnnSetStream(Handle* handle, musaStream_t stream) { - handle->SetStream(stream); -} - -void mudnnDestroy(Handle* handle) { - if (handle != nullptr) { - delete handle; - handle = nullptr; - } -} - -} // namespace dynload -} // namespace phi -#endif diff --git a/paddle/phi/backends/dynload/mudnn.h b/paddle/phi/backends/dynload/mudnn.h deleted file mode 100644 index d05f32a8b5df05..00000000000000 --- a/paddle/phi/backends/dynload/mudnn.h +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifdef PADDLE_WITH_MUSA -#include -#include - -namespace phi { -namespace dynload { - -using ::musa::dnn::BatchNorm; -using ::musa::dnn::Convolution; -using ::musa::dnn::Handle; -using ::musa::dnn::MemoryHandler; -using ::musa::dnn::Pooling; -using ::musa::dnn::Softmax; -using ::musa::dnn::Tensor; - -extern bool HasCUDNN(); - -void mudnnCreate(Handle** handle, int device); - -void mudnnSetStream(Handle* handle, musaStream_t stream); - -void mudnnDestroy(Handle* handle); - -} // namespace dynload -} // namespace phi -#endif diff --git a/paddle/phi/backends/dynload/mufft.cc b/paddle/phi/backends/dynload/mufft.cc deleted file mode 100644 index 9e30463ea39fa1..00000000000000 --- a/paddle/phi/backends/dynload/mufft.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/backends/dynload/mufft.h" - -#include "paddle/phi/core/enforce.h" - -namespace phi { -namespace dynload { -std::once_flag mufft_dso_flag; -void* mufft_dso_handle = nullptr; - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); - -bool HasMUFFT() { - std::call_once(mufft_dso_flag, - []() { mufft_dso_handle = GetMUFFTDsoHandle(); }); - return mufft_dso_handle != nullptr; -} - -void EnforceMUFFTLoaded(const char* fn_name) { - PADDLE_ENFORCE_NOT_NULL( - mufft_dso_handle, - phi::errors::PreconditionNotMet( - "Cannot load mufft shared library. Cannot invoke method %s.", - fn_name)); -} - -} // namespace dynload -} // namespace phi diff --git a/paddle/phi/backends/dynload/mufft.h b/paddle/phi/backends/dynload/mufft.h deleted file mode 100644 index 70bfdd4c1efd18..00000000000000 --- a/paddle/phi/backends/dynload/mufft.h +++ /dev/null @@ -1,155 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifdef PADDLE_WITH_MUSA -#include -#include -#include - -#include // NOLINT - -#include "paddle/phi/backends/dynload/dynamic_loader.h" -#include "paddle/phi/backends/dynload/port.h" - -namespace phi { -namespace dynload { - -extern std::once_flag mufft_dso_flag; -extern void* mufft_dso_handle; - -extern void EnforceMUFFTLoaded(const char* fn_name); -#define DECLARE_DYNAMIC_LOAD_MUFFT_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using mufft_func = decltype(&::__name); \ - std::call_once(mufft_dso_flag, []() { \ - mufft_dso_handle = phi::dynload::GetMUFFTDsoHandle(); \ - }); \ - EnforceMUFFTLoaded(#__name); \ - static void* p_##__name = dlsym(mufft_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern struct DynLoad__##__name __name - -/** - * include all needed mufft functions in HPPL - * different mufft version has different interfaces - **/ -#define MUFFT_FFT_ROUTINE_EACH(__macro) \ - __macro(mufftPlan1d); \ - __macro(mufftPlan2d); \ - __macro(mufftPlan3d); \ - __macro(mufftPlanMany); \ - __macro(mufftMakePlan1d); \ - __macro(mufftMakePlan2d); \ - __macro(mufftMakePlan3d); \ - __macro(mufftMakePlanMany); \ - __macro(mufftEstimate1d); \ - __macro(mufftEstimate2d); \ - __macro(mufftEstimate3d); \ - __macro(mufftEstimateMany); \ - __macro(mufftCreate); \ - __macro(mufftGetSize1d); \ - __macro(mufftGetSize2d); \ - __macro(mufftGetSize3d); \ - __macro(mufftGetSizeMany); \ - __macro(mufftGetSize); \ - __macro(mufftSetWorkArea); \ - __macro(mufftSetAutoAllocation); \ - __macro(mufftExecC2C); \ - __macro(mufftExecR2C); \ - __macro(mufftExecC2R); \ - __macro(mufftExecZ2Z); \ - __macro(mufftExecD2Z); \ - __macro(mufftExecZ2D); \ - __macro(mufftSetStream); \ - __macro(mufftDestroy); \ - __macro(mufftGetVersion); \ - __macro(mufftGetProperty); \ - __macro(mufftXtSetGPUs); \ - __macro(mufftXtMalloc); \ - __macro(mufftXtMemcpy); \ - __macro(mufftXtFree); \ - __macro(mufftXtExecDescriptorC2C); \ - __macro(mufftXtExecDescriptorR2C); \ - __macro(mufftXtExecDescriptorC2R); \ - __macro(mufftXtExecDescriptorZ2Z); \ - __macro(mufftXtExecDescriptorD2Z); \ - __macro(mufftXtExecDescriptorZ2D); \ - __macro(mufftXtQueryPlan); \ - __macro(mufftXtSetCallback); \ - __macro(mufftXtClearCallback); \ - __macro(mufftXtMakePlanMany); \ - __macro(mufftXtGetSizeMany); \ - __macro(mufftXtExec); \ - __macro(mufftXtExecDescriptor); -MUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUFFT_WRAP) - - -inline const char *mufftGetErrorString(mufftResult_t status) { - switch (status) { - case MUFFT_SUCCESS: - return "'MUFFT_SUCCESS'. The mufft operation was successful."; - case MUFFT_INVALID_PLAN: - return "'MUFFT_INVALID_PLAN'. mufft was passed an invalid plan handle."; - case MUFFT_ALLOC_FAILED: - return "'MUFFT_ALLOC_FAILED'. mufft failed to allocate GPU or CPU " - "memory."; - case MUFFT_INVALID_TYPE: - return "'MUFFT_INVALID_TYPE'. No longer used."; - case MUFFT_INVALID_VALUE: - return "'MUFFT_INVALID_VALUE'. User specified an invalid pointer or " - "parameter."; - case MUFFT_INTERNAL_ERROR: - return "'MUFFT_INTERNAL_ERROR'. Driver or internal mufft library " - "error."; - case MUFFT_EXEC_FAILED: - return "'MUFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU."; - case MUFFT_SETUP_FAILED: - return "'MUFFT_SETUP_FAILED'. The mufft library failed to initialize."; - case MUFFT_INVALID_SIZE: - return "'MUFFT_INVALID_SIZE'. User specified an invalid transform size."; - case MUFFT_UNALIGNED_DATA: - return "'MUFFT_UNALIGNED_DATA'. No longer used."; - case MUFFT_INCOMPLETE_PARAMETER_LIST: - return "'MUFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call."; - case MUFFT_INVALID_DEVICE: - return "'MUFFT_INVALID_DEVICE'. Execution of a plan was on different " - "GPU than plan creation."; - case MUFFT_PARSE_ERROR: - return "'MUFFT_PARSE_ERROR'. Internal plan database error."; - case MUFFT_NO_WORKSPACE: - return "'MUFFT_NO_WORKSPACE'. No workspace has been provided prior to " - "plan execution."; - case MUFFT_NOT_IMPLEMENTED: - return "'MUFFT_NOT_IMPLEMENTED'. Function does not implement " - "functionality for parameters given."; - case MUFFT_LICENSE_ERROR: - return "'MUFFT_LICENSE_ERROR'. Operation is not supported for " - "parameters given."; - case MUFFT_NOT_SUPPORTED: - return "'MUFFT_NOT_SUPPORTED'. Operation is not supported for " - "parameters given."; - default: - return "mufft_STATUS_UNKNOWN_ERROR"; - } -} - -} // namespace dynload -} // namespace phi - -#endif diff --git a/paddle/phi/backends/dynload/murand.cc b/paddle/phi/backends/dynload/murand.cc deleted file mode 100644 index bbeeb7bcd58981..00000000000000 --- a/paddle/phi/backends/dynload/murand.cc +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/backends/dynload/murand.h" - -namespace phi { -namespace dynload { - -std::once_flag murand_dso_flag; -void *murand_dso_handle; - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); - -} // namespace dynload -} // namespace phi diff --git a/paddle/phi/backends/dynload/murand.h b/paddle/phi/backends/dynload/murand.h deleted file mode 100644 index 28380cd9423f04..00000000000000 --- a/paddle/phi/backends/dynload/murand.h +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include - -#include // NOLINT - -#include "paddle/phi/backends/dynload/dynamic_loader.h" -#include "paddle/phi/backends/dynload/port.h" - -namespace phi { -namespace dynload { -extern std::once_flag murand_dso_flag; -extern void *murand_dso_handle; - -#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - murandStatus_t operator()(Args... args) { \ - using murandFunc = decltype(&::__name); \ - std::call_once(murand_dso_flag, []() { \ - murand_dso_handle = phi::dynload::GetCurandDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(murand_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern DynLoad__##__name __name - -#define MURAND_RAND_ROUTINE_EACH(__macro) \ - __macro(murandCreateGenerator); \ - __macro(murandSetStream); \ - __macro(murandSetPseudoRandomGeneratorSeed); \ - __macro(murandGenerateUniform); \ - __macro(murandGenerateUniformDouble); \ - __macro(murandGenerateNormal); \ - __macro(murandDestroyGenerator); - -MURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); - -} // namespace dynload -} // namespace phi diff --git a/paddle/phi/backends/dynload/musa_driver.cc b/paddle/phi/backends/dynload/musa_driver.cc deleted file mode 100644 index 2173a8d6cdd819..00000000000000 --- a/paddle/phi/backends/dynload/musa_driver.cc +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/backends/dynload/musa_driver.h" - -namespace phi { -namespace dynload { - -std::once_flag musa_dso_flag; -void* musa_dso_handle = nullptr; - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MUSA_ROUTINE_EACH(DEFINE_WRAP); - -bool HasCUDADriver() { - std::call_once(musa_dso_flag, []() { musa_dso_handle = GetCUDADsoHandle(); }); - return musa_dso_handle != nullptr; -} - -} // namespace dynload -} // namespace phi diff --git a/paddle/phi/backends/dynload/musa_driver.h b/paddle/phi/backends/dynload/musa_driver.h deleted file mode 100644 index 3534ab8213c936..00000000000000 --- a/paddle/phi/backends/dynload/musa_driver.h +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include // NOLINT - -#include "paddle/phi/backends/dynload/dynamic_loader.h" -#include "paddle/phi/backends/dynload/port.h" - -namespace phi { -namespace dynload { - -extern std::once_flag musa_dso_flag; -extern void* musa_dso_handle; -extern bool HasCUDADriver(); - -#define DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using musa_func = decltype(&::__name); \ - std::call_once(musa_dso_flag, []() { \ - musa_dso_handle = phi::dynload::GetCUDADsoHandle(); \ - }); \ - static void* p_##__name = dlsym(musa_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern struct DynLoad__##__name __name - -/** - * include all needed musa driver functions - **/ -#define MUSA_ROUTINE_EACH(__macro) \ - __macro(muInit); \ - __macro(muDriverGetVersion); \ - __macro(muGetErrorString); \ - __macro(muModuleLoadData); \ - __macro(muModuleGetFunction); \ - __macro(muModuleUnload); \ - __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \ - __macro(muLaunchKernel); \ - __macro(muCtxCreate); \ - __macro(muCtxGetCurrent); \ - __macro(muDeviceGetCount); \ - __macro(muDevicePrimaryCtxGetState); \ - __macro(muDeviceGetAttribute); \ - __macro(muDeviceGet); - -MUSA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSA_WRAP); - -#undef DECLARE_DYNAMIC_LOAD_MUSA_WRAP - -} // namespace dynload -} // namespace phi diff --git a/paddle/phi/backends/dynload/musartc.cc b/paddle/phi/backends/dynload/musartc.cc deleted file mode 100644 index 9cd25270a10167..00000000000000 --- a/paddle/phi/backends/dynload/musartc.cc +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/backends/dynload/musartc.h" - -namespace phi { -namespace dynload { - -std::once_flag musartc_dso_flag; -void* musartc_dso_handle = nullptr; - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MUSARTC_ROUTINE_EACH(DEFINE_WRAP); - -bool HasNVRTC() { - std::call_once(musartc_dso_flag, - []() { musartc_dso_handle = GetNVRTCDsoHandle(); }); - return musartc_dso_handle != nullptr; -} - -} // namespace dynload -} // namespace phi diff --git a/paddle/phi/backends/dynload/musartc.h b/paddle/phi/backends/dynload/musartc.h deleted file mode 100644 index ee85bebc503ec0..00000000000000 --- a/paddle/phi/backends/dynload/musartc.h +++ /dev/null @@ -1,147 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// #include - -#include // NOLINT - -#include "paddle/phi/backends/dynload/dynamic_loader.h" -#include "paddle/phi/backends/dynload/port.h" -#include "paddle/phi/core/enforce.h" - -// TODO(MTAI): The following musa runtime compiling functions are not supported -// now. Here empty implementations are given temporarily. When compiler MCC -// supports these functions, we will replace them. -typedef struct _mtrtcProgram *mtrtcProgram; - -typedef enum { - MTRTC_SUCCESS = 0, - MTRTC_ERROR_OUT_OF_MEMORY = 1, - MTRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, - MTRTC_ERROR_INVALID_INPUT = 3, - MTRTC_ERROR_INVALID_PROGRAM = 4, - MTRTC_ERROR_INVALID_OPTION = 5, - MTRTC_ERROR_COMPILATION = 6, - MTRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, - MTRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, - MTRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, - MTRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, - MTRTC_ERROR_INTERNAL_ERROR = 11 -} mtrtcResult; - -inline mtrtcResult mtrtcVersion(int *major, int *minor) { - PADDLE_THROW( - phi::errors::Unimplemented("mtrtcVersion is not supported on MUSA now!")); - return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; -} - -inline const char *mtrtcGetErrorString(mtrtcResult result) { - PADDLE_THROW(phi::errors::Unimplemented( - "mtrtcGetErrorString is not supported on MUSA now!")); - return "mtrtcGetErrorString is not supported on MUSA now!"; -} - -inline mtrtcResult mtrtcCompileProgram(mtrtcProgram prog, - int numOptions, - const char *const *options) { - PADDLE_THROW(phi::errors::Unimplemented( - "mtrtcCompileProgram is not supported on MUSA now!")); - return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; -} - -inline mtrtcResult mtrtcCreateProgram(mtrtcProgram *prog, - const char *src, - const char *name, - int numHeaders, - const char *const *headers, - const char *const *includeNames) { - PADDLE_THROW(phi::errors::Unimplemented( - "mtrtcCreateProgram is not supported on MUSA now!")); - return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; -} - -inline mtrtcResult mtrtcDestroyProgram(mtrtcProgram *prog) { - PADDLE_THROW(phi::errors::Unimplemented( - "mtrtcDestroyProgram is not supported on MUSA now!")); - return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; -} - -inline mtrtcResult mtrtcGetMUSA(mtrtcProgram prog, char *musa) { - PADDLE_THROW( - phi::errors::Unimplemented("mtrtcGetMUSA is not supported on MUSA now!")); - return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; -} - -inline mtrtcResult mtrtcGetMUSASize(mtrtcProgram prog, size_t *musaSizeRet) { - PADDLE_THROW(phi::errors::Unimplemented( - "mtrtcGetMUSASize is not supported on MUSA now!")); - return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; -} - -inline mtrtcResult mtrtcGetProgramLog(mtrtcProgram prog, char *log) { - PADDLE_THROW(phi::errors::Unimplemented( - "mtrtcGetProgramLog is not supported on MUSA now!")); - return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; -} - -inline mtrtcResult mtrtcGetProgramLogSize(mtrtcProgram prog, - size_t *logSizeRet) { - PADDLE_THROW(phi::errors::Unimplemented( - "mtrtcGetProgramLogSize is not supported on MUSA now!")); - return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; -} - -namespace phi { -namespace dynload { - -extern std::once_flag musartc_dso_flag; -extern void *musartc_dso_handle; -extern bool HasNVRTC(); - -#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using musartc_func = decltype(&::__name); \ - std::call_once(musartc_dso_flag, []() { \ - musartc_dso_handle = phi::dynload::GetNVRTCDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(musartc_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern struct DynLoad__##__name __name - -/** - * include all needed musartc functions - **/ -#define MUSARTC_ROUTINE_EACH(__macro) \ - __macro(mtrtcVersion); \ - __macro(mtrtcGetErrorString); \ - __macro(mtrtcCompileProgram); \ - __macro(mtrtcCreateProgram); \ - __macro(mtrtcDestroyProgram); \ - __macro(mtrtcGetMUSA); \ - __macro(mtrtcGetMUSASize); \ - __macro(mtrtcGetProgramLog); \ - __macro(mtrtcGetProgramLogSize) - -MUSARTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP); - -#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP - -} // namespace dynload -} // namespace phi diff --git a/paddle/phi/backends/dynload/musparse.cc b/paddle/phi/backends/dynload/musparse.cc deleted file mode 100644 index 40d766f963c40c..00000000000000 --- a/paddle/phi/backends/dynload/musparse.cc +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/backends/dynload/musparse.h" - -namespace phi { -namespace dynload { - -std::once_flag musparse_dso_flag; -void *musparse_dso_handle; - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MUSPARSE_ROUTINE_EACH(DEFINE_WRAP); - -} // namespace dynload -} // namespace phi - diff --git a/paddle/phi/backends/dynload/musparse.h b/paddle/phi/backends/dynload/musparse.h deleted file mode 100644 index e63182943190d5..00000000000000 --- a/paddle/phi/backends/dynload/musparse.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -#include // NOLINT - -#include "paddle/phi/backends/dynload/dynamic_loader.h" -#include "paddle/phi/backends/dynload/port.h" - -namespace phi { -namespace dynload { -extern std::once_flag musparse_dso_flag; -extern void *musparse_dso_handle; - -#define DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - musparseStatus_t operator()(Args... args) { \ - using Func = decltype(&::__name); \ - std::call_once(musparse_dso_flag, []() { \ - musparse_dso_handle = phi::dynload::GetCusparseDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(musparse_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern DynLoad__##__name __name - -#if defined(PADDLE_WITH_MUSA) -#define MUSPARSE_ROUTINE_EACH(__macro) \ - __macro(musparseCreateHandle); \ - __macro(musparseDestroyHandle); \ - __macro(musparseSetStream); \ - __macro(musparseCreateMatDescr); \ - __macro(musparseSnnz); \ - __macro(musparseDnnz); \ - __macro(musparseSetMatType); \ - __macro(musparseSetMatIndexBase); \ - __macro(musparseCreateCsr); \ - __macro(musparseCreateCoo); \ - __macro(musparseCreateDnMat); \ - __macro(musparseCreateDnVec); \ - __macro(musparseSpMM); \ - __macro(musparseDestroySpMat); \ - __macro(musparseDestroyDnMat); \ - __macro(musparseDestroyDnVec); \ - __macro(musparseSpMV); \ - __macro(musparseSDDMM_bufferSize); \ - __macro(musparseSDDMM_preprocess); \ - __macro(musparseSDDMM); \ - __macro(musparseDnMatSetStridedBatch); \ - __macro(musparseCooSetStridedBatch); \ - __macro(musparseCsrSetStridedBatch); - -MUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP) - -#endif // PADDLE_WITH_MUSA - -#undef DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP -} // namespace dynload -} // namespace phi - diff --git a/paddle/phi/backends/dynload/nccl.h b/paddle/phi/backends/dynload/nccl.h index a5759b67e8df78..91b6f5dcd58dc5 100644 --- a/paddle/phi/backends/dynload/nccl.h +++ b/paddle/phi/backends/dynload/nccl.h @@ -42,18 +42,18 @@ extern void* nccl_dso_handle; #define NCCL_RAND_ROUTINE_EACH(__macro) \ __macro(ncclCommInitAll); \ - __macro(mcclGetUniqueId); \ + __macro(ncclGetUniqueId); \ __macro(ncclCommInitRank); \ __macro(ncclCommAbort); \ __macro(ncclCommDestroy); \ __macro(ncclCommCount); \ __macro(ncclCommCuDevice); \ __macro(ncclCommUserRank); \ - __macro(mcclAllReduce); \ - __macro(mcclBcast); \ - __macro(mcclAllGather); \ - __macro(mcclGroupStart); \ - __macro(mcclGroupEnd); \ + __macro(ncclAllReduce); \ + __macro(ncclBcast); \ + __macro(ncclAllGather); \ + __macro(ncclGroupStart); \ + __macro(ncclGroupEnd); \ __macro(ncclReduce); \ __macro(ncclReduceScatter); \ __macro(ncclCommGetAsyncError); \ @@ -67,7 +67,7 @@ NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) #endif #if NCCL_VERSION_CODE >= 2304 -#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); +#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion); NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) #endif diff --git a/paddle/phi/backends/dynload/rccl.h b/paddle/phi/backends/dynload/rccl.h index 651cc9c68b2438..e1018a3f253fa5 100644 --- a/paddle/phi/backends/dynload/rccl.h +++ b/paddle/phi/backends/dynload/rccl.h @@ -42,18 +42,18 @@ extern void* rccl_dso_handle; #define RCCL_RAND_ROUTINE_EACH(__macro) \ __macro(ncclCommInitAll); \ - __macro(mcclGetUniqueId); \ + __macro(ncclGetUniqueId); \ __macro(ncclCommInitRank); \ __macro(ncclCommAbort); \ __macro(ncclCommDestroy); \ __macro(ncclCommCount); \ __macro(ncclCommCuDevice); \ __macro(ncclCommUserRank); \ - __macro(mcclAllReduce); \ - __macro(mcclBcast); \ - __macro(mcclAllGather); \ - __macro(mcclGroupStart); \ - __macro(mcclGroupEnd); \ + __macro(ncclAllReduce); \ + __macro(ncclBcast); \ + __macro(ncclAllGather); \ + __macro(ncclGroupStart); \ + __macro(ncclGroupEnd); \ __macro(ncclReduce); \ __macro(ncclReduceScatter); \ __macro(ncclCommGetAsyncError); \ @@ -67,7 +67,7 @@ RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) #endif #if NCCL_VERSION_CODE >= 2304 -#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); +#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion); RCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) #endif diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h index 2b733c01bc01b5..e1f3492f768702 100644 --- a/paddle/phi/backends/gpu/forwards.h +++ b/paddle/phi/backends/gpu/forwards.h @@ -72,25 +72,6 @@ using cufftHandle = int; // Forward declaration of NCCL types. using ncclComm_t = struct ncclComm *; - - - -// Forward declaration of MUSA runtime types. -using musaStream_t = struct MUstream_st *; -using musaEvent_t = struct MUevent_st *; -using mublasHandle_t = struct _mublasHandle_t *; -namespace musa { -namespace dnn { -struct Handle; -} // namespace dnn -} // namespace musa -using mudnnHandle_t = musa::dnn::Handle *; -using musparseHandle_t = struct _musparse_handle *; -using mublasLtHandle_t = struct mublasLtContext *; -using mcclComm_t = struct mcclComm *; - - - /// Forward declaration of ROCM types. #include diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index f250fb365ce85b..8d46c3e34cabdf 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -51,16 +51,6 @@ limitations under the License. */ #endif // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) #endif // PADDLE_WITH_HIP - -#ifdef PADDLE_WITH_MUSA -#include "paddle/phi/backends/dynload/mudnn.h" -#include "paddle/phi/backends/dynload/mublas.h" -#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) -#include "paddle/phi/backends/dynload/mccl.h" -#endif // !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) -#endif // PADDLE_WITH_MUSA - - // NOTE: The paddle framework should add WITH_EIGEN option to support compile // without eigen. #include "unsupported/Eigen/CXX11/Tensor" @@ -129,9 +119,6 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); @@ -156,11 +143,6 @@ static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status, void* user_data) #endif -#ifdef PADDLE_WITH_MUSA -static void StreamCallbackFunc(gpuStream_t stream, - gpuError_t status, - void* user_data) -#endif #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 static void CUDART_CB StreamCallbackFunc(void* user_data) @@ -188,8 +170,6 @@ void DnnWorkspaceHandle::RunFuncSync( std::lock_guard guard(*mtx_); #ifdef PADDLE_WITH_HIP auto status = hipMalloc(&workspace_ptr, size); -#elif defined(PADDLE_WITH_MUSA) - auto status = musaMalloc(&workspace_ptr, size); #else auto status = cudaMalloc(&workspace_ptr, size); #endif @@ -198,8 +178,6 @@ void DnnWorkspaceHandle::RunFuncSync( phi::backends::gpu::GpuStreamSync(stream_); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaFree(workspace_ptr)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr)); #endif @@ -270,9 +248,9 @@ struct GPUContext::Impl { DestoryInternalWorkspace(); DestoryInternalEigenDevice(); phi::DestroySparseHandle(sparse_handle_); - // phi::DestroySolverHandle(solver_handle_); + phi::DestroySolverHandle(solver_handle_); phi::DestroyDnnHandle(dnn_handle_); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nccl_comm_) { // NOTE(liyurui): It is not recommend calling CUDA runtime API // in destructor. Since we can not ensure the release order of @@ -286,7 +264,7 @@ struct GPUContext::Impl { phi::DestroyBlasHandle(blas_handle_); phi::DestroyBlasHandle(blas_tensor_core_handle_); phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_); - // phi::DestroyBlasLtHandle(blaslt_handle_); + phi::DestroyBlasLtHandle(blaslt_handle_); } if (stream_owned_ && stream_) { delete stream_; @@ -447,24 +425,24 @@ struct GPUContext::Impl { blas_tf32_tensor_core_handle_creator_ = std::move(handle_creator); } - // void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; } + void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; } - // void SetBlasLtHandle(std::function&& handle_creator) { - // blaslt_handle_creator_ = std::move(handle_creator); - // } + void SetBlasLtHandle(std::function&& handle_creator) { + blaslt_handle_creator_ = std::move(handle_creator); + } - // blasLtHandle_t GetBlasLtHandle() { - // std::call_once(flag_blaslt_, [&]() { - // if (!blaslt_handle_) { - // if (!blaslt_handle_creator_) - // phi::InitBlasLtHandle(&blaslt_handle_); - // else - // blaslt_handle_ = blaslt_handle_creator_(); - // } - // }); - // PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr."); - // return blaslt_handle_; - // } + blasLtHandle_t GetBlasLtHandle() { + std::call_once(flag_blaslt_, [&]() { + if (!blaslt_handle_) { + if (!blaslt_handle_creator_) + phi::InitBlasLtHandle(&blaslt_handle_); + else + blaslt_handle_ = blaslt_handle_creator_(); + } + }); + PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr."); + return blaslt_handle_; + } dnnHandle_t GetDnnHandle() { std::call_once(flag_dnn_, [&]() { @@ -486,11 +464,6 @@ struct GPUContext::Impl { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(dnn_handle_)); dnn_handle_ = nullptr; } -#elif defined(PADDLE_WITH_MUSA) - if (owned_ && dnn_handle_ != nullptr) { - phi::dynload::mudnnDestroy(dnn_handle_); - dnn_handle_ = nullptr; - } #else if (owned_ && dnn_handle_ != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_)); @@ -505,25 +478,25 @@ struct GPUContext::Impl { dnn_handle_creator_ = std::move(handle_creator); } - // solverHandle_t GetSolverHandle() { - // std::call_once(flag_slover_, [&]() { - // if (!solver_handle_) { - // if (!solver_handle_creator_) { - // phi::InitSolverHandle(&solver_handle_, stream()); - // } else { - // solver_handle_ = solver_handle_creator_(); - // } - // } - // }); - // PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr."); - // return solver_handle_; - // } + solverHandle_t GetSolverHandle() { + std::call_once(flag_slover_, [&]() { + if (!solver_handle_) { + if (!solver_handle_creator_) { + phi::InitSolverHandle(&solver_handle_, stream()); + } else { + solver_handle_ = solver_handle_creator_(); + } + } + }); + PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr."); + return solver_handle_; + } - // void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; } + void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; } - // void SetSolverHandle(std::function&& handle_creator) { - // solver_handle_creator_ = std::move(handle_creator); - // } + void SetSolverHandle(std::function&& handle_creator) { + solver_handle_creator_ = std::move(handle_creator); + } sparseHandle_t GetSparseHandle() { std::call_once(flag_sparse_, [&]() { @@ -556,9 +529,6 @@ struct GPUContext::Impl { break; } #endif // !defined(_WIN32) -#elif defined(PADDLE_WITH_MUSA) - musaError_t e_sync = musaSuccess; - e_sync = musaStreamSynchronize(stream()); #else // PADDLE_WITH_HIP cudaError_t e_sync = cudaSuccess; #if !defined(_WIN32) @@ -577,23 +547,21 @@ struct GPUContext::Impl { void WaitEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream(), ev, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream(), ev, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream(), ev, 0)); #endif } - mcclComm_t GetNcclComm() const { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) + ncclComm_t GetNcclComm() const { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) // PD_CHECK(nccl_comm_ != nullptr, "the gpu nccl_comm is nullptr."); return nccl_comm_; #endif return nullptr; } - void SetNcclComm(mcclComm_t comm) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) + void SetNcclComm(ncclComm_t comm) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) nccl_comm_ = comm; #endif } @@ -710,8 +678,6 @@ struct GPUContext::Impl { void RecordEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream())); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(ev, stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream())); #endif @@ -734,12 +700,6 @@ struct GPUContext::Impl { PADDLE_ENFORCE_GPU_SUCCESS( hipStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0)); #endif - -#ifdef PADDLE_WITH_MUSA - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0)); -#endif - #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 PADDLE_ENFORCE_GPU_SUCCESS( @@ -752,7 +712,7 @@ struct GPUContext::Impl { } void WaitStreamCallback() const { -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) phi::backends::gpu::GpuStreamSync(stream()); #endif { @@ -804,12 +764,12 @@ struct GPUContext::Impl { std::function blas_tensor_core_handle_creator_{nullptr}; blasHandle_t blas_tf32_tensor_core_handle_{nullptr}; std::function blas_tf32_tensor_core_handle_creator_{nullptr}; - // blasLtHandle_t blaslt_handle_{nullptr}; - // std::function blaslt_handle_creator_{nullptr}; + blasLtHandle_t blaslt_handle_{nullptr}; + std::function blaslt_handle_creator_{nullptr}; dnnHandle_t dnn_handle_{nullptr}; std::function dnn_handle_creator_{nullptr}; - // solverHandle_t solver_handle_{nullptr}; - // std::function solver_handle_creator_{nullptr}; + solverHandle_t solver_handle_{nullptr}; + std::function solver_handle_creator_{nullptr}; sparseHandle_t sparse_handle_{nullptr}; std::function sparse_handle_creator_{nullptr}; DnnWorkspaceHandle* workspace_{nullptr}; @@ -823,7 +783,7 @@ struct GPUContext::Impl { std::once_flag flag_tensorcore_cublas_; std::once_flag flag_eigen_device_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) // NCCL communicator (single process version) for NCCL collective operations. // NCCL collective operations provides fast collectives over multiple GPUs // both within and across nodes. @@ -832,7 +792,7 @@ struct GPUContext::Impl { // NOTE: Distributed communicator, distributed framework manages its // resources. - mcclComm_t nccl_comm_{nullptr}; + ncclComm_t nccl_comm_{nullptr}; #endif mutable std::mutex blas_mtx_; @@ -879,13 +839,13 @@ blasHandle_t GPUContext::cublas_handle() const { return impl_->GetBlasHandle(); } -// blasLtHandle_t GPUContext::cublaslt_handle() const { -// return impl_->GetBlasLtHandle(); -// } +blasLtHandle_t GPUContext::cublaslt_handle() const { + return impl_->GetBlasLtHandle(); +} -// solverHandle_t GPUContext::cusolver_dn_handle() const { -// return impl_->GetSolverHandle(); -// } +solverHandle_t GPUContext::cusolver_dn_handle() const { + return impl_->GetSolverHandle(); +} sparseHandle_t GPUContext::cusparse_handle() const { return impl_->GetSparseHandle(); @@ -954,9 +914,9 @@ void GPUContext::AddStreamCallback( void GPUContext::WaitStreamCallback() const { impl_->WaitStreamCallback(); } -mcclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); } +ncclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); } -void GPUContext::set_nccl_comm(mcclComm_t comm) { impl_->SetNcclComm(comm); } +void GPUContext::set_nccl_comm(ncclComm_t comm) { impl_->SetNcclComm(comm); } void GPUContext::Init() { impl_->allocator_ = const_cast(&this->GetAllocator()); // NOLINT @@ -1005,13 +965,13 @@ void GPUContext::SetBlasTF32Handle(std::function&& func) { impl_->SetBlasTF32Handle(std::move(func)); } -// void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) { -// impl_->SetBlasLtHandle(blaslt); -// } +void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) { + impl_->SetBlasLtHandle(blaslt); +} -// void GPUContext::SetBlasLtHandle(std::function&& func) { -// impl_->SetBlasLtHandle(std::move(func)); -// } +void GPUContext::SetBlasLtHandle(std::function&& func) { + impl_->SetBlasLtHandle(std::move(func)); +} void GPUContext::SetDnnHandle(dnnHandle_t handle) { impl_->SetDnnHandle(handle); @@ -1021,13 +981,13 @@ void GPUContext::SetDnnHandle(std::function&& func) { impl_->SetDnnHandle(std::move(func)); } -// void GPUContext::SetSolverHandle(solverHandle_t handle) { -// impl_->SetSolverHandle(handle); -// } +void GPUContext::SetSolverHandle(solverHandle_t handle) { + impl_->SetSolverHandle(handle); +} -// void GPUContext::SetSolverHandle(std::function&& func) { -// impl_->SetSolverHandle(std::move(func)); -// } +void GPUContext::SetSolverHandle(std::function&& func) { + impl_->SetSolverHandle(std::move(func)); +} void GPUContext::SetSparseHandle(sparseHandle_t handle) { impl_->SetSparseHandle(handle); @@ -1086,7 +1046,7 @@ void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) { void GPUContext::ClearDnnAttr() { return impl_->ClearDnnAttr(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) GPUPinnedContext::GPUPinnedContext() { eigen_device_ = std::make_unique(); } diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 19eb5dd05cd3c1..8cd0d414bc105b 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU_KP) #include @@ -109,10 +109,10 @@ class PADDLE_API GPUContext : public DeviceContext, blasHandle_t cublas_handle() const; /*! \brief Return cublasLt handle in the device context. */ - // blasLtHandle_t cublaslt_handle() const; + blasLtHandle_t cublaslt_handle() const; /*! \brief Return cusolver handle in the device context. */ - // solverHandle_t cusolver_dn_handle() const; + solverHandle_t cusolver_dn_handle() const; /*! \brief Return cusparse handle in the device context. */ sparseHandle_t cusparse_handle() const; @@ -183,10 +183,10 @@ class PADDLE_API GPUContext : public DeviceContext, public: /*! \brief Return nccl communicators. */ - mcclComm_t nccl_comm() const; + ncclComm_t nccl_comm() const; /*! \brief Set nccl communicators. */ - void set_nccl_comm(mcclComm_t comm); + void set_nccl_comm(ncclComm_t comm); public: // NOTE: DeviceContext hold resources. Used in training scenarios. @@ -232,14 +232,14 @@ class PADDLE_API GPUContext : public DeviceContext, void SetBlasTF32Handle(blasHandle_t); void SetBlasTF32Handle(std::function&&); - // void SetBlasLtHandle(blasLtHandle_t); - // void SetBlasLtHandle(std::function&&); + void SetBlasLtHandle(blasLtHandle_t); + void SetBlasLtHandle(std::function&&); void SetDnnHandle(dnnHandle_t); void SetDnnHandle(std::function&&); - // void SetSolverHandle(solverHandle_t); - // void SetSolverHandle(std::function&&); + void SetSolverHandle(solverHandle_t); + void SetSolverHandle(std::function&&); void SetSparseHandle(sparseHandle_t); void SetSparseHandle(std::function&&); @@ -276,7 +276,7 @@ using GPUDNNContext = GPUContext; // because we want to implement a KPS-based kernel and make it run // on GPU and XPU at the same time, so we need KPSContext when registering // KPS Kernel. Note: XPU and GPU cannot be compiled at the same time! -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) using KPSContext = GPUContext; #endif @@ -287,7 +287,7 @@ struct DefaultDevice; } // namespace Eigen namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Currently, GPUPinnedContext is only used to data copying. class GPUPinnedContext : public DeviceContext, diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h index e791326d71fd49..4a6b9d2fd87f13 100644 --- a/paddle/phi/backends/gpu/gpu_decls.h +++ b/paddle/phi/backends/gpu/gpu_decls.h @@ -16,66 +16,57 @@ #pragma once #include "paddle/phi/backends/gpu/forwards.h" -// #include "mudnn/export/c/mudnn_compatible.h" + namespace phi { #ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = ROCM_TYPE; -#elif defined(PADDLE_WITH_MUSA) -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ - using GPU_TYPE = MUSA_TYPE; + #else // PADDLE_WITH_CDUA -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ + +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = CUDA_TYPE; #endif -DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t,musaStream_t); -DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t,musaEvent_t); +DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t); +DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t); -// DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, -// cudnnActivationStruct, -// miopenActivationDescriptor, -// mudnnActivationStruct); -// DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, -// cudnnTensorStruct, -// miopenTensorDescriptor, -// mudnnTensorStruct); -// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, -// cudnnFilterStruct, -// miopenTensorDescriptor, -// mudnnFilterStruct); -// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, -// cudnnFilterDescriptor_t, -// miopenTensorDescriptor_t, -// mudnnFilterDescriptor_t); -// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, -// cudnnConvolutionStruct, -// miopenConvolutionDescriptor, -// mudnnConvolutionStruct); -// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, -// cudnnConvolutionDescriptor_t, -// miopenConvolutionDescriptor_t, -// mudnnConvolutionDescriptor_t); -// DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, -// cudnnPoolingDescriptor_t, -// miopenPoolingDescriptor_t, -// mudnnPoolingDescriptor_t); -// DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, -// cudnnDropoutDescriptor_t, -// miopenDropoutDescriptor_t, -// mudnnDropoutDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t,mudnnHandle_t); +DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, + cudnnActivationStruct, + miopenActivationDescriptor); +DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, + cudnnTensorStruct, + miopenTensorDescriptor); +DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, + cudnnFilterStruct, + miopenTensorDescriptor); +DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, + cudnnFilterDescriptor_t, + miopenTensorDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, + cudnnConvolutionStruct, + miopenConvolutionDescriptor); +DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, + cudnnConvolutionDescriptor_t, + miopenConvolutionDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, + cudnnPoolingDescriptor_t, + miopenPoolingDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, + cudnnDropoutDescriptor_t, + miopenDropoutDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); -DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle,mublasHandle_t); +DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workround. -// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasHandle_t); +DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); -// DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle, musolverDnHandle_t); +DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle); -DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle, musparseHandle_t); +DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle); #undef DECLARE_TYPE_FOR_GPU diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h index 5c0c475b140ff0..0f79e2a645ab34 100644 --- a/paddle/phi/backends/gpu/gpu_device_function.h +++ b/paddle/phi/backends/gpu/gpu_device_function.h @@ -13,12 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/rocm_device_function.h" -#elif defined(PADDLE_WITH_MUSA) -#include "paddle/phi/backends/gpu/musa/musa_device_function.h" #else #include "paddle/phi/backends/gpu/cuda/cuda_device_function.h" #endif diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h index 30cf3fae80519b..f37afa3deeb746 100644 --- a/paddle/phi/backends/gpu/gpu_dnn.h +++ b/paddle/phi/backends/gpu/gpu_dnn.h @@ -14,14 +14,11 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/miopen_desc.h" #include "paddle/phi/backends/gpu/rocm/miopen_helper.h" -#elif defined(PADDLE_WITH_MUSA) -#include "paddle/phi/backends/gpu/musa/mudnn_desc.h" -#include "paddle/phi/backends/gpu/musa/mudnn_helper.h" #else // CUDA #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h" #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h" diff --git a/paddle/phi/backends/gpu/gpu_helper.h b/paddle/phi/backends/gpu/gpu_helper.h index 8afa826408cb7a..2353b42794ffdd 100644 --- a/paddle/phi/backends/gpu/gpu_helper.h +++ b/paddle/phi/backends/gpu/gpu_helper.h @@ -13,12 +13,10 @@ // limitations under the License. #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/rocm_helper.h" -#elif defined(PADDLE_WITH_MUSA) -#include "paddle/phi/backends/gpu/musa/musa_helper.h" #else #include "paddle/phi/backends/gpu/cuda/cuda_helper.h" #endif diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h index 2d1b7c1a98f27f..ebf57bd06eb19d 100644 --- a/paddle/phi/backends/gpu/gpu_info.h +++ b/paddle/phi/backends/gpu/gpu_info.h @@ -11,7 +11,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index 4e300a3031a258..fd712baf754803 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -16,12 +16,10 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_CUDA #include -#elif defined(PADDLE_WITH_MUSA) -#include #else #include #endif diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h index 98ebea87eedfd8..b9c49cb5696633 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -16,10 +16,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#endif - #ifdef PADDLE_WITH_HIP #include #endif @@ -147,7 +143,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) { static_cast(val)); // NOLINT } -#if defined(__HIPCC__) || defined(__MUSACC__)|| (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) USE_CUDA_ATOMIC(Add, double); #else CUDA_ATOMIC_WRAPPER(Add, double) { @@ -399,12 +395,188 @@ CUDA_ATOMIC_WRAPPER(Add, complex) { CudaAtomicAdd(imag, val.imag)); } +// For atomicMul. +CUDA_ATOMIC_WRAPPER(Mul, int) { + int res = *address, old = res; // NOLINT + do { + old = res; + res = atomicCAS(address, // NOLINT + old, // NOLINT + val * old); // NOLINT + } while (old != res); + return res; +} + +CUDA_ATOMIC_WRAPPER(Mul, unsigned int) { + unsigned int res = *address, old = res; // NOLINT + do { + old = res; + res = atomicCAS(address, // NOLINT + old, // NOLINT + val * old); // NOLINT + } while (old != res); + return res; +} +// CUDA API uses unsigned long long int, we cannot use uint64_t here. +// It because unsigned long long int is not necessarily uint64_t +CUDA_ATOMIC_WRAPPER(Mul, unsigned long long int) { // NOLINT + unsigned long long int old = *address, assumed; // NOLINT + + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; +} + +CUDA_ATOMIC_WRAPPER(Mul, int64_t) { + // Here, we check long long int must be int64_t. + static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT + "long long should be int64"); + long long int res = *address, old = res; // NOLINT + do { + old = res; + res = (long long int)atomicCAS( // NOLINT + (unsigned long long int *)address, // NOLINT + (unsigned long long int)old, // NOLINT + (unsigned long long int)val * (unsigned long long int)old); // NOLINT + } while (old != res); + return res; +} + +CUDA_ATOMIC_WRAPPER(Mul, float) { + int *const address_as_i = reinterpret_cast(address); + int old = *address_as_i, assumed; + + do { + assumed = old; + old = atomicCAS( + address_as_i, assumed, __float_as_int(val * __int_as_float(assumed))); + } while (assumed != old); + + return __int_as_float(old); +} + +CUDA_ATOMIC_WRAPPER(Mul, double) { + unsigned long long int *const address_as_ull = // NOLINT + reinterpret_cast(address); // NOLINT + unsigned long long int old = *address_as_ull, assumed; // NOLINT + + do { + assumed = old; + + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(val * __longlong_as_double(assumed))); + } while (assumed != old); + + return __longlong_as_double(old); +} + +#ifdef PADDLE_CUDA_FP16 +inline static __device__ uint32_t mul_to_low_half(uint32_t val, float x) { + phi::dtype::float16 low_half; + // The float16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = static_cast(static_cast(low_half) * x); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t mul_to_high_half(uint32_t val, float x) { + phi::dtype::float16 high_half; + // The float16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = + static_cast(static_cast(high_half) * x); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::float16) { + if (*address >= val) { + return *address; + } + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // The float16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, mul_to_low_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::float16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // The float16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, mul_to_high_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::float16 ret; + ret.x = old >> 16; + return ret; + } +} +#endif + +inline static __device__ uint32_t bf16_mul_to_low_half(uint32_t val, float x) { + phi::dtype::bfloat16 low_half; + // The bfloat16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = + static_cast(static_cast(low_half) * x); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t bf16_mul_to_high_half(uint32_t val, float x) { + phi::dtype::bfloat16 high_half; + // The bfloat16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = + static_cast(static_cast(high_half) * x); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::bfloat16) { + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // The bfloat16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS( + address_as_ui, assumed, bf16_mul_to_low_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::bfloat16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // The bfloat16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS( + address_as_ui, assumed, bf16_mul_to_high_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::bfloat16 ret; + ret.x = old >> 16; + return ret; + } +} + // For atomicMax USE_CUDA_ATOMIC(Max, int); USE_CUDA_ATOMIC(Max, unsigned int); // CUDA API uses unsigned long long int, we cannot use uint64_t here. // It because unsigned long long int is not necessarily uint64_t -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) || defined(__MUSACC__) +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT #else CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { // NOLINT @@ -590,7 +762,7 @@ USE_CUDA_ATOMIC(Min, int); USE_CUDA_ATOMIC(Min, unsigned int); // CUDA API uses unsigned long long int, we cannot use uint64_t here. // It because unsigned long long int is not necessarily uint64_t -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) || defined(__MUSACC__) +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT #else CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { // NOLINT diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc index 89471ba29aee00..a29b5e110922a4 100644 --- a/paddle/phi/backends/gpu/gpu_resources.cc +++ b/paddle/phi/backends/gpu/gpu_resources.cc @@ -37,10 +37,6 @@ #include "paddle/phi/backends/dynload/rocsparse.h" #endif -#ifdef PADDLE_WITH_MUSA -#include "paddle/phi/backends/dynload/musparse.h" -#endif - #include "glog/logging.h" #include "unsupported/Eigen/CXX11/Tensor" @@ -68,9 +64,10 @@ void InitGpuProperties(Place place, *driver_version = backends::gpu::GetGPUDriverVersion(place.GetDeviceId()); *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId()); -#ifdef PADDLE_WITH_CUDA const gpuDeviceProp& prop = backends::gpu::GetDeviceProperties(place.GetDeviceId()); + +#ifdef PADDLE_WITH_CUDA static const std::set compiled_archs{CUDA_REAL_ARCHS}; // Make sure compiled cuda arch is as same as runtime cuda arch. if (compiled_archs.find(*compute_capability) == compiled_archs.cend() && @@ -118,17 +115,6 @@ void InitGpuProperties(Place place, } #endif -#ifdef PADDLE_WITH_MUSA - LOG_FIRST_N(INFO, 1) << "Please NOTE: device: " - << static_cast(place.device) - << ", GPU Compute Capability: " - << *compute_capability / 10 << "." - << *compute_capability % 10 - << ", Driver API Version: " << *driver_version / 10000 - << "." << (*driver_version % 10000) / 100 - << ", Runtime API Version: " << *runtime_version / 10000 - << "." << (*runtime_version % 10000) / 100; -#else // TODO(wilber): glog may be replaced in the future? LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << static_cast(place.device) @@ -140,7 +126,6 @@ void InitGpuProperties(Place place, << ", Runtime API Version: " << *runtime_version / 1000 << "." << (*runtime_version % 100) / 10; -#endif #ifdef PADDLE_WITH_HIP size_t miopen_major, miopen_minor, miopen_patch; PADDLE_ENFORCE_GPU_SUCCESS( @@ -159,62 +144,42 @@ void InitGpuProperties(Place place, << "Please recompile or reinstall Paddle with compatible MIOPEN " "version."; } -#elif defined(PADDLE_WITH_MUSA) - // TODO(@caizhi): mudnnGetVersion is not supported for MUSA now. - // Requests have been submitted to Mudnn. - // size_t mudnn_dso_ver = dynload::mudnnGetVersion(); - size_t mudnn_dso_ver = 2500; - LOG_FIRST_N(INFO, 1) << "device: " << static_cast(place.device) - << ", muDNN Version: " << mudnn_dso_ver / 1000 << "." - << (mudnn_dso_ver % 1000) / 100 << "."; - - // Check MUSA/MUDNN version compatiblity - auto local_musa_version = *driver_version; - int compile_musa_version = MUSA_VERSION; -#if defined(__linux__) - PADDLE_ENFORCE_EQ( - (local_musa_version / 100 < compile_musa_version / 100) && - (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000), - false, - phi::errors::InvalidArgument( - "The installed Paddle is compiled with MUSA%d/muDNN%d," - "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. " - "which will cause serious incompatible bug. " - "Please recompile or reinstall Paddle with compatible MUSA/muDNN " - "version.", - compile_musa_version / 10000, - MUDNN_VERSION / 1000, - local_musa_version / 10000, - mudnn_dso_ver / 1000)); -#endif - if (local_musa_version < compile_musa_version) { - LOG_FIRST_N(WARNING, 1) - << "WARNING: device: " << static_cast(place.device) - << ". The installed Paddle is compiled with MUSA " - << compile_musa_version / 10000 << "." - << (compile_musa_version % 1000) / 100 - << ", but MUSA runtime version in your machine is " - << local_musa_version / 10000 << "." - << (local_musa_version % 1000) / 100 - << ", which may cause serious incompatible bug. " - << "Please recompile or reinstall Paddle with compatible MUSA " - "version."; - } #else size_t cudnn_dso_ver = dynload::cudnnGetVersion(); + auto get_cudnn_major = [](auto version) { + if (version < 9000) { + return version / 1000; + } + // CUDNN changes the CUDNN_VERSION rules after 9.0 + return version / 10000; + }; + auto get_cudnn_minor = [](auto version) { + if (version < 9000) { + return (version % 1000) / 100; + } + // CUDNN changes the CUDNN_VERSION rules after 9.0 + return (version % 10000) / 100; + }; + LOG_FIRST_N(WARNING, 1) << "device: " << static_cast(place.device) - << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." - << (cudnn_dso_ver % 1000) / 100 << "."; + << ", cuDNN Version: " + << get_cudnn_major(cudnn_dso_ver) << "." + << get_cudnn_minor(cudnn_dso_ver) << "."; // Check CUDA/CUDNN version compatiblity auto local_cuda_version = (*driver_version / 1000) * 10 + (*driver_version % 100) / 10; auto compile_cuda_version = (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10; + + // Compute cuDNN major + auto local_cudnn_major = get_cudnn_major(cudnn_dso_ver); + size_t compile_cudnn_major = CUDNN_MAJOR; + #if defined(__linux__) PADDLE_ENFORCE_EQ( (local_cuda_version / 10 < compile_cuda_version / 10) && - (cudnn_dso_ver / 1000 < CUDNN_VERSION / 1000), + (local_cudnn_major < compile_cudnn_major), false, phi::errors::InvalidArgument( "The installed Paddle is compiled with CUDA%d/cuDNN%d," @@ -223,9 +188,9 @@ void InitGpuProperties(Place place, "Please recompile or reinstall Paddle with compatible CUDA/cuDNN " "version.", compile_cuda_version / 10, - CUDNN_VERSION / 1000, + compile_cudnn_major, local_cuda_version / 10, - cudnn_dso_ver / 1000)); + local_cudnn_major)); #endif if (local_cuda_version < compile_cuda_version) { LOG_FIRST_N(WARNING, 1) @@ -241,14 +206,10 @@ void InitGpuProperties(Place place, #endif } - void InitStream(gpuStream_t* stream) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamCreateWithPriority(stream, hipStreamDefault, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamCreateWithPriority(stream, musaStreamDefault, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithPriority(stream, cudaStreamDefault, 0)); @@ -259,8 +220,6 @@ void DestoryStream(gpuStream_t stream) { if (stream != nullptr) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream)); #endif @@ -272,9 +231,6 @@ void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) { #ifdef PADDLE_WITH_HIP phi::dynload::rocblas_create_handle(blas_handle); phi::dynload::rocblas_set_stream(*blas_handle, stream); -#elif defined(PADDLE_WITH_MUSA) - phi::dynload::mublasCreate(blas_handle); - phi::dynload::mublasSetStream(*blas_handle, stream); #else // PADDLE_WITH_CUDA PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle)); PADDLE_RETRY_CUDA_SUCCESS( @@ -288,11 +244,6 @@ void DestroyBlasHandle(blasHandle_t handle) { phi::dynload::rocblas_destroy_handle(handle); handle = nullptr; } -#elif defined(PADDLE_WITH_MUSA) - if (handle != nullptr) { - phi::dynload::mublasDestroy(handle); - handle = nullptr; - } #else if (handle != nullptr) { phi::dynload::cublasDestroy(handle); @@ -301,20 +252,20 @@ void DestroyBlasHandle(blasHandle_t handle) { #endif // PADDLE_WITH_HIP } -// void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) { -// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 -// phi::dynload::cublasLtCreate(blaslt_handle); -// #endif -// } +void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) { +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 + phi::dynload::cublasLtCreate(blaslt_handle); +#endif +} -// void DestroyBlasLtHandle(blasLtHandle_t handle) { -// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 -// if (handle != nullptr) { -// phi::dynload::cublasLtDestroy(handle); -// handle = nullptr; -// } -// #endif -// } +void DestroyBlasLtHandle(blasLtHandle_t handle) { +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 + if (handle != nullptr) { + phi::dynload::cublasLtDestroy(handle); + handle = nullptr; + } +#endif +} void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) { if (phi::dynload::HasCUDNN()) { @@ -338,9 +289,6 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) { } PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream)); -#elif defined(PADDLE_WITH_MUSA) - phi::dynload::mudnnCreate(handle, place.device); - phi::dynload::mudnnSetStream(*handle, stream); #else auto version = phi::dynload::cudnnGetVersion(); auto local_cudnn_major = @@ -371,11 +319,6 @@ void DestroyDnnHandle(dnnHandle_t handle) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle)); handle = nullptr; } -#elif defined(PADDLE_WITH_MUSA) - if (handle != nullptr) { - phi::dynload::mudnnDestroy(handle); - handle = nullptr; - } #else if (handle != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle)); @@ -384,21 +327,21 @@ void DestroyDnnHandle(dnnHandle_t handle) { #endif // PADDLE_WITH_HIP } -// void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) { -// #if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) -// PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle)); -// PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream)); -// #endif -// } +void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) { +#ifndef PADDLE_WITH_HIP + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle)); + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream)); +#endif +} -// void DestroySolverHandle(solverHandle_t solver_handle) { -// #if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) -// if (solver_handle != nullptr) { -// PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle)); -// solver_handle = nullptr; -// } -// #endif -// } +void DestroySolverHandle(solverHandle_t solver_handle) { +#ifndef PADDLE_WITH_HIP + if (solver_handle != nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle)); + solver_handle = nullptr; + } +#endif +} void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) { // ROCM is not yet supported @@ -411,9 +354,6 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) { #elif defined(PADDLE_WITH_HIP) phi::dynload::rocsparse_create_handle(handle); phi::dynload::rocsparse_set_stream(*handle, stream); -#elif defined(PADDLE_WITH_MUSA) - phi::dynload::musparseCreateHandle(handle); - phi::dynload::musparseSetStream(*handle, stream); #endif } @@ -430,11 +370,6 @@ void DestroySparseHandle(sparseHandle_t handle) { phi::dynload::rocsparse_destroy_handle(handle); handle = nullptr; } -#elif defined(PADDLE_WITH_MUSA) - if (handle != nullptr) { - phi::dynload::musparseDestroyHandle(handle); - handle = nullptr; - } #endif } diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h index df6a131ff315d7..7bec5eebf5886f 100644 --- a/paddle/phi/backends/gpu/gpu_resources.h +++ b/paddle/phi/backends/gpu/gpu_resources.h @@ -35,14 +35,14 @@ void DestoryStream(gpuStream_t stream); void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream); void DestroyBlasHandle(blasHandle_t handle); -// void InitBlasLtHandle(blasLtHandle_t* blaslt_handle); -// void DestroyBlasLtHandle(blasLtHandle_t handle); +void InitBlasLtHandle(blasLtHandle_t* blaslt_handle); +void DestroyBlasLtHandle(blasLtHandle_t handle); void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place); void DestroyDnnHandle(dnnHandle_t handle); -// void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream); -// void DestroySolverHandle(solverHandle_t solver_handle); +void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream); +void DestroySolverHandle(solverHandle_t solver_handle); void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream); void DestroySparseHandle(sparseHandle_t handle); diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h index 00c0bdf6c545bc..77f403795b6b3d 100644 --- a/paddle/phi/backends/gpu/gpu_types.h +++ b/paddle/phi/backends/gpu/gpu_types.h @@ -17,15 +17,11 @@ #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/miopen.h" #include "paddle/phi/backends/dynload/rocblas.h" -#elif defined(PADDLE_WITH_MUSA) -#include "paddle/phi/backends/dynload/mublas.h" -#include "paddle/phi/backends/dynload/mudnn.h" #else // PADDLE_WITH_CUDA #include "paddle/phi/backends/dynload/cublas.h" #include "paddle/phi/backends/dynload/cudnn.h" @@ -33,40 +29,19 @@ namespace phi { -#ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ - using GPU_TYPE = ROCM_TYPE; - -#elif defined(PADDLE_WITH_MUSA) -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ - using GPU_TYPE = MUSA_TYPE; - -#else // PADDLE_WITH_MUSA -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ - using GPU_TYPE = CUDA_TYPE; -#endif // PADDLE_WITH_CUDA - -DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t); -DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, - cudaMemcpyKind, - hipMemcpyKind, - musaMemcpyKind); -DECLARE_TYPE_FOR_GPU(gpuDeviceProp, - cudaDeviceProp, - hipDeviceProp_t, - musaDeviceProp); -#undef DECLARE_TYPE_FOR_GPU - -#ifndef PADDLE_WITH_MUSA #ifdef PADDLE_WITH_HIP #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = ROCM_TYPE; -#else // PADDLE_WITH_MUSA +#else // PADDLE_WITH_CDUA + #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = CUDA_TYPE; -#endif // PADDLE_WITH_CUDA +#endif +DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t); +DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind); +DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t); DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t); DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, @@ -75,45 +50,34 @@ DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, cudnnActivationMode_t, miopenActivationMode_t); + #undef DECLARE_TYPE_FOR_GPU -#endif #ifdef PADDLE_WITH_HIP -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ constexpr auto GPU_CV = ROCM_CV; -#elif defined(PADDLE_WITH_MUSA) -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ - constexpr auto GPU_CV = MUSA_CV; #else // PADDLE_WITH_CUDA -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ constexpr auto GPU_CV = CUDA_CV; #endif DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation, - hipErrorOutOfMemory, - musaErrorMemoryAllocation); -DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, - cudaErrorNotReady, - hipErrorNotReady, - musaErrorNotReady); -DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess); + hipErrorOutOfMemory); +DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); +DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyHostToDevice, cudaMemcpyKind::cudaMemcpyHostToDevice, - hipMemcpyKind::hipMemcpyHostToDevice, - musaMemcpyKind::musaMemcpyHostToDevice); + hipMemcpyKind::hipMemcpyHostToDevice); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost, cudaMemcpyKind::cudaMemcpyDeviceToHost, - hipMemcpyKind::hipMemcpyDeviceToHost, - musaMemcpyKind::musaMemcpyDeviceToHost); + hipMemcpyKind::hipMemcpyDeviceToHost); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice, cudaMemcpyKind::cudaMemcpyDeviceToDevice, - hipMemcpyKind::hipMemcpyDeviceToDevice, - musaMemcpyKind::musaMemcpyDeviceToDevice); + hipMemcpyKind::hipMemcpyDeviceToDevice); #undef DECLARE_CONSTANT_FOR_GPU } // namespace phi -#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || - // defined(PADDLE_WITH_MUSA ) +#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/phi/backends/gpu/musa/mudnn_desc.h b/paddle/phi/backends/gpu/musa/mudnn_desc.h deleted file mode 100644 index 9de12d586bea01..00000000000000 --- a/paddle/phi/backends/gpu/musa/mudnn_desc.h +++ /dev/null @@ -1,202 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/phi/backends/gpu/musa/mudnn_helper.h" -#include "paddle/phi/core/utils/data_type.h" - -namespace phi { -namespace backends { -namespace gpu { - -template -inline std::vector TransformDimOrder(const std::vector& dims) { - std::vector transformed_dims(dims.begin(), dims.end()); - if (dims.size() < 4) { - return transformed_dims; - } - T H, W, D, C; - if (dims.size() == 4) { - H = dims[1]; - W = dims[2]; - C = dims[3]; - transformed_dims[1] = C; - transformed_dims[2] = H; - transformed_dims[3] = W; - } else { - D = dims[1]; - H = dims[2]; - W = dims[3]; - C = dims[4]; - transformed_dims[1] = C; - transformed_dims[2] = D; - transformed_dims[3] = H; - transformed_dims[4] = W; - } - return transformed_dims; -} - -inline dynload::Tensor::Type ToCudnnDataType(const phi::DataType& t) { - dynload::Tensor::Type type = dynload::Tensor::Type::FLOAT; - switch (t) { - case phi::DataType::FLOAT16: - type = dynload::Tensor::Type::HALF; - break; - case phi::DataType::FLOAT32: - type = dynload::Tensor::Type::FLOAT; - break; - case phi::DataType::FLOAT64: - type = dynload::Tensor::Type::DOUBLE; - break; - default: - PD_THROW("Don't support this data type ", t); - } - return type; -} - -class TensorDescriptor { - public: - using T = dynload::Tensor; - TensorDescriptor() : desc_(std::make_unique()) {} - T* desc() { return desc_.get(); } - T* desc() const { return desc_.get(); } - void set(const phi::DenseTensor& tensor, const int groups = 1) { - auto dims = phi::vectorize(tensor.dims()); - std::vector strides(dims.size()); - strides[dims.size() - 1] = 1; - for (int i = dims.size() - 2; i >= 0; i--) { - strides[i] = dims[i + 1] * strides[i + 1]; - } - desc_->SetType(ToCudnnDataType(tensor.dtype())); - desc_->SetNdInfo(static_cast(dims.size()), dims.data(), strides.data()); - desc_->SetAddr(tensor.data()); - } - - template - void set(const phi::DenseTensor& tensor, const Type* data) { - auto dims = phi::vectorize(tensor.dims()); - std::vector strides(dims.size()); - strides[dims.size() - 1] = 1; - for (int i = dims.size() - 2; i >= 0; i--) { - strides[i] = dims[i + 1] * strides[i + 1]; - } - desc_->SetType(ToCudnnDataType(tensor.dtype())); - desc_->SetNdInfo(static_cast(dims.size()), dims.data(), strides.data()); - desc_->SetAddr(data); - } - - void set(const std::vector& dims, - const dynload::Tensor::Format format, - const dynload::Tensor::Type dtype) { - std::vector transformed_dims; - std::vector dims_64(dims.begin(), dims.end()); - if (format == dynload::Tensor::Format::NHWC) { - transformed_dims = TransformDimOrder(dims_64); - } else { - transformed_dims = dims_64; - } - desc_->SetFormat(format); - desc_->SetType(dtype); - desc_->SetNdInfo(static_cast(transformed_dims.size()), transformed_dims.data()); - } - - void set(const phi::DenseTensor& tensor, - const dynload::Tensor::Format format) { - auto dims = phi::vectorize(tensor.dims()); - auto dtype = ToCudnnDataType(tensor.dtype()); - set(dims, format, dtype); - desc_->SetAddr(tensor.data()); - } - - private: - std::unique_ptr desc_; -}; - -class FilterDescriptor { - public: - using T = phi::dynload::Tensor; - FilterDescriptor() : desc_(std::make_unique()) {} - T* desc() { return desc_.get(); } - T* desc() const { return desc_.get(); } - - void set(const std::vector& dims, - const dynload::Tensor::Format format, - const dynload::Tensor::Type dtype, - const int groups = 1) { - std::vector transformed_dims; - std::vector dims_64(dims.begin(), dims.end()); - if (format == dynload::Tensor::Format::NHWC) { - transformed_dims = TransformDimOrder(dims_64); - } else { - transformed_dims = dims_64; - } - if (groups > 1) { - transformed_dims[1] = transformed_dims[1] / groups; - } - desc_->SetFormat(format); - desc_->SetType(dtype); - desc_->SetNdInfo(static_cast(transformed_dims.size()), transformed_dims.data()); - } - - void set(const phi::DenseTensor& tensor, - const dynload::Tensor::Format format, - const int groups = 1) { - auto dims = phi::vectorize(tensor.dims()); - auto dtype = ToCudnnDataType(tensor.dtype()); - set(dims, format, dtype, groups); - desc_->SetAddr(tensor.data()); - } - - private: - std::unique_ptr desc_; -}; - -class ConvolutionDescriptor { - public: - using T = dynload::Convolution; - ConvolutionDescriptor() : desc_(std::make_unique()) {} - T* desc() { return desc_.get(); } - T* desc() const { return desc_.get(); } - - void set(dynload::Tensor::Type dtype, - const std::vector& pads, - const std::vector& strides, - const std::vector& dilations, - bool allow_tf32, - const int groups = 1) { - allow_tf32_ = allow_tf32; - desc_->SetNdInfo( - pads.size(), pads.data(), strides.data(), dilations.data()); - desc_->SetComputeMode(dynload::Convolution::ComputeMode::TENSOR); - desc_->SetGroups(groups); - } - - bool allow_tf32_; - - private: - std::unique_ptr desc_; -}; - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/mudnn_helper.h b/paddle/phi/backends/gpu/musa/mudnn_helper.h deleted file mode 100644 index 55030e860b4213..00000000000000 --- a/paddle/phi/backends/gpu/musa/mudnn_helper.h +++ /dev/null @@ -1,323 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "gflags/gflags.h" -#include "paddle/phi/backends/dynload/mudnn.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/enforce.h" - -#define CUDNN_BN_MIN_EPSILON 1e-05 - -DECLARE_bool(cudnn_deterministic); - -namespace phi { -namespace backends { -namespace gpu { - -#define CUDNN_VERSION_MIN(major, minor, patch) \ - (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) - -enum class DataLayout { // Not use - kNHWC, - kNCHW, - kNCDHW, - kNDHWC, // add, liyamei - kNCHW_VECT_C, -}; - -enum class PoolingMode { - kMaximum, - kMaximumDeterministic, - kAverageExclusive, - kAverageInclusive, -}; - -inline dynload::Pooling::Mode GetPoolingMode(const PoolingMode& mode) { - switch (mode) { - // case PoolingMode::kMaximumDeterministic: - // return CUDNN_POOLING_MAX_DETERMINISTIC; - case PoolingMode::kAverageExclusive: - return dynload::Pooling::Mode::AVGPOOL_COUNT_WITHOUT_PAD; - case PoolingMode::kAverageInclusive: - return dynload::Pooling::Mode::AVGPOOL_COUNT_PAD; - case PoolingMode::kMaximum: - return dynload::Pooling::Mode::MAXPOOL; - default: - PADDLE_THROW( - phi::errors::Unimplemented("Unexpected MUDNN pooling mode.")); - } -} - -template -class CudnnDataType; - -template <> -class CudnnDataType { - public: - static const dynload::Tensor::Type type = dynload::Tensor::Type::BFLOAT16; - using ScalingParamType = const float; - using BatchNormParamType = float; - static ScalingParamType* kOne() { - static ScalingParamType v = 1.0; - return &v; - } - static ScalingParamType* kZero() { - static ScalingParamType v = 0.0; - return &v; - } -}; - -template <> -class CudnnDataType { - public: - static const dynload::Tensor::Type type = dynload::Tensor::Type::HALF; - // The scaling param type is float for HALF and FLOAT tensors - using ScalingParamType = const float; - using BatchNormParamType = float; - static ScalingParamType* kOne() { - static ScalingParamType v = 1.0; - return &v; - } - static ScalingParamType* kZero() { - static ScalingParamType v = 0.0; - return &v; - } -}; - -template <> -class CudnnDataType { - public: - static const dynload::Tensor::Type type = dynload::Tensor::Type::FLOAT; - using ScalingParamType = const float; - using BatchNormParamType = float; - static ScalingParamType* kOne() { - static ScalingParamType v = 1.0; - return &v; - } - static ScalingParamType* kZero() { - static ScalingParamType v = 0.0; - return &v; - } -}; - -template <> -class CudnnDataType { - public: - static const dynload::Tensor::Type type = dynload::Tensor::Type::DOUBLE; - using ScalingParamType = const double; - using BatchNormParamType = double; - static ScalingParamType* kOne() { - static ScalingParamType v = 1.0; - return &v; - } - static ScalingParamType* kZero() { - static ScalingParamType v = 0.0; - return &v; - } -}; - -inline dynload::Tensor::Format GetCudnnTensorFormat( - const DataLayout& order) { // Not use - switch (order) { - case DataLayout::kNHWC: - return dynload::Tensor::Format::NHWC; - case DataLayout::kNCHW: - return dynload::Tensor::Format::NCHW; - case DataLayout::kNCDHW: - return dynload::Tensor::Format::NCDHW; - case DataLayout::kNDHWC: - return dynload::Tensor::Format::NDHWC; - default: - PADDLE_THROW(phi::errors::Unimplemented( - "MUDNN has no equivalent dataLayout for input order.")); - } - return dynload::Tensor::Format::NCHW; -} - -class ScopedTensorDescriptor { - public: - ScopedTensorDescriptor() {} - ~ScopedTensorDescriptor() PADDLE_MAY_THROW {} - - inline dynload::Tensor descriptor(const dynload::Tensor::Format format, - const dynload::Tensor::Type type, - const std::vector& dims, - const int groups = 1) { - // the format is not used now, will add later - std::vector strides(dims.size()); - strides[dims.size() - 1] = 1; - for (int i = dims.size() - 2; i >= 0; i--) { - strides[i] = dims[i + 1] * strides[i + 1]; - } - // Update tensor descriptor dims setting if groups > 1 - // NOTE: Here, Assume using NCHW or NCDHW order - std::vector dims_with_group(dims.begin(), dims.end()); - if (groups > 1) { - dims_with_group[1] = dims_with_group[1] / groups; - } - - PADDLE_ENFORCE_EQ( - format, - dynload::Tensor::Format::NCHW, - phi::errors::InvalidArgument("format should ONLY be NCHW in MUDNN.")); - - desc_.SetNdInfo( - static_cast(dims_with_group.size()), dims_with_group.data(), strides.data()); - desc_.SetType(type); - desc_.SetFormat(format); - - return desc_; - } - - template - inline dynload::Tensor& descriptor(const DataLayout& order, - const std::vector& dims, - const int groups = 1) { - descriptor( - GetCudnnTensorFormat(order), CudnnDataType::type, dims, groups); - return desc_; - } - - template - inline dynload::Tensor& descriptor(const phi::DenseTensor& tensor, - const DataLayout& order, - const std::vector& dims, - const int groups = 1) { - desc_.SetAddr(tensor.data()); - descriptor(order, dims, groups); - return desc_; - } - - template - inline dynload::Tensor& descriptor(const T* data, - const DataLayout& order, - const std::vector& dims, - const int groups = 1) { - desc_.SetAddr(data); - descriptor(order, dims, groups); - return desc_; - } - - inline dynload::Tensor& descriptor(const dynload::Tensor::Type mudnn_type, - const std::vector& dim, - const std::vector& stride) { - std::vector dims_64(dim.begin(), dim.end()); - std::vector stride_64(dim.begin(), dim.end()); - desc_.SetType(mudnn_type); - desc_.SetNdInfo(static_cast(dims_64.size()), dims_64.data(), stride_64.data()); - return desc_; - } - - template - inline dynload::Tensor& descriptor(const std::vector& dim, - const std::vector& stride) { - descriptor(CudnnDataType::type, dim, stride); - return desc_; - } - - inline dynload::Tensor& desc() { return desc_; } - - private: - dynload::Tensor desc_; - DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor); -}; - -class ScopedPoolingDescriptor { - public: - ScopedPoolingDescriptor() {} - ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {} - - inline dynload::Pooling& descriptor(const PoolingMode& mode, - const std::vector& kernel, - const std::vector& pads, - const std::vector& strides) { - PADDLE_ENFORCE_EQ(kernel.size(), - pads.size(), - phi::errors::InvalidArgument( - "The size of kernel and pads should be equal. But " - "received size of kernel is %d, size of pads is %d.", - kernel.size(), - pads.size())); - PADDLE_ENFORCE_EQ( - kernel.size(), - strides.size(), - phi::errors::InvalidArgument( - "The size of kernel and strides should be equal. But " - "received size of kernel is %d, size of strides is %d.", - kernel.size(), - strides.size())); - const std::vector dilation(kernel.size(), 1); - desc_.SetNdInfo(kernel.size(), - kernel.data(), - pads.data(), - strides.data(), - dilation.data()); - desc_.SetMode(GetPoolingMode(mode)); - return desc_; - } - - dynload::Pooling& desc() { return desc_; } - - private: - dynload::Pooling desc_; - DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); -}; - -class ScopedSoftmaxDescriptor { - public: - ScopedSoftmaxDescriptor() {} - ~ScopedSoftmaxDescriptor() PADDLE_MAY_THROW {} - - inline dynload::Softmax& descriptor(const dynload::Softmax::Mode& mode, - const dynload::Softmax::Algorithm& algo, - const int& dim) { - desc_.SetMode(mode); - desc_.SetDim(dim); - desc_.SetAlgorithm(algo); - return desc_; - } - - dynload::Softmax& desc() { return desc_; } - - private: - dynload::Softmax desc_; - DISABLE_COPY_AND_ASSIGN(ScopedSoftmaxDescriptor); -}; - -static void InternalMemFree(void* ptr) { - if (!ptr) { - return; - } - PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr)); -} - -static dynload::MemoryHandler InternalMemAlloc(size_t s) { - void* data = nullptr; - if (s) { - PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&data, s)); - } - return dynload::MemoryHandler(data, InternalMemFree); -} - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/musa_device_function.h b/paddle/phi/backends/gpu/musa/musa_device_function.h deleted file mode 100644 index f2847daf4dfacb..00000000000000 --- a/paddle/phi/backends/gpu/musa/musa_device_function.h +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#define PADDLE_CUDA_FP16 -// NOTE(): support float16 to half in header file. -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/complex.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/enforce.h" - -namespace phi { -namespace backends { -namespace gpu { - -#define FULL_WARP_MASK 0xFFFFFFFF -#define CREATE_SHFL_MASK(mask, predicate) \ - mask = __ballot_sync(FULL_WARP_MASK, (predicate)) - -#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kPowerOfTwoDim = (dim); \ - __VA_ARGS__; \ - } break - -#define CUDA_LAUNCH_KERNEL_HELPER(...) \ - CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); - -template -__forceinline__ __device__ T -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { - return __shfl_down_sync(mask, val, static_cast(delta), width); -} - -template -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, - T val, - int width = warpSize) { - return __shfl_xor_sync(mask, val, width); -} - - -#if defined(PADDLE_WITH_MUSA) -// Due to the inconsistency between mcc and nvcc, certain type conversions are not implicitly performed, so we specialize here. -template <> -__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(unsigned mask, - phi::dtype::float16 val, - int width) { - return (phi::dtype::float16)(__shfl_xor_sync(mask, float(val), width)); -} -#endif - -template <> -__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( - unsigned mask, phi::dtype::float16 val, int delta, int width) { - return phi::dtype::float16(__shfl_down_sync( - mask, val.to_half(), static_cast(delta), width)); -} - -template <> -__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( - unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { -#if defined(PADDLE_MUSA_BF16) && defined(__MUSA_ARCH__) && __MUSA_ARCH__ >= 220 - return phi::dtype::bfloat16(__shfl_down_sync( - mask, val.to_mt_bfloat16(), static_cast(delta), width)); -#else - PADDLE_ENFORCE( - false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11."); -#endif -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { - float real = static_cast(__shfl_down_sync( - mask, static_cast(val.real), static_cast(delta), width)); - float imag = static_cast(__shfl_down_sync( - mask, static_cast(val.imag), static_cast(delta), width)); - return phi::dtype::complex(real, imag); -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { - double real = - static_cast(__shfl_down_sync(mask, - static_cast(val.real), - static_cast(delta), - width)); - double imag = - static_cast(__shfl_down_sync(mask, - static_cast(val.imag), - static_cast(delta), - width)); - return phi::dtype::complex(real, imag); -} - -template <> -__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( - unsigned mask, phi::dtype::bfloat16 val, int width) { -#if defined(PADDLE_MUSA_BF16) - return phi::dtype::bfloat16( - __shfl_xor_sync(mask, val.to_mt_bfloat16(), width)); -#else - PADDLE_ENFORCE( - false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11."); -#endif -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { - float real = static_cast( - __shfl_xor_sync(mask, static_cast(val.real), width)); - float imag = static_cast( - __shfl_xor_sync(mask, static_cast(val.imag), width)); - return phi::dtype::complex(real, imag); -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { - double real = static_cast( - __shfl_xor_sync(mask, static_cast(val.real), width)); - double imag = static_cast( - __shfl_xor_sync(mask, static_cast(val.imag), width)); - return phi::dtype::complex(real, imag); -} - -template -__forceinline__ __device__ T -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { - return __shfl_sync(mask, val, src_line, width); -} - -template -HOSTDEVICE T Infinity() { - return INFINITY; -} - -template -__device__ T reduceSum(T val, int tid, int len) { - // NOTE(zcd): The warp size should be taken from the - // parameters of the GPU but not specified as 32 simply. - // To make the reduceSum more efficiently, - // I use Warp-Level Parallelism and assume the Warp size - // is 32 which may be different for different GPU, - // but most card's warp size is 32. - const int warpSize = 32; - __shared__ T shm[warpSize]; - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, tid < len); - - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); - - if (tid < warpSize) shm[tid] = 0; - __syncthreads(); - - if (tid % warpSize == 0) { - shm[tid / warpSize] = val; - } - __syncthreads(); - - CREATE_SHFL_MASK(mask, tid < warpSize); - - if (tid < warpSize) { - val = shm[tid]; - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); - } - return val; -} -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/musa_helper.h b/paddle/phi/backends/gpu/musa/musa_helper.h deleted file mode 100644 index 7463edc5d9ff60..00000000000000 --- a/paddle/phi/backends/gpu/musa/musa_helper.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace phi { -namespace backends { -namespace gpu { - -/* - * Summary: Grid stride looping macro in CUDA kernel - * - * [ Why need this macro? ] - * - * The original looping in CUDA kernel is: - * - * `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ - * i += blockDim.x * gridDim.x)` - * - * This for condition is risky. The value of `blockIdx.x * blockDim.x` - * may be large, such as over 1GB, the first iteration is no problem here, - * but when `i += blockDim.x * gridDim.x` is executed, the value of i - * will greater than INT_MAX and overflow becomes negative value, at - * this time, the cycle condition `i < (n)` is still satisfied, so it - * will cause illegal access to cuda memory. - * - * Here is a real example in ERINE, it will trigger above error. - * The related data are: - * - blockIdx.x = 2172938 - * - blockDim.x = 512 - * - blockIdx.x * blockDim.x = 1112543864 - * - INT_MAX = 2147483647 - * - * So we polish the for condition as follow, the int64_t __index__ will - * prevent overflow in the loop increment. - * - * Parameters: - * - i: loop index - * - num: total element numbers - * - * Examples: - * template - * __global__ void Scale(T* logit_grad, const T* loss_grad, const int num, - * const int d, const int remain) { - * CUDA_KERNEL_LOOP(index, num) { - * int idx_n = index / d; - * int idx_remain = index % remain; - * logit_grad[index] *= loss_grad[idx_n * remain + idx_remain]; - * } - * } - * - */ - -#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ - int64_t __index__ = \ - static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ - int64_t __stride__ = static_cast(blockDim.x) * gridDim.x; \ - for (index_type i = __index__; __index__ < (num); \ - __index__ += __stride__, i = __index__) - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc deleted file mode 100644 index cab81b58f5ecb2..00000000000000 --- a/paddle/phi/backends/gpu/musa/musa_info.cc +++ /dev/null @@ -1,334 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h" -#include "paddle/phi/backends/gpu/gpu_info.h" - -#include "paddle/phi/core/enforce.h" - -#include "musa_runtime.h" - -static std::once_flag g_device_props_size_init_flag; -static std::vector> g_device_props_init_flags; -static std::vector g_device_props; - -namespace phi { -namespace backends { -namespace gpu { - -int DnnVersion() { - if (!dynload::HasCUDNN()) return -1; - // TODO(@caizhi): mudnnGetVersion is not supported now. - // version info will be returned from mudnnGetVersion later. - const int version_major = 2; - const int version_minor = 5; - const int version_patch = 0; - return version_major * 1000 + version_minor * 100 + version_patch; -} - -static int GetGPUDeviceCountImpl() { - int driverVersion = 0; - musaError_t status = musaDriverGetVersion(&driverVersion); - - if (!(status == gpuSuccess && driverVersion != 0)) { - // No GPU driver - VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; - return 0; - } - - const auto *musa_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES"); - - if (musa_visible_devices != nullptr) { - std::string musa_visible_devices_str(musa_visible_devices); - if (!musa_visible_devices_str.empty()) { - musa_visible_devices_str.erase( - 0, musa_visible_devices_str.find_first_not_of('\'')); - musa_visible_devices_str.erase( - musa_visible_devices_str.find_last_not_of('\'') + 1); - musa_visible_devices_str.erase( - 0, musa_visible_devices_str.find_first_not_of('\"')); - musa_visible_devices_str.erase( - musa_visible_devices_str.find_last_not_of('\"') + 1); - } - if (std::all_of(musa_visible_devices_str.begin(), - musa_visible_devices_str.end(), - [](char ch) { return ch == ' '; })) { - VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be " - "empty. No GPU detected."; - return 0; - } - } - int count; - PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count)); - return count; -} - -int GetGPUDeviceCount() { - // cache the count - static auto dev_cnt = GetGPUDeviceCountImpl(); - return dev_cnt; -} - -int GetGPUComputeCapability(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int major, minor; - auto major_error_code = - musaDeviceGetAttribute(&major, musaDevAttrComputeCapabilityMajor, id); - auto minor_error_code = - musaDeviceGetAttribute(&minor, musaDevAttrComputeCapabilityMinor, id); - - PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); - PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); - return major * 10 + minor; -} - -int GetGPURuntimeVersion(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int runtime_version = 0; - // Note: runtime_version = MAJOR * 10000 + MINOR * 100 + PATCH - PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version)); - return runtime_version; -} - -int GetGPUDriverVersion(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int driver_version = 0; - // Note: driver_version = MAJOR * 10000 + MINOR * 100 + PATCH - PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version)); - return driver_version; -} - -bool TensorCoreAvailable() { return false; } - -int GetGPUMultiProcessors(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int count; - PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetAttribute(&count, musaDevAttrMultiProcessorCount, id)); - return count; -} - -int GetGPUMaxThreadsPerMultiProcessor(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int count; - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute( - &count, musaDevAttrMaxThreadsPerMultiProcessor, id)); - - return count; -} - -int GetGPUMaxThreadsPerBlock(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int count; - PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetAttribute(&count, musaDevAttrMaxThreadsPerBlock, id)); - return count; -} - -int GetCurrentDeviceId() { - int device_id; - PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id)); - return device_id; -} - -std::array GetGpuMaxGridDimSize(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - std::array ret; - int size; - auto error_code_x = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id); - PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); - ret[0] = size; - - auto error_code_y = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id); - PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); - ret[1] = size; - - auto error_code_z = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id); - PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); - ret[2] = size; - return ret; -} - -std::pair GetGpuStreamPriorityRange() { - int least_priority, greatest_priority; - PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority)); - return std::make_pair(least_priority, greatest_priority); -} - -const gpuDeviceProp &GetDeviceProperties(int id) { - std::call_once(g_device_props_size_init_flag, [&] { - int gpu_num = 0; - gpu_num = GetGPUDeviceCount(); - g_device_props_init_flags.resize(gpu_num); - g_device_props.resize(gpu_num); - for (int i = 0; i < gpu_num; ++i) { - g_device_props_init_flags[i] = std::make_unique(); - } - }); - - if (id == -1) { - id = GetCurrentDeviceId(); - } - - if (id < 0 || id >= static_cast(g_device_props.size())) { - PADDLE_THROW(phi::errors::OutOfRange( - "The device id %d is out of range [0, %d), where %d is the number of " - "devices on this machine. Because the device id should be greater than " - "or equal to zero and smaller than the number of gpus. Please input " - "appropriate device again!", - id, - static_cast(g_device_props.size()), - static_cast(g_device_props.size()))); - } - - std::call_once(*(g_device_props_init_flags[id]), [&] { - PADDLE_ENFORCE_GPU_SUCCESS( - musaGetDeviceProperties(&g_device_props[id], id)); - }); - //TODO@mtai:we hope not to skip UT that ask compute capacity to be greater than 7/8 - g_device_props[id].major = 9; - g_device_props[id].minor = 9; - return g_device_props[id]; -} - -void SetDeviceId(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id)); -} - -void GpuMemcpyAsync(void *dst, - const void *src, - size_t count, - gpuMemcpyKind kind, - gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream)); -} - -void GpuMemcpySync(void *dst, - const void *src, - size_t count, - gpuMemcpyKind kind) { - PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind)); - -} - -void GpuMemcpyPeerAsync(void *dst, - int dst_device, - const void *src, - int src_device, - size_t count, - gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); -} - -void GpuMemcpyPeerSync( - void *dst, int dst_device, const void *src, int src_device, size_t count) { - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemcpyPeer(dst, dst_device, src, src_device, count)); -} - -void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream)); -} - -void GpuStreamSync(gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); -} - -void GpuDestroyStream(gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); -} - -void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); } - -gpuError_t GpuGetLastError() { return musaGetLastError(); } - -bool IsGPUManagedMemorySupported(int dev_id) { - PADDLE_ENFORCE_LT( - dev_id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - dev_id, - GetGPUDeviceCount())); - return false; -} - -bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) { - PADDLE_ENFORCE_LT( - dev_id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - dev_id, - GetGPUDeviceCount())); - return false; -} - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/capi/include/c_meta_tensor.h b/paddle/phi/capi/include/c_meta_tensor.h index 08f01084c6abf3..f4c9a541e526aa 100644 --- a/paddle/phi/capi/include/c_meta_tensor.h +++ b/paddle/phi/capi/include/c_meta_tensor.h @@ -39,6 +39,13 @@ int64_t PD_MetaTensorGetDim(const PD_MetaTensor *tensor, size_t index, PD_Status *status); +int64_t PD_MetaTensorGetNumStrides(const PD_MetaTensor *tensor, + PD_Status *status); + +int64_t PD_MetaTensorGetStride(const PD_MetaTensor *tensor, + size_t index, + PD_Status *status); + bool PD_MetaTensorIsValid(const PD_MetaTensor *tensor, PD_Status *status); void PD_MetaTensorSetDims(PD_MetaTensor *tensor, @@ -46,6 +53,11 @@ void PD_MetaTensorSetDims(PD_MetaTensor *tensor, const int64_t *dims, PD_Status *status); +void PD_MetaTensorSetStrides(PD_MetaTensor *tensor, + int64_t nstrides, + const int64_t *strides, + PD_Status *status); + void PD_MetaTensorSetDataType(PD_MetaTensor *tensor, PD_DataType dtype, PD_Status *status); diff --git a/paddle/phi/capi/include/c_tensor.h b/paddle/phi/capi/include/c_tensor.h index c4f706c70ccfb4..2df292c6b946b2 100644 --- a/paddle/phi/capi/include/c_tensor.h +++ b/paddle/phi/capi/include/c_tensor.h @@ -41,6 +41,12 @@ int64_t PD_TensorGetDim(const PD_Tensor *tensor, size_t index, PD_Status *status); +int64_t PD_TensorGetNumStrides(const PD_Tensor *tensor, PD_Status *status); + +int64_t PD_TensorGetStride(const PD_Tensor *tensor, + size_t index, + PD_Status *status); + void PD_TensorGetLoD(const PD_Tensor *tensor, PD_List *data, PD_List *offset, @@ -52,11 +58,22 @@ bool PD_TensorIsValid(const PD_Tensor *tensor, PD_Status *status); void *PD_TensorGetHolder(const PD_Tensor *tensor, PD_Status *status); +size_t PD_TensorGetOffset(const PD_Tensor *tensor, PD_Status *status); + void PD_TensorSetDims(PD_Tensor *tensor, int64_t ndims, const int64_t *dims, PD_Status *status); +void PD_TensorSetOffset(PD_Tensor *tensor, + const int64_t offset, + PD_Status *status); + +void PD_TensorSetStrides(PD_Tensor *tensor, + int64_t nstrides, + const int64_t *strides, + PD_Status *status); + void PD_TensorSetDataType(PD_Tensor *tensor, PD_DataType dtype, PD_Status *status); diff --git a/paddle/phi/capi/include/wrapper_base.h b/paddle/phi/capi/include/wrapper_base.h index 061561008a95e7..75f3e2d9e350eb 100644 --- a/paddle/phi/capi/include/wrapper_base.h +++ b/paddle/phi/capi/include/wrapper_base.h @@ -72,6 +72,19 @@ inline std::vector PD_TensorGetDims(PD_Tensor* tensor, return std::vector(); } +inline std::vector PD_TensorGetStrides(PD_Tensor* tensor, + PD_Status* status) { + int64_t nstrides = PD_TensorGetNumStrides(tensor, status); + if (nstrides > 0) { + std::vector shape(nstrides); + for (int64_t i = 0; i < nstrides; ++i) { + shape[i] = PD_TensorGetStride(tensor, i, status); + } + return shape; + } + return std::vector(); +} + inline std::vector PD_MetaTensorGetDims(PD_MetaTensor* tensor, PD_Status* status) { int64_t ndims = PD_MetaTensorGetNumDims(tensor, status); @@ -85,6 +98,19 @@ inline std::vector PD_MetaTensorGetDims(PD_MetaTensor* tensor, return std::vector(); } +inline std::vector PD_MetaTensorGetStrides(PD_MetaTensor* tensor, + PD_Status* status) { + int64_t nstrides = PD_MetaTensorGetNumStrides(tensor, status); + if (nstrides > 0) { + std::vector shape(nstrides); + for (int64_t i = 0; i < nstrides; ++i) { + shape[i] = PD_MetaTensorGetStride(tensor, i, status); + } + return shape; + } + return std::vector(); +} + template class WrapperBase { public: @@ -134,6 +160,13 @@ class DenseTensor : public WrapperBase { return holder; } + size_t offset() const { + C_Status status; + auto offset = PD_TensorGetOffset(raw_data(), &status); + PD_CHECK_STATUS(status); + return offset; + } + std::vector dims() const { C_Status status; auto dimension = PD_TensorGetDims(raw_data(), &status); @@ -141,6 +174,13 @@ class DenseTensor : public WrapperBase { return dimension; } + std::vector strides() const { + C_Status status; + auto strides = PD_TensorGetStrides(raw_data(), &status); + PD_CHECK_STATUS(status); + return strides; + } + PD_DataType dtype() const { C_Status status; auto data_type = PD_TensorGetPDDataType(raw_data(), &status); @@ -207,6 +247,18 @@ class DenseTensor : public WrapperBase { PD_CHECK_STATUS(status); } + void set_offset(const int64_t& offset) { + C_Status status; + PD_TensorSetOffset(raw_data(), offset, &status); + PD_CHECK_STATUS(status); + } + + void set_strides(const std::vector& strides) { + C_Status status; + PD_TensorSetStrides(raw_data(), strides.size(), strides.data(), &status); + PD_CHECK_STATUS(status); + } + void set_dtype(PD_DataType data_type) { C_Status status; PD_TensorSetDataType(raw_data(), data_type, &status); @@ -513,6 +565,13 @@ class MetaTensor : WrapperBase { return dimension; } + std::vector strides() const { + C_Status status; + auto strides = PD_MetaTensorGetStrides(raw_data(), &status); + PD_CHECK_STATUS(status); + return strides; + } + PD_DataType dtype() const { C_Status status; auto data_type = PD_MetaTensorGetPDDataType(raw_data(), &status); @@ -540,6 +599,13 @@ class MetaTensor : WrapperBase { PD_CHECK_STATUS(status); } + void set_strides(const std::vector& strides) { + C_Status status; + PD_MetaTensorSetStrides( + raw_data(), strides.size(), strides.data(), &status); + PD_CHECK_STATUS(status); + } + void set_dtype(PD_DataType data_type) { C_Status status; PD_MetaTensorSetDataType(raw_data(), data_type, &status); diff --git a/paddle/phi/capi/lib/c_device_context.cc b/paddle/phi/capi/lib/c_device_context.cc index 6dc1ff768260d7..b415ece7e361d2 100644 --- a/paddle/phi/capi/lib/c_device_context.cc +++ b/paddle/phi/capi/lib/c_device_context.cc @@ -35,7 +35,7 @@ PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext* ctx, reinterpret_cast(ctx)->stream()); } else if (dev_ctx_type == phi::AllocationType::CPU) { return nullptr; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if (dev_ctx_type == phi::AllocationType::GPU) { return reinterpret_cast( reinterpret_cast(ctx)->stream()); diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc index 7df79117dbae5d..e9fe2aada1f35f 100644 --- a/paddle/phi/capi/lib/c_kernel_context.cc +++ b/paddle/phi/capi/lib/c_kernel_context.cc @@ -30,7 +30,7 @@ PD_DeviceContext* PD_KernelContextGetDeviceContext(PD_KernelContext* ctx) { } else if (dev_ctx_type == phi::AllocationType::CPU) { return reinterpret_cast(const_cast( &kernel_context->GetDeviceContext())); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if (dev_ctx_type == phi::AllocationType::GPU) { return reinterpret_cast(const_cast( &kernel_context->GetDeviceContext())); diff --git a/paddle/phi/capi/lib/c_meta_tensor.cc b/paddle/phi/capi/lib/c_meta_tensor.cc index 6ea6eda1a7f23e..f436ba9d3cde0d 100644 --- a/paddle/phi/capi/lib/c_meta_tensor.cc +++ b/paddle/phi/capi/lib/c_meta_tensor.cc @@ -88,6 +88,36 @@ int64_t PD_MetaTensorGetDim(const PD_MetaTensor *tensor, return cc_tensor->dims()[index]; } +int64_t PD_MetaTensorGetNumStrides(const PD_MetaTensor *tensor, + PD_Status *status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return 0; + } + *status = C_SUCCESS; + } + + auto cc_tensor = reinterpret_cast(tensor); + return cc_tensor->strides().size(); +} + +int64_t PD_MetaTensorGetStride(const PD_MetaTensor *tensor, + size_t index, + PD_Status *status) { + auto cc_tensor = reinterpret_cast(tensor); + + if (status) { + if (!tensor || index >= static_cast(cc_tensor->strides().size())) { + *status = C_FAILED; + return 0; + } + *status = C_SUCCESS; + } + + return cc_tensor->strides()[index]; +} + bool PD_MetaTensorIsValid(const PD_MetaTensor *tensor, PD_Status *status) { if (status) { if (!tensor) { @@ -117,6 +147,22 @@ void PD_MetaTensorSetDims(PD_MetaTensor *tensor, cc_tensor->set_dims(common::make_ddim(shape)); } +void PD_MetaTensorSetStrides(PD_MetaTensor *tensor, + int64_t nstrides, + const int64_t *strides, + PD_Status *status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return; + } + *status = C_SUCCESS; + } + auto cc_tensor = reinterpret_cast(tensor); + std::vector shape(strides, strides + nstrides); + cc_tensor->set_strides(common::make_ddim(shape)); +} + void PD_MetaTensorSetDataType(PD_MetaTensor *tensor, PD_DataType dtype, PD_Status *status) { diff --git a/paddle/phi/capi/lib/c_tensor.cc b/paddle/phi/capi/lib/c_tensor.cc index 31a724447b7c7f..eb8c8c6f4eb47d 100644 --- a/paddle/phi/capi/lib/c_tensor.cc +++ b/paddle/phi/capi/lib/c_tensor.cc @@ -111,6 +111,35 @@ int64_t PD_TensorGetDim(const PD_Tensor* tensor, return cc_tensor->dims()[index]; } +int64_t PD_TensorGetNumStrides(const PD_Tensor* tensor, PD_Status* status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return 0; + } + *status = C_SUCCESS; + } + + auto cc_tensor = reinterpret_cast(tensor); + return cc_tensor->strides().size(); +} + +int64_t PD_TensorGetStride(const PD_Tensor* tensor, + size_t index, + PD_Status* status) { + auto cc_tensor = reinterpret_cast(tensor); + + if (status) { + if (!tensor || index >= static_cast(cc_tensor->strides().size())) { + *status = C_FAILED; + return 0; + } + *status = C_SUCCESS; + } + + return cc_tensor->strides()[index]; +} + void PD_TensorGetLoD(const PD_Tensor* tensor, PD_List* data, PD_List* offset, @@ -185,6 +214,19 @@ void* PD_TensorGetHolder(const PD_Tensor* tensor, PD_Status* status) { return cc_tensor->Holder().get(); } +size_t PD_TensorGetOffset(const PD_Tensor* tensor, PD_Status* status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return 0; + } + *status = C_SUCCESS; + } + + auto cc_tensor = reinterpret_cast(tensor); + return cc_tensor->offset(); +} + void PD_TensorSetDims(PD_Tensor* tensor, int64_t ndims, const int64_t* dims, @@ -201,6 +243,36 @@ void PD_TensorSetDims(PD_Tensor* tensor, cc_tensor->Resize(common::make_ddim(shape)); } +void PD_TensorSetOffset(PD_Tensor* tensor, + const int64_t offset, + PD_Status* status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return; + } + *status = C_SUCCESS; + } + auto cc_tensor = reinterpret_cast(tensor); + cc_tensor->set_offset(offset); +} + +void PD_TensorSetStrides(PD_Tensor* tensor, + int64_t nstrides, + const int64_t* strides, + PD_Status* status) { + if (status) { + if (!tensor) { + *status = C_FAILED; + return; + } + *status = C_SUCCESS; + } + auto cc_tensor = reinterpret_cast(tensor); + std::vector shape(strides, strides + nstrides); + cc_tensor->set_strides(common::make_ddim(shape)); +} + void PD_TensorSetDataType(PD_Tensor* tensor, PD_DataType dtype, PD_Status* status) { diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 4f238496c41494..64dab3ccdeb3b4 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -138,7 +138,7 @@ inline Backend StringToBackend(const char* backend_cstr) { } else if (s == std::string("GPUDNN")) { return Backend::GPUDNN; } else if (s == std::string("KPS")) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // NOTE(chenweihang) KPS is not yet a complete backend, and it still needs // to be converted // to GPU in the GPU environment diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h index 9609dc50a9a0be..028851e34c8bc7 100644 --- a/paddle/phi/common/bfloat16.h +++ b/paddle/phi/common/bfloat16.h @@ -31,13 +31,7 @@ #include #endif -#if defined(__MUSACC__) -#define PADDLE_MUSA_BF16 -#include -#include -#endif - -#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#ifndef PADDLE_WITH_HIP #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else @@ -71,14 +65,13 @@ struct PADDLE_ALIGN(2) bfloat16 { tempRes = reinterpret_cast(&val); res = *tempRes; x = res >> 16; -#elif defined(PADDLE_CUDA_BF16) +#else +#if defined(PADDLE_CUDA_BF16) __nv_bfloat16 tmp = __float2bfloat16(val); x = *reinterpret_cast(&tmp); -#elif defined(PADDLE_MUSA_BF16) - __mt_bfloat16 tmp = __float2bfloat16(val); - x = *reinterpret_cast(&tmp); #else std::memcpy(&x, reinterpret_cast(&val) + 2, 2); +#endif #endif } @@ -88,12 +81,6 @@ struct PADDLE_ALIGN(2) bfloat16 { } #endif -#if defined(PADDLE_MUSA_BF16) - HOSTDEVICE inline explicit bfloat16(const __mt_bfloat16& val) { - x = *reinterpret_cast(&val); // NOLINT - } -#endif - template HOSTDEVICE inline explicit bfloat16(const T& val) : x(bfloat16(static_cast(val)).x) {} @@ -106,13 +93,6 @@ struct PADDLE_ALIGN(2) bfloat16 { } #endif -#if defined(PADDLE_MUSA_BF16) - HOSTDEVICE inline bfloat16& operator=(const __mt_bfloat16& val) { - x = *reinterpret_cast(&val); // NOLINT - return *this; - } -#endif - HOSTDEVICE inline bfloat16& operator=(bool b) { x = b ? 0x3f80 : 0; return *this; @@ -180,16 +160,16 @@ struct PADDLE_ALIGN(2) bfloat16 { // return res; res = res << 16; return *reinterpret_cast(&res); -#elif defined(PADDLE_CUDA_BF16) +#else +#ifdef PADDLE_CUDA_BF16 return __bfloat162float(*reinterpret_cast(&x)); -#elif defined(PADDLE_MUSA_BF16) - return __bfloat162float(*reinterpret_cast(&x)); #else float val = 0.f; uint16_t temp = x; std::memcpy( reinterpret_cast(&val) + 2, reinterpret_cast(&temp), 2); return val; +#endif #endif } @@ -199,12 +179,6 @@ struct PADDLE_ALIGN(2) bfloat16 { } #endif -#ifdef PADDLE_MUSA_BF16 - HOSTDEVICE inline __mt_bfloat16 to_mt_bfloat16() const { - return *reinterpret_cast(&x); - } -#endif - HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } HOSTDEVICE inline explicit operator int8_t() const { diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h index 4fb04ed0f7f666..5de6290fb77057 100644 --- a/paddle/phi/common/complex.h +++ b/paddle/phi/common/complex.h @@ -26,17 +26,12 @@ #include #endif // PADDLE_WITH_CUDA -#ifdef PADDLE_WITH_MUSA -#include -#include -#endif // PADDLE_WITH_MUSA - #ifdef PADDLE_WITH_HIP #include #include // NOLINT #endif -#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#ifndef PADDLE_WITH_HIP #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else @@ -46,7 +41,7 @@ #define PADDLE_ALIGN(x) #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // todo #define PADDLE_WITH_CUDA_OR_HIP_COMPLEX #endif @@ -71,7 +66,7 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex { HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template HOSTDEVICE inline explicit complex(const thrust::complex& c) { @@ -100,14 +95,6 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex { HOSTDEVICE inline explicit operator hipDoubleComplex() const { return make_hipDoubleComplex(real, imag); } -#elif defined(PADDLE_WITH_MUSA) - HOSTDEVICE inline explicit operator muFloatComplex() const { - return make_muFloatComplex(real, imag); - } - - HOSTDEVICE inline explicit operator muDoubleComplex() const { - return make_muDoubleComplex(real, imag); - } #else HOSTDEVICE inline explicit operator cuFloatComplex() const { return make_cuFloatComplex(real, imag); diff --git a/paddle/phi/common/cpstring_impl.h b/paddle/phi/common/cpstring_impl.h index c88d4ac21cd4a0..1906fd4e57a444 100644 --- a/paddle/phi/common/cpstring_impl.h +++ b/paddle/phi/common/cpstring_impl.h @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/common/macros.h" -#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) +#if (defined(__NVCC__) || defined(__HIPCC__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ @@ -77,7 +77,7 @@ HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) { } #endif -#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) +#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__)) #define PD_le32toh(x) x #else // PD_PSTRING_LITTLE_ENDIAN #define PD_le32toh(x) swap32(x) @@ -209,7 +209,7 @@ HOSTDEVICE static inline void *PD_Malloc(size_t size) { return malloc(size); } HOSTDEVICE static inline void *PD_Realloc(void *ptr, size_t old_size UNUSED, size_t new_size) { -#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) +#if (defined(__NVCC__) || defined(__HIPCC__)) if (old_size >= new_size) { return ptr; } diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index e4f4a5ae272eb9..9d60b8c6241ae3 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -37,10 +37,6 @@ #include #endif // PADDLE_WITH_CUDA -#ifdef PADDLE_WITH_MUSA -#include -#endif // PADDLE_WITH_MUSA - #ifdef PADDLE_WITH_HIP #include #endif @@ -50,17 +46,12 @@ #include #endif -#ifdef __MUSACC__ -#define PADDLE_CUDA_FP16 -#include -#endif - #ifdef __HIPCC__ #define PADDLE_CUDA_FP16 #include #endif -#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#ifndef PADDLE_WITH_HIP #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else @@ -95,8 +86,8 @@ struct PADDLE_ALIGN(2) float16 { // Constructors #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline explicit float16(const half& h) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(const_cast(&h))->x; #else x = h.x; @@ -115,7 +106,7 @@ struct PADDLE_ALIGN(2) float16 { HOSTDEVICE inline explicit float16(float val) { #if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) + (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) half tmp = __float2half(val); x = *reinterpret_cast(&tmp); @@ -157,7 +148,7 @@ struct PADDLE_ALIGN(2) float16 { // Assignment operators #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline float16& operator=(const half& rhs) { -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(const_cast(&rhs))->x; #else x = rhs.x; @@ -231,7 +222,7 @@ struct PADDLE_ALIGN(2) float16 { // Conversion operators #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline half to_half() const { -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)|| CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 __half_raw h; h.x = x; return half(h); @@ -251,7 +242,7 @@ struct PADDLE_ALIGN(2) float16 { HOSTDEVICE inline operator float() const { #if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) + (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) half tmp = *reinterpret_cast(this); return __half2float(tmp); @@ -360,7 +351,7 @@ struct PADDLE_ALIGN(2) float16 { // CUDA 9.0 regarding the half data type. // ROCM has built-in arithmetic operators as not defined // __HIP_NO_HALF_OPERATORS__ -#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && !defined(__MUSACC__) && CUDA_VERSION < 9000 +#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && CUDA_VERSION < 9000 DEVICE inline half operator+(const half& a, const half& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hadd(a, b); @@ -408,7 +399,7 @@ DEVICE inline half operator-(const half& a) { #endif } -#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) // not defined __HIP_NO_HALF_OPERATORS__ +#ifndef PADDLE_WITH_HIP // not defined __HIP_NO_HALF_OPERATORS__ DEVICE inline half& operator+=(half& a, const half& b) { // NOLINT a = a + b; return a; @@ -484,7 +475,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) { #if defined(PADDLE_CUDA_FP16) // HIPCC has compile error if call __device__ function __hadd, __hsub, etc. // in __host__ __device__ function -#if defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__HIPCC__) DEVICE inline float16 operator+(const float16& a, const float16& b) { return float16(__hadd(a.to_half(), b.to_half())); } @@ -501,7 +492,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { } #endif -#if defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__HIPCC__) DEVICE inline float16 operator-(const float16& a, const float16& b) { return float16(__hsub(a.to_half(), b.to_half())); } @@ -518,7 +509,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { } #endif -#if defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__HIPCC__) DEVICE inline float16 operator*(const float16& a, const float16& b) { return float16(__hmul(a.to_half(), b.to_half())); } @@ -535,7 +526,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { } #endif -#if defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__HIPCC__) DEVICE inline float16 operator/(const float16& a, const float16& b) { return float16(__hdiv(a.to_half(), b.to_half())); } @@ -555,7 +546,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { } #endif -#if defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__HIPCC__) DEVICE inline float16 operator-(const float16& a) { return float16(__hneg(a.to_half())); } @@ -598,7 +589,7 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT // HIPCC has compile error if call __device__ function __heq, __hne, etc. // in __host__ __device__ function -#if defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__HIPCC__) DEVICE inline bool operator==(const float16& a, const float16& b) { return __heq(a.to_half(), b.to_half()); } @@ -615,7 +606,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { } #endif // __HIPCC__ -#if defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__HIPCC__) DEVICE inline bool operator!=(const float16& a, const float16& b) { return __hne(a.to_half(), b.to_half()); } @@ -632,7 +623,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { } #endif // __HIPCC__ -#if defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__HIPCC__) DEVICE inline bool operator<(const float16& a, const float16& b) { return __hlt(a.to_half(), b.to_half()); } @@ -649,7 +640,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { } #endif // __HIPCC__ -#if defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__HIPCC__) DEVICE inline bool operator<=(const float16& a, const float16& b) { return __hle(a.to_half(), b.to_half()); } @@ -666,7 +657,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { } #endif // __HIPCC__ -#if defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__HIPCC__) DEVICE inline bool operator>(const float16& a, const float16& b) { return __hgt(a.to_half(), b.to_half()); } @@ -683,7 +674,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { } #endif // __HIPCC__ -#if defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__HIPCC__) DEVICE inline bool operator>=(const float16& a, const float16& b) { return __hge(a.to_half(), b.to_half()); } @@ -974,7 +965,7 @@ DEVICE inline bool(isnan)(const float16& a) { return __hisnan(a.to_half()); } HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; } #else HOSTDEVICE inline bool(isnan)(const float16& a) { -#if defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(__MUSACC__)) +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hisnan(a.to_half()); #else return (a.x & 0x7fff) > 0x7c00; @@ -992,7 +983,7 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) { HOSTDEVICE inline float16(abs)(const float16& a) { #if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)) + (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)) return float16(::fabs(static_cast(a))); #else return float16(std::abs(static_cast(a))); diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc index a1fc14073d96ac..1af8cc442a1178 100644 --- a/paddle/phi/common/memory_utils.cc +++ b/paddle/phi/common/memory_utils.cc @@ -69,7 +69,7 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) { dev_id); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void GpuMemoryUsage(size_t* available, size_t* total) { return MemoryUtils::Instance().GpuMemoryUsage(available, total); } @@ -90,8 +90,8 @@ void EmplaceDeviceContexts( stream_priority); } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) { return MemoryUtils::Instance().GetAllocator(device_id, stream); } diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h index abcc6ac003c644..9e4e573277549a 100644 --- a/paddle/phi/common/memory_utils.h +++ b/paddle/phi/common/memory_utils.h @@ -34,11 +34,6 @@ #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#include -#endif - namespace phi { struct MemoryInterface { @@ -133,7 +128,7 @@ struct MemoryInterface { int64_t (*device_memory_stat_current_value)(const std::string& stat_type, int dev_id); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * @brief get the memory usage of current GPU device. * @@ -166,8 +161,8 @@ struct MemoryInterface { bool disable_setting_default_stream_for_allocator, int stream_priority); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) phi::Allocator* (*get_allocator)(int device_id, phi::gpuStream_t stream); phi::Allocator* (*get_host_allocator)(); phi::Allocator* (*get_zero_allocator)(int device_id); @@ -297,7 +292,7 @@ class MemoryUtils { return memory_method_->device_memory_stat_current_value(stat_type, dev_id); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void GpuMemoryUsage(size_t* available, size_t* total) { CheckMemoryMethod(); PADDLE_ENFORCE_NOT_NULL( @@ -349,8 +344,8 @@ class MemoryUtils { "Fluid. You can call InitMemoryMethod() for initialization.")); } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) { return memory_method_->get_allocator(device_id, stream); } @@ -426,7 +421,7 @@ void Copy(const Place& dst_place, int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void GpuMemoryUsage(size_t* available, size_t* total); #endif @@ -439,8 +434,8 @@ void EmplaceDeviceContexts( bool disable_setting_default_stream_for_allocator, int stream_priority); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) const Allocator* GetAllocator(int device_id, phi::gpuStream_t stream); const Allocator* GetHostAllocator(); diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index c205bb7675393f..008f45aa935544 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -129,7 +129,7 @@ static int8_t GetCorrectDeviceIdByPlaceType( switch (place_type) { case paddle::PlaceType::kCPU: return 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) case paddle::PlaceType::kGPU: return phi::backends::gpu::GetCurrentDeviceId(); #endif @@ -175,7 +175,7 @@ bool operator==(PlaceType place_type, const Place &place) { GPUPlace DefaultGPUPlace() { return GPUPlace( -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::backends::gpu::GetCurrentDeviceId()); #else 0); diff --git a/paddle/phi/common/transform.h b/paddle/phi/common/transform.h index 0b1a94aa0c1b90..e80561284b885f 100644 --- a/paddle/phi/common/transform.h +++ b/paddle/phi/common/transform.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/hostdevice.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) #include #include #include "thrust/device_ptr.h" @@ -92,7 +92,7 @@ struct Transform { } }; -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) // PointerToThrustDevicePtr has two specializations, one casts a (CUDA // device) pointer into thrust::device_ptr, the other keeps rest types @@ -153,12 +153,6 @@ struct Transform { CastToCUDATransformIterator(last), CastToCUDATransformIterator(result), op); -#elif defined(__MUSACC__) - thrust::transform(thrust::musa::par.on(context.stream()), - CastToCUDATransformIterator(first), - CastToCUDATransformIterator(last), - CastToCUDATransformIterator(result), - op); #else thrust::transform(thrust::cuda::par.on(context.stream()), CastToCUDATransformIterator(first), @@ -190,13 +184,6 @@ struct Transform { CastToCUDATransformIterator(first2), CastToCUDATransformIterator(result), op); -#elif defined(__MUSACC__) - thrust::transform(thrust::musa::par.on(context.stream()), - CastToCUDATransformIterator(first1), - CastToCUDATransformIterator(last1), - CastToCUDATransformIterator(first2), - CastToCUDATransformIterator(result), - op); #else thrust::transform(thrust::cuda::par.on(context.stream()), CastToCUDATransformIterator(first1), diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 15585543417d8e..d4c5de0dbe6dc9 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -61,7 +61,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { return phi::CPUPlace(); case phi::Backend::UNDEFINED: return phi::Place(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) case phi::Backend::GPU: return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); @@ -70,7 +70,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { case phi::Backend::ONEDNN: // NOLINT return phi::CPUPlace(); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) case phi::Backend::GPUDNN: return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); @@ -81,7 +81,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0); #endif case phi::Backend::KPS: -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); #elif defined(PADDLE_WITH_XPU_KP) diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h index 50c07b6e2cc46b..b27770b0814339 100644 --- a/paddle/phi/core/cuda_stream.h +++ b/paddle/phi/core/cuda_stream.h @@ -23,11 +23,6 @@ limitations under the License. */ using gpuStream_t = cudaStream_t; #endif -#ifdef PADDLE_WITH_MUSA -#include -using gpuStream_t = musaStream_t; -#endif - #ifdef PADDLE_WITH_HIP #include using gpuStream_t = hipStream_t; @@ -78,9 +73,6 @@ class CUDAStream { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority( &stream, static_cast(flag), priority)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreateWithPriority( - &stream, static_cast(flag), priority)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority( &stream, static_cast(flag), priority)); @@ -100,8 +92,6 @@ class CUDAStream { backends::gpu::GPUDeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(raw_stream())); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(raw_stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(raw_stream())); #endif @@ -122,14 +112,6 @@ class CUDAStream { if (err == hipErrorNotReady) { return false; } -#elif defined(PADDLE_WITH_MUSA) - musaError_t err = musaStreamQuery(raw_stream()); - if (err == musaSuccess) { - return true; - } - if (err == musaErrorNotReady) { - return false; - } #else cudaError_t err = cudaStreamQuery(raw_stream()); if (err == cudaSuccess) { @@ -152,8 +134,6 @@ class CUDAStream { void WaitEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(raw_stream(), ev, 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(raw_stream(), ev, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(raw_stream(), ev, 0)); #endif @@ -166,8 +146,6 @@ class CUDAStream { backends::gpu::GPUDeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP hipStreamDestroy(raw_stream()); -#elif defined(PADDLE_WITH_MUSA) - musaStreamDestroy(raw_stream()); #else cudaStreamDestroy(raw_stream()); #endif diff --git a/paddle/phi/core/distributed/CMakeLists.txt b/paddle/phi/core/distributed/CMakeLists.txt index 34046df6013a57..00000c3fff9e0f 100644 --- a/paddle/phi/core/distributed/CMakeLists.txt +++ b/paddle/phi/core/distributed/CMakeLists.txt @@ -4,7 +4,7 @@ add_subdirectory(auto_parallel) set(DISTRIBUTED_COMMON_SRCS comm_context_manager.cc) -if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) +if(WITH_NCCL OR WITH_RCCL) list(APPEND DISTRIBUTED_COMMON_SRCS comm_task_manager.cc) list(APPEND DISTRIBUTED_COMMON_SRCS nccl_comm_context.cc nccl_comm_task.cc nccl_tools.cc) diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc index 9407d1fad7f428..e7a1ec15da307a 100644 --- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc +++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc @@ -101,7 +101,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx, store, unique_comm_key, dev_ctx.GetPlace(), rank, world_size); #endif } else { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (phi::GPUContext::classof(&dev_ctx)) { CommContextManager::CreateNCCLCommContext( store, unique_comm_key, rank, world_size); @@ -164,7 +164,7 @@ bool NeedComputationClipForPP( } Place GetDefaultPlace() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (phi::backends::gpu::GetGPUDeviceCount() >= 0) { return paddle::DefaultGPUPlace(); } diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h index 41cfd4efca8fd7..022dc065980641 100644 --- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h +++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h @@ -71,7 +71,7 @@ std::vector BalancedSplit(int64_t total_nums, int64_t num_of_pieces); CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx, const std::vector& process_ids); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #define RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, ...) \ do { \ if (phi::CPUContext::classof(dev_ctx)) { \ @@ -123,7 +123,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx, RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, __VA_ARGS__); \ } while (0) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...) \ do { \ if (phi::CPUContext::classof(dev_ctx)) { \ diff --git a/paddle/phi/core/distributed/check/CMakeLists.txt b/paddle/phi/core/distributed/check/CMakeLists.txt index 964106feac4027..1721a4a4602d10 100644 --- a/paddle/phi/core/distributed/check/CMakeLists.txt +++ b/paddle/phi/core/distributed/check/CMakeLists.txt @@ -1,6 +1,6 @@ set(CHECK_COMMON_SRCS static_check.cc) -if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) +if(WITH_NCCL OR WITH_RCCL) list(APPEND CHECK_COMMON_SRCS nccl_dynamic_check.cc) endif() diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc index 4a7b931ad2b332..9307af45bd622b 100644 --- a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc +++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc @@ -30,16 +30,6 @@ #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuFree hipFree -#elif defined(PADDLE_WITH_MCCL) -#include - -#include "paddle/phi/backends/dynload/mccl.h" - -#define gpuMalloc musaMalloc -#define gpuMemcpy musaMemcpy -#define gpuMemcpyDeviceToHost musaMemcpyDeviceToHost -#define gpuMemcpyHostToDevice musaMemcpyHostToDevice -#define gpuFree musaFree #else #include @@ -66,7 +56,7 @@ void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor, void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor, int root_rank, int cur_rank, - mcclComm_t comm) { + ncclComm_t comm) { constexpr int kSize = sizeof(int64_t); int64_t dtype_host = static_cast(tensor.dtype()); int64_t* dtype_device; @@ -74,10 +64,10 @@ void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor, PADDLE_ENFORCE_GPU_SUCCESS( gpuMemcpy(dtype_device, &dtype_host, kSize, gpuMemcpyHostToDevice)); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclBroadcast(dtype_device, + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBroadcast(dtype_device, dtype_device, 1, - mcclInt64, + ncclInt64, root_rank, comm, kDefaultStream)); @@ -105,7 +95,7 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor, void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor, int root_rank, int cur_rank, - mcclComm_t comm) { + ncclComm_t comm) { CheckDataType(tensor, root_rank, cur_rank, comm); constexpr int kSize = sizeof(int64_t); @@ -116,10 +106,10 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor, PADDLE_ENFORCE_GPU_SUCCESS( gpuMemcpy(shape_device, &shape_host, kSize, gpuMemcpyHostToDevice)); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclBroadcast(shape_device, + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBroadcast(shape_device, shape_device, 1, - mcclInt64, + ncclInt64, root_rank, comm, kDefaultStream)); @@ -140,7 +130,7 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor, const std::vector& in_size_each_rank, int cur_rank, int world_size, - mcclComm_t comm) { + ncclComm_t comm) { CheckDataType(out_tensor, /*root_rank*/ 0, cur_rank, comm); CheckDataType(in_tensor, /*root_rank*/ 0, cur_rank, comm); @@ -153,11 +143,11 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor, PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&in_shape_device, kSize)); PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy( in_shape_device, &in_shape_host, kSize, gpuMemcpyHostToDevice)); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclReduce(in_shape_device, + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduce(in_shape_device, in_shape_device, 1, - mcclInt64, - mcclSum, + ncclInt64, + ncclSum, rank, comm, kDefaultStream)); @@ -177,7 +167,7 @@ void NCCLDynamicCheck::CheckGatherShape( int root_rank, int cur_rank, int world_size, - mcclComm_t comm) { + ncclComm_t comm) { std::vector shapes(world_size, 0); shapes[cur_rank] = in_tensor.numel(); int64_t* in_shape_device; @@ -188,11 +178,11 @@ void NCCLDynamicCheck::CheckGatherShape( world_size * sizeof(int64_t), gpuMemcpyHostToDevice)); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(in_shape_device, + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(in_shape_device, in_shape_device, world_size, - mcclInt64, - mcclSum, + ncclInt64, + ncclSum, comm, kDefaultStream)); PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy(shapes.data(), diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.h b/paddle/phi/core/distributed/check/nccl_dynamic_check.h index 502ec886211e1b..23e8386d6f2aff 100644 --- a/paddle/phi/core/distributed/check/nccl_dynamic_check.h +++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.h @@ -21,8 +21,6 @@ #if defined(PADDLE_WITH_RCCL) using gpuStream_t = hipStream_t; -#elif defined(PADDLE_WITH_MCCL) -using gpuStream_t = musaStream_t; #else using gpuStream_t = cudaStream_t; #endif @@ -38,21 +36,21 @@ struct NCCLDynamicCheck { static void CheckDataType(const phi::DenseTensor& tensor, int root_rank, int cur_rank, - mcclComm_t comm); + ncclComm_t comm); static void CheckShape(const phi::DenseTensor& tensor, int64_t shape); static void CheckShape(const phi::DenseTensor& tensor, int root_rank, int cur_rank, - mcclComm_t comm); + ncclComm_t comm); static void CheckShape(const phi::DenseTensor& out_tensor, const phi::DenseTensor& in_tensor, const std::vector& in_size_each_rank, int cur_rank, int world_size, - mcclComm_t comm); + ncclComm_t comm); // can be used to check gather and all gather static void CheckGatherShape(const phi::DenseTensor& in_tensor, @@ -60,7 +58,7 @@ struct NCCLDynamicCheck { int root_rank, int cur_rank, int world_size, - mcclComm_t comm); + ncclComm_t comm); private: // `0` represents default stream for both cuda & hip diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc index 2aee7c7c851042..5fd7861cc52b2d 100644 --- a/paddle/phi/core/distributed/comm_context_manager.cc +++ b/paddle/phi/core/distributed/comm_context_manager.cc @@ -29,7 +29,7 @@ #include "paddle/phi/core/distributed/store/gloo_store.h" #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -49,13 +49,13 @@ namespace distributed { int CommContextManager::device_id = -1; void CommContextManager::SetDeviceId(int dev_id) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) phi::backends::gpu::SetDeviceId(dev_id); CommContextManager::device_id = dev_id; #endif } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) void CommContextManager::CreateNCCLCommContext( const std::shared_ptr& store, const std::string& unique_comm_key, @@ -67,16 +67,16 @@ void CommContextManager::CreateNCCLCommContext( if (comm_context_manager.Has(unique_comm_key)) { return; } - mcclUniqueId nccl_id; + ncclUniqueId nccl_id; if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclGetUniqueId(&nccl_id)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id)); } std::string unique_key = "NCCLCommContext/" + unique_comm_key + hash_key; if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) { std::vector nccl_id_wrapper( reinterpret_cast(&nccl_id), - reinterpret_cast(&nccl_id) + MCCL_UNIQUE_ID_BYTES); + reinterpret_cast(&nccl_id) + NCCL_UNIQUE_ID_BYTES); store->set(unique_key, nccl_id_wrapper); } else { const auto& nccl_id_wrapper = store->get(unique_key); @@ -231,8 +231,8 @@ CommContext* CommContextManager::Get(const std::string& unique_comm_key) const { return id_to_comm_context_.at(unique_comm_key).get(); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) -int CommContextManager::GetRingId(const mcclComm_t& comm) const { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +int CommContextManager::GetRingId(const ncclComm_t& comm) const { for (auto iter = id_to_comm_context_.begin(); iter != id_to_comm_context_.end(); ++iter) { diff --git a/paddle/phi/core/distributed/comm_context_manager.h b/paddle/phi/core/distributed/comm_context_manager.h index 5c3f3101dcada6..8c4d802294986f 100644 --- a/paddle/phi/core/distributed/comm_context_manager.h +++ b/paddle/phi/core/distributed/comm_context_manager.h @@ -24,7 +24,7 @@ #include "paddle/phi/common/place.h" #include "paddle/phi/core/distributed/comm_context.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/phi/backends/gpu/forwards.h" #endif @@ -57,8 +57,8 @@ class CommContextManager { CommContext* Get(const std::string& unique_comm_key) const; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) - int GetRingId(const mcclComm_t& comm) const; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + int GetRingId(const ncclComm_t& comm) const; #endif bool Has(const std::string& unique_comm_key) const; @@ -71,7 +71,7 @@ class CommContextManager { std::vector GetGroupRanks(const std::string& pg_key) const; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static void CreateNCCLCommContext(const std::shared_ptr& store, const std::string& unique_comm_key, int rank, diff --git a/paddle/phi/core/distributed/comm_task.h b/paddle/phi/core/distributed/comm_task.h index ca7f8495495d2d..47ba01b980479a 100644 --- a/paddle/phi/core/distributed/comm_task.h +++ b/paddle/phi/core/distributed/comm_task.h @@ -25,9 +25,6 @@ #if defined(PADDLE_WITH_RCCL) #include "paddle/phi/backends/dynload/rccl.h" #endif -#if defined(PADDLE_WITH_MCCL) -#include "paddle/phi/backends/dynload/mccl.h" -#endif #if defined(PADDLE_WITH_NCCL) #include "paddle/phi/backends/dynload/nccl.h" #endif @@ -46,7 +43,7 @@ class CommTask { int gid = 0, uint64_t seq = 0, int64_t numel = 0, - mcclComm_t nccl_comm = nullptr, + ncclComm_t nccl_comm = nullptr, gpuStream_t nccl_stream = nullptr, CommType comm_type = CommType::UNKNOWN) : backend_(backend), @@ -92,7 +89,7 @@ class CommTask { std::shared_ptr GetStore() { return store_; } void SetStore(std::shared_ptr store) { store_ = store; } - mcclComm_t nccl_comm() { return nccl_comm_; } + ncclComm_t nccl_comm() { return nccl_comm_; } gpuStream_t nccl_stream() { return nccl_stream_; } virtual std::string GetTraceMsg() { @@ -163,7 +160,7 @@ class CommTask { int gid_; uint64_t seq_{0}; int64_t numel_; - mcclComm_t nccl_comm_; + ncclComm_t nccl_comm_; gpuStream_t nccl_stream_; CommType comm_type_; bool start_trace_updated_{false}; diff --git a/paddle/phi/core/distributed/comm_task_manager.cc b/paddle/phi/core/distributed/comm_task_manager.cc index 822b3892ec3646..ae7de422913587 100644 --- a/paddle/phi/core/distributed/comm_task_manager.cc +++ b/paddle/phi/core/distributed/comm_task_manager.cc @@ -32,7 +32,7 @@ #include "paddle/phi/core/distributed/store/store.h" #include "paddle/phi/core/enforce.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/phi/core/distributed/comm_task_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #endif diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc index 4600d2e14cdbbe..8da676e74d911a 100644 --- a/paddle/phi/core/distributed/nccl_comm_context.cc +++ b/paddle/phi/core/distributed/nccl_comm_context.cc @@ -30,16 +30,16 @@ namespace distributed { // set this flag to `true` and recompile to enable dynamic checks constexpr bool FLAGS_enable_nccl_dynamic_check = false; -NCCLCommContext::NCCLCommContext(int rank, int size, mcclUniqueId nccl_id) +NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id) : CommContext(rank, size) { - MCCL_CHECK( - phi::dynload::mcclCommInitRank(&nccl_comm_, size_, nccl_id, rank_)); - MCCL_CHECK(phi::dynload::mcclGetVersion(&nccl_version_)); + NCCL_CHECK( + phi::dynload::ncclCommInitRank(&nccl_comm_, size_, nccl_id, rank_)); + NCCL_CHECK(phi::dynload::ncclGetVersion(&nccl_version_)); } int NCCLCommContext::GetNcclVersion() { return nccl_version_; } -mcclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; } +ncclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; } gpuStream_t NCCLCommContext::GetStream() { return dev_ctx_->stream(); } @@ -77,7 +77,7 @@ void NCCLCommContext::Broadcast(phi::DenseTensor* out_tensor, if (FLAGS_enable_nccl_dynamic_check) { NCCLDynamicCheck::CheckShape(*out_tensor, root, rank_, nccl_comm_); } - MCCL_CHECK(phi::dynload::mcclBroadcast(in_tensor.data(), + NCCL_CHECK(phi::dynload::ncclBroadcast(in_tensor.data(), out_tensor->data(), in_tensor.numel(), ToNCCLDataType(in_tensor.type()), @@ -100,7 +100,7 @@ void NCCLCommContext::AllGather(phi::DenseTensor* out_tensor, rank_, nccl_comm_); } - MCCL_CHECK(phi::dynload::mcclAllGather(in_tensor.data(), + NCCL_CHECK(phi::dynload::ncclAllGather(in_tensor.data(), out_tensor->data(), in_tensor.numel(), ToNCCLDataType(in_tensor.type()), @@ -109,7 +109,7 @@ void NCCLCommContext::AllGather(phi::DenseTensor* out_tensor, } void NCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - mcclRedOp_t reduce_type, + ncclRedOp_t reduce_type, gpuStream_t stream) { phi::distributed::CommStaticCheck::ScatterLikeShape(*out_tensor, in_tensor, @@ -122,7 +122,7 @@ void NCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor, rank_, nccl_comm_); } - MCCL_CHECK(phi::dynload::mcclReduceScatter(in_tensor.data(), + NCCL_CHECK(phi::dynload::ncclReduceScatter(in_tensor.data(), out_tensor->data(), out_tensor->numel(), ToNCCLDataType(in_tensor.type()), @@ -141,7 +141,7 @@ void NCCLCommContext::Send(const phi::DenseTensor& in_tensor, NCCLDynamicCheck::CheckShape(in_tensor, rank_, rank_, nccl_comm_); } - MCCL_CHECK(phi::dynload::mcclSend(in_tensor.data(), + NCCL_CHECK(phi::dynload::ncclSend(in_tensor.data(), count, ToNCCLDataType(in_tensor.dtype()), peer, @@ -160,7 +160,7 @@ void NCCLCommContext::Recv(phi::DenseTensor* out_tensor, NCCLDynamicCheck::CheckShape(*out_tensor, peer, rank_, nccl_comm_); } - MCCL_CHECK(phi::dynload::mcclRecv(out_tensor->data(), + NCCL_CHECK(phi::dynload::ncclRecv(out_tensor->data(), count, ToNCCLDataType(out_tensor->dtype()), peer, @@ -172,7 +172,7 @@ void NCCLCommContext::Recv(phi::DenseTensor* out_tensor, void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - mcclRedOp_t reduce_type, + ncclRedOp_t reduce_type, gpuStream_t stream) { phi::distributed::CommStaticCheck::SameShape(*out_tensor, in_tensor, @@ -185,7 +185,7 @@ void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor, rank_, nccl_comm_); } - MCCL_CHECK(phi::dynload::mcclAllReduce(in_tensor.data(), + NCCL_CHECK(phi::dynload::ncclAllReduce(in_tensor.data(), out_tensor->data(), in_tensor.numel(), ToNCCLDataType(in_tensor.type()), @@ -196,7 +196,7 @@ void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor, void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - mcclRedOp_t reduce_type, + ncclRedOp_t reduce_type, int root, gpuStream_t stream) { phi::distributed::CommStaticCheck::SameShape(*out_tensor, @@ -210,7 +210,7 @@ void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor, rank_, nccl_comm_); } - MCCL_CHECK(phi::dynload::mcclReduce(in_tensor.data(), + NCCL_CHECK(phi::dynload::ncclReduce(in_tensor.data(), out_tensor->data(), in_tensor.numel(), ToNCCLDataType(in_tensor.type()), @@ -221,23 +221,23 @@ void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor, } void NCCLCommContext::GroupStart() { - MCCL_CHECK(phi::dynload::mcclGroupStart()); + NCCL_CHECK(phi::dynload::ncclGroupStart()); } -void NCCLCommContext::GroupEnd() { MCCL_CHECK(phi::dynload::mcclGroupEnd()); } +void NCCLCommContext::GroupEnd() { NCCL_CHECK(phi::dynload::ncclGroupEnd()); } -// #if NCCL_VERSION_CODE >= 21100 -void NCCLCommContext::RedOpCreatePreMulSum(mcclRedOp_t* op, +#if NCCL_VERSION_CODE >= 21100 +void NCCLCommContext::RedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, - mcclDataType_t dtype, - mcclScalarResidence_t residence) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpCreatePreMulSum( + ncclDataType_t dtype, + ncclScalarResidence_t residence) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum( op, scalar, dtype, residence, nccl_comm_)); } -void NCCLCommContext::RedOpDestroy(mcclRedOp_t op) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpDestroy(op, nccl_comm_)); +void NCCLCommContext::RedOpDestroy(ncclRedOp_t op) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, nccl_comm_)); } -// #endif +#endif } // namespace distributed } // namespace phi diff --git a/paddle/phi/core/distributed/nccl_comm_context.h b/paddle/phi/core/distributed/nccl_comm_context.h index e7a73f12046721..609b5e0defe079 100644 --- a/paddle/phi/core/distributed/nccl_comm_context.h +++ b/paddle/phi/core/distributed/nccl_comm_context.h @@ -18,11 +18,6 @@ #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#include -#endif - #ifdef PADDLE_WITH_HIP #include #endif @@ -34,8 +29,6 @@ #if defined(PADDLE_WITH_RCCL) #include "paddle/phi/backends/dynload/rccl.h" -#elif defined(PADDLE_WITH_MCCL) -#include "paddle/phi/backends/dynload/mccl.h" #else #include "paddle/phi/backends/dynload/nccl.h" #endif @@ -46,12 +39,12 @@ namespace distributed { class NCCLCommContext final : public CommContext { public: - NCCLCommContext(int rank, int size, mcclUniqueId nccl_id); + NCCLCommContext(int rank, int size, ncclUniqueId nccl_id); ~NCCLCommContext() override = default; int GetNcclVersion(); - mcclComm_t GetNcclComm(); + ncclComm_t GetNcclComm(); gpuStream_t GetStream(); @@ -87,7 +80,7 @@ class NCCLCommContext final : public CommContext { void ReduceScatter(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - mcclRedOp_t reduce_type, + ncclRedOp_t reduce_type, gpuStream_t stream); void AllGather(phi::DenseTensor* out_tensor, @@ -96,12 +89,12 @@ class NCCLCommContext final : public CommContext { void AllReduce(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - mcclRedOp_t reduce_type, + ncclRedOp_t reduce_type, gpuStream_t stream); void Reduce(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - mcclRedOp_t reduce_type, + ncclRedOp_t reduce_type, int root, gpuStream_t stream); @@ -109,25 +102,25 @@ class NCCLCommContext final : public CommContext { void GroupEnd(); -// #if NCCL_VERSION_CODE >= 21100 +#if NCCL_VERSION_CODE >= 21100 // Creates a new reduction operator which pre-multiplies input values by a // given scalar locally before reducing them with peer values via summation. - void RedOpCreatePreMulSum(mcclRedOp_t* op, + void RedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, - mcclDataType_t dtype, - mcclScalarResidence_t residence); + ncclDataType_t dtype, + ncclScalarResidence_t residence); // Destroys the reduction operator op. The operator must have been created by // ncclRedOpCreatePreMul with the matching communicator comm. - void RedOpDestroy(mcclRedOp_t op); -// #endif + void RedOpDestroy(ncclRedOp_t op); +#endif private: DISABLE_COPY_AND_ASSIGN(NCCLCommContext); int nccl_version_; - mcclComm_t nccl_comm_; + ncclComm_t nccl_comm_; std::unique_ptr dev_ctx_; diff --git a/paddle/phi/core/distributed/nccl_comm_task.cc b/paddle/phi/core/distributed/nccl_comm_task.cc index 5f11c8101df938..4e2efea0068eb9 100644 --- a/paddle/phi/core/distributed/nccl_comm_task.cc +++ b/paddle/phi/core/distributed/nccl_comm_task.cc @@ -33,7 +33,7 @@ NCCLCommTask::NCCLCommTask(const phi::Place& place, int64_t numel, bool sync_op, bool use_calc_stream, - mcclComm_t nccl_comm, + ncclComm_t nccl_comm, gpuStream_t stream, CommType comm_type, int64_t timeout) @@ -62,8 +62,6 @@ void NCCLCommTask::StartRecord() { if (!start_event_created_) { #ifdef PADDLE_WITH_CUDA CUDA_CHECK(cudaEventCreateWithFlags(&nccl_start_event_, cuda_event_flags_)); -#elif defined(PADDLE_WITH_MUSA) - MUSA_CHECK(musaEventCreateWithFlags(&nccl_start_event_, musa_event_flags_)); #else // PADDLE_WITH_HIP HIP_CHECK(hipEventCreateWithFlags(&nccl_start_event_, hip_event_flags_)); #endif @@ -71,8 +69,6 @@ void NCCLCommTask::StartRecord() { } #ifdef PADDLE_WITH_CUDA CUDA_CHECK(cudaEventRecord(nccl_start_event_, nccl_stream_)); -#elif defined(PADDLE_WITH_MUSA) - MUSA_CHECK(musaEventRecord(nccl_start_event_, nccl_stream_)); #else // PADDLE_WITH_HIP HIP_CHECK(hipEventRecord(nccl_start_event_, nccl_stream_)); #endif @@ -82,8 +78,6 @@ void NCCLCommTask::EndRecord() { if (!end_event_created_) { #ifdef PADDLE_WITH_CUDA CUDA_CHECK(cudaEventCreateWithFlags(&nccl_end_event_, cuda_event_flags_)); -#elif defined(PADDLE_WITH_MUSA) - MUSA_CHECK(musaEventCreateWithFlags(&nccl_end_event_, musa_event_flags_)); #else // PADDLE_WITH_HIP HIP_CHECK(hipEventCreateWithFlags(&nccl_end_event_, hip_event_flags_)); #endif @@ -91,8 +85,6 @@ void NCCLCommTask::EndRecord() { } #ifdef PADDLE_WITH_CUDA CUDA_CHECK(cudaEventRecord(nccl_end_event_, nccl_stream_)); -#elif defined(PADDLE_WITH_MUSA) - MUSA_CHECK(musaEventRecord(nccl_end_event_, nccl_stream_)); #else // PADDLE_WITH_HIP HIP_CHECK(hipEventRecord(nccl_end_event_, nccl_stream_)); #endif @@ -111,19 +103,6 @@ void NCCLCommTask::ClearRecord() { end_event_created_ = false; } } -#elif defined(PADDLE_WITH_MUSA) -void NCCLCommTask::ClearRecord() { - if (start_event_created_) { - backends::gpu::GPUDeviceGuard guard(place_.device); - MUSA_CHECK(musaEventDestroy(nccl_start_event_)); - start_event_created_ = false; - } - if (end_event_created_) { - backends::gpu::GPUDeviceGuard guard(place_.device); - MUSA_CHECK(musaEventDestroy(nccl_end_event_)); - end_event_created_ = false; - } -} #else // PADDLE_WITH_HIP void NCCLCommTask::ClearRecord() { if (start_event_created_) { @@ -150,16 +129,6 @@ bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) { // ignore and clear the error if not ready CUDA_CHECK(cudaGetLastError()); } -#elif defined(PADDLE_WITH_MUSA) - musaError_t ret = musaEventQuery(event); - if (ret == musaSuccess) { - return true; - } else if (ret != musaErrorNotReady) { - MUSA_CHECK(ret); - } else { - // ignore and clear the error if not ready - MUSA_CHECK(musaGetLastError()); - } #else // PADDLE_WITH_HIP hipError_t ret = hipEventQuery(event); if (ret == hipSuccess) { @@ -174,7 +143,7 @@ bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) { return false; } -std::string GetNCCLErrorDetail(mcclResult_t result) { +std::string GetNCCLErrorDetail(ncclResult_t result) { std::string detail; std::string last_error; #ifdef ENABLE_NCCL_GET_LAST_ERROR @@ -182,10 +151,10 @@ std::string GetNCCLErrorDetail(mcclResult_t result) { ", Last error: " + std::string(phi::dynload::ncclGetLastError(NULL)); #endif switch (result) { - case mcclUnhandledCudaError: + case ncclUnhandledCudaError: detail = "ncclUnhandledCudaError: Call to CUDA function failed."; break; - case mcclSystemError: + case ncclSystemError: detail = "ncclSystemError: System call (e.g. socket, malloc) or external " "library call failed or device error. "; @@ -195,13 +164,13 @@ std::string GetNCCLErrorDetail(mcclResult_t result) { detail += "It can be also caused by unexpected exit of a remote peer."; #endif break; - case mcclInternalError: + case ncclInternalError: detail = "ncclInternalError: Internal check failed."; break; - case mcclInvalidArgument: + case ncclInvalidArgument: detail = "ncclInvalidArgument: Invalid value for an argument."; break; - case mcclInvalidUsage: + case ncclInvalidUsage: detail = "ncclInvalidUsage: This usually reflects invalid usage of NCCL " "library."; @@ -225,10 +194,10 @@ std::string NCCLCommTask::GetCommErrors() { return comm_error_; } - mcclResult_t nccl_async_error; - MCCL_CHECK( - phi::dynload::mcclCommGetAsyncError(nccl_comm_, &nccl_async_error)); - if (nccl_async_error != mcclSuccess) { + ncclResult_t nccl_async_error; + NCCL_CHECK( + phi::dynload::ncclCommGetAsyncError(nccl_comm_, &nccl_async_error)); + if (nccl_async_error != ncclSuccess) { comm_error_ = "\n\t Find nccl comm error: " + GetNCCLErrorDetail(nccl_async_error); } @@ -272,7 +241,7 @@ void NCCLCommTask::AbortComm() { if (aborted_) { return; } - MCCL_CHECK(phi::dynload::mcclCommAbort(nccl_comm_)); + NCCL_CHECK(phi::dynload::ncclCommAbort(nccl_comm_)); aborted_ = true; nccl_comm_ = nullptr; diff --git a/paddle/phi/core/distributed/nccl_comm_task.h b/paddle/phi/core/distributed/nccl_comm_task.h index 11bbbd1c9dcf70..fca9004cf0b2d4 100644 --- a/paddle/phi/core/distributed/nccl_comm_task.h +++ b/paddle/phi/core/distributed/nccl_comm_task.h @@ -21,8 +21,6 @@ #if defined(PADDLE_WITH_RCCL) #include "paddle/phi/backends/dynload/rccl.h" -#elif defined(PADDLE_WITH_MCCL) -#include "paddle/phi/backends/dynload/mccl.h" #else #include "paddle/phi/backends/dynload/nccl.h" #endif @@ -44,7 +42,7 @@ class NCCLCommTask : public CommTask { int64_t numel = 0, bool sync_op = true, bool use_calc_stream = false, - mcclComm_t = nullptr, + ncclComm_t = nullptr, gpuStream_t = nullptr, CommType comm_type = CommType::UNKNOWN, int64_t timeout = DefaultTimeout); @@ -73,8 +71,6 @@ class NCCLCommTask : public CommTask { #ifdef PADDLE_WITH_CUDA unsigned int cuda_event_flags_ = cudaEventDisableTiming; -#elif defined(PADDLE_WITH_MUSA) - unsigned int musa_event_flags_ = musaEventDisableTiming; #else // PADDLE_WITH_HIP unsigned int hip_event_flags_ = hipEventDisableTiming; #endif diff --git a/paddle/phi/core/distributed/nccl_tools.cc b/paddle/phi/core/distributed/nccl_tools.cc index 24a1f3ee7891d1..a5388796d1f45b 100644 --- a/paddle/phi/core/distributed/nccl_tools.cc +++ b/paddle/phi/core/distributed/nccl_tools.cc @@ -19,74 +19,74 @@ #include "paddle/common/errors.h" #include "paddle/phi/core/enforce.h" -// #if NCCL_VERSION_CODE >= 21300 +#if NCCL_VERSION_CODE >= 21300 #define ENABLE_NCCL_GET_LAST_ERROR #define NCCL_REMOTE_ERROR -// #endif +#endif namespace phi { namespace distributed { -mcclRedOp_t ToNCCLRedType(ReduceOp reduction) { - static const std::unordered_map red_type = { - {ReduceOp::MIN, mcclMin}, - {ReduceOp::MAX, mcclMax}, - {ReduceOp::SUM, mcclSum}, - {ReduceOp::PRODUCT, mcclProd}, +ncclRedOp_t ToNCCLRedType(ReduceOp reduction) { + static const std::unordered_map red_type = { + {ReduceOp::MIN, ncclMin}, + {ReduceOp::MAX, ncclMax}, + {ReduceOp::SUM, ncclSum}, + {ReduceOp::PRODUCT, ncclProd}, }; auto it = red_type.find(reduction); PADDLE_ENFORCE_EQ(it != red_type.end(), true, phi::errors::InvalidArgument( - "Invalid nccl reduction. Must be mcclMin | mcclMax | " - "mcclProd | mcclSum")); + "Invalid nccl reduction. Must be ncclMin | ncclMax | " + "ncclProd | ncclSum")); return it->second; } -std::string SerializeNCCLUniqueId(const mcclUniqueId& ncclID) { +std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) { const uint8_t* bytes = reinterpret_cast(&ncclID); std::ostringstream oss; - for (auto i = 0; i < MCCL_UNIQUE_ID_BYTES; ++i) { + for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) { oss << std::hex << static_cast(bytes[i]); } return oss.str(); } -std::string NCCLDTypeToString(mcclDataType_t dtype) { +std::string NCCLDTypeToString(ncclDataType_t dtype) { #define PD_NCCL_DTYPE_TO_STR(__nccl_dtype, __str_dtype) \ if (dtype == __nccl_dtype) return __str_dtype; - PD_NCCL_DTYPE_TO_STR(mcclFloat, "float32"); - PD_NCCL_DTYPE_TO_STR(mcclFloat32, "float32"); - PD_NCCL_DTYPE_TO_STR(mcclHalf, "float16"); - PD_NCCL_DTYPE_TO_STR(mcclFloat16, "float16"); -// // #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 -// PD_NCCL_DTYPE_TO_STR(mcclBfloat16, "bfloat16"); -// // #endif - PD_NCCL_DTYPE_TO_STR(mcclDouble, "float64"); - PD_NCCL_DTYPE_TO_STR(mcclFloat64, "float64"); + PD_NCCL_DTYPE_TO_STR(ncclFloat, "float32"); + PD_NCCL_DTYPE_TO_STR(ncclFloat32, "float32"); + PD_NCCL_DTYPE_TO_STR(ncclHalf, "float16"); + PD_NCCL_DTYPE_TO_STR(ncclFloat16, "float16"); +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 + PD_NCCL_DTYPE_TO_STR(ncclBfloat16, "bfloat16"); +#endif + PD_NCCL_DTYPE_TO_STR(ncclDouble, "float64"); + PD_NCCL_DTYPE_TO_STR(ncclFloat64, "float64"); - PD_NCCL_DTYPE_TO_STR(mcclInt8, "int8"); - PD_NCCL_DTYPE_TO_STR(mcclChar, "int8"); - PD_NCCL_DTYPE_TO_STR(mcclUint8, "uint8"); - PD_NCCL_DTYPE_TO_STR(mcclInt32, "int32"); - PD_NCCL_DTYPE_TO_STR(mcclInt, "int32"); - PD_NCCL_DTYPE_TO_STR(mcclUint32, "uint32"); - PD_NCCL_DTYPE_TO_STR(mcclInt64, "int64"); - PD_NCCL_DTYPE_TO_STR(mcclUint64, "uint64"); + PD_NCCL_DTYPE_TO_STR(ncclInt8, "int8"); + PD_NCCL_DTYPE_TO_STR(ncclChar, "int8"); + PD_NCCL_DTYPE_TO_STR(ncclUint8, "uint8"); + PD_NCCL_DTYPE_TO_STR(ncclInt32, "int32"); + PD_NCCL_DTYPE_TO_STR(ncclInt, "int32"); + PD_NCCL_DTYPE_TO_STR(ncclUint32, "uint32"); + PD_NCCL_DTYPE_TO_STR(ncclInt64, "int64"); + PD_NCCL_DTYPE_TO_STR(ncclUint64, "uint64"); #undef PD_NCCL_DTYPE_TO_STR PADDLE_THROW(phi::errors::InvalidArgument( "This datatype %d in nccl is not supported.", static_cast(dtype))); } -std::string NCCLRedTypeToString(mcclRedOp_t op) { - if (op == mcclSum) return "SUM"; - if (op == mcclProd) return "PROD"; - if (op == mcclMin) return "MIN"; - if (op == mcclMax) return "MAX"; -// #if NCCL_VERSION_CODE >= 21000 - if (op == mcclAvg) return "AVG"; -// #endif +std::string NCCLRedTypeToString(ncclRedOp_t op) { + if (op == ncclSum) return "SUM"; + if (op == ncclProd) return "PROD"; + if (op == ncclMin) return "MIN"; + if (op == ncclMax) return "MAX"; +#if NCCL_VERSION_CODE >= 21000 + if (op == ncclAvg) return "AVG"; +#endif return "UDF_" + std::to_string(op); } diff --git a/paddle/phi/core/distributed/nccl_tools.h b/paddle/phi/core/distributed/nccl_tools.h index e256d4ef4d0093..0ab380a4177838 100644 --- a/paddle/phi/core/distributed/nccl_tools.h +++ b/paddle/phi/core/distributed/nccl_tools.h @@ -21,9 +21,6 @@ #ifdef PADDLE_WITH_RCCL #include #include "paddle/phi/backends/dynload/rccl.h" -#elif defined(PADDLE_WITH_MCCL) -#include -#include "paddle/phi/backends/dynload/mccl.h" #else #include #include "paddle/phi/backends/dynload/nccl.h" @@ -35,7 +32,7 @@ namespace distributed { #define NCCL_CHECK(cmd) \ do { \ ncclResult_t r = cmd; \ - if (r != mcclSuccess) { \ + if (r != ncclSuccess) { \ PADDLE_THROW( \ phi::errors::External("Failed, NCCL error %s:%d '%s'\n", \ __FILE__, \ @@ -44,18 +41,6 @@ namespace distributed { } \ } while (0) -#define MCCL_CHECK(cmd) \ - do { \ - mcclResult_t r = cmd; \ - if (r != mcclSuccess) { \ - PADDLE_THROW( \ - phi::errors::External("Failed, MCCL error %s:%d '%s'\n", \ - __FILE__, \ - __LINE__, \ - phi::dynload::mcclGetErrorString(r))); \ - } \ - } while (0) - #ifdef PADDLE_WITH_NCCL #define CUDA_CHECK(expr) \ do { \ @@ -67,17 +52,6 @@ namespace distributed { cudaGetErrorString(r))); \ } \ } while (0) -#elif defined(PADDLE_WITH_MCCL) -#define MUSA_CHECK(expr) \ - do { \ - musaError_t r = expr; \ - if (r != musaSuccess) { \ - PADDLE_THROW(phi::errors::External("Failed, musa error %s:%d '%s'\n", \ - __FILE__, \ - __LINE__, \ - musaGetErrorString(r))); \ - } \ - } while (0) #else // PADDLE_WITH_RCCL #define HIP_CHECK(expr) \ do { \ @@ -91,13 +65,13 @@ namespace distributed { } while (0) #endif -mcclRedOp_t ToNCCLRedType(ReduceOp reduction); +ncclRedOp_t ToNCCLRedType(ReduceOp reduction); -std::string SerializeNCCLUniqueId(const mcclUniqueId& ncclID); +std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID); -std::string NCCLDTypeToString(mcclDataType_t dtype); +std::string NCCLDTypeToString(ncclDataType_t dtype); -std::string NCCLRedTypeToString(mcclRedOp_t op); +std::string NCCLRedTypeToString(ncclRedOp_t op); } // namespace distributed } // namespace phi diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 0c21ffac88703f..61e502951f24ee 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -23,16 +23,6 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA -#ifdef PADDLE_WITH_MUSA -#include -#include -#include -#include -#include -#include -#include -#endif - #ifdef PADDLE_WITH_HIP #include #include @@ -65,17 +55,6 @@ limitations under the License. */ #endif // __APPLE__ #endif // PADDLE_WITH_CUDA -#ifdef PADDLE_WITH_MUSA -#include "paddle/phi/backends/dynload/mufft.h" -#include "paddle/phi/backends/dynload/mublas.h" -#include "paddle/phi/backends/dynload/mudnn.h" -#include "paddle/phi/backends/dynload/murand.h" -#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) -#include -#include "paddle/phi/backends/dynload/mccl.h" -#endif // __APPLE__ -#endif - #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/hipfft.h" #include "paddle/phi/backends/dynload/hiprand.h" @@ -90,7 +69,7 @@ limitations under the License. */ // Note: these headers for simplify demangle type string #include "paddle/phi/core/type_defs.h" // Note: this header for simplify HIP and CUDA type string -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/backends/gpu/gpu_types.h" #endif #if defined(PADDLE_WITH_XPU_BKCL) @@ -347,17 +326,6 @@ struct EnforceNotMet : public std::exception { abort(); \ } \ } while (0) -#elif defined(__MUSACC__) -#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...) \ - do { \ - if (!(_IS_NOT_ERROR)) { \ - printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \ - __FILE__, \ - __LINE__, \ - #_IS_NOT_ERROR, \ - ##__VA_ARGS__); \ - } \ - } while (0) #else #define PADDLE_ENFORCE(COND, ...) \ do { \ @@ -602,7 +570,7 @@ DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS); DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) -DEFINE_EXTERNAL_API_TYPE(ncclResult_t, mcclSuccess); +DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); #endif } // namespace details @@ -698,7 +666,7 @@ inline std::string build_nvidia_error_msg(CUresult stat) { /**************** NCCL ERROR ****************/ #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) inline bool is_error(ncclResult_t nccl_result) { - return nccl_result != mcclSuccess; + return nccl_result != ncclSuccess; } inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { @@ -899,7 +867,7 @@ inline std::string build_rocm_error_msg(rocblas_status stat) { /****** RCCL ERROR ******/ #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) inline bool is_error(ncclResult_t nccl_result) { - return nccl_result != mcclSuccess; + return nccl_result != ncclSuccess; } inline std::string build_rocm_error_msg(ncclResult_t nccl_result) { @@ -935,7 +903,7 @@ DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success); DEFINE_EXTERNAL_API_TYPE(hipfftResult_t, HIPFFT_SUCCESS); #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) -DEFINE_EXTERNAL_API_TYPE(ncclResult_t, mcclSuccess); +DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); #endif } // namespace details @@ -990,7 +958,7 @@ inline void retry_sleep(unsigned millisecond) { } \ if (UNLIKELY(__cond__ != __success_type__)) { \ auto __summary__ = phi::errors::External( \ - ::phi::enforce::build_musa_error_msg(__cond__)); \ + ::phi::enforce::build_rocm_error_msg(__cond__)); \ __THROW_ERROR_INTERNAL__(__summary__); \ } \ } while (0) @@ -998,234 +966,6 @@ inline void retry_sleep(unsigned millisecond) { #undef DEFINE_EXTERNAL_API_TYPE #endif // PADDLE_WITH_HIP - - - - - - - - - - - - - - - -/**************************************************************************/ -/***************************** MUSA ERROR **********************************/ -#ifdef PADDLE_WITH_MUSA - -/***** MUSA ERROR *****/ -inline bool is_error(musaError_t e) { return e != musaSuccess; } - -inline std::string build_musa_error_msg(musaError_t e) { - std::ostringstream sout; - sout << " Musa error(" << e << "), " << musaGetErrorString(e) << "."; - return sout.str(); -} - -/***** MURAND ERROR *****/ -inline bool is_error(murandStatus_t stat) { - return stat != MURAND_STATUS_SUCCESS; -} - -inline const char* murandGetErrorString(murandStatus_t stat) { - switch (stat) { - case MURAND_STATUS_SUCCESS: - return "MURAND_STATUS_SUCCESS"; - case MURAND_STATUS_VERSION_MISMATCH: - return "MURAND_STATUS_VERSION_MISMATCH"; - case MURAND_STATUS_NOT_CREATED: - return "MURAND_STATUS_NOT_CREATED"; - case MURAND_STATUS_ALLOCATION_FAILED: - return "MURAND_STATUS_ALLOCATION_FAILED"; - case MURAND_STATUS_TYPE_ERROR: - return "MURAND_STATUS_TYPE_ERROR"; - case MURAND_STATUS_OUT_OF_RANGE: - return "MURAND_STATUS_OUT_OF_RANGE"; - case MURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "MURAND_STATUS_LENGTH_NOT_MULTIPLE"; - case MURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "MURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case MURAND_STATUS_LAUNCH_FAILURE: - return "MURAND_STATUS_LAUNCH_FAILURE"; - case MURAND_STATUS_INTERNAL_ERROR: - return "MURAND_STATUS_INTERNAL_ERROR"; - case MURAND_STATUS_NOT_IMPLEMENTED: - return "MURAND_STATUS_NOT_IMPLEMENTED"; - default: - return "Unknown murand status"; - } -} - -inline std::string build_musa_error_msg(murandStatus_t stat) { - std::string msg(" Murand error, "); - return msg + murandGetErrorString(stat) + " "; -} - -/***** mudnn ERROR *****/ -// inline bool is_error(mudnnStatus_t stat) { -// return stat != cudnnStatusSuccess; -// } - -// inline std::string build_rocm_error_msg(miopenStatus_t stat) { -// std::string msg(" Miopen error, "); -// return msg + phi::dynload::miopenGetErrorString(stat) + " "; -// } - -/***** MUBLAS ERROR *****/ -inline bool is_error(mublasStatus stat) { - return stat != MUBLAS_STATUS_SUCCESS; -} - -inline const char* mublasGetErrorString(mublasStatus stat) { - switch (stat) { - case MUBLAS_STATUS_SUCCESS: - return "MUBLAS_STATUS_SUCCESS"; - case MUBLAS_STATUS_INVALID_HANDLE: - return "MUBLAS_STATUS_INVALID_HANDLE"; - case MUBLAS_STATUS_NOT_IMPLEMENTED: - return "MUBLAS_STATUS_NOT_IMPLEMENTED"; - case MUBLAS_STATUS_INVALID_POINTER: - return "MUBLAS_STATUS_INVALID_POINTER"; - case MUBLAS_STATUS_INVALID_SIZE: - return "MUBLAS_STATUS_INVALID_SIZE"; - case MUBLAS_STATUS_MEMORY_ERROR: - return "MUBLAS_STATUS_MEMORY_ERROR"; - case MUBLAS_STATUS_INTERNAL_ERROR: - return "MUBLAS_STATUS_INTERNAL_ERROR"; - case MUBLAS_STATUS_PERF_DEGRADED: - return "MUBLAS_STATUS_PERF_DEGRADED"; - case MUBLAS_STATUS_SIZE_QUERY_MISMATCH: - return "MUBLAS_STATUS_SIZE_QUERY_MISMATCH"; - case MUBLAS_STATUS_SIZE_INCREASED: - return "MUBLAS_STATUS_SIZE_INCREASED"; - case MUBLAS_STATUS_SIZE_UNCHANGED: - return "MUBLAS_STATUS_SIZE_UNCHANGED"; - case MUBLAS_STATUS_INVALID_VALUE: - return "MUBLAS_STATUS_INVALID_VALUE"; - case MUBLAS_STATUS_CONTINUE: - return "MUBLAS_STATUS_CONTINUE"; - case MUBLAS_STATUS_CHECK_NUMERICS_FAIL: - return "MUBLAS_STATUS_CHECK_NUMERICS_FAIL"; - default: - return "Unknown mublas status"; - } -} - -inline std::string build_musa_error_msg(mublasStatus stat) { - std::string msg(" mublas error, "); - return msg + mublasGetErrorString(stat) + " "; -} - -/****** MCCL ERROR ******/ -#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) -inline bool is_error(mcclResult_t mccl_result) { - return mccl_result != mcclSuccess; -} - -inline std::string build_musa_error_msg(mcclResult_t mccl_result) { - std::string msg(" Mccl error, "); - return msg + phi::dynload::mcclGetErrorString(mccl_result) + " "; -} -#endif // not(__APPLE__) and PADDLE_WITH_MCCL - -/***** MUFFT ERROR *****/ -inline bool is_error(mufftResult_t stat) { return stat != MUFFT_SUCCESS; } - -inline std::string build_musa_error_msg(mufftResult_t stat) { - std::string msg(" MUFFT error, "); - return msg + phi::dynload::mufftGetErrorString(stat) + " "; -} - -namespace details { - -template -struct ExternalApiType {}; - -#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ - template <> \ - struct ExternalApiType { \ - using Type = type; \ - static constexpr Type kSuccess = success_value; \ - } - -DEFINE_EXTERNAL_API_TYPE(musaError_t, musaSuccess); -DEFINE_EXTERNAL_API_TYPE(murandStatus_t, MURAND_STATUS_SUCCESS); -DEFINE_EXTERNAL_API_TYPE(mublasStatus, MUBLAS_STATUS_SUCCESS); -DEFINE_EXTERNAL_API_TYPE(mufftResult_t, MUFFT_SUCCESS); - -#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) -DEFINE_EXTERNAL_API_TYPE(mcclResult_t, mcclSuccess); -#endif - -} // namespace details - -#define PADDLE_ENFORCE_GPU_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::phi::enforce::details::ExternalApiType< \ - __CUDA_STATUS_TYPE__>::kSuccess; \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = phi::errors::External( \ - ::phi::enforce::build_musa_error_msg(__cond__)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - -#define PADDLE_WARN_GPU_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::phi::enforce::details::ExternalApiType< \ - __CUDA_STATUS_TYPE__>::kSuccess; \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - ::phi::enforce::ThrowWarnInternal( \ - ::phi::enforce::build_musa_error_msg(__cond__)); \ - } \ - } while (0) - -inline void retry_sleep(unsigned millisecond) { -#ifdef _WIN32 - Sleep(millisecond); -#else - sleep(millisecond); -#endif -} - -#define PADDLE_RETRY_CUDA_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - int retry_count = 1; \ - using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::phi::enforce::details::ExternalApiType< \ - __CUDA_STATUS_TYPE__>::kSuccess; \ - while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ - ::phi::enforce::retry_sleep(10000); \ - __cond__ = (COND); \ - ++retry_count; \ - } \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = phi::errors::External( \ - ::phi::enforce::build_musa_error_msg(__cond__)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - -#undef DEFINE_EXTERNAL_API_TYPE -#endif // PADDLE_WITH_MUSA - - - - - - } // namespace enforce using namespace enforce; // NOLINT } // namespace phi diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index 9304b42be1644a..a6764dfcf1c31f 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -14,7 +14,7 @@ // limitations under the License. #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h" #endif @@ -120,7 +120,7 @@ PHI_DEFINE_EXPORTED_bool( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * CUDA related related FLAG @@ -215,7 +215,7 @@ PHI_DEFINE_EXPORTED_bool( true, "Whether enable api kernel fallback to CPU one when not found"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * CUDNN related FLAG * Name: FLAGS_cudnn_deterministic @@ -322,7 +322,7 @@ PHI_DEFINE_EXPORTED_bool( "batch_norm, default is False."); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * NCCL related FLAG @@ -541,7 +541,7 @@ PHI_DEFINE_EXPORTED_double( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) /** @@ -785,7 +785,7 @@ PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off, * Example: * Note: Check kernel launch status after every kernel compute. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PHI_DEFINE_EXPORTED_bool( check_kernel_launch, false, @@ -800,7 +800,7 @@ PHI_DEFINE_EXPORTED_bool( * Example: * Note: Disable cudnn in conv2d. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PHI_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); @@ -819,7 +819,7 @@ PHI_DEFINE_EXPORTED_bool(use_fast_math, * Note: Get host by name time. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \ - defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE) + defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUSTOM_DEVICE) PHI_DEFINE_EXPORTED_int32(get_host_by_name_time, 120, "The maximum time for get host by name time"); @@ -1190,11 +1190,11 @@ PHI_DEFINE_EXPORTED_bool(multi_node_sample_use_gpu_table, * Note: nccl blocking wait. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PHI_DEFINE_EXPORTED_bool(benchmark_nccl, false, "enable nccl debug mode to synchronize nccl comm"); @@ -1428,7 +1428,7 @@ PHI_DEFINE_EXPORTED_int32( PHI_DEFINE_EXPORTED_bool(print_ir, false, "Whether print ir debug str."); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) /** * Communication library related FLAG diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc index a2fe426b0ec47b..82d37be80d3c36 100644 --- a/paddle/phi/core/generator.cc +++ b/paddle/phi/core/generator.cc @@ -63,7 +63,7 @@ const std::shared_ptr& DefaultXPUGenerator(int64_t device_id) { } const std::shared_ptr& DefaultCUDAGenerator(int64_t device_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) static int64_t num_cuda_devices = -1; static std::once_flag num_devices_init_flag; @@ -278,7 +278,8 @@ uint64_t Generator::Random64() { std::pair Generator::IncrementOffset( uint64_t increment_offset) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_CUSTOM_DEVICE) std::lock_guard lock(this->mu_); uint64_t cur_offset = this->state_.thread_offset; VLOG(10) << "cur_offset = " << cur_offset diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h index 3295a2f6b37399..decebbe66a5381 100644 --- a/paddle/phi/core/hostdevice.h +++ b/paddle/phi/core/hostdevice.h @@ -18,10 +18,6 @@ #include #endif -#ifdef __MUSACC__ -#include -#endif - #if defined(__xpu__) #include @@ -30,7 +26,7 @@ #include "xpu/kernel/math.h" #endif -#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__) || defined(__MUSACC__)) +#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index 6e534511802bb9..a5c5a3994a81b1 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -124,7 +124,7 @@ const Kernel& KernelFactory::SelectKernelWithGPUDNN( return empty_kernel; } KernelKey kernel_key = KernelKey(const_kernel_key); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (kernel_key.backend() == Backend::GPUDNN) { auto kernel_iter = iter->second.find( {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()}); @@ -239,7 +239,7 @@ KernelResult KernelFactory::SelectKernelOrThrowError( KernelKey kernel_key = KernelKey(const_kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, const_kernel_key.dtype()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (kernel_key.backend() == Backend::GPUDNN) { auto kernel_iter = iter->second.find( {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()}); diff --git a/paddle/phi/core/kernel_registry.cc b/paddle/phi/core/kernel_registry.cc index 77ae9b45c9d682..fa9d531b6534d6 100644 --- a/paddle/phi/core/kernel_registry.cc +++ b/paddle/phi/core/kernel_registry.cc @@ -34,7 +34,7 @@ void SetKernelArgsDef(const std::vector& args_type, #if defined(PADDLE_WITH_DNNL) || arg_type == std::type_index(typeid(const OneDNNContext&)) #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || arg_type == std::type_index(typeid(const GPUContext&)) #elif defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) || arg_type == std::type_index(typeid(const XPUContext&)) diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 19f76f60f9a1ba..b24e39b6c75bf1 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -1199,7 +1199,7 @@ struct KernelRegistrar { meta_kernel_fn, \ BACKEND_LIST_EXCEPT_CUSTOM) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #define _DEVICE GPU, #elif defined(PADDLE_WITH_XPU) #define _DEVICE XPU, diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 3b55ccd3dbc365..715b4f76392d8f 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -300,7 +300,7 @@ struct KernelImpl { /* DeviceContext Helpers */ PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext); #endif #ifdef PADDLE_WITH_XPU diff --git a/paddle/phi/core/mixed_vector.cc b/paddle/phi/core/mixed_vector.cc index aba6a0f7bfca27..857bd546befcdf 100644 --- a/paddle/phi/core/mixed_vector.cc +++ b/paddle/phi/core/mixed_vector.cc @@ -33,7 +33,7 @@ template void CopyToCPUHelper(std::vector *cpu_, phi::Allocator::AllocationPtr *gpu_, size_t *gpu_memory_size_) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // COPY GPU Data To CPU auto *dev_ctx = static_cast( phi::DeviceContextPool::Instance().Get((*gpu_)->place())); @@ -55,7 +55,7 @@ void CopyCPUDataToCUDAHelper(std::vector *cpu_, phi::Allocator::AllocationPtr *gpu_, size_t *gpu_memory_size_, const phi::Place &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void *src = cpu_->data(); *gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T) (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_); diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc index 700db5e8d4382e..35c59c2d8d787d 100644 --- a/paddle/phi/core/string_tensor.cc +++ b/paddle/phi/core/string_tensor.cc @@ -116,11 +116,9 @@ void StringTensor::init_holder() { if (place.GetType() == phi::AllocationType::CPU) { std::memset(ptr, 0, bytes_size); } else if (place.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_HIP hipMemset(ptr, 0, bytes_size); -#elif defined(PADDLE_WITH_MUSA) - musaMemset(ptr, 0, bytes_size); #else cudaMemset(ptr, 0, bytes_size); #endif diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc index 03d8b3a0f661ee..17fdef1b9cfbdd 100644 --- a/paddle/phi/core/tensor_utils.cc +++ b/paddle/phi/core/tensor_utils.cc @@ -65,7 +65,7 @@ void Copy(const Context& dev_ctx, #ifdef PADDLE_WITH_DNNL dst->set_layout(src.layout()); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if (dst_place.GetType() == AllocationType::GPU || dst_place.GetType() == AllocationType::GPUPINNED) { dst_ptr = dev_ctx.Alloc( @@ -106,7 +106,7 @@ void Copy(const Context& dev_ctx, if (src_place.GetType() == AllocationType::CPU && dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if ((src_place.GetType() == AllocationType::CPU || src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT (dst_place.GetType() == AllocationType::CPU || @@ -394,7 +394,7 @@ template void Copy(const DeviceContext& dev_ctx, bool blocking, TensorArray* dst); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template void Copy(const GPUContext& dev_ctx, const DenseTensor& src, Place dst_place, @@ -476,7 +476,7 @@ void TensorFromVector(const std::vector& src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -530,7 +530,7 @@ void TensorFromVector(const std::vector& src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -622,7 +622,7 @@ void TensorFromArray(const T* src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -722,7 +722,7 @@ void TensorToVector(const phi::DenseTensor& src, if (src.place().GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (src.place().GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -764,7 +764,7 @@ void TensorToVector(const phi::DenseTensor& src, if (src.place().GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (src.place().GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h index ea1caf4ac067d8..449d7cbe8966df 100644 --- a/paddle/phi/core/utils/data_type.h +++ b/paddle/phi/core/utils/data_type.h @@ -211,35 +211,34 @@ inline int TransToProtoVarType(const DataType& dtype) { } } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) -inline mcclDataType_t ToNCCLDataType(DataType type) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +inline ncclDataType_t ToNCCLDataType(DataType type) { if (type == DataType::FLOAT32) { - return mcclFloat; + return ncclFloat; } else if (type == DataType::FLOAT64) { - return mcclDouble; + return ncclDouble; } else if (type == DataType::INT32) { - return mcclInt; + return ncclInt; } else if (type == DataType::INT64) { - return mcclInt64; + return ncclInt64; } else if (type == DataType::FLOAT16) { - return mcclFloat16; + return ncclFloat16; } else if (type == DataType::UINT8) { - return mcclUint8; + return ncclUint8; } else if (type == DataType::INT8) { - return mcclInt8; + return ncclInt8; } else if (type == DataType::BOOL) { - return mcclUint8; - // } else if (type == DataType::BFLOAT16) { - // return ncclBfloat16; + return ncclUint8; +#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 + } else if (type == DataType::BFLOAT16) { + return ncclBfloat16; +#endif } else { PADDLE_THROW( errors::Unimplemented("This datatype in nccl is not supported.")); } } #endif - - - #if defined(PADDLE_WITH_XPU_BKCL) inline BKCLDataType ToBKCLDataType(DataType type) { if (type == DataType::FLOAT32) { diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc index 63c9cf63f9a320..b419338401eeac 100644 --- a/paddle/phi/core/utils/type_info.cc +++ b/paddle/phi/core/utils/type_info.cc @@ -54,12 +54,12 @@ template class TypeInfoTraits; template class TypeInfoTraits; template class TypeInfoTraits; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU_KP) template class TypeInfoTraits; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template class TypeInfoTraits; #endif diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h index 34a8fca61fbbee..6318b17647cd61 100644 --- a/paddle/phi/core/utils/visit_place.h +++ b/paddle/phi/core/utils/visit_place.h @@ -25,7 +25,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place, const Visitor& visitor) { switch (place.GetType()) { case phi::AllocationType::GPU: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::GPUPlace p(place.GetDeviceId()); return visitor(p); #else @@ -35,7 +35,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place, #endif } case phi::AllocationType::GPUPINNED: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::GPUPinnedPlace p; return visitor(p); #else diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h index 6c61c3964b52d6..7ee12e26d7d0ef 100644 --- a/paddle/phi/core/visit_type.h +++ b/paddle/phi/core/visit_type.h @@ -150,7 +150,7 @@ namespace phi { ///////// BOOL and Floating and Integral Dispatch Marco /////////// -#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL) && !defined(PADDLE_WITH_MCCL) +#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL) #define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...) \ [&] { \ const auto& __dtype__ = TYPE; \ @@ -355,7 +355,7 @@ namespace phi { "`"); \ } \ }() -#if defined(PADDLE_WITH_XPU) +#if defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_HIP) #define PD_VISIT_ALL_TYPES(TYPE, NAME, ...) \ [&] { \ const auto& __dtype__ = TYPE; \ diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index ac3eb1f3cc12fc..eee92aa1380449 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -932,7 +932,7 @@ void CoalesceTensorInferMeta(const std::vector& input, size_of_dtype = static_cast(phi::SizeOf(dtype)); } if (config.is_runtime) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) int64_t numel = 0; for (auto item : input) { const auto& dim = item->dims(); diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 2df3f34b57936c..f38a842a669873 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -22,9 +22,6 @@ add_subdirectory(autotune) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h") -if(WITH_MUSA) - list(REMOVE_ITEM kernel_cu "sparse/*.h") -endif() file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h") file(GLOB kernel_primitive_h "primitive/*.h") @@ -43,43 +40,6 @@ file( "strings/gpu/*.cu" "fusion/gpu/*.cu") -if(WITH_MUSA) - # 创建要排除的文件模式列表 - file( - GLOB files_to_remove - RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" - "sparse/gpu/*.cu" - "gpudnn/*.cu") - - list(REMOVE_ITEM kernel_cu ${files_to_remove}) - message(STATUS "files_to_remove:${files_to_remove}") - - list( - REMOVE_ITEM - kernel_cu - "strings/gpu/strings_lower_upper_kernel.cu" - "strings/gpu/strings_copy_kernel.cu" - "fusion/gpu/block_multi_head_attention_kernel.cu" - "gpu/cudnn_lstm_kernel.cu" - "gpu/cudnn_lstm_grad_kernel.cu" - "gpu/instance_norm_kernel.cu" - "gpu/instance_norm_grad_kernel.cu" - "gpu/log_softmax_kernel.cu" - "gpu/log_softmax_grad_kernel.cu" - "gpu/weighted_sample_neighbors_kernel.cu" - "gpu/cross_entropy_kernel.cu" - "gpu/cross_entropy_grad_kernel.cu" - "gpu/gelu_kernel.cu" - "gpu/gelu_grad_kernel.cu" - "gpu/rnn_kernel.cu.cc" - "gpu/rnn_grad_kernel.cu.cc" - "gpu/clip_by_norm_kernel.cu" - "selected_rows/gpu/clip_by_norm_kernel.cu" - "gpu/softmax_grad_kernel.cu" - "gpu/softmax_kernel.cu" - ) -endif() - if(APPLE OR WIN32) list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu") endif() @@ -217,6 +177,32 @@ if(NOT WITH_CUDNN_FRONTEND) "fusion/gpu/fused_dconv_drelu_dbn_kernel.cu") endif() +# Note(qili93): remove kernels not supported on DCU yet +if(WITH_ROCM) + list( + REMOVE_ITEM + kernel_cu + "gpu/affine_grid_grad_kernel.cu" + "gpu/apply_per_channel_scale_kernel.cu" + "gpu/cholesky_solve_kernel.cu" + "gpu/eigh_kernel.cu" + "gpu/eigvalsh_kernel.cu" + "gpu/lstsq_kernel.cu" + "gpu/lu_kernel.cu" + "gpu/matrix_rank_kernel.cu" + "gpu/matrix_rank_tol_kernel.cu" + "gpu/multiclass_nms3_kernel.cu" + "gpu/put_along_axis_grad_kernel.cu" + "gpu/put_along_axis_kernel.cu" + "gpu/qr_kernel.cu" + "gpu/svd_kernel.cu" + "gpudnn/mha_cudnn_frontend.cu" + "fusion/gpu/block_multi_head_attention_kernel.cu" + "fusion/gpu/fused_bn_add_activation_grad_kernel.cu" + "fusion/gpu/fused_bn_add_activation_kernel.cu" + "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu") +endif() + set(cc_search_pattern "*.cc" "cpu/*.cc" @@ -233,16 +219,16 @@ set(cc_search_pattern "fusion/*.cc" "stride/*.cc" "fusion/cpu/*.cc") -if(WITH_MUSA) - list(REMOVE_ITEM cc_search_pattern "sparse/*.cc") - list(REMOVE_ITEM cc_search_pattern "sparse/cpu/*.cc") -endif() if(WITH_MKLDNN) set(cc_search_pattern ${cc_search_pattern} "legacy/onednn/*.cc" "onednn/*.cc" "fusion/onednn/*.cc") endif() +if(WITH_CUSTOM_DEVICE) + set(cc_search_pattern ${cc_search_pattern} "custom/*.cc") +endif() + file( GLOB kernel_cc RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" @@ -266,7 +252,7 @@ file( "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc" "sparse/xpu/*.cc") -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) collect_srcs(kernels_srcs SRCS ${kernel_cu}) kernel_declare("${kernel_cu}") endif() diff --git a/paddle/phi/kernels/array_kernel.cc b/paddle/phi/kernels/array_kernel.cc index 5389a26479213a..8a599dcf9d80d8 100644 --- a/paddle/phi/kernels/array_kernel.cc +++ b/paddle/phi/kernels/array_kernel.cc @@ -134,7 +134,7 @@ PD_REGISTER_KERNEL(create_array, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(create_array, GPU, ALL_LAYOUT, @@ -178,7 +178,7 @@ PD_REGISTER_KERNEL(array_read, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(array_read, GPU, ALL_LAYOUT, @@ -208,7 +208,7 @@ PD_REGISTER_KERNEL(array_write, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(array_write, GPU, ALL_LAYOUT, @@ -238,7 +238,7 @@ PD_REGISTER_KERNEL(array_to_tensor, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(array_to_tensor, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc index eb884d53f3cd63..b4504f83818d77 100644 --- a/paddle/phi/kernels/assign_kernel.cc +++ b/paddle/phi/kernels/assign_kernel.cc @@ -139,7 +139,7 @@ PD_REGISTER_KERNEL(assign_value, int8_t, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 01ba364ad3d3d5..b04c46351c2cfd 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -30,15 +30,11 @@ #include #endif -#ifdef PADDLE_WITH_MUSA -#include -#endif - namespace phi { -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -static void RecordEventTimerCallback(musaStream_t stream, - musaError_t status, +#ifdef PADDLE_WITH_HIP +static void RecordEventTimerCallback(hipStream_t stream, + hipError_t status, void *user_data) { struct timeval time_now {}; gettimeofday(&time_now, nullptr); @@ -64,9 +60,6 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventCreate(&start_); hipEventCreate(&stop_); -#elif defined(PADDLE_WITH_MUSA) - musaEventCreate(&start_); - musaEventCreate(&stop_); #else cudaEventCreate(&start_); cudaEventCreate(&stop_); @@ -81,9 +74,6 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventDestroy(start_); hipEventDestroy(stop_); -#elif defined(PADDLE_WITH_MUSA) - musaEventDestroy(start_); - musaEventDestroy(stop_); #else cudaEventDestroy(start_); cudaEventDestroy(stop_); @@ -93,8 +83,6 @@ class GpuTimer { void Start(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP hipEventRecord(start_, stream); -#elif defined(PADDLE_WITH_MUSA) - musaEventRecord(start_, stream); #else cudaEventRecord(start_, stream); #endif @@ -103,8 +91,6 @@ class GpuTimer { void Stop(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP hipEventRecord(stop_, stream); -#elif defined(PADDLE_WITH_MUSA) - musaEventRecord(stop_, stream); #else cudaEventRecord(stop_, stream); #endif @@ -115,9 +101,6 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventSynchronize(stop_); hipEventElapsedTime(&milliseconds, start_, stop_); -#elif defined(PADDLE_WITH_MUSA) - musaEventSynchronize(stop_); - musaEventElapsedTime(&milliseconds, start_, stop_); #else cudaEventSynchronize(stop_); cudaEventElapsedTime(&milliseconds, start_, stop_); @@ -161,12 +144,6 @@ class CalculateStreamTimer { RecordEventTimerCallback, reinterpret_cast(&start_time_), 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamAddCallback(calculated_stream_, - RecordEventTimerCallback, - reinterpret_cast(&start_time_), - 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamAddCallback(calculated_stream_, @@ -186,12 +163,6 @@ class CalculateStreamTimer { RecordEventTimerCallback, reinterpret_cast(&end_time_), 0)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaStreamAddCallback(calculated_stream_, - RecordEventTimerCallback, - reinterpret_cast(&end_time_), - 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamAddCallback(calculated_stream_, @@ -207,8 +178,6 @@ class CalculateStreamTimer { if (calculated_stream_ != nullptr) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(calculated_stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_)); #endif @@ -220,8 +189,6 @@ class CalculateStreamTimer { if (calculated_stream_ != nullptr) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(calculated_stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_)); #endif diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc index dba08b0de366af..bf04c99dab0a3c 100644 --- a/paddle/phi/kernels/batch_norm_kernel.cc +++ b/paddle/phi/kernels/batch_norm_kernel.cc @@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer, } #endif #endif -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#ifdef PADDLE_WITH_HIP PD_REGISTER_KERNEL(batch_norm_infer, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc index 9f4b51281cd37f..6e496a355302fc 100644 --- a/paddle/phi/kernels/check_memory_continue_kernel.cc +++ b/paddle/phi/kernels/check_memory_continue_kernel.cc @@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(check_memory_continue, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(check_memory_continue, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc index 2d0ab05a8de78b..a60369af449f4e 100644 --- a/paddle/phi/kernels/coalesce_tensor_kernel.cc +++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc @@ -309,20 +309,6 @@ PD_REGISTER_KERNEL(coalesce_tensor, } #endif -#ifdef PADDLE_WITH_MUSA -PD_REGISTER_KERNEL(coalesce_tensor, - GPU, - ALL_LAYOUT, - phi::CoalesceTensorKernel, - phi::dtype::float16, - int, - float, - double) { - kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); - kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); -} -#endif - #ifdef PADDLE_WITH_XPU PD_REGISTER_KERNEL(coalesce_tensor, XPU, diff --git a/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc b/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc index acd84a80be2ad1..47e804b7de2775 100644 --- a/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc @@ -38,10 +38,10 @@ void CummaxGradKernel(const Context& dev_ctx, } if (dtype == DataType::INT32) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } else if (dtype == DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } } @@ -61,10 +61,10 @@ void CumminGradKernel(const Context& dev_ctx, } if (dtype == DataType::INT32) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } else if (dtype == DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, dev_ctx); + *x_grad, axis, indices, out_grad, true, dev_ctx); } } diff --git a/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc b/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc index 0b11e3d6f98da9..aceced1ce85313 100644 --- a/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc +++ b/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc @@ -29,4 +29,4 @@ void DecodeJpegKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - decode_jpeg, CPU, ALL_LAYOUT, phi::DecodeJpegKernel, uint8_t) {} \ No newline at end of file + decode_jpeg, CPU, ALL_LAYOUT, phi::DecodeJpegKernel, uint8_t) {} diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc index 81ed7170d7a24f..65ee3c1851003e 100644 --- a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc @@ -64,7 +64,7 @@ struct GeluGradFunctor { } else { #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) + !defined(PADDLE_WITH_HIP) auto x_data = x.data(); auto dx_data = dx.data(); auto dout_data = dout.data(); diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc index 47ab1a78390662..dbab3bd3266649 100644 --- a/paddle/phi/kernels/cpu/gelu_kernel.cc +++ b/paddle/phi/kernels/cpu/gelu_kernel.cc @@ -53,7 +53,7 @@ struct GeluFunctor { } else { #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) + !defined(PADDLE_WITH_HIP) auto x_data = x.data(); auto out_data = out.data(); int n = std::min(x.size(), out.size()); diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc index dd7b762849d16b..aeb2071b136de8 100644 --- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc @@ -25,11 +25,14 @@ namespace phi { template void PutAlongAxisGradKernel(const Context& dev_ctx, - const DenseTensor& x UNUSED, + const DenseTensor& x, const DenseTensor& index, + const DenseTensor& value, + const DenseTensor& out, const DenseTensor& out_grad, int axis, - const std::string& reduce UNUSED, + const std::string& reduce, + bool include_self, DenseTensor* x_grad, DenseTensor* value_grad) { PADDLE_ENFORCE_EQ( @@ -40,31 +43,135 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, const auto& index_type = index.dtype(); if (x_grad) { phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_input_grad_kernel( - // Here passing an unused argument out_grad, because it's - // convenient to instantiate a bunch of template function with the - // same arguments list. - out_grad, - axis, - index, - *x_grad, - dev_ctx); - } else { - phi::funcs::cpu_scatter_input_grad_kernel( - out_grad, axis, index, *x_grad, dev_ctx); + if (include_self == false || reduce == "assign") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_input_grad_kernel( + // Here passing an unused argument out_grad, because it's + // convenient to instantiate a bunch of template function with the + // same arguments list. + out_grad, + axis, + index, + *x_grad, + include_self, + dev_ctx); + } else { + phi::funcs::cpu_scatter_input_grad_kernel( + out_grad, axis, index, *x_grad, include_self, dev_ctx); + } + } else if (reduce == "multiply" || reduce == "mul" || reduce == "amin" || + reduce == "amax") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_mul_min_max_input_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *x_grad, + reduce, + include_self, + dev_ctx); + } else { + phi::funcs::cpu_scatter_mul_min_max_input_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *x_grad, + reduce, + include_self, + dev_ctx); + } + } else if (reduce == "mean") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_mean_input_grad_kernel( + // Here passing an unused argument out_grad, because it's + // convenient to instantiate a bunch of template function with the + // same arguments list. + out_grad, + axis, + index, + *x_grad, + include_self, + dev_ctx); + } else { + phi::funcs::cpu_scatter_mean_input_grad_kernel( + out_grad, axis, index, *x_grad, include_self, dev_ctx); + } } } if (value_grad) { value_grad->Resize(index.dims()); dev_ctx.template Alloc(value_grad); - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_value_grad_kernel( - out_grad, axis, index, *value_grad, dev_ctx); - } else { - phi::funcs::cpu_scatter_value_grad_kernel( - out_grad, axis, index, *value_grad, dev_ctx); + auto* grad_data = value_grad->data(); + int64_t grad_size = value_grad->numel(); + memset(grad_data, 0, sizeof(T) * grad_size); + if (reduce == "assign") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_value_grad_kernel( + out_grad, axis, index, *value_grad, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::cpu_scatter_value_grad_kernel( + out_grad, axis, index, *value_grad, include_self, dev_ctx); + } + } else if (reduce == "add" || reduce == "mean") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_add_mean_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } else { + phi::funcs::cpu_scatter_add_mean_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } + } else if (reduce == "mul" || reduce == "multiply" || reduce == "amin" || + reduce == "amax") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_mul_min_max_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } else { + phi::funcs::cpu_scatter_mul_min_max_value_grad_kernel( + out_grad, + axis, + index, + out, + x, + value, + *value_grad, + reduce, + include_self, + dev_ctx); + } } } } diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc index 5417f9463a62f8..4411755d61cbaf 100644 --- a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc +++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc @@ -30,6 +30,7 @@ void PutAlongAxisKernel(const Context& dev_ctx, const DenseTensor& value, int axis, const std::string& reduce, + bool include_self, DenseTensor* out) { PADDLE_ENFORCE_EQ( dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU, @@ -41,31 +42,56 @@ void PutAlongAxisKernel(const Context& dev_ctx, if (reduce == "add") { if (index_type == DataType::INT32) { phi::funcs::cpu_scatter_add_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } } else if (reduce == "multiply" || reduce == "mul") { if (index_type == DataType::INT32) { phi::funcs::cpu_scatter_mul_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::cpu_scatter_mul_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } } else if (reduce == "assign") { if (index_type == DataType::INT32) { phi::funcs::cpu_scatter_assign_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::cpu_scatter_assign_kernel( - *out, axis, index, value, dev_ctx); + *out, axis, index, value, include_self, dev_ctx); + } + } else if (reduce == "mean") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_mean_kernel( + *out, axis, index, value, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::cpu_scatter_mean_kernel( + *out, axis, index, value, include_self, dev_ctx); + } + } else if (reduce == "amax") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_max_kernel( + *out, axis, index, value, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::cpu_scatter_max_kernel( + *out, axis, index, value, include_self, dev_ctx); + } + } else if (reduce == "amin") { + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_min_kernel( + *out, axis, index, value, include_self, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::cpu_scatter_min_kernel( + *out, axis, index, value, include_self, dev_ctx); } } else { PADDLE_THROW(errors::InvalidArgument( "can not support reduce: '%s' for scatter kernel, only " - "support reduce op: 'add', 'assign', 'mul' and 'multiply', the " + "support reduce op: 'add', 'assign', 'mul', 'mean', 'amin', 'amax' and " + "'multiply', the " "default reduce " "op is 'assign' ", reduce)); diff --git a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc index b7b33d4290daec..66f3ef0cd790d1 100644 --- a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc @@ -104,7 +104,8 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(repeat_interleave_grad, CPU, @@ -113,4 +114,5 @@ PD_REGISTER_KERNEL(repeat_interleave_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc index 388e243eff42a0..8b00d7e38f304c 100644 --- a/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc +++ b/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc @@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(repeat_interleave, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index, CPU, @@ -34,4 +35,5 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index, float, double, int, - int64_t) {} + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc index ed35513d985505..237a892dbb356c 100644 --- a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc @@ -35,3 +35,20 @@ PD_REGISTER_KERNEL(set_value_grad, phi::dtype::float16, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(set_value_with_scalar_grad, + CPU, + ALL_LAYOUT, + phi::SetValueWithScalarGradKernel, + float, + double, + int, + int64_t, + bool, + int16_t, + uint8_t, + int8_t, + phi::dtype::bfloat16, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc index 8a7238203ec647..4e5fc0c305100c 100644 --- a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc @@ -50,10 +50,11 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx, axis, index, out_grad, + true, dev_ctx); // the gradient of gather is scatter } else if (index_type == phi::DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, index, out_grad, dev_ctx); + *x_grad, axis, index, out_grad, true, dev_ctx); } } diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc index d1b4a24b54eba5..d006f688ae2434 100644 --- a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc +++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc @@ -38,9 +38,11 @@ void TakeAlongAxisKernel(const Context& dev_ctx, const auto& index_type = index.dtype(); if (index_type == DataType::INT32) { - phi::funcs::cpu_gather_kernel(x, axis, index, *out, dev_ctx); + phi::funcs::cpu_gather_kernel( + x, axis, index, *out, true, dev_ctx); } else if (index_type == DataType::INT64) { - phi::funcs::cpu_gather_kernel(x, axis, index, *out, dev_ctx); + phi::funcs::cpu_gather_kernel( + x, axis, index, *out, true, dev_ctx); } } diff --git a/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc b/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc new file mode 100644 index 00000000000000..ff61688513b139 --- /dev/null +++ b/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/c_embedding_grad_kernel.h" +#include "glog/logging.h" +#include "paddle/phi/api/backward/backward_api.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +template +void CEmbeddingGradKernel(const Context& dev_ctx, + const DenseTensor& w, + const DenseTensor& ids, + const DenseTensor& out_grad, + int64_t start_index, + DenseTensor* w_grad) { + w_grad->Resize(w.dims()); + dev_ctx.template Alloc(w_grad, w.dtype()); + const auto& index_type = ids.dtype(); + if (index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64) { + auto K = ids.numel(); + auto N = w.dims()[0]; + auto D = w.dims()[1]; + + auto x_tmp = std::make_shared(); + x_tmp->ShareDataWith(ids).Resize({K}); + auto w_tmp = std::make_shared(); + w_tmp->set_meta(w.meta()); + dev_ctx.Alloc(w_tmp.get(), w_tmp->dtype()); + auto out_grad_tmp = std::make_shared(); + out_grad_tmp->ShareDataWith(out_grad).Resize({K, D}); + paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp), + out_grad_tensor(out_grad_tmp); + + auto start_index_tensor = paddle::experimental::full_like( + x_tensor, start_index, x_tensor.dtype(), x_tensor.place()); + auto end_index_tensor = paddle::experimental::full_like( + x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place()); + auto ids_mask_tensor = paddle::experimental::logical_and( + x_tensor.greater_equal(start_index_tensor), + x_tensor.less_than(end_index_tensor)); + auto real_ids_tensor = (x_tensor - start_index_tensor) + .multiply(paddle::experimental::cast( + ids_mask_tensor, x_tensor.dtype())); + auto out_grad_tensor_mul_mask = + paddle::experimental::reshape(out_grad_tensor, {K, D}) + .multiply(paddle::experimental::reshape( + paddle::experimental::cast(ids_mask_tensor, w.dtype()), + {K, 1})); + paddle::Tensor w_grad_tensor; + paddle::experimental::embedding_grad(real_ids_tensor, + w_tensor, + out_grad_tensor_mul_mask, + -1, + false, + &w_grad_tensor); + w_grad->ShareDataWith( + *reinterpret_cast(w_grad_tensor.impl().get())); + + } else { + PADDLE_THROW(phi::errors::Unavailable( + "Custom Device c_embedding_grad ids only support int32 or int64.")); + } +} +#endif +} // namespace phi + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +PD_REGISTER_KERNEL(c_embedding_grad, + Custom, + ALL_LAYOUT, + phi::CEmbeddingGradKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/custom/c_embedding_kernel.cc b/paddle/phi/kernels/custom/c_embedding_kernel.cc new file mode 100644 index 00000000000000..0cacf61d46f3a8 --- /dev/null +++ b/paddle/phi/kernels/custom/c_embedding_kernel.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/c_embedding_kernel.h" +#include "glog/logging.h" +#include "paddle/phi/api/backward/backward_api.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +template +void CEmbeddingKernel(const Context& dev_ctx, + const DenseTensor& w, + const DenseTensor& ids, + int64_t start_index, + int64_t vocab_size, + DenseTensor* out) { + const auto& index_type = ids.dtype(); + if (index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64) { + auto out_dims = out->dims(); + auto K = ids.numel(); + auto N = w.dims()[0]; + auto D = w.dims()[1]; + + auto x_tmp = std::make_shared(); + x_tmp->ShareDataWith(ids).Resize({K}); + auto w_tmp = std::make_shared(); + w_tmp->ShareDataWith(w).Resize({N, D}); + paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp); + + auto start_index_tensor = paddle::experimental::full_like( + x_tensor, start_index, x_tensor.dtype(), x_tensor.place()); + auto end_index_tensor = paddle::experimental::full_like( + x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place()); + auto ids_mask_tensor = paddle::experimental::logical_and( + x_tensor.greater_equal(start_index_tensor), + x_tensor.less_than(end_index_tensor)); + auto ids_tensor = (x_tensor - start_index_tensor) + .multiply(paddle::experimental::cast( + ids_mask_tensor, x_tensor.dtype())); + auto out_tensor = + paddle::experimental::reshape( + paddle::experimental::cast(ids_mask_tensor, w_tensor.dtype()), + {K, 1}) + .multiply(paddle::experimental::reshape( + paddle::experimental::embedding( + ids_tensor, w_tensor, -1, false), + {K, D})); + out->ShareDataWith( + *reinterpret_cast(out_tensor.impl().get())) + .Resize(out_dims); + } else { + PADDLE_THROW(phi::errors::Unavailable( + "Custom Device c_embedding ids only support int32 or int64.")); + } +} +#endif +} // namespace phi + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +PD_REGISTER_KERNEL(c_embedding, + Custom, + ALL_LAYOUT, + phi::CEmbeddingKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc index 170f9a3a4d6082..088a4fe4ffd266 100644 --- a/paddle/phi/kernels/dist_grad_kernel.cc +++ b/paddle/phi/kernels/dist_grad_kernel.cc @@ -97,7 +97,7 @@ void DistGradKernel(const Context& dev_ctx, PD_REGISTER_KERNEL( dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(dist_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index 60fc5236abc940..d2391a5702d4b1 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(empty_like, kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(empty, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc index 2b7c400bc64641..ebe1b1d24e50a5 100644 --- a/paddle/phi/kernels/flatten_grad_kernel.cc +++ b/paddle/phi/kernels/flatten_grad_kernel.cc @@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(flatten_grad, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(flatten_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc index 6b22ac75181791..dc61e6a650efa1 100644 --- a/paddle/phi/kernels/flatten_kernel.cc +++ b/paddle/phi/kernels/flatten_kernel.cc @@ -75,7 +75,7 @@ PD_REGISTER_KERNEL(flatten, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(flatten_infer, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc index 1886f5af4c1cb7..cd603dd57e64d1 100644 --- a/paddle/phi/kernels/full_kernel.cc +++ b/paddle/phi/kernels/full_kernel.cc @@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(full_batch_size_like, bool) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(full_batch_size_like, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index f2d43a19a246d6..d124e269e5c007 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -8,16 +8,16 @@ file( GLOB func_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +if(WITH_GPU OR WITH_ROCM) file( GLOB func_cu_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu") endif() -if(WITH_MUSA) - list(REMOVE_ITEM func_cu_srcs - "softmax.cu") +# Note(qili93): remove kernels not supported on DCU yet +if(WITH_ROCM) + list(REMOVE_ITEM func_cu_srcs "weight_only_gemv.cu") endif() collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs}) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index dcad9755ee4e05..06b59644cf11d4 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -3013,7 +3013,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) template struct CudaLogitFunctor : public BaseActivationFunctor { diff --git a/paddle/phi/kernels/funcs/algorithm.h b/paddle/phi/kernels/funcs/algorithm.h index cab4d32a998268..5f66f6f1abd4d2 100644 --- a/paddle/phi/kernels/funcs/algorithm.h +++ b/paddle/phi/kernels/funcs/algorithm.h @@ -40,7 +40,7 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) { template HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) { -#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__) // @{ Group LowerBound +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) // @{ Group LowerBound // The following code is from // https://en.cppreference.com/w/cpp/algorithm/lower_bound auto *first = x; @@ -63,7 +63,7 @@ HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) { template HOSTDEVICE inline size_t UpperBound(const T1 *x, size_t num, const T2 &val) { -#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__) // @{ Group UpperBound +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) // @{ Group UpperBound // The following code is from // https://en.cppreference.com/w/cpp/algorithm/upper_bound auto *first = x; diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h index 69e13d29874d51..140eca890480f9 100644 --- a/paddle/phi/kernels/funcs/blas/blas.h +++ b/paddle/phi/kernels/funcs/blas/blas.h @@ -175,7 +175,7 @@ class Blas { T* c, const int* ldc) const; -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) template void MatMulWithHead(const phi::DenseTensor& mat_a, const MatDescriptor& dim_a, @@ -303,7 +303,7 @@ class Blas { int batchCount) const; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA) + !defined(PADDLE_WITH_HIP) template void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, @@ -360,7 +360,7 @@ class Blas { T* B, int ldb) const; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const; @@ -445,7 +445,7 @@ class BlasT : private Blas { Base()->template CSRMM(args...); } -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) template void MatMulWithHead(ARGS... args) const { Base()->template MatMulWithHead(args...); @@ -543,7 +543,7 @@ class BlasT : private Blas { Base()->template TRSM(args...); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template void BatchedGETRF(ARGS... args) const { Base()->template BatchedGETRF(args...); @@ -593,7 +593,3 @@ inline BlasT GetBlas(const DeviceContext& dev_ctx) { #ifdef PADDLE_WITH_HIP #include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h" #endif - -#ifdef PADDLE_WITH_MUSA -#include "paddle/phi/kernels/funcs/blas/blas_impl.mu.h" -#endif diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h index a4233d9a4147ac..ffafe15b8fcf2d 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.h @@ -1451,7 +1451,7 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, #endif } -#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_MUSA) && \ +#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ !defined(PADDLE_WITH_HIP) // @{ Group Blas MKLML: BatchedGEMMWithHead template <> template @@ -1698,7 +1698,7 @@ void Blas::MatMul(const T *mat_a, } #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA) + !defined(PADDLE_WITH_HIP) // @{ Group Blas MKLML: MatMulWithHead /* * Multiple two matrixes with multiple heads diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.mu.h b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h deleted file mode 100644 index c6391acab6d894..00000000000000 --- a/paddle/phi/kernels/funcs/blas/blas_impl.mu.h +++ /dev/null @@ -1,1602 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#if defined(__MUSACC__) -#include -#endif -#include "glog/logging.h" -#include "paddle/utils/flags.h" - -#include "paddle/phi/backends/dynload/mublas.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/flags.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -PHI_DECLARE_bool(enable_cublas_tensor_op_math); -PHI_DECLARE_bool(gemm_use_half_precision_compute_type); - -namespace phi { -namespace funcs { - -template -struct CUBlas; - -template <> -struct CUBlas { - template - static void GEMM(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemm(args...)); - } - - template - static void AXPY(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSaxpy(args...)); - } - - template - static void SCAL(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSscal(args...)); - } - - template - static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasScopy(args...)); - } - - template - static void GEMV(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemv(args...)); - } - - template - static void GEMM_BATCH(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemmBatched(args...)); - } - - template - static void GEMM_STRIDED_BATCH(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mublasSgemmStridedBatched(args...)); - } - - template - static void GEMM_EX(phi::GPUContext *dev_ctx, - mublasOperation_t transa, - mublasOperation_t transb, - int m, - int n, - int k, - const float *alpha, - const void *A, - musaDataType_t Atype, - int lda, - const void *B, - musaDataType_t Btype, - int ldb, - const float *beta, - void *C, - musaDataType_t Ctype, - int ldc) { -// Because the gcc 4.8 doesn't expand template parameter pack that -// appears in a lambda-expression, I can not use template parameter pack -// here. - // VLOG(5) << "use_tensor_op_math: " - // << (dev_ctx->tensor_core_available() ? "True" : "False"); - // dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) { - // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemmEx(handle, - // transa, - // transb, - // m, - // n, - // k, - // alpha, - // A, - // Atype, - // lda, - // B, - // Btype, - // ldb, - // beta, - // C, - // Ctype, - // ldc)); - // }); - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasSgemmEx.")); - } - - template - static void TRSM(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasStrsm(args...)); - } - - template - static void GETRF_BATCH(ARGS... args) { - // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetrfBatched(args...)); - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasSgetrfBatched.")); - } - - template - static void GETRI_BATCH(ARGS... args) { - // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetriBatched(args...)); - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasSgetriBatched.")); - } - - template - static void MATINV_BATCH(ARGS... args) { - // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSmatinvBatched(args...)); - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasSmatinvBatched.")); - } - - template - static void GETRS_BATCH(ARGS... args) { - // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetrsBatched(args...)); - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasSgetrsBatched.")); - } - - template - static void TRSM_BATCH(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasStrsmBatched(args...)); - } -}; - -template <> -struct CUBlas { - template - static void GEMM(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemm(args...)); - } - - template - static void AXPY(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDaxpy(args...)); - } - - template - static void SCAL(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDscal(args...)); - } - - template - static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDcopy(args...)); - } - - template - static void GEMV(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemv(args...)); - } - - template - static void GEMM_BATCH(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemmBatched(args...)); - } - - template - static void GEMM_STRIDED_BATCH(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mublasDgemmStridedBatched(args...)); - } - - template - static void GEMM_EX(ARGS... args UNUSED) { - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasDgemmEx.")); - } - - template - static void TRSM(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDtrsm(args...)); - } - - template - static void GETRF_BATCH(ARGS... args) { - // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetrfBatched(args...)); - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasDgetrfBatched.")); - } - - template - static void GETRI_BATCH(ARGS... args) { - // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetriBatched(args...)); - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasDgetriBatched.")); - } - - template - static void MATINV_BATCH(ARGS... args) { - // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDmatinvBatched(args...)); - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasDmatinvBatched.")); - } - - template - static void GETRS_BATCH(ARGS... args) { - // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetrsBatched(args...)); - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasDgetrsBatched.")); - } - - template - static void TRSM_BATCH(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDtrsmBatched(args...)); - } -}; - -template <> -struct CUBlas { - using float16 = phi::dtype::float16; - - static void GEMM(mublasHandle_t handle, - mublasOperation_t transa, - mublasOperation_t transb, - int m, - int n, - int k, - const float16 *alpha, - const float16 *A, - int lda, - const float16 *B, - int ldb, - const float16 *beta, - float16 *C, - int ldc) { - // PADDLE_ENFORCE_GPU_SUCCESS( - // phi::dynload::mublasHgemm(handle, - // transa, - // transb, - // m, - // n, - // k, - // reinterpret_cast(alpha), - // reinterpret_cast(A), - // lda, - // reinterpret_cast(B), - // ldb, - // reinterpret_cast(beta), - // reinterpret_cast<__half *>(C), - // ldc)); - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasHgemm.")); - } - - static void GEMM_BATCH(phi::GPUContext *dev_ctx, - mublasOperation_t transa, - mublasOperation_t transb, - int m, - int n, - int k, - const float *alpha, - const float16 **A, - musaDataType_t Atype, - int lda, - const float16 **B, - musaDataType_t Btype, - int ldb, - const float *beta, - float16 **C, - musaDataType_t Ctype, - int ldc, - int batchCount, - musaDataType_t computeType) { - PADDLE_THROW(phi::errors::Unimplemented( - "mublasGemmBatchedEx is not supported")); - } - - static void GEMM_STRIDED_BATCH(mublasHandle_t handle, - mublasOperation_t transa, - mublasOperation_t transb, - int m, - int n, - int k, - const float16 *alpha, - const float16 *A, - int lda, - long long int strideA, // NOLINT - const float16 *B, // NOLINT - int ldb, - long long int strideB, // NOLINT - const float16 *beta, - float16 *C, - int ldc, - long long int strideC, // NOLINT - int batchCount) { - PADDLE_THROW(phi::errors::Unimplemented( - "mublasHgemmStridedBatched is not supported")); - // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasHgemmStridedBatched( - // handle, - // transa, - // transb, - // m, - // n, - // k, - // reinterpret_cast(alpha), - // reinterpret_cast(A), - // lda, - // strideA, - // reinterpret_cast(B), - // ldb, - // strideB, - // reinterpret_cast(beta), - // reinterpret_cast<__half *>(C), - // ldc, - // strideC, - // batchCount)); - } - - // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. - // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode - template - static void GEMM_EX(phi::GPUContext *dev_ctx, - mublasOperation_t transa, - mublasOperation_t transb, - int m, - int n, - int k, - const void *alpha, - const void *A, - musaDataType_t Atype, - int lda, - const void *B, - musaDataType_t Btype, - int ldb, - const void *beta, - void *C, - musaDataType_t Ctype, - int ldc, - musaDataType_t computeType) { - mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = dev_ctx->tensor_core_available(); - if (use_tensor_op_math) { - algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); - - dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }); - } -}; - -template <> -struct CUBlas> { - static void GEMV(mublasHandle_t handle, - mublasOperation_t transa, - int m, - int n, - const phi::dtype::complex *alpha, - const phi::dtype::complex *A, - int lda, - const phi::dtype::complex *B, - int ldb, - const phi::dtype::complex *beta, - phi::dtype::complex *C, - int ldc) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemv( - handle, - transa, - m, - n, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - reinterpret_cast(B), - ldb, - reinterpret_cast(beta), - reinterpret_cast(C), - ldc)); - } - - static void AXPY(mublasHandle_t handle, - int n, - const phi::dtype::complex *alpha, - const phi::dtype::complex *X, - const int incX, - phi::dtype::complex *Y, - const int incY) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCaxpy( - handle, - n, - reinterpret_cast(alpha), - reinterpret_cast(X), - incX, - reinterpret_cast(Y), - incY)); - } - - static void GEMM_STRIDED_BATCH(mublasHandle_t handle, - mublasOperation_t transa, - mublasOperation_t transb, - int m, - int n, - int k, - const phi::dtype::complex *alpha, - const phi::dtype::complex *A, - int lda, - long long int strideA, // NOLINT - const phi::dtype::complex *B, // NOLINT - int ldb, - long long int strideB, // NOLINT - const phi::dtype::complex *beta, - phi::dtype::complex *C, - int ldc, - long long int strideC, // NOLINT - int batchCount) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemmStridedBatched( - handle, - transa, - transb, - m, - n, - k, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - strideA, - reinterpret_cast(B), - ldb, - strideB, - reinterpret_cast(beta), - reinterpret_cast(C), - ldc, - strideC, - batchCount)); - } - - static void GEMM(mublasHandle_t handle, - mublasOperation_t transa, - mublasOperation_t transb, - int m, - int n, - int k, - const phi::dtype::complex *alpha, - const phi::dtype::complex *A, - int lda, - const phi::dtype::complex *B, - int ldb, - const phi::dtype::complex *beta, - phi::dtype::complex *C, - int ldc) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemm( - handle, - transa, - transb, - m, - n, - k, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - reinterpret_cast(B), - ldb, - reinterpret_cast(beta), - reinterpret_cast(C), - ldc)); - } - - static void TRSM(mublasHandle_t handle, - mublasSideMode_t side, - mublasFillMode_t uplo, - mublasOperation_t transa, - mublasDiagType_t diag, - int m, - int n, - const phi::dtype::complex *alpha, - const phi::dtype::complex *A, - int lda, - phi::dtype::complex *B, - int ldb) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCtrsm( - handle, - side, - uplo, - transa, - diag, - m, - n, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - reinterpret_cast(B), - ldb)); - } - - // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. - // https://docs.nvidia.com/muda/mublas/index.html#mublassetmathmode - template - static void GEMM_EX(phi::GPUContext *dev_ctx, - mublasOperation_t transa, - mublasOperation_t transb, - int m, - int n, - int k, - const void *alpha, - const void *A, - musaDataType_t Atype, - int lda, - const void *B, - musaDataType_t Btype, - int ldb, - const void *beta, - void *C, - musaDataType_t Ctype, - int ldc, - musaDataType_t computeType) { - mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = dev_ctx->tensor_core_available(); - if (use_tensor_op_math) { - algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); - - dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }); - } - - static void TRSM_BATCH(mublasHandle_t handle, - mublasSideMode_t side, - mublasFillMode_t uplo, - mublasOperation_t transa, - mublasDiagType_t diag, - int m, - int n, - const phi::dtype::complex *alpha, - const phi::dtype::complex **A, - int lda, - phi::dtype::complex **B, - int ldb, - int batch_size) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCtrsmBatched( - handle, - side, - uplo, - transa, - diag, - m, - n, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - reinterpret_cast(B), - ldb, - batch_size)); - } -}; - -template <> -struct CUBlas> { - static void GEMV(mublasHandle_t handle, - mublasOperation_t transa, - int m, - int n, - const phi::dtype::complex *alpha, - const phi::dtype::complex *A, - int lda, - const phi::dtype::complex *B, - int ldb, - const phi::dtype::complex *beta, - phi::dtype::complex *C, - int ldc) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemv( - handle, - transa, - m, - n, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - reinterpret_cast(B), - ldb, - reinterpret_cast(beta), - reinterpret_cast(C), - ldc)); - } - - static void AXPY(mublasHandle_t handle, - int n, - const phi::dtype::complex *alpha, - const phi::dtype::complex *X, - const int incX, - phi::dtype::complex *Y, - const int incY) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZaxpy( - handle, - n, - reinterpret_cast(alpha), - reinterpret_cast(X), - incX, - reinterpret_cast(Y), - incY)); - } - - static void GEMM_STRIDED_BATCH( - mublasHandle_t handle, - mublasOperation_t transa, - mublasOperation_t transb, - int m, - int n, - int k, - const phi::dtype::complex *alpha, - const phi::dtype::complex *A, - int lda, - long long int strideA, // NOLINT - const phi::dtype::complex *B, // NOLINT - int ldb, - long long int strideB, // NOLINT - const phi::dtype::complex *beta, - phi::dtype::complex *C, - int ldc, - long long int strideC, // NOLINT - int batchCount) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemmStridedBatched( - handle, - transa, - transb, - m, - n, - k, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - strideA, - reinterpret_cast(B), - ldb, - strideB, - reinterpret_cast(beta), - reinterpret_cast(C), - ldc, - strideC, - batchCount)); - } - - static void GEMM(mublasHandle_t handle, - mublasOperation_t transa, - mublasOperation_t transb, - int m, - int n, - int k, - const phi::dtype::complex *alpha, - const phi::dtype::complex *A, - int lda, - const phi::dtype::complex *B, - int ldb, - const phi::dtype::complex *beta, - phi::dtype::complex *C, - int ldc) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemm( - handle, - transa, - transb, - m, - n, - k, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - reinterpret_cast(B), - ldb, - reinterpret_cast(beta), - reinterpret_cast(C), - ldc)); - } - - static void TRSM(mublasHandle_t handle, - mublasSideMode_t side, - mublasFillMode_t uplo, - mublasOperation_t transa, - mublasDiagType_t diag, - int m, - int n, - const phi::dtype::complex *alpha, - const phi::dtype::complex *A, - int lda, - phi::dtype::complex *B, - int ldb) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZtrsm( - handle, - side, - uplo, - transa, - diag, - m, - n, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - reinterpret_cast(B), - ldb)); - } - - static void TRSM_BATCH(mublasHandle_t handle, - mublasSideMode_t side, - mublasFillMode_t uplo, - mublasOperation_t transa, - mublasDiagType_t diag, - int m, - int n, - const phi::dtype::complex *alpha, - const phi::dtype::complex **A, - int lda, - phi::dtype::complex **B, - int ldb, - int batch_size) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZtrsmBatched( - handle, - side, - uplo, - transa, - diag, - m, - n, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - reinterpret_cast(B), - ldb, - batch_size)); - } - - // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. - // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode - template - static void GEMM_EX(phi::GPUContext *dev_ctx, - mublasOperation_t transa, - mublasOperation_t transb, - int m, - int n, - int k, - const void *alpha, - const void *A, - musaDataType_t Atype, - int lda, - const void *B, - musaDataType_t Btype, - int ldb, - const void *beta, - void *C, - musaDataType_t Ctype, - int ldc, - musaDataType_t computeType) { - mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = dev_ctx->tensor_core_available(); - if (use_tensor_op_math) { - algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); - - dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }); - - } -}; - -template <> -template -void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - mublasOperation_t cuTransA = - (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - mublasOperation_t cuTransB = - (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - N); - }); -} - -template <> -template <> -inline void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::float16 alpha, - const phi::dtype::float16 *A, - const phi::dtype::float16 *B, - phi::dtype::float16 beta, - phi::dtype::float16 *C) const { - // // Note that cublas follows fortran order, so the order is different from - // // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - mublasOperation_t cuTransA = - (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - mublasOperation_t cuTransB = - (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - - float h_alpha = static_cast(alpha); - float h_beta = static_cast(beta); - auto &cuda_ctx = const_cast(context_); - CUBlas::GEMM_EX(&cuda_ctx, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - MUSA_R_16F, - ldb, - A, - MUSA_R_16F, - lda, - &h_beta, - C, - MUSA_R_16F, - N, - (musaDataType_t)0);//MUSA_R_32F https://jira.mthreads.com/browse/SW-37038 -} - - - - -template <> -template <> -inline void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - const phi::dtype::bfloat16 *B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C) const { - PADDLE_THROW(phi::errors::Unimplemented( - "cublasGemmEx with bfloat16 is not supported on cuda <= 11")); -} - -template <> -template <> -inline void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::complex alpha, - const phi::dtype::complex *A, - const phi::dtype::complex *B, - phi::dtype::complex beta, - phi::dtype::complex *C) const { - PADDLE_THROW(phi::errors::Unimplemented( - "Blas::GEMM for dtype complex is not supported on MUSA now!")); -} - -template <> -template <> -inline void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::complex alpha, - const phi::dtype::complex *A, - const phi::dtype::complex *B, - phi::dtype::complex beta, - phi::dtype::complex *C) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - mublasOperation_t cuTransA = - (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - mublasOperation_t cuTransB = - (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - - thrust::complex c_alpha = - thrust::complex(alpha.real, alpha.imag); - thrust::complex c_beta = - thrust::complex(beta.real, beta.imag); - auto &cuda_ctx = const_cast(context_); - CUBlas>::GEMM_EX(&cuda_ctx, - cuTransB, - cuTransA, - N, - M, - K, - &c_alpha, - B, - // Originally, this was MUSA_C_64F, but due to some bugs, it was necessary to manually specify a value - // jira:https://jira.mthreads.com/browse/SW-37038 - (musaDataType_t)5,//MUSA_C_64F - ldb, - A, - (musaDataType_t)5,//MUSA_C_64F - lda, - &c_beta, - C, - (musaDataType_t)5,//MUSA_C_64F - N, - (musaDataType_t)5);//MUSA_C_64F -} - - -template <> -template -void Blas::GEMM(bool transA, - bool transB, - int M, - int N, - int K, - T alpha, - const T *A, - int lda, - const T *B, - int ldb, - T beta, - T *C, - int ldc) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - mublasOperation_t cuTransA = transA ? MUBLAS_OP_T : MUBLAS_OP_N; - mublasOperation_t cuTransB = transB ? MUBLAS_OP_T : MUBLAS_OP_N; - - if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { - auto &cuda_ctx = const_cast(context_); - CUBlas::GEMM_EX(&cuda_ctx, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - (musaDataType_t)0,//MUSA_R_32F, - ldb, - A, - (musaDataType_t)0,//MUSA_R_32F, - lda, - &beta, - C, - (musaDataType_t)0,//MUSA_R_32F, - ldc); - } else { - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - ldc); - }); - } -} - -template <> -template <> -inline void Blas::GEMM(bool transA, - bool transB, - int M, - int N, - int K, - phi::dtype::float16 alpha, - const phi::dtype::float16 *A, - int lda, - const phi::dtype::float16 *B, - int ldb, - phi::dtype::float16 beta, - phi::dtype::float16 *C, - int ldc) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - mublasOperation_t cuTransA = transA ? MUBLAS_OP_T : MUBLAS_OP_N; - mublasOperation_t cuTransB = transB ? MUBLAS_OP_T : MUBLAS_OP_N; - - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - ldc); - }); -} -template <> -template <> -inline void Blas::GEMM(bool transA, - bool transB, - int M, - int N, - int K, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - int lda, - const phi::dtype::bfloat16 *B, - int ldb, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C, - int ldc) const { - PADDLE_THROW(phi::errors::Unimplemented( - "Blas::GEMM for dtype bfloat16 is not supported on MUSA now!")); -} - -template <> -template -void Blas::AXPY(int n, T alpha, const T *x, T *y) const { - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); - }); -} - -template <> -template -void Blas::SCAL(int n, const T alpha, T *x) const { - context_.CublasCall( - [&](mublasHandle_t handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }); -} - -template <> -template -void Blas::VCOPY(int n, const T *x, T *y) const { - context_.CublasCall( - [&](mublasHandle_t handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); -} - -template <> -template -void Blas::GEMV(bool trans_a, - int M, - int N, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { - mublasOperation_t cuTransA = !trans_a ? MUBLAS_OP_T : MUBLAS_OP_N; - - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); - }); -} - -template <> -template <> -inline void Blas::GEMV(bool trans_a, - int M, - int N, - phi::dtype::float16 alpha, - const phi::dtype::float16 *A, - const phi::dtype::float16 *B, - phi::dtype::float16 beta, - phi::dtype::float16 *C) const { - // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it. - if (trans_a) { - this->template GEMM( - CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); - } else { - this->template GEMM( - CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); - } -} - -template <> -template <> -inline void Blas::GEMV(bool trans_a, - int M, - int N, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - const phi::dtype::bfloat16 *B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C) const { - // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve - // it. - if (trans_a) { - this->template GEMM( - CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); - } else { - this->template GEMM( - CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); - } -} - - -template <> -template -void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - const T *B, - T beta, - T *C, - int batchCount, - int64_t strideA, - int64_t strideB) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - mublasOperation_t cuTransA = - (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - mublasOperation_t cuTransB = - (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - const int64_t strideC = M * N; - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::GEMM_STRIDED_BATCH(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - strideB, - A, - lda, - strideA, - &beta, - C, - ldc, - strideC, - batchCount); - }); -} - -template <> -template <> -inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 *A, - const phi::dtype::bfloat16 *B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 *C, - int batchCount, - int64_t strideA, - int64_t strideB) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - // int lda = (transA == CblasNoTrans) ? K : M; - // int ldb = (transB == CblasNoTrans) ? N : K; - // int ldc = N; - // mublasOperation_t cuTransA = - // (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - // mublasOperation_t cuTransB = - // (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - // const int64_t strideC = M * N; - - // float h_alpha = static_cast(alpha); - // float h_beta = static_cast(beta); - - // mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT; - // bool use_tensor_op_math = context_.tensor_core_available(); - // if (use_tensor_op_math) { - // algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP; - // } - // VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); - - // context_.TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) { - // PADDLE_ENFORCE_GPU_SUCCESS( - // phi::dynload::mublasGemmStridedBatchedEx(handle, - // cuTransB, - // cuTransA, - // N, - // M, - // K, - // &h_alpha, - // B, - // MUSA_R_16BF, - // ldb, - // strideB, - // A, - // MUSA_R_16BF, - // lda, - // strideA, - // &h_beta, - // C, - // MUSA_R_16BF, - // ldc, - // strideC, - // batchCount, - // MUBLAS_COMPUTE_32F, - // algo)); - // }); - PADDLE_THROW( - phi::errors::Unimplemented("murrently there are not mublasGemmStridedBatchedEx.")); -} - -template <> -template -void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T **A, - const T **B, - T beta, - T **C, - int batchCount) const { - for (int k = 0; k < batchCount; ++k) { - this->template GEMM( - transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]); - } -} - -#if defined(__MUSACC__) -template <> -template <> -inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - double alpha, - const double **A, - const double **B, - double beta, - double **C, - int batchCount) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - mublasOperation_t cuTransA = - (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - mublasOperation_t cuTransB = - (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - thrust::device_vector A_ptr(A, A + batchCount); - thrust::device_vector B_ptr(B, B + batchCount); - thrust::device_vector C_ptr(C, C + batchCount); - - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::GEMM_BATCH(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B_ptr.data().get(), - ldb, - A_ptr.data().get(), - lda, - &beta, - C_ptr.data().get(), - ldc, - batchCount); - }); -} - -template <> -template <> -inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - float alpha, - const float **A, - const float **B, - float beta, - float **C, - int batchCount) const { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - mublasOperation_t cuTransA = - (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - mublasOperation_t cuTransB = - (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - thrust::device_vector A_ptr(A, A + batchCount); - thrust::device_vector B_ptr(B, B + batchCount); - thrust::device_vector C_ptr(C, C + batchCount); - - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::GEMM_BATCH(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B_ptr.data().get(), - ldb, - A_ptr.data().get(), - lda, - &beta, - C_ptr.data().get(), - ldc, - batchCount); - }); -} - -template <> -template <> -inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::float16 alpha, - const phi::dtype::float16 **A, - const phi::dtype::float16 **B, - phi::dtype::float16 beta, - phi::dtype::float16 **C, - int batchCount) const { - PADDLE_THROW(phi::errors::Unimplemented( - "Blas::BatchedGEMM for dtype float16 is not supported on MUSA now!")); -} - -template <> -template <> -inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - phi::dtype::bfloat16 alpha, - const phi::dtype::bfloat16 **A, - const phi::dtype::bfloat16 **B, - phi::dtype::bfloat16 beta, - phi::dtype::bfloat16 **C, - int batchCount) const { - PADDLE_THROW(phi::errors::Unimplemented( - "Blas::BatchedGEMM for bfloat16 is not supported on MUSA now!")); -} -#endif -template <> -template -void Blas::TRSM(CBLAS_SIDE side, - CBLAS_UPLO uplo, - CBLAS_TRANSPOSE transA, - CBLAS_DIAG diag, - int M, - int N, - T alpha, - const T *A, - int lda, - T *B, - int ldb) const { - // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` - // where ' stands for transpose - mublasSideMode_t cuSide = - (side == CblasLeft) ? MUBLAS_SIDE_RIGHT : MUBLAS_SIDE_LEFT; - mublasFillMode_t cuUplo = - (uplo == CblasLower) ? MUBLAS_FILL_MODE_UPPER : MUBLAS_FILL_MODE_LOWER; - // use CUBLAS_OP_C (conjugate transpose) for complex - mublasOperation_t cuTransA = - (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - mublasDiagType_t cuDiag = - (diag == CblasUnit) ? MUBLAS_DIAG_UNIT : MUBLAS_DIAG_NON_UNIT; - - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::TRSM( - handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb); - }); -} - -template <> -template -void Blas::BatchedGETRF( - int n, T **a, int *ipiv, int *info, int batch_size) const { - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); - }); -} - -template <> -template -void Blas::BatchedGETRI(int n, - const T **a, - const int *ipiv, - T **a_inv, - int *info, - int batch_size) const { - PADDLE_ENFORCE_NE( - a_inv, - a, - phi::errors::InvalidArgument( - "cuBLAS fuction 'cublasgetrfBatched' cannot be executed " - "in-place. The memory space of output matrix (address: %p) cannot " - "overlap memory space of input matrix (address: %p).", - a_inv, - a)); - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size); - }); -} - -template <> -template -void Blas::BatchedMatInv( - int n, const T **a, T **a_inv, int *info, int batch_size) const { - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); - }); -} - -template <> -template -void Blas::BatchedGETRS(CBLAS_TRANSPOSE trans, - int n, - int nrhs, - const T **a, - int lda, - int *ipiv, - T **b, - int ldb, - int *info, - int batch_size) const { - // use CUBLAS_OP_C (conjugate transpose) for complex - mublasOperation_t cuTrans = - (trans == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::GETRS_BATCH( - handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size); - }); -} - -template <> -template -void Blas::BatchedTRSM(CBLAS_SIDE side, - CBLAS_UPLO uplo, - CBLAS_TRANSPOSE transA, - CBLAS_DIAG diag, - int M, - int N, - T alpha, - const T **A, - int lda, - T **B, - int ldb, - int batch_size) const { - // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` - // where ' stands for transpose - mublasSideMode_t cuSide = - (side == CblasLeft) ? MUBLAS_SIDE_RIGHT : MUBLAS_SIDE_LEFT; - mublasFillMode_t cuUplo = - (uplo == CblasLower) ? MUBLAS_FILL_MODE_UPPER : MUBLAS_FILL_MODE_LOWER; - // use CUBLAS_OP_C (conjugate transpose) for complex - mublasOperation_t cuTransA = - (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; - mublasDiagType_t cuDiag = - (diag == CblasUnit) ? MUBLAS_DIAG_UNIT : MUBLAS_DIAG_NON_UNIT; - - context_.CublasCall([&](mublasHandle_t handle) { - CUBlas::TRSM_BATCH(handle, - cuSide, - cuUplo, - cuTransA, - cuDiag, - N, - M, - &alpha, - A, - lda, - B, - ldb, - batch_size); - }); -} - -} // namespace funcs -} // namespace phi diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index c25ab4b55cb53d..822801e10c357c 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "paddle/phi/kernels/funcs/elementwise_base.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) #include "paddle/phi/kernels/funcs/dims_simplifier.h" namespace kps = phi::kps; @@ -27,7 +27,7 @@ namespace kps = phi::kps; namespace phi { namespace funcs { -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) enum BroadcastType { kMixed = 1, kBroadcast = 2, kElementwise = 3 }; diff --git a/paddle/phi/kernels/funcs/check_numerics_utils.h b/paddle/phi/kernels/funcs/check_numerics_utils.h index 6d426d764e2214..76adc40c4f9f95 100644 --- a/paddle/phi/kernels/funcs/check_numerics_utils.h +++ b/paddle/phi/kernels/funcs/check_numerics_utils.h @@ -86,7 +86,7 @@ HOSTDEVICE static void PrintAndThrowError(const char* debug_info, int64_t num_nan, int64_t num_inf, int64_t num_zero) { -#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__) && !defined(__MUSACC__) +#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__) PADDLE_THROW(phi::errors::PreconditionNotMet( "There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in " "%s.", diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 877bd056ac5426..f2b7de681bcfce 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -49,7 +49,7 @@ static inline void GetBlockDims(const phi::GPUContext& context, *grid_dims = dim3(grid_cols, grid_rows, 1); } -#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +#ifndef PADDLE_WITH_HIP #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else diff --git a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h index 2d210f32009370..e6d587a61e11a7 100644 --- a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h @@ -29,7 +29,7 @@ template using EigenVector = phi::EigenVector; -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group for GRU CPU +#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group for GRU CPU template void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, T *gate_value, diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h index d0f714831549bc..b491cbe120d06f 100644 --- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h @@ -144,7 +144,7 @@ __global__ void KeFastCollectiveGruGate(T *gate_value, } for (int i = 0; i < Tiled_size; ++i) { -#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700) || defined(__MUSACC__) +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i]; #else c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i]; @@ -206,7 +206,7 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight, } for (int i = 0; i < Tiled_size; ++i) { -#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700) || defined(__MUSACC__) +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i]; #else c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i]; diff --git a/paddle/phi/kernels/funcs/detail/gru_kernel.h b/paddle/phi/kernels/funcs/detail/gru_kernel.h index f5a16ade4fd23d..9e2aef19406191 100644 --- a/paddle/phi/kernels/funcs/detail/gru_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_kernel.h @@ -44,7 +44,7 @@ class gru_resetOutput { (*value_reset_output + *value_reset_bias) * (*value_reset_gate); } } -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group GRU reset output +#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU reset output #ifndef __AVX__ static const bool avx = false; #else @@ -90,7 +90,7 @@ class gru_finalOutput { ((*value_update_gate) * (*value_frame_state)); } } -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___)// @{ Group GRU final output +#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU final output #ifndef __AVX__ static const bool avx = false; #else @@ -150,7 +150,7 @@ class gru_stateGrad { *grad_output * (*value_update_gate), *value_frame_state, act_input); } } -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group GRU state grad +#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU state grad #ifndef __AVX__ static const bool avx = false; #else @@ -211,7 +211,7 @@ class gru_resetGrad { *grad_reset_gate = activation(*grad_reset_gate, *value_reset_gate, act_gate); } -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group GRU reset grad +#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU reset grad #ifndef __AVX__ static const bool avx = false; #else @@ -265,7 +265,7 @@ class gru { reset_output * (*grad_frame_state), *value_reset_gate, act_gate); *grad_reset_output = (*value_reset_gate) * (*grad_frame_state); } -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group GRU CPU +#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU CPU #ifndef __AVX__ static const bool avx = false; #else diff --git a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h index b0702d560fa518..e8b8e957c80d1c 100644 --- a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h @@ -36,7 +36,7 @@ template using EigenVector = phi::EigenVector; -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group LSTM CPU +#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group LSTM CPU template void naive_lstm_forward_one_sequence(Op op, diff --git a/paddle/phi/kernels/funcs/detail/lstm_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_kernel.h index 264322521d477f..0846f05a0c2c53 100644 --- a/paddle/phi/kernels/funcs/detail/lstm_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_kernel.h @@ -59,7 +59,7 @@ class lstm { *state_atv = activation(*state, active_state); *output = (*value_og) * (*state_atv); } -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group LSTM FWD +#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group LSTM FWD #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default static const bool avx = false; #else @@ -163,7 +163,7 @@ class lstm { *checkFGrad = (*grad_fg) * (*prev_state); *checkOGrad = (*grad_og) * (*state); } -#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group LSTM BWD +#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group LSTM BWD #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default static const bool avx = false; #else diff --git a/paddle/phi/kernels/funcs/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h index 555b1d3fb250e0..03e3bdde05ad09 100644 --- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h +++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/device_context.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/backends/gpu/gpu_context.h" #endif @@ -41,7 +41,7 @@ struct StridedMemcpyFunctor { auto& cpu_place = place; memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T)); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto& gpu_place = place; auto& cuda_ctx = reinterpret_cast(dev_ctx); memory_utils::Copy( @@ -68,7 +68,7 @@ struct StridedMemcpyFunctor { memory_utils::Copy( cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto& gpu_place = place; auto& cuda_ctx = reinterpret_cast(dev_ctx); memory_utils::Copy(gpu_place, diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h index 6f4e5fceec4739..5504a337e88f2e 100644 --- a/paddle/phi/kernels/funcs/diagonal.h +++ b/paddle/phi/kernels/funcs/diagonal.h @@ -14,7 +14,7 @@ #pragma once -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) #include #include @@ -109,7 +109,7 @@ DenseTensor Diagonal(const DeviceContext& context, int64_t pos = std::abs(offset) * offset_stride; int64_t dim_size = ret_strides.size(); -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) thrust::device_vector diag_vec(common::vectorize(dig_stride)); const int64_t* diag_arr = thrust::raw_pointer_cast(diag_vec.data()); thrust::device_vector ret_vec(ret_strides); @@ -146,7 +146,7 @@ std::vector ComputeDimStride(const std::vector dim) { return dim_strides; } -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) template __global__ void DiagonalCuda(const T* data1, T* data2, diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h index f9c6a0934dc6a2..abade7ac0ef877 100644 --- a/paddle/phi/kernels/funcs/distribution_helper.h +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -21,10 +21,6 @@ limitations under the License. */ #include #endif -#ifdef __MUSACC__ -#include -#endif - #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/common/amp_type_traits.h" @@ -32,7 +28,7 @@ limitations under the License. */ #include "paddle/phi/core/generator.h" #include "paddle/phi/core/hostdevice.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) #include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" #endif @@ -53,7 +49,7 @@ struct exponential_transform { explicit exponential_transform(T lambda) : lambda_(lambda) {} HOSTDEVICE inline T operator()(T val) const { -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) T log = -std::numeric_limits::epsilon() / 2; if (val < static_cast(1.) - std::numeric_limits::epsilon() / 2) { if (std::is_same::value) { @@ -117,7 +113,7 @@ struct normal_transform { T std_; }; -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) namespace kps = phi::kps; @@ -126,19 +122,19 @@ namespace kps = phi::kps; template struct normal_distribution; -#if defined(__MUSACC__) +#if defined(__NVCC__) template struct uniform_distribution { - __device__ inline T operator()(murandStatePhilox4_32_10_t *state) const { - return static_cast(murand_uniform(state)); + __device__ inline T operator()(curandStatePhilox4_32_10_t *state) const { + return static_cast(curand_uniform(state)); } static constexpr int kReturnsCount = 1; }; template <> struct uniform_distribution { - __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const { - return murand_uniform4(state); + __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { + return curand_uniform4(state); } static constexpr int kReturnsCount = 4; }; @@ -146,16 +142,16 @@ struct uniform_distribution { template <> struct uniform_distribution { __device__ inline double2 operator()( - murandStatePhilox4_32_10_t *state) const { - return murand_uniform2_double(state); + curandStatePhilox4_32_10_t *state) const { + return curand_uniform2_double(state); } static constexpr int kReturnsCount = 2; }; template <> struct uniform_distribution { - __device__ inline uint4 operator()(murandStatePhilox4_32_10_t *state) const { - return murand4(state); + __device__ inline uint4 operator()(curandStatePhilox4_32_10_t *state) const { + return curand4(state); } static constexpr int kReturnsCount = 4; }; @@ -163,9 +159,9 @@ struct uniform_distribution { template <> struct uniform_distribution { __device__ inline ulonglong2 operator()( - murandStatePhilox4_32_10_t *state) const { + curandStatePhilox4_32_10_t *state) const { ulonglong2 result; - uint4 rand = murand4(state); + uint4 rand = curand4(state); result.x = (uint64_t)rand.x << 32 | rand.y; result.y = (uint64_t)rand.z << 32 | rand.w; return result; @@ -175,8 +171,8 @@ struct uniform_distribution { template <> struct normal_distribution { - __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const { - return murand_normal4(state); + __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { + return curand_normal4(state); } static constexpr int kReturnsCount = 4; }; @@ -184,8 +180,8 @@ struct normal_distribution { template <> struct normal_distribution { __device__ inline double2 operator()( - murandStatePhilox4_32_10_t *state) const { - return murand_normal2_double(state); + curandStatePhilox4_32_10_t *state) const { + return curand_normal2_double(state); } static constexpr int kReturnsCount = 2; }; @@ -268,10 +264,10 @@ __global__ void DistributionKernel(size_t size, size_t stride) { size_t idx = static_cast(BLOCK_ID_X * BLOCK_NUM_X); static constexpr int kCount = DistOp::kReturnsCount; -#if defined(__MUSACC__) - murandStatePhilox4_32_10_t state; - murand_init(seed, idx + THREAD_ID_X, offset, &state); - using SType = murandStatePhilox4_32_10_t; +#if defined(__NVCC__) + curandStatePhilox4_32_10_t state; + curand_init(seed, idx + THREAD_ID_X, offset, &state); + using SType = curandStatePhilox4_32_10_t; #else hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, offset, &state); diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h index 87283549f8e294..985c028afb2a88 100644 --- a/paddle/phi/kernels/funcs/dropout_impl.cu.h +++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h @@ -20,12 +20,6 @@ limitations under the License. */ #include #include #endif - -#ifdef PADDLE_WITH_MUSA -#include -#include -#endif - #ifdef PADDLE_WITH_HIP #include #include @@ -152,10 +146,6 @@ __global__ void VectorizedRandomGenerator( hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, increment, &state); using SType = hiprandStatePhilox4_32_10_t; -#elif defined(PADDLE_WITH_MUSA) - murandStatePhilox4_32_10_t state; - murand_init(seed, idx + THREAD_ID_X, increment, &state); - using SType = murandStatePhilox4_32_10_t; #else curandStatePhilox4_32_10_t state; curand_init(seed, idx + THREAD_ID_X, increment, &state); @@ -226,10 +216,6 @@ __global__ void VectorizedGeneratorMask(const size_t n, hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, increment, &state); using SType = hiprandStatePhilox4_32_10_t; -#elif defined(PADDLE_WITH_MUSA) - murandStatePhilox4_32_10_t state; - murand_init(seed, idx + THREAD_ID_X, increment, &state); - using SType = murandStatePhilox4_32_10_t; #else curandStatePhilox4_32_10_t state; curand_init(seed, idx + THREAD_ID_X, increment, &state); @@ -302,11 +288,6 @@ void DropoutFwGPUKernelDriver( hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); @@ -368,7 +349,7 @@ void DropoutFwGPUKernelDriver( } else { bool copy_in_kernel = GetSeedDataAndIncrement( dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment); -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#ifdef PADDLE_WITH_HIP VectorizedRandomGenerator <<>>(0, size, @@ -468,8 +449,6 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, if (upscale_in_train && dropout_prob == 1.0f) { #ifdef PADDLE_WITH_HIP hipMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); -#elif defined(PADDLE_WITH_MUSA) - musaMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); #else cudaMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); #endif diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 5b2657704367e5..c92acdaf4180be 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/elementwise_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/function_traits.h" @@ -150,7 +150,7 @@ class MidWiseTransformIterator int64_t post_; }; -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) +#if defined(__NVCC__) || defined(__HIPCC__) template class RowwiseTransformIterator : public thrust::iterator_adaptor, @@ -485,7 +485,7 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout, } } -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) // static unroller template